orca-sdk 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +30 -0
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +634 -0
- orca_sdk/_shared/metrics_test.py +570 -0
- orca_sdk/_utils/__init__.py +0 -0
- orca_sdk/_utils/analysis_ui.py +196 -0
- orca_sdk/_utils/analysis_ui_style.css +51 -0
- orca_sdk/_utils/auth.py +65 -0
- orca_sdk/_utils/auth_test.py +31 -0
- orca_sdk/_utils/common.py +37 -0
- orca_sdk/_utils/data_parsing.py +129 -0
- orca_sdk/_utils/data_parsing_test.py +244 -0
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.css +18 -0
- orca_sdk/_utils/prediction_result_ui.py +110 -0
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/async_client.py +4104 -0
- orca_sdk/classification_model.py +1165 -0
- orca_sdk/classification_model_test.py +887 -0
- orca_sdk/client.py +4096 -0
- orca_sdk/conftest.py +382 -0
- orca_sdk/credentials.py +217 -0
- orca_sdk/credentials_test.py +121 -0
- orca_sdk/datasource.py +576 -0
- orca_sdk/datasource_test.py +463 -0
- orca_sdk/embedding_model.py +712 -0
- orca_sdk/embedding_model_test.py +206 -0
- orca_sdk/job.py +343 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +3811 -0
- orca_sdk/memoryset_test.py +1150 -0
- orca_sdk/regression_model.py +841 -0
- orca_sdk/regression_model_test.py +595 -0
- orca_sdk/telemetry.py +742 -0
- orca_sdk/telemetry_test.py +119 -0
- orca_sdk-0.1.9.dist-info/METADATA +98 -0
- orca_sdk-0.1.9.dist-info/RECORD +41 -0
- orca_sdk-0.1.9.dist-info/WHEEL +4 -0
orca_sdk/memoryset.py
ADDED
|
@@ -0,0 +1,3811 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from os import PathLike
|
|
7
|
+
from typing import (
|
|
8
|
+
TYPE_CHECKING,
|
|
9
|
+
Any,
|
|
10
|
+
Generic,
|
|
11
|
+
Iterable,
|
|
12
|
+
Literal,
|
|
13
|
+
Self,
|
|
14
|
+
TypeVar,
|
|
15
|
+
cast,
|
|
16
|
+
overload,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import pyarrow as pa
|
|
21
|
+
from datasets import Dataset
|
|
22
|
+
from torch.utils.data import DataLoader as TorchDataLoader
|
|
23
|
+
from torch.utils.data import Dataset as TorchDataset
|
|
24
|
+
|
|
25
|
+
from ._utils.common import UNSET, CreateMode, DropMode
|
|
26
|
+
from .async_client import OrcaAsyncClient
|
|
27
|
+
from .client import (
|
|
28
|
+
CascadingEditSuggestion,
|
|
29
|
+
CloneMemorysetRequest,
|
|
30
|
+
CreateMemorysetFromDatasourceRequest,
|
|
31
|
+
CreateMemorysetRequest,
|
|
32
|
+
FilterItem,
|
|
33
|
+
)
|
|
34
|
+
from .client import LabeledMemory as LabeledMemoryResponse
|
|
35
|
+
from .client import (
|
|
36
|
+
LabeledMemoryInsert,
|
|
37
|
+
)
|
|
38
|
+
from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
|
|
39
|
+
from .client import (
|
|
40
|
+
LabeledMemoryUpdate,
|
|
41
|
+
LabeledMemoryWithFeedbackMetrics,
|
|
42
|
+
LabelPredictionMemoryLookup,
|
|
43
|
+
LabelPredictionWithMemoriesAndFeedback,
|
|
44
|
+
ListPredictionsRequest,
|
|
45
|
+
MemoryMetrics,
|
|
46
|
+
MemorysetAnalysisConfigs,
|
|
47
|
+
MemorysetMetadata,
|
|
48
|
+
MemorysetMetrics,
|
|
49
|
+
MemorysetUpdate,
|
|
50
|
+
MemoryType,
|
|
51
|
+
OrcaClient,
|
|
52
|
+
)
|
|
53
|
+
from .client import ScoredMemory as ScoredMemoryResponse
|
|
54
|
+
from .client import (
|
|
55
|
+
ScoredMemoryInsert,
|
|
56
|
+
)
|
|
57
|
+
from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
|
|
58
|
+
from .client import (
|
|
59
|
+
ScoredMemoryUpdate,
|
|
60
|
+
ScoredMemoryWithFeedbackMetrics,
|
|
61
|
+
ScorePredictionMemoryLookup,
|
|
62
|
+
ScorePredictionWithMemoriesAndFeedback,
|
|
63
|
+
TelemetryField,
|
|
64
|
+
TelemetryFilterItem,
|
|
65
|
+
TelemetrySortOptions,
|
|
66
|
+
)
|
|
67
|
+
from .datasource import Datasource
|
|
68
|
+
from .embedding_model import (
|
|
69
|
+
EmbeddingModelBase,
|
|
70
|
+
FinetunedEmbeddingModel,
|
|
71
|
+
PretrainedEmbeddingModel,
|
|
72
|
+
)
|
|
73
|
+
from .job import Job, Status
|
|
74
|
+
from .telemetry import ClassificationPrediction, RegressionPrediction
|
|
75
|
+
|
|
76
|
+
if TYPE_CHECKING:
|
|
77
|
+
from .classification_model import ClassificationModel
|
|
78
|
+
from .regression_model import RegressionModel
|
|
79
|
+
|
|
80
|
+
TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
|
|
81
|
+
"""
|
|
82
|
+
Sort expression for telemetry data consisting of a field and a direction.
|
|
83
|
+
|
|
84
|
+
* **`field`**: The field to sort on.
|
|
85
|
+
* **`direction`**: The direction to sort in.
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> ("feedback_metrics.accuracy.avg", "asc")
|
|
89
|
+
>>> ("lookup.count", "desc")
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
|
|
93
|
+
"""
|
|
94
|
+
Operations that can be used in a filter expression.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
FilterValue = str | int | float | bool | datetime | None | list[str | None] | list[int] | list[float] | list[bool]
|
|
98
|
+
"""
|
|
99
|
+
Values that can be used in a filter expression.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
FilterItemTuple = tuple[str, FilterOperation, FilterValue]
|
|
103
|
+
"""
|
|
104
|
+
Filter expression consisting of a field, an operator, and a value:
|
|
105
|
+
|
|
106
|
+
* **`field`**: The field to filter on.
|
|
107
|
+
* **`operation`**: The operation to apply to the field and value.
|
|
108
|
+
* **`value`**: The value to compare the field against.
|
|
109
|
+
|
|
110
|
+
Examples:
|
|
111
|
+
>>> ("label", "==", 0)
|
|
112
|
+
>>> ("metadata.author", "like", "John")
|
|
113
|
+
>>> ("source_id", "in", ["123", "456"])
|
|
114
|
+
>>> ("feedback_metrics.accuracy.avg", ">", 0.95)
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
IndexType = Literal["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "DISKANN"]
|
|
118
|
+
|
|
119
|
+
DEFAULT_COLUMN_NAMES = {"value", "source_id", "partition_id"}
|
|
120
|
+
TYPE_SPECIFIC_COLUMN_NAMES = {"label", "score"}
|
|
121
|
+
FORBIDDEN_METADATA_COLUMN_NAMES = {
|
|
122
|
+
"memory_id",
|
|
123
|
+
"memory_version",
|
|
124
|
+
"embedding",
|
|
125
|
+
"created_at",
|
|
126
|
+
"updated_at",
|
|
127
|
+
"metrics",
|
|
128
|
+
"feedback_metrics",
|
|
129
|
+
"lookup",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _is_metric_column(column: str):
|
|
134
|
+
return column in ["feedback_metrics", "lookup"]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem | TelemetryFilterItem:
|
|
138
|
+
field = input[0].split(".")
|
|
139
|
+
if (
|
|
140
|
+
len(field) == 1
|
|
141
|
+
and field[0] not in DEFAULT_COLUMN_NAMES | TYPE_SPECIFIC_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES
|
|
142
|
+
):
|
|
143
|
+
field = ["metadata", field[0]]
|
|
144
|
+
op = input[1]
|
|
145
|
+
value = input[2]
|
|
146
|
+
if isinstance(value, datetime):
|
|
147
|
+
value = value.isoformat()
|
|
148
|
+
if _is_metric_column(field[0]):
|
|
149
|
+
if not (
|
|
150
|
+
(isinstance(value, list) and all(isinstance(v, float) or isinstance(v, int) for v in value))
|
|
151
|
+
or isinstance(value, float)
|
|
152
|
+
or isinstance(value, int)
|
|
153
|
+
):
|
|
154
|
+
raise ValueError(f"Invalid value for {field[0]} filter: {value}")
|
|
155
|
+
if field[0] == "feedback_metrics" and (len(field) != 3 or field[2] not in ["avg", "count"]):
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"Feedback metrics filters must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
158
|
+
)
|
|
159
|
+
elif field[0] == "lookup" and (len(field) != 2 or field[1] != "count"):
|
|
160
|
+
raise ValueError("Lookup filters must follow the format `lookup.count`")
|
|
161
|
+
if op == "like":
|
|
162
|
+
raise ValueError("Like filters are not supported on metric columns")
|
|
163
|
+
op = cast(Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in"], op)
|
|
164
|
+
value = cast(float | int | list[float] | list[int], value)
|
|
165
|
+
return TelemetryFilterItem(field=cast(TelemetryField, tuple(field)), op=op, value=value)
|
|
166
|
+
|
|
167
|
+
# Convert list to tuple for FilterItem field type
|
|
168
|
+
return FilterItem(field=tuple(field), op=op, value=value) # type: ignore[assignment]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _parse_sort_item_from_tuple(
|
|
172
|
+
input: TelemetrySortItem,
|
|
173
|
+
) -> TelemetrySortOptions:
|
|
174
|
+
field = input[0].split(".")
|
|
175
|
+
|
|
176
|
+
if len(field) == 1:
|
|
177
|
+
raise ValueError("Sort field must be a telemetry field with an aggregate function name value")
|
|
178
|
+
if field[0] not in ["feedback_metrics", "lookup"]:
|
|
179
|
+
raise ValueError("Sort field must be one of telemetry fields: feedback_metrics or lookup")
|
|
180
|
+
if field[0] == "feedback_metrics":
|
|
181
|
+
if len(field) != 3:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"Feedback metrics must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
184
|
+
)
|
|
185
|
+
if field[2] not in ["avg", "count"]:
|
|
186
|
+
raise ValueError("Feedback metrics can only be sorted on avg or count")
|
|
187
|
+
if field[0] == "lookup":
|
|
188
|
+
if len(field) != 2:
|
|
189
|
+
raise ValueError("Lookup must follow the format `lookup.count`")
|
|
190
|
+
if field[1] != "count":
|
|
191
|
+
raise ValueError("Lookup can only be sorted on count")
|
|
192
|
+
# Convert list to tuple for TelemetryField type
|
|
193
|
+
return TelemetrySortOptions(field=cast(TelemetryField, tuple(field)), direction=input[1])
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _parse_memory_insert(memory: dict[str, Any], type: MemoryType) -> LabeledMemoryInsert | ScoredMemoryInsert:
|
|
197
|
+
value = memory.get("value")
|
|
198
|
+
if not isinstance(value, str):
|
|
199
|
+
raise ValueError("Memory value must be a string")
|
|
200
|
+
source_id = memory.get("source_id")
|
|
201
|
+
if source_id is not None and not isinstance(source_id, str):
|
|
202
|
+
raise ValueError("Memory source_id must be a string")
|
|
203
|
+
partition_id = memory.get("partition_id")
|
|
204
|
+
if partition_id is not None and not isinstance(partition_id, str):
|
|
205
|
+
raise ValueError("Memory partition_id must be a string")
|
|
206
|
+
match type:
|
|
207
|
+
case "LABELED":
|
|
208
|
+
label = memory.get("label")
|
|
209
|
+
if label is not None and not isinstance(label, int):
|
|
210
|
+
raise ValueError("Memory label must be an integer")
|
|
211
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"label"}}
|
|
212
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
215
|
+
)
|
|
216
|
+
return {
|
|
217
|
+
"value": value,
|
|
218
|
+
"label": label,
|
|
219
|
+
"source_id": source_id,
|
|
220
|
+
"partition_id": partition_id,
|
|
221
|
+
"metadata": metadata,
|
|
222
|
+
}
|
|
223
|
+
case "SCORED":
|
|
224
|
+
score = memory.get("score")
|
|
225
|
+
if score is not None and not isinstance(score, (int, float)):
|
|
226
|
+
raise ValueError("Memory score must be a number")
|
|
227
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"score"}}
|
|
228
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
231
|
+
)
|
|
232
|
+
return {
|
|
233
|
+
"value": value,
|
|
234
|
+
"score": score,
|
|
235
|
+
"source_id": source_id,
|
|
236
|
+
"partition_id": partition_id,
|
|
237
|
+
"metadata": metadata,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _parse_memory_update(update: dict[str, Any], type: MemoryType) -> LabeledMemoryUpdate | ScoredMemoryUpdate:
|
|
242
|
+
if "memory_id" not in update:
|
|
243
|
+
raise ValueError("memory_id must be specified in the update dictionary")
|
|
244
|
+
memory_id = update["memory_id"]
|
|
245
|
+
if not isinstance(memory_id, str):
|
|
246
|
+
raise ValueError("memory_id must be a string")
|
|
247
|
+
payload: LabeledMemoryUpdate | ScoredMemoryUpdate = {"memory_id": memory_id}
|
|
248
|
+
if "value" in update:
|
|
249
|
+
if not isinstance(update["value"], str):
|
|
250
|
+
raise ValueError("value must be a string or unset")
|
|
251
|
+
payload["value"] = update["value"]
|
|
252
|
+
if "source_id" in update:
|
|
253
|
+
source_id = update["source_id"]
|
|
254
|
+
if source_id is not None and not isinstance(source_id, str):
|
|
255
|
+
raise ValueError("source_id must be a string or None")
|
|
256
|
+
payload["source_id"] = source_id
|
|
257
|
+
if "partition_id" in update:
|
|
258
|
+
partition_id = update["partition_id"]
|
|
259
|
+
if partition_id is not None and not isinstance(partition_id, str):
|
|
260
|
+
raise ValueError("partition_id must be a string or None")
|
|
261
|
+
payload["partition_id"] = partition_id
|
|
262
|
+
match type:
|
|
263
|
+
case "LABELED":
|
|
264
|
+
payload = cast(LabeledMemoryUpdate, payload)
|
|
265
|
+
if "label" in update:
|
|
266
|
+
if not isinstance(update["label"], int):
|
|
267
|
+
raise ValueError("label must be an integer or unset")
|
|
268
|
+
payload["label"] = update["label"]
|
|
269
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "label"}}
|
|
270
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
271
|
+
raise ValueError(
|
|
272
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
273
|
+
)
|
|
274
|
+
payload["metadata"] = metadata
|
|
275
|
+
return payload
|
|
276
|
+
case "SCORED":
|
|
277
|
+
payload = cast(ScoredMemoryUpdate, payload)
|
|
278
|
+
if "score" in update:
|
|
279
|
+
if not isinstance(update["score"], (int, float)):
|
|
280
|
+
raise ValueError("score must be a number or unset")
|
|
281
|
+
payload["score"] = update["score"]
|
|
282
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "score"}}
|
|
283
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
284
|
+
raise ValueError(
|
|
285
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
286
|
+
)
|
|
287
|
+
payload["metadata"] = metadata
|
|
288
|
+
return cast(ScoredMemoryUpdate, payload)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class MemoryBase(ABC):
|
|
292
|
+
value: str
|
|
293
|
+
embedding: list[float]
|
|
294
|
+
source_id: str | None
|
|
295
|
+
partition_id: str | None
|
|
296
|
+
created_at: datetime
|
|
297
|
+
updated_at: datetime
|
|
298
|
+
metadata: dict[str, str | float | int | bool | None]
|
|
299
|
+
metrics: MemoryMetrics
|
|
300
|
+
memory_id: str
|
|
301
|
+
memory_version: int
|
|
302
|
+
feedback_metrics: dict[str, Any]
|
|
303
|
+
lookup_count: int
|
|
304
|
+
memory_type: MemoryType # defined by subclasses
|
|
305
|
+
|
|
306
|
+
def __init__(
|
|
307
|
+
self,
|
|
308
|
+
memoryset_id: str,
|
|
309
|
+
memory: (
|
|
310
|
+
LabeledMemoryResponse
|
|
311
|
+
| LabeledMemoryLookupResponse
|
|
312
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
313
|
+
| LabelPredictionMemoryLookup
|
|
314
|
+
| ScoredMemoryResponse
|
|
315
|
+
| ScoredMemoryLookupResponse
|
|
316
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
317
|
+
| ScorePredictionMemoryLookup
|
|
318
|
+
),
|
|
319
|
+
):
|
|
320
|
+
# for internal use only, do not document
|
|
321
|
+
self.memoryset_id = memoryset_id
|
|
322
|
+
self.memory_id = memory["memory_id"]
|
|
323
|
+
self.memory_version = memory["memory_version"]
|
|
324
|
+
self.value = cast(str, memory["value"])
|
|
325
|
+
self.embedding = memory["embedding"]
|
|
326
|
+
self.source_id = memory["source_id"]
|
|
327
|
+
self.partition_id = memory["partition_id"]
|
|
328
|
+
self.created_at = datetime.fromisoformat(memory["created_at"])
|
|
329
|
+
self.updated_at = datetime.fromisoformat(memory["updated_at"])
|
|
330
|
+
self.metadata = memory["metadata"]
|
|
331
|
+
self.metrics = memory["metrics"] if "metrics" in memory else {}
|
|
332
|
+
self.feedback_metrics = memory.get("feedback_metrics", {}) or {}
|
|
333
|
+
self.lookup_count = memory.get("lookup_count", 0)
|
|
334
|
+
|
|
335
|
+
def __getattr__(self, key: str) -> Any:
|
|
336
|
+
if key.startswith("__") or key not in self.metadata:
|
|
337
|
+
raise AttributeError(f"{key} is not a valid attribute")
|
|
338
|
+
return self.metadata[key]
|
|
339
|
+
|
|
340
|
+
def _convert_to_classification_prediction(
|
|
341
|
+
self,
|
|
342
|
+
prediction: LabelPredictionWithMemoriesAndFeedback,
|
|
343
|
+
*,
|
|
344
|
+
memoryset: LabeledMemoryset,
|
|
345
|
+
model: ClassificationModel,
|
|
346
|
+
) -> ClassificationPrediction:
|
|
347
|
+
"""
|
|
348
|
+
Convert internal prediction TypedDict to ClassificationPrediction object.
|
|
349
|
+
"""
|
|
350
|
+
input_value = prediction.get("input_value")
|
|
351
|
+
input_value_str: str | None = None
|
|
352
|
+
if input_value is not None:
|
|
353
|
+
input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
|
|
354
|
+
|
|
355
|
+
return ClassificationPrediction(
|
|
356
|
+
prediction_id=prediction["prediction_id"],
|
|
357
|
+
label=prediction.get("label"),
|
|
358
|
+
label_name=prediction.get("label_name"),
|
|
359
|
+
score=None,
|
|
360
|
+
confidence=prediction["confidence"],
|
|
361
|
+
anomaly_score=prediction["anomaly_score"],
|
|
362
|
+
memoryset=memoryset,
|
|
363
|
+
model=model,
|
|
364
|
+
telemetry=prediction,
|
|
365
|
+
logits=prediction.get("logits"),
|
|
366
|
+
input_value=input_value_str,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def _convert_to_regression_prediction(
|
|
370
|
+
self,
|
|
371
|
+
prediction: ScorePredictionWithMemoriesAndFeedback,
|
|
372
|
+
*,
|
|
373
|
+
memoryset: ScoredMemoryset,
|
|
374
|
+
model: RegressionModel,
|
|
375
|
+
) -> RegressionPrediction:
|
|
376
|
+
"""
|
|
377
|
+
Convert internal prediction TypedDict to RegressionPrediction object.
|
|
378
|
+
"""
|
|
379
|
+
input_value = prediction.get("input_value")
|
|
380
|
+
input_value_str: str | None = None
|
|
381
|
+
if input_value is not None:
|
|
382
|
+
input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
|
|
383
|
+
|
|
384
|
+
return RegressionPrediction(
|
|
385
|
+
prediction_id=prediction["prediction_id"],
|
|
386
|
+
label=None,
|
|
387
|
+
label_name=None,
|
|
388
|
+
score=prediction.get("score"),
|
|
389
|
+
confidence=prediction["confidence"],
|
|
390
|
+
anomaly_score=prediction["anomaly_score"],
|
|
391
|
+
memoryset=memoryset,
|
|
392
|
+
model=model,
|
|
393
|
+
telemetry=prediction,
|
|
394
|
+
logits=None,
|
|
395
|
+
input_value=input_value_str,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
def feedback(self) -> dict[str, list[bool] | list[float]]:
|
|
399
|
+
"""
|
|
400
|
+
Get feedback metrics computed from predictions that used this memory.
|
|
401
|
+
|
|
402
|
+
Returns a dictionary where:
|
|
403
|
+
- Keys are feedback category names
|
|
404
|
+
- Values are lists of feedback values (you may want to look at mean on the raw data)
|
|
405
|
+
"""
|
|
406
|
+
# Collect all feedbacks by category, paginating through all predictions
|
|
407
|
+
feedback_by_category: dict[str, list[bool] | list[float]] = {}
|
|
408
|
+
batch_size = 500
|
|
409
|
+
offset = 0
|
|
410
|
+
|
|
411
|
+
while True:
|
|
412
|
+
predictions_batch = self.predictions(limit=batch_size, offset=offset)
|
|
413
|
+
|
|
414
|
+
if not predictions_batch:
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
for prediction in predictions_batch:
|
|
418
|
+
telemetry = prediction._telemetry
|
|
419
|
+
if "feedbacks" not in telemetry:
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
for fb in telemetry["feedbacks"]:
|
|
423
|
+
category_name = fb["category_name"]
|
|
424
|
+
value = fb["value"]
|
|
425
|
+
# Convert BINARY (1/0) to boolean, CONTINUOUS to float
|
|
426
|
+
if fb["category_type"] == "BINARY":
|
|
427
|
+
value = bool(value)
|
|
428
|
+
if category_name not in feedback_by_category:
|
|
429
|
+
feedback_by_category[category_name] = []
|
|
430
|
+
cast(list[bool], feedback_by_category[category_name]).append(value)
|
|
431
|
+
else:
|
|
432
|
+
value = float(value)
|
|
433
|
+
if category_name not in feedback_by_category:
|
|
434
|
+
feedback_by_category[category_name] = []
|
|
435
|
+
cast(list[float], feedback_by_category[category_name]).append(value)
|
|
436
|
+
|
|
437
|
+
if len(predictions_batch) < batch_size:
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
offset += batch_size
|
|
441
|
+
|
|
442
|
+
return feedback_by_category
|
|
443
|
+
|
|
444
|
+
def _update(
|
|
445
|
+
self,
|
|
446
|
+
*,
|
|
447
|
+
value: str = UNSET,
|
|
448
|
+
source_id: str | None = UNSET,
|
|
449
|
+
partition_id: str | None = UNSET,
|
|
450
|
+
**metadata: None | bool | float | int | str,
|
|
451
|
+
) -> Self:
|
|
452
|
+
client = OrcaClient._resolve_client()
|
|
453
|
+
response = client.PATCH(
|
|
454
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
455
|
+
params={"name_or_id": self.memoryset_id},
|
|
456
|
+
json=_parse_memory_update(
|
|
457
|
+
{"memory_id": self.memory_id}
|
|
458
|
+
| ({"value": value} if value is not UNSET else {})
|
|
459
|
+
| ({"source_id": source_id} if source_id is not UNSET else {})
|
|
460
|
+
| ({"partition_id": partition_id} if partition_id is not UNSET else {})
|
|
461
|
+
| {k: v for k, v in metadata.items() if v is not UNSET},
|
|
462
|
+
type=self.memory_type,
|
|
463
|
+
),
|
|
464
|
+
)
|
|
465
|
+
self.__dict__.update(self.__class__(self.memoryset_id, response).__dict__)
|
|
466
|
+
return self
|
|
467
|
+
|
|
468
|
+
def to_dict(self) -> dict[str, Any]:
|
|
469
|
+
"""
|
|
470
|
+
Convert the memory to a dictionary
|
|
471
|
+
"""
|
|
472
|
+
return {
|
|
473
|
+
"value": self.value,
|
|
474
|
+
"embedding": self.embedding,
|
|
475
|
+
"source_id": self.source_id,
|
|
476
|
+
"partition_id": self.partition_id,
|
|
477
|
+
"created_at": self.created_at,
|
|
478
|
+
"updated_at": self.updated_at,
|
|
479
|
+
"metadata": self.metadata,
|
|
480
|
+
"metrics": self.metrics,
|
|
481
|
+
"memory_id": self.memory_id,
|
|
482
|
+
"memory_version": self.memory_version,
|
|
483
|
+
"feedback_metrics": self.feedback_metrics,
|
|
484
|
+
"lookup_count": self.lookup_count,
|
|
485
|
+
"memory_type": self.memory_type,
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
class LabeledMemory(MemoryBase):
|
|
490
|
+
"""
|
|
491
|
+
A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
|
|
492
|
+
|
|
493
|
+
Attributes:
|
|
494
|
+
value: Value represented by the row
|
|
495
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
496
|
+
with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
|
|
497
|
+
label: Class label of the memory
|
|
498
|
+
label_name: Human-readable name of the label, automatically populated from the
|
|
499
|
+
[`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
|
|
500
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
501
|
+
partition_id: Optional identifier of the partition the memory belongs to
|
|
502
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
503
|
+
[`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
|
|
504
|
+
metadata: Metadata associated with the memory that is not used in the model. Metadata
|
|
505
|
+
properties are also accessible as individual attributes on the instance.
|
|
506
|
+
memory_id: Unique identifier for the memory, automatically generated on insert
|
|
507
|
+
memory_version: Version of the memory, automatically updated when the label or value changes
|
|
508
|
+
created_at: When the memory was created, automatically generated on insert
|
|
509
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
510
|
+
|
|
511
|
+
## Other Attributes:
|
|
512
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
513
|
+
"""
|
|
514
|
+
|
|
515
|
+
label: int | None
|
|
516
|
+
label_name: str | None
|
|
517
|
+
memory_type = "LABELED"
|
|
518
|
+
|
|
519
|
+
def __init__(
|
|
520
|
+
self,
|
|
521
|
+
memoryset_id: str,
|
|
522
|
+
memory: (
|
|
523
|
+
LabeledMemoryResponse
|
|
524
|
+
| LabeledMemoryLookupResponse
|
|
525
|
+
| LabelPredictionMemoryLookup
|
|
526
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
527
|
+
),
|
|
528
|
+
):
|
|
529
|
+
# for internal use only, do not document
|
|
530
|
+
super().__init__(memoryset_id, memory)
|
|
531
|
+
self.label = memory["label"]
|
|
532
|
+
self.label_name = memory["label_name"]
|
|
533
|
+
|
|
534
|
+
def __repr__(self) -> str:
|
|
535
|
+
return (
|
|
536
|
+
"LabeledMemory({ "
|
|
537
|
+
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
538
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
539
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
540
|
+
+ (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
|
|
541
|
+
+ " })"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
def __eq__(self, other: object) -> bool:
|
|
545
|
+
return isinstance(other, LabeledMemory) and self.memory_id == other.memory_id
|
|
546
|
+
|
|
547
|
+
def update(
|
|
548
|
+
self,
|
|
549
|
+
*,
|
|
550
|
+
value: str = UNSET,
|
|
551
|
+
label: int | None = UNSET,
|
|
552
|
+
source_id: str | None = UNSET,
|
|
553
|
+
partition_id: str | None = UNSET,
|
|
554
|
+
**metadata: None | bool | float | int | str,
|
|
555
|
+
) -> LabeledMemory:
|
|
556
|
+
"""
|
|
557
|
+
Update the memory with new values
|
|
558
|
+
|
|
559
|
+
Note:
|
|
560
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
561
|
+
|
|
562
|
+
Params:
|
|
563
|
+
value: New value of the memory
|
|
564
|
+
label: New label of the memory
|
|
565
|
+
source_id: New source ID of the memory
|
|
566
|
+
partition_id: New partition ID of the memory
|
|
567
|
+
**metadata: New values for metadata properties
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
The updated memory
|
|
571
|
+
"""
|
|
572
|
+
self._update(value=value, label=label, source_id=source_id, partition_id=partition_id, **metadata)
|
|
573
|
+
return self
|
|
574
|
+
|
|
575
|
+
def predictions(
|
|
576
|
+
self,
|
|
577
|
+
limit: int = 100,
|
|
578
|
+
offset: int = 0,
|
|
579
|
+
tag: str | None = None,
|
|
580
|
+
sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
|
|
581
|
+
expected_label_match: bool | None = None,
|
|
582
|
+
) -> list[ClassificationPrediction]:
|
|
583
|
+
"""
|
|
584
|
+
Get classification predictions that used this memory.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
limit: Maximum number of predictions to return (default: 100)
|
|
588
|
+
offset: Number of predictions to skip for pagination (default: 0)
|
|
589
|
+
tag: Optional tag filter to only include predictions with this tag
|
|
590
|
+
sort: List of (field, direction) tuples for sorting results.
|
|
591
|
+
Valid fields: "anomaly_score", "confidence", "timestamp".
|
|
592
|
+
Valid directions: "asc", "desc"
|
|
593
|
+
expected_label_match: Filter by prediction correctness:
|
|
594
|
+
- True: only return correct predictions (label == expected_label)
|
|
595
|
+
- False: only return incorrect predictions (label != expected_label)
|
|
596
|
+
- None: return all predictions (default)
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
List of ClassificationPrediction objects that used this memory
|
|
600
|
+
"""
|
|
601
|
+
|
|
602
|
+
client = OrcaClient._resolve_client()
|
|
603
|
+
request_json: ListPredictionsRequest = {
|
|
604
|
+
"memory_id": self.memory_id,
|
|
605
|
+
"limit": limit,
|
|
606
|
+
"offset": offset,
|
|
607
|
+
"tag": tag,
|
|
608
|
+
"expected_label_match": expected_label_match,
|
|
609
|
+
}
|
|
610
|
+
if sort:
|
|
611
|
+
request_json["sort"] = sort
|
|
612
|
+
predictions_data = client.POST(
|
|
613
|
+
"/telemetry/prediction",
|
|
614
|
+
json=request_json,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Filter to only classification predictions and convert to ClassificationPrediction objects
|
|
618
|
+
classification_predictions = [
|
|
619
|
+
cast(LabelPredictionWithMemoriesAndFeedback, p) for p in predictions_data if "label" in p
|
|
620
|
+
]
|
|
621
|
+
|
|
622
|
+
from .classification_model import ClassificationModel
|
|
623
|
+
|
|
624
|
+
memorysets: dict[str, LabeledMemoryset] = {}
|
|
625
|
+
models: dict[str, ClassificationModel] = {}
|
|
626
|
+
|
|
627
|
+
def resolve_memoryset(memoryset_id: str) -> LabeledMemoryset:
|
|
628
|
+
if memoryset_id not in memorysets:
|
|
629
|
+
memorysets[memoryset_id] = LabeledMemoryset.open(memoryset_id)
|
|
630
|
+
return memorysets[memoryset_id]
|
|
631
|
+
|
|
632
|
+
def resolve_model(model_id: str) -> ClassificationModel:
|
|
633
|
+
if model_id not in models:
|
|
634
|
+
models[model_id] = ClassificationModel.open(model_id)
|
|
635
|
+
return models[model_id]
|
|
636
|
+
|
|
637
|
+
return [
|
|
638
|
+
self._convert_to_classification_prediction(
|
|
639
|
+
p,
|
|
640
|
+
memoryset=resolve_memoryset(p["memoryset_id"]),
|
|
641
|
+
model=resolve_model(p["model_id"]),
|
|
642
|
+
)
|
|
643
|
+
for p in classification_predictions
|
|
644
|
+
]
|
|
645
|
+
|
|
646
|
+
def to_dict(self) -> dict[str, Any]:
|
|
647
|
+
"""
|
|
648
|
+
Convert the memory to a dictionary
|
|
649
|
+
"""
|
|
650
|
+
super_dict = super().to_dict()
|
|
651
|
+
super_dict["label"] = self.label
|
|
652
|
+
super_dict["label_name"] = self.label_name
|
|
653
|
+
return super_dict
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
class LabeledMemoryLookup(LabeledMemory):
|
|
657
|
+
"""
|
|
658
|
+
Lookup result for a memory in a memoryset
|
|
659
|
+
|
|
660
|
+
Attributes:
|
|
661
|
+
lookup_score: Similarity between the memory embedding and search query embedding
|
|
662
|
+
attention_weight: Weight the model assigned to the memory during prediction if this lookup
|
|
663
|
+
happened as part of a prediction
|
|
664
|
+
value: Value represented by the row
|
|
665
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
666
|
+
with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
|
|
667
|
+
label: Class label of the memory
|
|
668
|
+
label_name: Human-readable name of the label, automatically populated from the
|
|
669
|
+
[`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
|
|
670
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
671
|
+
partition_id: Optional identifier of the partition the memory belongs to
|
|
672
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
673
|
+
[`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
|
|
674
|
+
metadata: Metadata associated with the memory that is not used in the model. Metadata
|
|
675
|
+
properties are also accessible as individual attributes on the instance.
|
|
676
|
+
memory_id: The unique identifier for the memory, automatically generated on insert
|
|
677
|
+
memory_version: The version of the memory, automatically updated when the label or value changes
|
|
678
|
+
created_at: When the memory was created, automatically generated on insert
|
|
679
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
680
|
+
|
|
681
|
+
## Other Attributes:
|
|
682
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
683
|
+
"""
|
|
684
|
+
|
|
685
|
+
lookup_score: float
|
|
686
|
+
attention_weight: float | None
|
|
687
|
+
|
|
688
|
+
def __init__(
|
|
689
|
+
self,
|
|
690
|
+
memoryset_id: str,
|
|
691
|
+
memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
|
|
692
|
+
):
|
|
693
|
+
# for internal use only, do not document
|
|
694
|
+
super().__init__(memoryset_id, memory_lookup)
|
|
695
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
696
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
697
|
+
|
|
698
|
+
def __repr__(self) -> str:
|
|
699
|
+
return (
|
|
700
|
+
"LabeledMemoryLookup({ "
|
|
701
|
+
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
702
|
+
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
703
|
+
+ (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
|
|
704
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
705
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
706
|
+
+ (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
|
|
707
|
+
+ " })"
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class ScoredMemory(MemoryBase):
|
|
712
|
+
"""
|
|
713
|
+
A row of the [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
714
|
+
|
|
715
|
+
Attributes:
|
|
716
|
+
value: Value represented by the row
|
|
717
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
718
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
719
|
+
score: Score of the memory
|
|
720
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
721
|
+
partition_id: Optional identifier of the partition the memory belongs to
|
|
722
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
723
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
724
|
+
metadata: Metadata associated with the memory that is not used in the model. Metadata
|
|
725
|
+
properties are also accessible as individual attributes on the instance.
|
|
726
|
+
memory_id: Unique identifier for the memory, automatically generated on insert
|
|
727
|
+
memory_version: Version of the memory, automatically updated when the score or value changes
|
|
728
|
+
created_at: When the memory was created, automatically generated on insert
|
|
729
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
730
|
+
|
|
731
|
+
## Other Attributes:
|
|
732
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
733
|
+
"""
|
|
734
|
+
|
|
735
|
+
score: float | None
|
|
736
|
+
memory_type = "SCORED"
|
|
737
|
+
|
|
738
|
+
def __init__(
|
|
739
|
+
self,
|
|
740
|
+
memoryset_id: str,
|
|
741
|
+
memory: (
|
|
742
|
+
ScoredMemoryResponse
|
|
743
|
+
| ScoredMemoryLookupResponse
|
|
744
|
+
| ScorePredictionMemoryLookup
|
|
745
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
746
|
+
),
|
|
747
|
+
):
|
|
748
|
+
# for internal use only, do not document
|
|
749
|
+
super().__init__(memoryset_id, memory)
|
|
750
|
+
self.score = memory["score"]
|
|
751
|
+
|
|
752
|
+
def __repr__(self) -> str:
|
|
753
|
+
return (
|
|
754
|
+
"ScoredMemory({ "
|
|
755
|
+
+ f"score: {self.score:.2f}"
|
|
756
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
757
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
758
|
+
+ (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
|
|
759
|
+
+ " })"
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
def __eq__(self, other: object) -> bool:
|
|
763
|
+
return isinstance(other, ScoredMemory) and self.memory_id == other.memory_id
|
|
764
|
+
|
|
765
|
+
def update(
|
|
766
|
+
self,
|
|
767
|
+
*,
|
|
768
|
+
value: str = UNSET,
|
|
769
|
+
score: float | None = UNSET,
|
|
770
|
+
source_id: str | None = UNSET,
|
|
771
|
+
partition_id: str | None = UNSET,
|
|
772
|
+
**metadata: None | bool | float | int | str,
|
|
773
|
+
) -> ScoredMemory:
|
|
774
|
+
"""
|
|
775
|
+
Update the memory with new values
|
|
776
|
+
|
|
777
|
+
Note:
|
|
778
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
779
|
+
|
|
780
|
+
Params:
|
|
781
|
+
value: New value of the memory
|
|
782
|
+
score: New score of the memory
|
|
783
|
+
source_id: New source ID of the memory
|
|
784
|
+
**metadata: New values for metadata properties
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
The updated memory
|
|
788
|
+
"""
|
|
789
|
+
self._update(value=value, score=score, source_id=source_id, partition_id=partition_id, **metadata)
|
|
790
|
+
return self
|
|
791
|
+
|
|
792
|
+
def predictions(
|
|
793
|
+
self,
|
|
794
|
+
limit: int = 100,
|
|
795
|
+
offset: int = 0,
|
|
796
|
+
tag: str | None = None,
|
|
797
|
+
sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
|
|
798
|
+
expected_label_match: bool | None = None,
|
|
799
|
+
) -> list[RegressionPrediction]:
|
|
800
|
+
"""
|
|
801
|
+
Get regression predictions that used this memory.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
limit: Maximum number of predictions to return (default: 100)
|
|
805
|
+
offset: Number of predictions to skip for pagination (default: 0)
|
|
806
|
+
tag: Optional tag filter to only include predictions with this tag
|
|
807
|
+
sort: List of (field, direction) tuples for sorting results.
|
|
808
|
+
Valid fields: "anomaly_score", "confidence", "timestamp".
|
|
809
|
+
Valid directions: "asc", "desc"
|
|
810
|
+
expected_label_match: Filter by prediction correctness:
|
|
811
|
+
- True: only return correct predictions (score close to expected_score)
|
|
812
|
+
- False: only return incorrect predictions (score differs from expected_score)
|
|
813
|
+
- None: return all predictions (default)
|
|
814
|
+
Note: For regression, "correctness" is based on score proximity to expected_score.
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
List of RegressionPrediction objects that used this memory
|
|
818
|
+
"""
|
|
819
|
+
client = OrcaClient._resolve_client()
|
|
820
|
+
request_json: ListPredictionsRequest = {
|
|
821
|
+
"memory_id": self.memory_id,
|
|
822
|
+
"limit": limit,
|
|
823
|
+
"offset": offset,
|
|
824
|
+
"tag": tag,
|
|
825
|
+
"expected_label_match": expected_label_match,
|
|
826
|
+
}
|
|
827
|
+
if sort:
|
|
828
|
+
request_json["sort"] = sort
|
|
829
|
+
predictions_data = client.POST(
|
|
830
|
+
"/telemetry/prediction",
|
|
831
|
+
json=request_json,
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
# Filter to only regression predictions and convert to RegressionPrediction objects
|
|
835
|
+
regression_predictions = [
|
|
836
|
+
cast(ScorePredictionWithMemoriesAndFeedback, p) for p in predictions_data if "score" in p
|
|
837
|
+
]
|
|
838
|
+
|
|
839
|
+
from .regression_model import RegressionModel
|
|
840
|
+
|
|
841
|
+
memorysets: dict[str, ScoredMemoryset] = {}
|
|
842
|
+
models: dict[str, RegressionModel] = {}
|
|
843
|
+
|
|
844
|
+
def resolve_memoryset(memoryset_id: str) -> ScoredMemoryset:
|
|
845
|
+
if memoryset_id not in memorysets:
|
|
846
|
+
memorysets[memoryset_id] = ScoredMemoryset.open(memoryset_id)
|
|
847
|
+
return memorysets[memoryset_id]
|
|
848
|
+
|
|
849
|
+
def resolve_model(model_id: str) -> RegressionModel:
|
|
850
|
+
if model_id not in models:
|
|
851
|
+
models[model_id] = RegressionModel.open(model_id)
|
|
852
|
+
return models[model_id]
|
|
853
|
+
|
|
854
|
+
return [
|
|
855
|
+
self._convert_to_regression_prediction(
|
|
856
|
+
p,
|
|
857
|
+
memoryset=resolve_memoryset(p["memoryset_id"]),
|
|
858
|
+
model=resolve_model(p["model_id"]),
|
|
859
|
+
)
|
|
860
|
+
for p in regression_predictions
|
|
861
|
+
]
|
|
862
|
+
|
|
863
|
+
def to_dict(self) -> dict[str, Any]:
|
|
864
|
+
"""
|
|
865
|
+
Convert the memory to a dictionary
|
|
866
|
+
"""
|
|
867
|
+
super_dict = super().to_dict()
|
|
868
|
+
super_dict["score"] = self.score
|
|
869
|
+
return super_dict
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
class ScoredMemoryLookup(ScoredMemory):
|
|
873
|
+
"""
|
|
874
|
+
Lookup result for a memory in a memoryset
|
|
875
|
+
|
|
876
|
+
Attributes:
|
|
877
|
+
lookup_score: Similarity between the memory embedding and search query embedding
|
|
878
|
+
attention_weight: Weight the model assigned to the memory during prediction if this lookup
|
|
879
|
+
happened as part of a prediction
|
|
880
|
+
value: Value represented by the row
|
|
881
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
882
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
883
|
+
score: Score of the memory
|
|
884
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
885
|
+
partition_id: Optional identifier of the partition the memory belongs to
|
|
886
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
887
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
888
|
+
memory_id: The unique identifier for the memory, automatically generated on insert
|
|
889
|
+
memory_version: The version of the memory, automatically updated when the score or value changes
|
|
890
|
+
created_at: When the memory was created, automatically generated on insert
|
|
891
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
892
|
+
|
|
893
|
+
## Other Attributes:
|
|
894
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
895
|
+
"""
|
|
896
|
+
|
|
897
|
+
lookup_score: float
|
|
898
|
+
attention_weight: float | None
|
|
899
|
+
|
|
900
|
+
def __init__(
|
|
901
|
+
self,
|
|
902
|
+
memoryset_id: str,
|
|
903
|
+
memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup,
|
|
904
|
+
):
|
|
905
|
+
# for internal use only, do not document
|
|
906
|
+
super().__init__(memoryset_id, memory_lookup)
|
|
907
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
908
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
909
|
+
|
|
910
|
+
def __repr__(self) -> str:
|
|
911
|
+
return (
|
|
912
|
+
"ScoredMemoryLookup({ "
|
|
913
|
+
+ f"score: {self.score:.2f}"
|
|
914
|
+
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
915
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
916
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
917
|
+
+ (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
|
|
918
|
+
+ " })"
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
MemoryT = TypeVar("MemoryT", bound=MemoryBase)
|
|
923
|
+
MemoryLookupT = TypeVar("MemoryLookupT", bound=MemoryBase)
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
927
|
+
"""
|
|
928
|
+
A Handle to a collection of memories with labels in the OrcaCloud
|
|
929
|
+
|
|
930
|
+
Attributes:
|
|
931
|
+
id: Unique identifier for the memoryset
|
|
932
|
+
name: Unique name of the memoryset
|
|
933
|
+
description: Description of the memoryset
|
|
934
|
+
length: Number of memories in the memoryset
|
|
935
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
936
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
937
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
938
|
+
"""
|
|
939
|
+
|
|
940
|
+
id: str
|
|
941
|
+
name: str
|
|
942
|
+
description: str | None
|
|
943
|
+
memory_type: MemoryType # defined by subclasses
|
|
944
|
+
|
|
945
|
+
length: int
|
|
946
|
+
created_at: datetime
|
|
947
|
+
updated_at: datetime
|
|
948
|
+
insertion_status: Status | None
|
|
949
|
+
embedding_model: EmbeddingModelBase
|
|
950
|
+
index_type: IndexType
|
|
951
|
+
index_params: dict[str, Any]
|
|
952
|
+
hidden: bool
|
|
953
|
+
|
|
954
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
955
|
+
# for internal use only, do not document
|
|
956
|
+
if metadata["pretrained_embedding_model_name"]:
|
|
957
|
+
self.embedding_model = PretrainedEmbeddingModel._get(metadata["pretrained_embedding_model_name"])
|
|
958
|
+
elif metadata["finetuned_embedding_model_id"]:
|
|
959
|
+
self.embedding_model = FinetunedEmbeddingModel.open(metadata["finetuned_embedding_model_id"])
|
|
960
|
+
else:
|
|
961
|
+
raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
|
|
962
|
+
self.id = metadata["id"]
|
|
963
|
+
self.name = metadata["name"]
|
|
964
|
+
self.description = metadata["description"]
|
|
965
|
+
self.length = metadata["length"]
|
|
966
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
967
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
968
|
+
self.insertion_status = (
|
|
969
|
+
Status(metadata["insertion_status"]) if metadata["insertion_status"] is not None else None
|
|
970
|
+
)
|
|
971
|
+
self._last_refresh = datetime.now()
|
|
972
|
+
self.index_type = metadata["index_type"]
|
|
973
|
+
self.index_params = metadata["index_params"]
|
|
974
|
+
self.memory_type = metadata["memory_type"]
|
|
975
|
+
self.hidden = metadata["hidden"]
|
|
976
|
+
|
|
977
|
+
def __eq__(self, other) -> bool:
|
|
978
|
+
return isinstance(other, MemorysetBase) and self.id == other.id
|
|
979
|
+
|
|
980
|
+
def __repr__(self) -> str:
|
|
981
|
+
return (
|
|
982
|
+
f"{self.memory_type.capitalize()}Memoryset(" + "{\n"
|
|
983
|
+
f" name: '{self.name}',\n"
|
|
984
|
+
f" length: {self.length},\n"
|
|
985
|
+
f" embedding_model: {self.embedding_model},\n"
|
|
986
|
+
"})"
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
@classmethod
|
|
990
|
+
def _handle_if_exists(
|
|
991
|
+
cls,
|
|
992
|
+
name: str,
|
|
993
|
+
*,
|
|
994
|
+
if_exists: CreateMode,
|
|
995
|
+
label_names: list[str] | None,
|
|
996
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
|
|
997
|
+
) -> Self | None:
|
|
998
|
+
"""
|
|
999
|
+
Handle common `if_exists` logic shared by all creator-style helpers.
|
|
1000
|
+
|
|
1001
|
+
Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
|
|
1002
|
+
and returns `None` when the memoryset does not yet exist.
|
|
1003
|
+
"""
|
|
1004
|
+
if not cls.exists(name):
|
|
1005
|
+
return None
|
|
1006
|
+
if if_exists == "error":
|
|
1007
|
+
raise ValueError(f"Memoryset with name {name} already exists")
|
|
1008
|
+
|
|
1009
|
+
existing = cls.open(name)
|
|
1010
|
+
|
|
1011
|
+
if label_names is not None and hasattr(existing, "label_names"):
|
|
1012
|
+
existing_label_names = getattr(existing, "label_names")
|
|
1013
|
+
if label_names != existing_label_names:
|
|
1014
|
+
requested = ", ".join(label_names)
|
|
1015
|
+
existing_joined = ", ".join(existing_label_names)
|
|
1016
|
+
raise ValueError(
|
|
1017
|
+
f"Memoryset {name} already exists with label names [{existing_joined}] "
|
|
1018
|
+
f"(requested: [{requested}])."
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
if embedding_model is not None and embedding_model != existing.embedding_model:
|
|
1022
|
+
existing_model = existing.embedding_model
|
|
1023
|
+
existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
|
|
1024
|
+
requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
|
|
1025
|
+
raise ValueError(
|
|
1026
|
+
f"Memoryset {name} already exists with embedding_model {existing_model_name} "
|
|
1027
|
+
f"(requested: {requested_name})."
|
|
1028
|
+
)
|
|
1029
|
+
|
|
1030
|
+
return existing
|
|
1031
|
+
|
|
1032
|
+
@classmethod
|
|
1033
|
+
def _create_from_datasource(
|
|
1034
|
+
cls,
|
|
1035
|
+
name: str,
|
|
1036
|
+
*,
|
|
1037
|
+
datasource: Datasource,
|
|
1038
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1039
|
+
value_column: str = "value",
|
|
1040
|
+
label_column: str | None = None,
|
|
1041
|
+
score_column: str | None = None,
|
|
1042
|
+
source_id_column: str | None = None,
|
|
1043
|
+
partition_id_column: str | None = None,
|
|
1044
|
+
description: str | None = None,
|
|
1045
|
+
label_names: list[str] | None = None,
|
|
1046
|
+
max_seq_length_override: int | None = None,
|
|
1047
|
+
prompt: str | None = None,
|
|
1048
|
+
remove_duplicates: bool = True,
|
|
1049
|
+
index_type: IndexType = "FLAT",
|
|
1050
|
+
index_params: dict[str, Any] = {},
|
|
1051
|
+
if_exists: CreateMode = "error",
|
|
1052
|
+
background: bool = False,
|
|
1053
|
+
hidden: bool = False,
|
|
1054
|
+
subsample: int | float | None = None,
|
|
1055
|
+
memory_type: MemoryType | None = None,
|
|
1056
|
+
) -> Self | Job[Self]:
|
|
1057
|
+
"""
|
|
1058
|
+
Create a memoryset from a datasource by calling the API.
|
|
1059
|
+
|
|
1060
|
+
This is a private method that performs the actual API call to create a memoryset from a datasource.
|
|
1061
|
+
"""
|
|
1062
|
+
if embedding_model is None:
|
|
1063
|
+
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
1064
|
+
|
|
1065
|
+
existing = cls._handle_if_exists(
|
|
1066
|
+
name,
|
|
1067
|
+
if_exists=if_exists,
|
|
1068
|
+
label_names=label_names,
|
|
1069
|
+
embedding_model=embedding_model,
|
|
1070
|
+
)
|
|
1071
|
+
if existing is not None:
|
|
1072
|
+
return existing
|
|
1073
|
+
|
|
1074
|
+
payload: CreateMemorysetFromDatasourceRequest = {
|
|
1075
|
+
"name": name,
|
|
1076
|
+
"description": description,
|
|
1077
|
+
"datasource_name_or_id": datasource.id,
|
|
1078
|
+
"datasource_label_column": label_column,
|
|
1079
|
+
"datasource_score_column": score_column,
|
|
1080
|
+
"datasource_value_column": value_column,
|
|
1081
|
+
"datasource_source_id_column": source_id_column,
|
|
1082
|
+
"datasource_partition_id_column": partition_id_column,
|
|
1083
|
+
"label_names": label_names,
|
|
1084
|
+
"max_seq_length_override": max_seq_length_override,
|
|
1085
|
+
"remove_duplicates": remove_duplicates,
|
|
1086
|
+
"index_type": index_type,
|
|
1087
|
+
"index_params": index_params,
|
|
1088
|
+
"hidden": hidden,
|
|
1089
|
+
}
|
|
1090
|
+
if memory_type is not None:
|
|
1091
|
+
payload["memory_type"] = memory_type
|
|
1092
|
+
if subsample is not None:
|
|
1093
|
+
payload["subsample"] = subsample
|
|
1094
|
+
if prompt is not None:
|
|
1095
|
+
payload["prompt"] = prompt
|
|
1096
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
1097
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
1098
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1099
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1100
|
+
else:
|
|
1101
|
+
raise ValueError("Invalid embedding model")
|
|
1102
|
+
client = OrcaClient._resolve_client()
|
|
1103
|
+
response = client.POST("/memoryset", json=payload)
|
|
1104
|
+
|
|
1105
|
+
if response["insertion_job_id"] is None:
|
|
1106
|
+
raise ValueError("Create memoryset operation failed to produce an insertion job")
|
|
1107
|
+
|
|
1108
|
+
job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
|
|
1109
|
+
return job if background else job.result()
|
|
1110
|
+
|
|
1111
|
+
@overload
|
|
1112
|
+
@classmethod
|
|
1113
|
+
def create(
|
|
1114
|
+
cls,
|
|
1115
|
+
name: str,
|
|
1116
|
+
*,
|
|
1117
|
+
datasource: None = None,
|
|
1118
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1119
|
+
description: str | None = None,
|
|
1120
|
+
label_names: list[str] | None = None,
|
|
1121
|
+
max_seq_length_override: int | None = None,
|
|
1122
|
+
prompt: str | None = None,
|
|
1123
|
+
index_type: IndexType = "FLAT",
|
|
1124
|
+
index_params: dict[str, Any] = {},
|
|
1125
|
+
if_exists: CreateMode = "error",
|
|
1126
|
+
hidden: bool = False,
|
|
1127
|
+
memory_type: MemoryType | None = None,
|
|
1128
|
+
) -> Self:
|
|
1129
|
+
pass
|
|
1130
|
+
|
|
1131
|
+
@overload
|
|
1132
|
+
@classmethod
|
|
1133
|
+
def create(
|
|
1134
|
+
cls,
|
|
1135
|
+
name: str,
|
|
1136
|
+
*,
|
|
1137
|
+
datasource: Datasource,
|
|
1138
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1139
|
+
value_column: str = "value",
|
|
1140
|
+
label_column: str | None = None,
|
|
1141
|
+
score_column: str | None = None,
|
|
1142
|
+
source_id_column: str | None = None,
|
|
1143
|
+
partition_id_column: str | None = None,
|
|
1144
|
+
description: str | None = None,
|
|
1145
|
+
label_names: list[str] | None = None,
|
|
1146
|
+
max_seq_length_override: int | None = None,
|
|
1147
|
+
prompt: str | None = None,
|
|
1148
|
+
remove_duplicates: bool = True,
|
|
1149
|
+
index_type: IndexType = "FLAT",
|
|
1150
|
+
index_params: dict[str, Any] = {},
|
|
1151
|
+
if_exists: CreateMode = "error",
|
|
1152
|
+
background: Literal[True],
|
|
1153
|
+
hidden: bool = False,
|
|
1154
|
+
subsample: int | float | None = None,
|
|
1155
|
+
memory_type: MemoryType | None = None,
|
|
1156
|
+
) -> Job[Self]:
|
|
1157
|
+
pass
|
|
1158
|
+
|
|
1159
|
+
@overload
|
|
1160
|
+
@classmethod
|
|
1161
|
+
def create(
|
|
1162
|
+
cls,
|
|
1163
|
+
name: str,
|
|
1164
|
+
*,
|
|
1165
|
+
datasource: Datasource,
|
|
1166
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1167
|
+
value_column: str = "value",
|
|
1168
|
+
label_column: str | None = None,
|
|
1169
|
+
score_column: str | None = None,
|
|
1170
|
+
source_id_column: str | None = None,
|
|
1171
|
+
partition_id_column: str | None = None,
|
|
1172
|
+
description: str | None = None,
|
|
1173
|
+
label_names: list[str] | None = None,
|
|
1174
|
+
max_seq_length_override: int | None = None,
|
|
1175
|
+
prompt: str | None = None,
|
|
1176
|
+
remove_duplicates: bool = True,
|
|
1177
|
+
index_type: IndexType = "FLAT",
|
|
1178
|
+
index_params: dict[str, Any] = {},
|
|
1179
|
+
if_exists: CreateMode = "error",
|
|
1180
|
+
background: Literal[False] = False,
|
|
1181
|
+
hidden: bool = False,
|
|
1182
|
+
subsample: int | float | None = None,
|
|
1183
|
+
memory_type: MemoryType | None = None,
|
|
1184
|
+
) -> Self:
|
|
1185
|
+
pass
|
|
1186
|
+
|
|
1187
|
+
@classmethod
|
|
1188
|
+
def create(
|
|
1189
|
+
cls,
|
|
1190
|
+
name: str,
|
|
1191
|
+
*,
|
|
1192
|
+
datasource: Datasource | None = None,
|
|
1193
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1194
|
+
value_column: str = "value",
|
|
1195
|
+
label_column: str | None = None,
|
|
1196
|
+
score_column: str | None = None,
|
|
1197
|
+
source_id_column: str | None = None,
|
|
1198
|
+
partition_id_column: str | None = None,
|
|
1199
|
+
description: str | None = None,
|
|
1200
|
+
label_names: list[str] | None = None,
|
|
1201
|
+
max_seq_length_override: int | None = None,
|
|
1202
|
+
prompt: str | None = None,
|
|
1203
|
+
remove_duplicates: bool = True,
|
|
1204
|
+
index_type: IndexType = "FLAT",
|
|
1205
|
+
index_params: dict[str, Any] = {},
|
|
1206
|
+
if_exists: CreateMode = "error",
|
|
1207
|
+
background: bool = False,
|
|
1208
|
+
hidden: bool = False,
|
|
1209
|
+
subsample: int | float | None = None,
|
|
1210
|
+
memory_type: MemoryType | None = None,
|
|
1211
|
+
) -> Self | Job[Self]:
|
|
1212
|
+
"""
|
|
1213
|
+
Create a new memoryset in the OrcaCloud
|
|
1214
|
+
|
|
1215
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
1216
|
+
`value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
1217
|
+
as metadata in the memoryset.
|
|
1218
|
+
|
|
1219
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
1220
|
+
You can add memories later using the `insert` method.
|
|
1221
|
+
|
|
1222
|
+
Params:
|
|
1223
|
+
name: Name for the new memoryset (must be unique)
|
|
1224
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
1225
|
+
an empty memoryset will be created.
|
|
1226
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1227
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1228
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
1229
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1230
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
1231
|
+
converted to integers with the unique strings extracted as `label_names`
|
|
1232
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
1233
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
1234
|
+
the system of reference
|
|
1235
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
1236
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1237
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1238
|
+
datasource or the embedding model.
|
|
1239
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
1240
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1241
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1242
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
1243
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1244
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1245
|
+
sequence length if not provided
|
|
1246
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1247
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
1248
|
+
into the memoryset
|
|
1249
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1250
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1251
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1252
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1253
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1254
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
1255
|
+
Note: This parameter is ignored when creating an empty memoryset (when datasource is None).
|
|
1256
|
+
hidden: Whether the memoryset should be hidden
|
|
1257
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1258
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1259
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1260
|
+
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
1261
|
+
Returns:
|
|
1262
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1263
|
+
|
|
1264
|
+
Raises:
|
|
1265
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1266
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1267
|
+
"""
|
|
1268
|
+
if datasource is None:
|
|
1269
|
+
return cls._create_empty(
|
|
1270
|
+
name,
|
|
1271
|
+
embedding_model=embedding_model,
|
|
1272
|
+
description=description,
|
|
1273
|
+
label_names=label_names,
|
|
1274
|
+
max_seq_length_override=max_seq_length_override,
|
|
1275
|
+
prompt=prompt,
|
|
1276
|
+
index_type=index_type,
|
|
1277
|
+
index_params=index_params,
|
|
1278
|
+
if_exists=if_exists,
|
|
1279
|
+
hidden=hidden,
|
|
1280
|
+
memory_type=memory_type,
|
|
1281
|
+
)
|
|
1282
|
+
else:
|
|
1283
|
+
return cls._create_from_datasource(
|
|
1284
|
+
name,
|
|
1285
|
+
datasource=datasource,
|
|
1286
|
+
embedding_model=embedding_model,
|
|
1287
|
+
value_column=value_column,
|
|
1288
|
+
label_column=label_column,
|
|
1289
|
+
score_column=score_column,
|
|
1290
|
+
source_id_column=source_id_column,
|
|
1291
|
+
partition_id_column=partition_id_column,
|
|
1292
|
+
description=description,
|
|
1293
|
+
label_names=label_names,
|
|
1294
|
+
max_seq_length_override=max_seq_length_override,
|
|
1295
|
+
prompt=prompt,
|
|
1296
|
+
remove_duplicates=remove_duplicates,
|
|
1297
|
+
index_type=index_type,
|
|
1298
|
+
index_params=index_params,
|
|
1299
|
+
if_exists=if_exists,
|
|
1300
|
+
background=background,
|
|
1301
|
+
hidden=hidden,
|
|
1302
|
+
subsample=subsample,
|
|
1303
|
+
memory_type=memory_type,
|
|
1304
|
+
)
|
|
1305
|
+
|
|
1306
|
+
@overload
|
|
1307
|
+
@classmethod
|
|
1308
|
+
def from_datasource(
|
|
1309
|
+
cls,
|
|
1310
|
+
name: str,
|
|
1311
|
+
*,
|
|
1312
|
+
datasource: Datasource,
|
|
1313
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1314
|
+
value_column: str = "value",
|
|
1315
|
+
label_column: str | None = None,
|
|
1316
|
+
score_column: str | None = None,
|
|
1317
|
+
source_id_column: str | None = None,
|
|
1318
|
+
partition_id_column: str | None = None,
|
|
1319
|
+
description: str | None = None,
|
|
1320
|
+
label_names: list[str] | None = None,
|
|
1321
|
+
max_seq_length_override: int | None = None,
|
|
1322
|
+
prompt: str | None = None,
|
|
1323
|
+
remove_duplicates: bool = True,
|
|
1324
|
+
index_type: IndexType = "FLAT",
|
|
1325
|
+
index_params: dict[str, Any] = {},
|
|
1326
|
+
if_exists: CreateMode = "error",
|
|
1327
|
+
background: Literal[True],
|
|
1328
|
+
hidden: bool = False,
|
|
1329
|
+
subsample: int | float | None = None,
|
|
1330
|
+
memory_type: MemoryType | None = None,
|
|
1331
|
+
) -> Job[Self]:
|
|
1332
|
+
pass
|
|
1333
|
+
|
|
1334
|
+
@overload
|
|
1335
|
+
@classmethod
|
|
1336
|
+
def from_datasource(
|
|
1337
|
+
cls,
|
|
1338
|
+
name: str,
|
|
1339
|
+
*,
|
|
1340
|
+
datasource: Datasource,
|
|
1341
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1342
|
+
value_column: str = "value",
|
|
1343
|
+
label_column: str | None = None,
|
|
1344
|
+
score_column: str | None = None,
|
|
1345
|
+
source_id_column: str | None = None,
|
|
1346
|
+
partition_id_column: str | None = None,
|
|
1347
|
+
description: str | None = None,
|
|
1348
|
+
label_names: list[str] | None = None,
|
|
1349
|
+
max_seq_length_override: int | None = None,
|
|
1350
|
+
prompt: str | None = None,
|
|
1351
|
+
remove_duplicates: bool = True,
|
|
1352
|
+
index_type: IndexType = "FLAT",
|
|
1353
|
+
index_params: dict[str, Any] = {},
|
|
1354
|
+
if_exists: CreateMode = "error",
|
|
1355
|
+
background: Literal[False] = False,
|
|
1356
|
+
hidden: bool = False,
|
|
1357
|
+
subsample: int | float | None = None,
|
|
1358
|
+
memory_type: MemoryType | None = None,
|
|
1359
|
+
) -> Self:
|
|
1360
|
+
pass
|
|
1361
|
+
|
|
1362
|
+
@classmethod
|
|
1363
|
+
def from_datasource(
|
|
1364
|
+
cls,
|
|
1365
|
+
name: str,
|
|
1366
|
+
*,
|
|
1367
|
+
datasource: Datasource,
|
|
1368
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1369
|
+
value_column: str = "value",
|
|
1370
|
+
label_column: str | None = None,
|
|
1371
|
+
score_column: str | None = None,
|
|
1372
|
+
source_id_column: str | None = None,
|
|
1373
|
+
partition_id_column: str | None = None,
|
|
1374
|
+
description: str | None = None,
|
|
1375
|
+
label_names: list[str] | None = None,
|
|
1376
|
+
max_seq_length_override: int | None = None,
|
|
1377
|
+
prompt: str | None = None,
|
|
1378
|
+
remove_duplicates: bool = True,
|
|
1379
|
+
index_type: IndexType = "FLAT",
|
|
1380
|
+
index_params: dict[str, Any] = {},
|
|
1381
|
+
if_exists: CreateMode = "error",
|
|
1382
|
+
background: bool = False,
|
|
1383
|
+
hidden: bool = False,
|
|
1384
|
+
subsample: int | float | None = None,
|
|
1385
|
+
memory_type: MemoryType | None = None,
|
|
1386
|
+
) -> Self | Job[Self]:
|
|
1387
|
+
"""
|
|
1388
|
+
Create a new memoryset in the OrcaCloud from a datasource.
|
|
1389
|
+
|
|
1390
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
1391
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
1392
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
1393
|
+
in the memoryset.
|
|
1394
|
+
|
|
1395
|
+
Params:
|
|
1396
|
+
name: Name for the new memoryset (must be unique)
|
|
1397
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
1398
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1399
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1400
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
1401
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1402
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
1403
|
+
converted to integers with the unique strings extracted as `label_names`
|
|
1404
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
1405
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
1406
|
+
the system of reference
|
|
1407
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
1408
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1409
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1410
|
+
datasource or the embedding model.
|
|
1411
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
1412
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1413
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1414
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
1415
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1416
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1417
|
+
sequence length if not provided
|
|
1418
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1419
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
1420
|
+
into the memoryset
|
|
1421
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1422
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1423
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1424
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1425
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1426
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
1427
|
+
hidden: Whether the memoryset should be hidden
|
|
1428
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1429
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1430
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1431
|
+
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
1432
|
+
Returns:
|
|
1433
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1434
|
+
|
|
1435
|
+
Raises:
|
|
1436
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1437
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1438
|
+
"""
|
|
1439
|
+
return cls._create_from_datasource(
|
|
1440
|
+
name,
|
|
1441
|
+
datasource=datasource,
|
|
1442
|
+
embedding_model=embedding_model,
|
|
1443
|
+
value_column=value_column,
|
|
1444
|
+
label_column=label_column,
|
|
1445
|
+
score_column=score_column,
|
|
1446
|
+
source_id_column=source_id_column,
|
|
1447
|
+
partition_id_column=partition_id_column,
|
|
1448
|
+
description=description,
|
|
1449
|
+
label_names=label_names,
|
|
1450
|
+
max_seq_length_override=max_seq_length_override,
|
|
1451
|
+
prompt=prompt,
|
|
1452
|
+
remove_duplicates=remove_duplicates,
|
|
1453
|
+
index_type=index_type,
|
|
1454
|
+
index_params=index_params,
|
|
1455
|
+
if_exists=if_exists,
|
|
1456
|
+
background=background,
|
|
1457
|
+
hidden=hidden,
|
|
1458
|
+
subsample=subsample,
|
|
1459
|
+
memory_type=memory_type,
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
@classmethod
|
|
1463
|
+
def _create_empty(
|
|
1464
|
+
cls,
|
|
1465
|
+
name: str,
|
|
1466
|
+
*,
|
|
1467
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1468
|
+
description: str | None = None,
|
|
1469
|
+
label_names: list[str] | None = None,
|
|
1470
|
+
max_seq_length_override: int | None = None,
|
|
1471
|
+
prompt: str | None = None,
|
|
1472
|
+
index_type: IndexType = "FLAT",
|
|
1473
|
+
index_params: dict[str, Any] = {},
|
|
1474
|
+
if_exists: CreateMode = "error",
|
|
1475
|
+
hidden: bool = False,
|
|
1476
|
+
memory_type: MemoryType | None = None,
|
|
1477
|
+
) -> Self:
|
|
1478
|
+
"""
|
|
1479
|
+
Create an empty memoryset in the OrcaCloud
|
|
1480
|
+
|
|
1481
|
+
This creates a memoryset with no initial memories. You can add memories later using
|
|
1482
|
+
the `insert` method.
|
|
1483
|
+
|
|
1484
|
+
Params:
|
|
1485
|
+
name: Name for the new memoryset (must be unique)
|
|
1486
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1487
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1488
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1489
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1490
|
+
datasource or the embedding model.
|
|
1491
|
+
label_names: List of human-readable names for the labels in the memoryset
|
|
1492
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1493
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1494
|
+
sequence length if not provided
|
|
1495
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1496
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1497
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1498
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1499
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1500
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1501
|
+
hidden: Whether the memoryset should be hidden
|
|
1502
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if called from
|
|
1503
|
+
`LabeledMemoryset` and `"SCORED"` if called from `ScoredMemoryset`.
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1507
|
+
|
|
1508
|
+
Raises:
|
|
1509
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1510
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1511
|
+
"""
|
|
1512
|
+
if embedding_model is None:
|
|
1513
|
+
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
1514
|
+
|
|
1515
|
+
existing = cls._handle_if_exists(
|
|
1516
|
+
name,
|
|
1517
|
+
if_exists=if_exists,
|
|
1518
|
+
label_names=label_names,
|
|
1519
|
+
embedding_model=embedding_model,
|
|
1520
|
+
)
|
|
1521
|
+
if existing is not None:
|
|
1522
|
+
return existing
|
|
1523
|
+
|
|
1524
|
+
payload: CreateMemorysetRequest = {
|
|
1525
|
+
"name": name,
|
|
1526
|
+
"description": description,
|
|
1527
|
+
"label_names": label_names,
|
|
1528
|
+
"max_seq_length_override": max_seq_length_override,
|
|
1529
|
+
"index_type": index_type,
|
|
1530
|
+
"index_params": index_params,
|
|
1531
|
+
"hidden": hidden,
|
|
1532
|
+
}
|
|
1533
|
+
if memory_type is not None:
|
|
1534
|
+
payload["memory_type"] = memory_type
|
|
1535
|
+
if prompt is not None:
|
|
1536
|
+
payload["prompt"] = prompt
|
|
1537
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
1538
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
1539
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1540
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1541
|
+
else:
|
|
1542
|
+
raise ValueError("Invalid embedding model")
|
|
1543
|
+
|
|
1544
|
+
client = OrcaClient._resolve_client()
|
|
1545
|
+
response = client.POST("/memoryset/empty", json=payload)
|
|
1546
|
+
return cls.open(response["id"])
|
|
1547
|
+
|
|
1548
|
+
@overload
|
|
1549
|
+
@classmethod
|
|
1550
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
|
|
1551
|
+
pass
|
|
1552
|
+
|
|
1553
|
+
@overload
|
|
1554
|
+
@classmethod
|
|
1555
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
|
|
1556
|
+
pass
|
|
1557
|
+
|
|
1558
|
+
@classmethod
|
|
1559
|
+
def from_hf_dataset(
|
|
1560
|
+
cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
|
|
1561
|
+
) -> Self | Job[Self]:
|
|
1562
|
+
"""
|
|
1563
|
+
Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
|
|
1564
|
+
|
|
1565
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1566
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1567
|
+
|
|
1568
|
+
All features that are not specified to be used as `value_column`, `label_column`, or
|
|
1569
|
+
`source_id_column` will be stored as metadata in the memoryset.
|
|
1570
|
+
|
|
1571
|
+
Params:
|
|
1572
|
+
name: Name for the new memoryset (must be unique)
|
|
1573
|
+
hf_dataset: Hugging Face dataset to create the memoryset from
|
|
1574
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1575
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1576
|
+
|
|
1577
|
+
Returns:
|
|
1578
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1579
|
+
"""
|
|
1580
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1581
|
+
existing = cls._handle_if_exists(
|
|
1582
|
+
name,
|
|
1583
|
+
if_exists=if_exists,
|
|
1584
|
+
label_names=kwargs.get("label_names"),
|
|
1585
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1586
|
+
)
|
|
1587
|
+
if existing is not None:
|
|
1588
|
+
return existing
|
|
1589
|
+
|
|
1590
|
+
datasource = Datasource.from_hf_dataset(
|
|
1591
|
+
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
1592
|
+
)
|
|
1593
|
+
kwargs["background"] = background
|
|
1594
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1595
|
+
|
|
1596
|
+
@overload
|
|
1597
|
+
@classmethod
|
|
1598
|
+
def from_pytorch(
|
|
1599
|
+
cls,
|
|
1600
|
+
name: str,
|
|
1601
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1602
|
+
*,
|
|
1603
|
+
column_names: list[str] | None = None,
|
|
1604
|
+
background: Literal[True],
|
|
1605
|
+
**kwargs: Any,
|
|
1606
|
+
) -> Job[Self]:
|
|
1607
|
+
pass
|
|
1608
|
+
|
|
1609
|
+
@overload
|
|
1610
|
+
@classmethod
|
|
1611
|
+
def from_pytorch(
|
|
1612
|
+
cls,
|
|
1613
|
+
name: str,
|
|
1614
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1615
|
+
*,
|
|
1616
|
+
column_names: list[str] | None = None,
|
|
1617
|
+
background: Literal[False] = False,
|
|
1618
|
+
**kwargs: Any,
|
|
1619
|
+
) -> Self:
|
|
1620
|
+
pass
|
|
1621
|
+
|
|
1622
|
+
@classmethod
|
|
1623
|
+
def from_pytorch(
|
|
1624
|
+
cls,
|
|
1625
|
+
name: str,
|
|
1626
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1627
|
+
*,
|
|
1628
|
+
column_names: list[str] | None = None,
|
|
1629
|
+
background: bool = False,
|
|
1630
|
+
**kwargs: Any,
|
|
1631
|
+
) -> Self | Job[Self]:
|
|
1632
|
+
"""
|
|
1633
|
+
Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
|
|
1634
|
+
[`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
|
|
1635
|
+
|
|
1636
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1637
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1638
|
+
|
|
1639
|
+
All properties that are not specified to be used as `value_column`, `label_column`, or
|
|
1640
|
+
`source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1641
|
+
|
|
1642
|
+
Params:
|
|
1643
|
+
name: Name for the new memoryset (must be unique)
|
|
1644
|
+
torch_data: PyTorch data loader or dataset to create the memoryset from
|
|
1645
|
+
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
1646
|
+
argument must be provided to specify the names of the columns.
|
|
1647
|
+
background: Whether to run the operation in the background
|
|
1648
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1649
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1650
|
+
|
|
1651
|
+
Returns:
|
|
1652
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1653
|
+
"""
|
|
1654
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1655
|
+
existing = cls._handle_if_exists(
|
|
1656
|
+
name,
|
|
1657
|
+
if_exists=if_exists,
|
|
1658
|
+
label_names=kwargs.get("label_names"),
|
|
1659
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1660
|
+
)
|
|
1661
|
+
if existing is not None:
|
|
1662
|
+
return existing
|
|
1663
|
+
|
|
1664
|
+
datasource = Datasource.from_pytorch(
|
|
1665
|
+
f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
|
|
1666
|
+
)
|
|
1667
|
+
kwargs["background"] = background
|
|
1668
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1669
|
+
|
|
1670
|
+
@overload
|
|
1671
|
+
@classmethod
|
|
1672
|
+
def from_list(
|
|
1673
|
+
cls,
|
|
1674
|
+
name: str,
|
|
1675
|
+
data: list[dict],
|
|
1676
|
+
*,
|
|
1677
|
+
background: Literal[True],
|
|
1678
|
+
**kwargs: Any,
|
|
1679
|
+
) -> Job[Self]:
|
|
1680
|
+
pass
|
|
1681
|
+
|
|
1682
|
+
@overload
|
|
1683
|
+
@classmethod
|
|
1684
|
+
def from_list(
|
|
1685
|
+
cls,
|
|
1686
|
+
name: str,
|
|
1687
|
+
data: list[dict],
|
|
1688
|
+
*,
|
|
1689
|
+
background: Literal[False] = False,
|
|
1690
|
+
**kwargs: Any,
|
|
1691
|
+
) -> Self:
|
|
1692
|
+
pass
|
|
1693
|
+
|
|
1694
|
+
@classmethod
|
|
1695
|
+
def from_list(
|
|
1696
|
+
cls,
|
|
1697
|
+
name: str,
|
|
1698
|
+
data: list[dict],
|
|
1699
|
+
*,
|
|
1700
|
+
background: bool = False,
|
|
1701
|
+
**kwargs: Any,
|
|
1702
|
+
) -> Self | Job[Self]:
|
|
1703
|
+
"""
|
|
1704
|
+
Create a new memoryset from a list of dictionaries in the OrcaCloud
|
|
1705
|
+
|
|
1706
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1707
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1708
|
+
|
|
1709
|
+
All properties that are not specified to be used as `value_column`, `label_column`, or
|
|
1710
|
+
`source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1711
|
+
|
|
1712
|
+
Params:
|
|
1713
|
+
name: Name for the new memoryset (must be unique)
|
|
1714
|
+
data: List of dictionaries to create the memoryset from
|
|
1715
|
+
background: Whether to run the operation in the background
|
|
1716
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1717
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1718
|
+
|
|
1719
|
+
Returns:
|
|
1720
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1721
|
+
|
|
1722
|
+
Examples:
|
|
1723
|
+
>>> LabeledMemoryset.from_list("my_memoryset", [
|
|
1724
|
+
... {"value": "hello", "label": 0, "tag": "tag1"},
|
|
1725
|
+
... {"value": "world", "label": 1, "tag": "tag2"},
|
|
1726
|
+
... ])
|
|
1727
|
+
"""
|
|
1728
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1729
|
+
existing = cls._handle_if_exists(
|
|
1730
|
+
name,
|
|
1731
|
+
if_exists=if_exists,
|
|
1732
|
+
label_names=kwargs.get("label_names"),
|
|
1733
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1734
|
+
)
|
|
1735
|
+
if existing is not None:
|
|
1736
|
+
return existing
|
|
1737
|
+
|
|
1738
|
+
datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1739
|
+
kwargs["background"] = background
|
|
1740
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1741
|
+
|
|
1742
|
+
@overload
|
|
1743
|
+
@classmethod
|
|
1744
|
+
def from_dict(
|
|
1745
|
+
cls,
|
|
1746
|
+
name: str,
|
|
1747
|
+
data: dict,
|
|
1748
|
+
*,
|
|
1749
|
+
background: Literal[True],
|
|
1750
|
+
**kwargs: Any,
|
|
1751
|
+
) -> Job[Self]:
|
|
1752
|
+
pass
|
|
1753
|
+
|
|
1754
|
+
@overload
|
|
1755
|
+
@classmethod
|
|
1756
|
+
def from_dict(
|
|
1757
|
+
cls,
|
|
1758
|
+
name: str,
|
|
1759
|
+
data: dict,
|
|
1760
|
+
*,
|
|
1761
|
+
background: Literal[False] = False,
|
|
1762
|
+
**kwargs: Any,
|
|
1763
|
+
) -> Self:
|
|
1764
|
+
pass
|
|
1765
|
+
|
|
1766
|
+
@classmethod
|
|
1767
|
+
def from_dict(
|
|
1768
|
+
cls,
|
|
1769
|
+
name: str,
|
|
1770
|
+
data: dict,
|
|
1771
|
+
*,
|
|
1772
|
+
background: bool = False,
|
|
1773
|
+
**kwargs: Any,
|
|
1774
|
+
) -> Self | Job[Self]:
|
|
1775
|
+
"""
|
|
1776
|
+
Create a new memoryset from a dictionary of columns in the OrcaCloud
|
|
1777
|
+
|
|
1778
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1779
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1780
|
+
|
|
1781
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
1782
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1783
|
+
|
|
1784
|
+
Params:
|
|
1785
|
+
name: Name for the new memoryset (must be unique)
|
|
1786
|
+
data: Dictionary of columns to create the memoryset from
|
|
1787
|
+
background: Whether to run the operation in the background
|
|
1788
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1789
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1790
|
+
|
|
1791
|
+
Returns:
|
|
1792
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1793
|
+
|
|
1794
|
+
Examples:
|
|
1795
|
+
>>> LabeledMemoryset.from_dict("my_memoryset", {
|
|
1796
|
+
... "value": ["hello", "world"],
|
|
1797
|
+
... "label": [0, 1],
|
|
1798
|
+
... "tag": ["tag1", "tag2"],
|
|
1799
|
+
... })
|
|
1800
|
+
"""
|
|
1801
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1802
|
+
existing = cls._handle_if_exists(
|
|
1803
|
+
name,
|
|
1804
|
+
if_exists=if_exists,
|
|
1805
|
+
label_names=kwargs.get("label_names"),
|
|
1806
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1807
|
+
)
|
|
1808
|
+
if existing is not None:
|
|
1809
|
+
return existing
|
|
1810
|
+
|
|
1811
|
+
datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1812
|
+
kwargs["background"] = background
|
|
1813
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1814
|
+
|
|
1815
|
+
@overload
|
|
1816
|
+
@classmethod
|
|
1817
|
+
def from_pandas(
|
|
1818
|
+
cls,
|
|
1819
|
+
name: str,
|
|
1820
|
+
dataframe: pd.DataFrame,
|
|
1821
|
+
*,
|
|
1822
|
+
background: Literal[True],
|
|
1823
|
+
**kwargs: Any,
|
|
1824
|
+
) -> Job[Self]:
|
|
1825
|
+
pass
|
|
1826
|
+
|
|
1827
|
+
@overload
|
|
1828
|
+
@classmethod
|
|
1829
|
+
def from_pandas(
|
|
1830
|
+
cls,
|
|
1831
|
+
name: str,
|
|
1832
|
+
dataframe: pd.DataFrame,
|
|
1833
|
+
*,
|
|
1834
|
+
background: Literal[False] = False,
|
|
1835
|
+
**kwargs: Any,
|
|
1836
|
+
) -> Self:
|
|
1837
|
+
pass
|
|
1838
|
+
|
|
1839
|
+
@classmethod
|
|
1840
|
+
def from_pandas(
|
|
1841
|
+
cls,
|
|
1842
|
+
name: str,
|
|
1843
|
+
dataframe: pd.DataFrame,
|
|
1844
|
+
*,
|
|
1845
|
+
background: bool = False,
|
|
1846
|
+
**kwargs: Any,
|
|
1847
|
+
) -> Self | Job[Self]:
|
|
1848
|
+
"""
|
|
1849
|
+
Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
|
|
1850
|
+
|
|
1851
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1852
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1853
|
+
|
|
1854
|
+
All columns that are not specified to be used as `value_column`, `label_column`, or
|
|
1855
|
+
`source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1856
|
+
|
|
1857
|
+
Params:
|
|
1858
|
+
name: Name for the new memoryset (must be unique)
|
|
1859
|
+
dataframe: Dataframe to create the memoryset from
|
|
1860
|
+
background: Whether to run the operation in the background
|
|
1861
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1862
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1863
|
+
|
|
1864
|
+
Returns:
|
|
1865
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1866
|
+
"""
|
|
1867
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1868
|
+
existing = cls._handle_if_exists(
|
|
1869
|
+
name,
|
|
1870
|
+
if_exists=if_exists,
|
|
1871
|
+
label_names=kwargs.get("label_names"),
|
|
1872
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1873
|
+
)
|
|
1874
|
+
if existing is not None:
|
|
1875
|
+
return existing
|
|
1876
|
+
|
|
1877
|
+
datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
|
|
1878
|
+
kwargs["background"] = background
|
|
1879
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1880
|
+
|
|
1881
|
+
@overload
|
|
1882
|
+
@classmethod
|
|
1883
|
+
def from_arrow(
|
|
1884
|
+
cls,
|
|
1885
|
+
name: str,
|
|
1886
|
+
pyarrow_table: pa.Table,
|
|
1887
|
+
*,
|
|
1888
|
+
background: Literal[True],
|
|
1889
|
+
**kwargs: Any,
|
|
1890
|
+
) -> Job[Self]:
|
|
1891
|
+
pass
|
|
1892
|
+
|
|
1893
|
+
@overload
|
|
1894
|
+
@classmethod
|
|
1895
|
+
def from_arrow(
|
|
1896
|
+
cls,
|
|
1897
|
+
name: str,
|
|
1898
|
+
pyarrow_table: pa.Table,
|
|
1899
|
+
*,
|
|
1900
|
+
background: Literal[False] = False,
|
|
1901
|
+
**kwargs: Any,
|
|
1902
|
+
) -> Self:
|
|
1903
|
+
pass
|
|
1904
|
+
|
|
1905
|
+
@classmethod
|
|
1906
|
+
def from_arrow(
|
|
1907
|
+
cls,
|
|
1908
|
+
name: str,
|
|
1909
|
+
pyarrow_table: pa.Table,
|
|
1910
|
+
*,
|
|
1911
|
+
background: bool = False,
|
|
1912
|
+
**kwargs: Any,
|
|
1913
|
+
) -> Self | Job[Self]:
|
|
1914
|
+
"""
|
|
1915
|
+
Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
|
|
1916
|
+
|
|
1917
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1918
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1919
|
+
|
|
1920
|
+
All columns that are not specified to be used as `value_column`, `label_column`, or
|
|
1921
|
+
`source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1922
|
+
|
|
1923
|
+
Params:
|
|
1924
|
+
name: Name for the new memoryset (must be unique)
|
|
1925
|
+
pyarrow_table: PyArrow table to create the memoryset from
|
|
1926
|
+
background: Whether to run the operation in the background
|
|
1927
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1928
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1929
|
+
|
|
1930
|
+
Returns:
|
|
1931
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1932
|
+
"""
|
|
1933
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1934
|
+
existing = cls._handle_if_exists(
|
|
1935
|
+
name,
|
|
1936
|
+
if_exists=if_exists,
|
|
1937
|
+
label_names=kwargs.get("label_names"),
|
|
1938
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1939
|
+
)
|
|
1940
|
+
if existing is not None:
|
|
1941
|
+
return existing
|
|
1942
|
+
|
|
1943
|
+
datasource = Datasource.from_arrow(
|
|
1944
|
+
f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
|
|
1945
|
+
)
|
|
1946
|
+
kwargs["background"] = background
|
|
1947
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1948
|
+
|
|
1949
|
+
@overload
|
|
1950
|
+
@classmethod
|
|
1951
|
+
def from_disk(
|
|
1952
|
+
cls,
|
|
1953
|
+
name: str,
|
|
1954
|
+
file_path: str | PathLike,
|
|
1955
|
+
*,
|
|
1956
|
+
background: Literal[True],
|
|
1957
|
+
**kwargs: Any,
|
|
1958
|
+
) -> Job[Self]:
|
|
1959
|
+
pass
|
|
1960
|
+
|
|
1961
|
+
@overload
|
|
1962
|
+
@classmethod
|
|
1963
|
+
def from_disk(
|
|
1964
|
+
cls,
|
|
1965
|
+
name: str,
|
|
1966
|
+
file_path: str | PathLike,
|
|
1967
|
+
*,
|
|
1968
|
+
background: Literal[False] = False,
|
|
1969
|
+
**kwargs: Any,
|
|
1970
|
+
) -> Self:
|
|
1971
|
+
pass
|
|
1972
|
+
|
|
1973
|
+
@classmethod
|
|
1974
|
+
def from_disk(
|
|
1975
|
+
cls,
|
|
1976
|
+
name: str,
|
|
1977
|
+
file_path: str | PathLike,
|
|
1978
|
+
*,
|
|
1979
|
+
background: bool = False,
|
|
1980
|
+
**kwargs: Any,
|
|
1981
|
+
) -> Self | Job[Self]:
|
|
1982
|
+
"""
|
|
1983
|
+
Create a new memoryset from a file on disk in the OrcaCloud
|
|
1984
|
+
|
|
1985
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1986
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1987
|
+
|
|
1988
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
1989
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1990
|
+
|
|
1991
|
+
Params:
|
|
1992
|
+
name: Name for the new memoryset (must be unique)
|
|
1993
|
+
file_path: Path to the file on disk to create the memoryset from. The file type will
|
|
1994
|
+
be inferred from the file extension. The following file types are supported:
|
|
1995
|
+
|
|
1996
|
+
- .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
|
|
1997
|
+
- .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
|
|
1998
|
+
- .csv: [`CSV`][csv] files
|
|
1999
|
+
- .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
|
|
2000
|
+
- dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
|
|
2001
|
+
background: Whether to run the operation in the background
|
|
2002
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
2003
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
2004
|
+
|
|
2005
|
+
Returns:
|
|
2006
|
+
Handle to the new memoryset in the OrcaCloud
|
|
2007
|
+
"""
|
|
2008
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
2009
|
+
existing = cls._handle_if_exists(
|
|
2010
|
+
name,
|
|
2011
|
+
if_exists=if_exists,
|
|
2012
|
+
label_names=kwargs.get("label_names"),
|
|
2013
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
2014
|
+
)
|
|
2015
|
+
if existing is not None:
|
|
2016
|
+
return existing
|
|
2017
|
+
|
|
2018
|
+
datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
|
|
2019
|
+
kwargs["background"] = background
|
|
2020
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
2021
|
+
|
|
2022
|
+
@classmethod
|
|
2023
|
+
def open(cls, name: str) -> Self:
|
|
2024
|
+
"""
|
|
2025
|
+
Get a handle to a memoryset in the OrcaCloud
|
|
2026
|
+
|
|
2027
|
+
Params:
|
|
2028
|
+
name: Name or unique identifier of the memoryset
|
|
2029
|
+
|
|
2030
|
+
Returns:
|
|
2031
|
+
Handle to the existing memoryset in the OrcaCloud
|
|
2032
|
+
|
|
2033
|
+
Raises:
|
|
2034
|
+
LookupError: If the memoryset does not exist
|
|
2035
|
+
"""
|
|
2036
|
+
client = OrcaClient._resolve_client()
|
|
2037
|
+
metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
2038
|
+
return cls(metadata)
|
|
2039
|
+
|
|
2040
|
+
@classmethod
|
|
2041
|
+
async def aopen(cls, name: str) -> Self:
|
|
2042
|
+
"""
|
|
2043
|
+
Asynchronously get a handle to a memoryset in the OrcaCloud
|
|
2044
|
+
|
|
2045
|
+
Params:
|
|
2046
|
+
name: Name or unique identifier of the memoryset
|
|
2047
|
+
|
|
2048
|
+
Returns:
|
|
2049
|
+
Handle to the existing memoryset in the OrcaCloud
|
|
2050
|
+
|
|
2051
|
+
Raises:
|
|
2052
|
+
LookupError: If the memoryset does not exist
|
|
2053
|
+
"""
|
|
2054
|
+
client = OrcaAsyncClient._resolve_client()
|
|
2055
|
+
metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
2056
|
+
return cls(metadata)
|
|
2057
|
+
|
|
2058
|
+
@classmethod
|
|
2059
|
+
def exists(cls, name_or_id: str) -> bool:
|
|
2060
|
+
"""
|
|
2061
|
+
Check if a memoryset exists in the OrcaCloud
|
|
2062
|
+
|
|
2063
|
+
Params:
|
|
2064
|
+
name_or_id: Name or id of the memoryset
|
|
2065
|
+
|
|
2066
|
+
Returns:
|
|
2067
|
+
True if the memoryset exists, False otherwise
|
|
2068
|
+
"""
|
|
2069
|
+
try:
|
|
2070
|
+
cls.open(name_or_id)
|
|
2071
|
+
return True
|
|
2072
|
+
except LookupError:
|
|
2073
|
+
return False
|
|
2074
|
+
|
|
2075
|
+
@classmethod
|
|
2076
|
+
def all(cls, show_hidden: bool = False) -> list[Self]:
|
|
2077
|
+
"""
|
|
2078
|
+
Get a list of handles to all memorysets in the OrcaCloud
|
|
2079
|
+
|
|
2080
|
+
Params:
|
|
2081
|
+
show_hidden: Whether to include hidden memorysets in results, defaults to `False`
|
|
2082
|
+
|
|
2083
|
+
Returns:
|
|
2084
|
+
List of handles to all memorysets in the OrcaCloud
|
|
2085
|
+
"""
|
|
2086
|
+
client = OrcaClient._resolve_client()
|
|
2087
|
+
return [
|
|
2088
|
+
cls(metadata)
|
|
2089
|
+
for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
|
|
2090
|
+
]
|
|
2091
|
+
|
|
2092
|
+
@classmethod
|
|
2093
|
+
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
|
|
2094
|
+
"""
|
|
2095
|
+
Delete a memoryset from the OrcaCloud
|
|
2096
|
+
|
|
2097
|
+
Params:
|
|
2098
|
+
name_or_id: Name or id of the memoryset
|
|
2099
|
+
if_not_exists: What to do if the memoryset does not exist, defaults to `"error"`.
|
|
2100
|
+
Other options are `"ignore"` to do nothing if the memoryset does not exist.
|
|
2101
|
+
|
|
2102
|
+
Raises:
|
|
2103
|
+
LookupError: If the memoryset does not exist and if_not_exists is `"error"`
|
|
2104
|
+
"""
|
|
2105
|
+
try:
|
|
2106
|
+
client = OrcaClient._resolve_client()
|
|
2107
|
+
client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
|
|
2108
|
+
logging.info(f"Deleted memoryset {name_or_id}")
|
|
2109
|
+
except LookupError:
|
|
2110
|
+
if if_not_exists == "error":
|
|
2111
|
+
raise
|
|
2112
|
+
|
|
2113
|
+
def set(
|
|
2114
|
+
self,
|
|
2115
|
+
*,
|
|
2116
|
+
name: str = UNSET,
|
|
2117
|
+
description: str | None = UNSET,
|
|
2118
|
+
label_names: list[str] = UNSET,
|
|
2119
|
+
hidden: bool = UNSET,
|
|
2120
|
+
):
|
|
2121
|
+
"""
|
|
2122
|
+
Update editable attributes of the memoryset
|
|
2123
|
+
|
|
2124
|
+
Note:
|
|
2125
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
2126
|
+
|
|
2127
|
+
Params:
|
|
2128
|
+
description: Value to set for the description
|
|
2129
|
+
name: Value to set for the name
|
|
2130
|
+
label_names: Value to replace existing label names with
|
|
2131
|
+
"""
|
|
2132
|
+
payload: MemorysetUpdate = {}
|
|
2133
|
+
if name is not UNSET:
|
|
2134
|
+
payload["name"] = name
|
|
2135
|
+
if description is not UNSET:
|
|
2136
|
+
payload["description"] = description
|
|
2137
|
+
if label_names is not UNSET:
|
|
2138
|
+
payload["label_names"] = label_names
|
|
2139
|
+
if hidden is not UNSET:
|
|
2140
|
+
payload["hidden"] = hidden
|
|
2141
|
+
|
|
2142
|
+
client = OrcaClient._resolve_client()
|
|
2143
|
+
client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
|
|
2144
|
+
self.refresh()
|
|
2145
|
+
|
|
2146
|
+
@overload
|
|
2147
|
+
def clone(
|
|
2148
|
+
self,
|
|
2149
|
+
name: str,
|
|
2150
|
+
*,
|
|
2151
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
2152
|
+
max_seq_length_override: int | None = None,
|
|
2153
|
+
prompt: str | None = None,
|
|
2154
|
+
if_exists: CreateMode = "error",
|
|
2155
|
+
background: Literal[True],
|
|
2156
|
+
) -> Job[Self]:
|
|
2157
|
+
pass
|
|
2158
|
+
|
|
2159
|
+
@overload
|
|
2160
|
+
def clone(
|
|
2161
|
+
self,
|
|
2162
|
+
name: str,
|
|
2163
|
+
*,
|
|
2164
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
2165
|
+
max_seq_length_override: int | None = None,
|
|
2166
|
+
prompt: str | None = None,
|
|
2167
|
+
if_exists: CreateMode = "error",
|
|
2168
|
+
background: Literal[False] = False,
|
|
2169
|
+
) -> Self:
|
|
2170
|
+
pass
|
|
2171
|
+
|
|
2172
|
+
def clone(
|
|
2173
|
+
self,
|
|
2174
|
+
name: str,
|
|
2175
|
+
*,
|
|
2176
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
2177
|
+
max_seq_length_override: int | None = UNSET,
|
|
2178
|
+
prompt: str | None = None,
|
|
2179
|
+
if_exists: CreateMode = "error",
|
|
2180
|
+
background: bool = False,
|
|
2181
|
+
) -> Self | Job[Self]:
|
|
2182
|
+
"""
|
|
2183
|
+
Create a clone of the memoryset with a new name
|
|
2184
|
+
|
|
2185
|
+
Params:
|
|
2186
|
+
name: Name for the new memoryset (must be unique)
|
|
2187
|
+
embedding_model: Optional new embedding model to use for re-embedding the memory values
|
|
2188
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
2189
|
+
sequence length if not provided
|
|
2190
|
+
max_seq_length_override: Optional custom max sequence length to use for the cloned memoryset.
|
|
2191
|
+
If not provided, will use the source memoryset's max sequence length.
|
|
2192
|
+
prompt: Optional custom prompt to use for the cloned memoryset.
|
|
2193
|
+
If not provided, will use the source memoryset's prompt.
|
|
2194
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2195
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2196
|
+
|
|
2197
|
+
Returns:
|
|
2198
|
+
Handle to the cloned memoryset in the OrcaCloud
|
|
2199
|
+
|
|
2200
|
+
Examples:
|
|
2201
|
+
>>> memoryset = LabeledMemoryset.open("my_memoryset")
|
|
2202
|
+
>>> finetuned_embedding_model = PretrainedEmbeddingModel.GTE_BASE.finetune(
|
|
2203
|
+
... "gte_base_finetuned", my_memoryset
|
|
2204
|
+
... )
|
|
2205
|
+
>>> new_memoryset = memoryset.clone(
|
|
2206
|
+
... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
|
|
2207
|
+
... )
|
|
2208
|
+
|
|
2209
|
+
>>> # Clone with custom prompts
|
|
2210
|
+
>>> new_memoryset = memoryset.clone(
|
|
2211
|
+
... "my_memoryset_with_prompts",
|
|
2212
|
+
... document_prompt_override="Represent this document for retrieval:",
|
|
2213
|
+
... query_prompt_override="Represent this query for retrieval:",
|
|
2214
|
+
... )
|
|
2215
|
+
"""
|
|
2216
|
+
if self.exists(name):
|
|
2217
|
+
if if_exists == "error":
|
|
2218
|
+
raise ValueError(f"Memoryset with name {name} already exists")
|
|
2219
|
+
elif if_exists == "open":
|
|
2220
|
+
existing = self.open(name)
|
|
2221
|
+
for attribute in {"embedding_model"}:
|
|
2222
|
+
if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
|
|
2223
|
+
raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
|
|
2224
|
+
return existing
|
|
2225
|
+
payload: CloneMemorysetRequest = {"name": name}
|
|
2226
|
+
if max_seq_length_override is not UNSET:
|
|
2227
|
+
payload["max_seq_length_override"] = max_seq_length_override
|
|
2228
|
+
if prompt is not None:
|
|
2229
|
+
payload["prompt"] = prompt
|
|
2230
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
2231
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
2232
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
2233
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
2234
|
+
|
|
2235
|
+
client = OrcaClient._resolve_client()
|
|
2236
|
+
metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
2237
|
+
|
|
2238
|
+
if metadata["insertion_job_id"] is None:
|
|
2239
|
+
raise ValueError("Create memoryset operation failed to produce an insertion job")
|
|
2240
|
+
|
|
2241
|
+
job = Job(
|
|
2242
|
+
metadata["insertion_job_id"],
|
|
2243
|
+
lambda: self.open(metadata["id"]),
|
|
2244
|
+
)
|
|
2245
|
+
return job if background else job.result()
|
|
2246
|
+
|
|
2247
|
+
def refresh(self, throttle: float = 0):
|
|
2248
|
+
"""
|
|
2249
|
+
Refresh the information about the memoryset from the OrcaCloud
|
|
2250
|
+
|
|
2251
|
+
Params:
|
|
2252
|
+
throttle: Minimum time in seconds between refreshes
|
|
2253
|
+
"""
|
|
2254
|
+
current_time = datetime.now()
|
|
2255
|
+
# Skip refresh if last refresh was too recent
|
|
2256
|
+
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
2257
|
+
return
|
|
2258
|
+
|
|
2259
|
+
self.__dict__.update(self.open(self.id).__dict__)
|
|
2260
|
+
self._last_refresh = current_time
|
|
2261
|
+
|
|
2262
|
+
def __len__(self) -> int:
|
|
2263
|
+
"""Get the number of memories in the memoryset"""
|
|
2264
|
+
self.refresh(throttle=5)
|
|
2265
|
+
return self.length
|
|
2266
|
+
|
|
2267
|
+
@overload
|
|
2268
|
+
def __getitem__(self, index: int | str) -> MemoryT:
|
|
2269
|
+
pass
|
|
2270
|
+
|
|
2271
|
+
@overload
|
|
2272
|
+
def __getitem__(self, index: slice) -> list[MemoryT]:
|
|
2273
|
+
pass
|
|
2274
|
+
|
|
2275
|
+
def __getitem__(self, index: int | slice | str) -> MemoryT | list[MemoryT]:
|
|
2276
|
+
"""
|
|
2277
|
+
Get memories from the memoryset by index or memory id
|
|
2278
|
+
|
|
2279
|
+
Params:
|
|
2280
|
+
index: Index or memory to retrieve or slice of memories to retrieve or unique
|
|
2281
|
+
identifier of the memory to retrieve
|
|
2282
|
+
|
|
2283
|
+
Returns:
|
|
2284
|
+
Memory or memories from the memoryset
|
|
2285
|
+
|
|
2286
|
+
Raises:
|
|
2287
|
+
LookupError: If the id is not found or the index is out of bounds
|
|
2288
|
+
|
|
2289
|
+
Examples:
|
|
2290
|
+
Retrieve the first memory in the memoryset:
|
|
2291
|
+
>>> memoryset[0]
|
|
2292
|
+
LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
|
|
2293
|
+
|
|
2294
|
+
Retrieve the last memory in the memoryset:
|
|
2295
|
+
>>> memoryset[-1]
|
|
2296
|
+
LabeledMemory({ label: <negative: 0>, value: 'I am sad' })
|
|
2297
|
+
|
|
2298
|
+
Retrieve a slice of memories in the memoryset:
|
|
2299
|
+
>>> memoryset[1:3]
|
|
2300
|
+
[
|
|
2301
|
+
LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
|
|
2302
|
+
LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
|
|
2303
|
+
]
|
|
2304
|
+
|
|
2305
|
+
Retrieve a memory by id:
|
|
2306
|
+
>>> memoryset["0195019a-5bc7-7afb-b902-5945ee1fb766"]
|
|
2307
|
+
LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
|
|
2308
|
+
"""
|
|
2309
|
+
if isinstance(index, int):
|
|
2310
|
+
return self.query(offset=len(self) + index if index < 0 else index, limit=1)[0]
|
|
2311
|
+
elif isinstance(index, str):
|
|
2312
|
+
return self.get(index)
|
|
2313
|
+
elif isinstance(index, slice):
|
|
2314
|
+
start = 0 if index.start is None else (len(self) + index.start) if index.start < 0 else index.start
|
|
2315
|
+
stop = len(self) if index.stop is None else (len(self) + index.stop) if index.stop < 0 else index.stop
|
|
2316
|
+
return self.query(offset=start, limit=stop - start)
|
|
2317
|
+
else:
|
|
2318
|
+
raise ValueError(f"Invalid index type: {type(index)}")
|
|
2319
|
+
|
|
2320
|
+
@overload
|
|
2321
|
+
def search(
|
|
2322
|
+
self,
|
|
2323
|
+
query: str,
|
|
2324
|
+
*,
|
|
2325
|
+
count: int = 1,
|
|
2326
|
+
prompt: str | None = None,
|
|
2327
|
+
partition_id: str | None = None,
|
|
2328
|
+
partition_filter_mode: Literal[
|
|
2329
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2330
|
+
] = "include_global",
|
|
2331
|
+
) -> list[MemoryLookupT]:
|
|
2332
|
+
pass
|
|
2333
|
+
|
|
2334
|
+
@overload
|
|
2335
|
+
def search(
|
|
2336
|
+
self,
|
|
2337
|
+
query: list[str],
|
|
2338
|
+
*,
|
|
2339
|
+
count: int = 1,
|
|
2340
|
+
prompt: str | None = None,
|
|
2341
|
+
partition_id: str | None = None,
|
|
2342
|
+
partition_filter_mode: Literal[
|
|
2343
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2344
|
+
] = "include_global",
|
|
2345
|
+
) -> list[list[MemoryLookupT]]:
|
|
2346
|
+
pass
|
|
2347
|
+
|
|
2348
|
+
def search(
|
|
2349
|
+
self,
|
|
2350
|
+
query: str | list[str],
|
|
2351
|
+
*,
|
|
2352
|
+
count: int = 1,
|
|
2353
|
+
prompt: str | None = None,
|
|
2354
|
+
partition_id: str | None = None,
|
|
2355
|
+
partition_filter_mode: Literal[
|
|
2356
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2357
|
+
] = "include_global",
|
|
2358
|
+
) -> list[MemoryLookupT] | list[list[MemoryLookupT]]:
|
|
2359
|
+
"""
|
|
2360
|
+
Search for memories that are semantically similar to the query
|
|
2361
|
+
|
|
2362
|
+
Params:
|
|
2363
|
+
query: Query to lookup memories in the memoryset, can be a single query or a list
|
|
2364
|
+
count: Number of memories to return for each query
|
|
2365
|
+
prompt: Optional prompt for query embedding during search.
|
|
2366
|
+
If not provided, the memoryset's default query prompt will be used if available.
|
|
2367
|
+
partition_id: Optional partition ID to filter memories by
|
|
2368
|
+
partition_filter_mode: How to filter partitions when searching for memories
|
|
2369
|
+
- "ignore_partitions": Ignore partitions
|
|
2370
|
+
- "include_global": Include global memories
|
|
2371
|
+
- "exclude_global": Exclude global memories
|
|
2372
|
+
- "only_global": Only include global memories
|
|
2373
|
+
Returns:
|
|
2374
|
+
List of memories from the memoryset that match the query. If a single query is provided,
|
|
2375
|
+
the return value is a list containing a single list of memories. If a list of
|
|
2376
|
+
queries is provided, the return value is a list of lists of memories.
|
|
2377
|
+
|
|
2378
|
+
Examples:
|
|
2379
|
+
Search for similar memories:
|
|
2380
|
+
>>> memoryset.search("I am happy", count=2)
|
|
2381
|
+
[
|
|
2382
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
|
|
2383
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
2384
|
+
]
|
|
2385
|
+
|
|
2386
|
+
Search with custom query prompt for instruction-following models:
|
|
2387
|
+
>>> memoryset.search("I am happy", count=2, query_prompt="Represent this query for sentiment retrieval:")
|
|
2388
|
+
[
|
|
2389
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
|
|
2390
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
2391
|
+
]
|
|
2392
|
+
|
|
2393
|
+
Search for similar memories for multiple queries:
|
|
2394
|
+
>>> memoryset.search(["I am happy", "I am sad"], count=1)
|
|
2395
|
+
[
|
|
2396
|
+
[
|
|
2397
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
|
|
2398
|
+
],
|
|
2399
|
+
[
|
|
2400
|
+
LabeledMemoryLookup({ label: <negative: 0>, value: 'I am sad' }),
|
|
2401
|
+
],
|
|
2402
|
+
]
|
|
2403
|
+
"""
|
|
2404
|
+
client = OrcaClient._resolve_client()
|
|
2405
|
+
response = client.POST(
|
|
2406
|
+
"/gpu/memoryset/{name_or_id}/lookup",
|
|
2407
|
+
params={"name_or_id": self.id},
|
|
2408
|
+
json={
|
|
2409
|
+
"query": query if isinstance(query, list) else [query],
|
|
2410
|
+
"count": count,
|
|
2411
|
+
"prompt": prompt,
|
|
2412
|
+
"partition_id": partition_id,
|
|
2413
|
+
"partition_filter_mode": partition_filter_mode,
|
|
2414
|
+
},
|
|
2415
|
+
)
|
|
2416
|
+
lookups = [
|
|
2417
|
+
[
|
|
2418
|
+
cast(
|
|
2419
|
+
MemoryLookupT,
|
|
2420
|
+
(
|
|
2421
|
+
LabeledMemoryLookup(self.id, lookup_response)
|
|
2422
|
+
if "label" in lookup_response
|
|
2423
|
+
else ScoredMemoryLookup(self.id, lookup_response)
|
|
2424
|
+
),
|
|
2425
|
+
)
|
|
2426
|
+
for lookup_response in batch
|
|
2427
|
+
]
|
|
2428
|
+
for batch in response
|
|
2429
|
+
]
|
|
2430
|
+
return lookups if isinstance(query, list) else lookups[0]
|
|
2431
|
+
|
|
2432
|
+
def query(
|
|
2433
|
+
self,
|
|
2434
|
+
offset: int = 0,
|
|
2435
|
+
limit: int = 100,
|
|
2436
|
+
filters: list[FilterItemTuple] = [],
|
|
2437
|
+
with_feedback_metrics: bool = False,
|
|
2438
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
2439
|
+
partition_id: str | None = None,
|
|
2440
|
+
partition_filter_mode: Literal[
|
|
2441
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2442
|
+
] = "include_global",
|
|
2443
|
+
) -> list[MemoryT]:
|
|
2444
|
+
"""
|
|
2445
|
+
Query the memoryset for memories that match the filters
|
|
2446
|
+
|
|
2447
|
+
Params:
|
|
2448
|
+
offset: The offset of the first memory to return
|
|
2449
|
+
limit: The maximum number of memories to return
|
|
2450
|
+
filters: List of filters to apply to the query.
|
|
2451
|
+
with_feedback_metrics: Whether to include feedback metrics in the response
|
|
2452
|
+
|
|
2453
|
+
Returns:
|
|
2454
|
+
List of memories from the memoryset that match the filters
|
|
2455
|
+
|
|
2456
|
+
Examples:
|
|
2457
|
+
>>> memoryset.query(filters=[("label", "==", 0)], limit=2)
|
|
2458
|
+
[
|
|
2459
|
+
LabeledMemory({ label: <positive: 1>, value: "I am happy" }),
|
|
2460
|
+
LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
|
|
2461
|
+
]
|
|
2462
|
+
"""
|
|
2463
|
+
parsed_filters = [
|
|
2464
|
+
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
|
|
2465
|
+
]
|
|
2466
|
+
|
|
2467
|
+
if with_feedback_metrics:
|
|
2468
|
+
if partition_id:
|
|
2469
|
+
raise ValueError("Partition ID is not supported when with_feedback_metrics is True")
|
|
2470
|
+
if partition_filter_mode != "include_global":
|
|
2471
|
+
raise ValueError(
|
|
2472
|
+
f"Partition filter mode {partition_filter_mode} is not supported when with_feedback_metrics is True. Only 'include_global' is supported."
|
|
2473
|
+
)
|
|
2474
|
+
|
|
2475
|
+
client = OrcaClient._resolve_client()
|
|
2476
|
+
response = client.POST(
|
|
2477
|
+
"/telemetry/memories",
|
|
2478
|
+
json={
|
|
2479
|
+
"memoryset_id": self.id,
|
|
2480
|
+
"offset": offset,
|
|
2481
|
+
"limit": limit,
|
|
2482
|
+
"filters": parsed_filters,
|
|
2483
|
+
"sort": [_parse_sort_item_from_tuple(item) for item in sort] if sort else None,
|
|
2484
|
+
},
|
|
2485
|
+
)
|
|
2486
|
+
return [
|
|
2487
|
+
cast(
|
|
2488
|
+
MemoryT,
|
|
2489
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
2490
|
+
)
|
|
2491
|
+
for memory in response["items"]
|
|
2492
|
+
]
|
|
2493
|
+
|
|
2494
|
+
if any(_is_metric_column(filter[0]) for filter in filters):
|
|
2495
|
+
raise ValueError("Feedback metrics are only supported when the with_feedback_metrics flag is set to True")
|
|
2496
|
+
|
|
2497
|
+
if sort:
|
|
2498
|
+
logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
|
|
2499
|
+
|
|
2500
|
+
client = OrcaClient._resolve_client()
|
|
2501
|
+
response = client.POST(
|
|
2502
|
+
"/memoryset/{name_or_id}/memories",
|
|
2503
|
+
params={"name_or_id": self.id},
|
|
2504
|
+
json={
|
|
2505
|
+
"offset": offset,
|
|
2506
|
+
"limit": limit,
|
|
2507
|
+
"filters": cast(list[FilterItem], parsed_filters),
|
|
2508
|
+
"partition_id": partition_id,
|
|
2509
|
+
"partition_filter_mode": partition_filter_mode,
|
|
2510
|
+
},
|
|
2511
|
+
)
|
|
2512
|
+
return [
|
|
2513
|
+
cast(
|
|
2514
|
+
MemoryT,
|
|
2515
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
2516
|
+
)
|
|
2517
|
+
for memory in response
|
|
2518
|
+
]
|
|
2519
|
+
|
|
2520
|
+
def to_pandas(
|
|
2521
|
+
self,
|
|
2522
|
+
offset: int = 0,
|
|
2523
|
+
limit: int = 100,
|
|
2524
|
+
filters: list[FilterItemTuple] = [],
|
|
2525
|
+
with_feedback_metrics: bool = False,
|
|
2526
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
2527
|
+
) -> pd.DataFrame:
|
|
2528
|
+
"""
|
|
2529
|
+
Convert the memoryset to a pandas DataFrame
|
|
2530
|
+
"""
|
|
2531
|
+
return pd.DataFrame(
|
|
2532
|
+
[
|
|
2533
|
+
memory.to_dict()
|
|
2534
|
+
for memory in self.query(
|
|
2535
|
+
offset=offset,
|
|
2536
|
+
limit=limit,
|
|
2537
|
+
filters=filters,
|
|
2538
|
+
with_feedback_metrics=with_feedback_metrics,
|
|
2539
|
+
sort=sort,
|
|
2540
|
+
)
|
|
2541
|
+
]
|
|
2542
|
+
)
|
|
2543
|
+
|
|
2544
|
+
def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any], *, batch_size: int = 32) -> None:
|
|
2545
|
+
"""
|
|
2546
|
+
Insert memories into the memoryset
|
|
2547
|
+
|
|
2548
|
+
Params:
|
|
2549
|
+
items: List of memories to insert into the memoryset. This should be a list of
|
|
2550
|
+
dictionaries with the following keys:
|
|
2551
|
+
|
|
2552
|
+
- `value`: Value of the memory
|
|
2553
|
+
- `label`: Label of the memory
|
|
2554
|
+
- `score`: Score of the memory
|
|
2555
|
+
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
2556
|
+
- `...`: Any other metadata to store for the memory
|
|
2557
|
+
|
|
2558
|
+
batch_size: Number of memories to insert in a single API call
|
|
2559
|
+
|
|
2560
|
+
Examples:
|
|
2561
|
+
>>> memoryset.insert([
|
|
2562
|
+
... {"value": "I am happy", "label": 1, "source_id": "data_123", "partition_id": "user_1", "tag": "happy"},
|
|
2563
|
+
... {"value": "I am sad", "label": 0, "source_id": "data_124", "partition_id": "user_1", "tag": "sad"},
|
|
2564
|
+
... ])
|
|
2565
|
+
"""
|
|
2566
|
+
if batch_size <= 0 or batch_size > 500:
|
|
2567
|
+
raise ValueError("batch_size must be between 1 and 500")
|
|
2568
|
+
client = OrcaClient._resolve_client()
|
|
2569
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
2570
|
+
# insert memories in batches to avoid API timeouts
|
|
2571
|
+
for i in range(0, len(items), batch_size):
|
|
2572
|
+
batch = items[i : i + batch_size]
|
|
2573
|
+
client.POST(
|
|
2574
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
2575
|
+
params={"name_or_id": self.id},
|
|
2576
|
+
json=cast(
|
|
2577
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
2578
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
2579
|
+
),
|
|
2580
|
+
)
|
|
2581
|
+
|
|
2582
|
+
self.refresh()
|
|
2583
|
+
|
|
2584
|
+
async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any], *, batch_size: int = 32) -> None:
|
|
2585
|
+
"""
|
|
2586
|
+
Asynchronously insert memories into the memoryset
|
|
2587
|
+
|
|
2588
|
+
Params:
|
|
2589
|
+
items: List of memories to insert into the memoryset. This should be a list of
|
|
2590
|
+
dictionaries with the following keys:
|
|
2591
|
+
|
|
2592
|
+
- `value`: Value of the memory
|
|
2593
|
+
- `label`: Label of the memory
|
|
2594
|
+
- `score`: Score of the memory
|
|
2595
|
+
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
2596
|
+
- `partition_id`: Optional partition ID of the memory
|
|
2597
|
+
- `...`: Any other metadata to store for the memory
|
|
2598
|
+
|
|
2599
|
+
batch_size: Number of memories to insert in a single API call
|
|
2600
|
+
|
|
2601
|
+
Examples:
|
|
2602
|
+
>>> await memoryset.ainsert([
|
|
2603
|
+
... {"value": "I am happy", "label": 1, "source_id": "data_123", "partition_id": "user_1", "tag": "happy"},
|
|
2604
|
+
... {"value": "I am sad", "label": 0, "source_id": "data_124", "partition_id": "user_1", "tag": "sad"},
|
|
2605
|
+
... ])
|
|
2606
|
+
"""
|
|
2607
|
+
if batch_size <= 0 or batch_size > 500:
|
|
2608
|
+
raise ValueError("batch_size must be between 1 and 500")
|
|
2609
|
+
client = OrcaAsyncClient._resolve_client()
|
|
2610
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
2611
|
+
# insert memories in batches to avoid API timeouts
|
|
2612
|
+
for i in range(0, len(items), batch_size):
|
|
2613
|
+
batch = items[i : i + batch_size]
|
|
2614
|
+
await client.POST(
|
|
2615
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
2616
|
+
params={"name_or_id": self.id},
|
|
2617
|
+
json=cast(
|
|
2618
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
2619
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
2620
|
+
),
|
|
2621
|
+
)
|
|
2622
|
+
|
|
2623
|
+
await self.arefresh()
|
|
2624
|
+
|
|
2625
|
+
async def arefresh(self, throttle: float = 0):
|
|
2626
|
+
"""
|
|
2627
|
+
Asynchronously refresh the information about the memoryset from the OrcaCloud
|
|
2628
|
+
|
|
2629
|
+
Params:
|
|
2630
|
+
throttle: Minimum time in seconds between refreshes
|
|
2631
|
+
"""
|
|
2632
|
+
current_time = datetime.now()
|
|
2633
|
+
# Skip refresh if last refresh was too recent
|
|
2634
|
+
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
2635
|
+
return
|
|
2636
|
+
|
|
2637
|
+
refreshed_memoryset = await type(self).aopen(self.id)
|
|
2638
|
+
self.__dict__.update(refreshed_memoryset.__dict__)
|
|
2639
|
+
self._last_refresh = current_time
|
|
2640
|
+
|
|
2641
|
+
@overload
|
|
2642
|
+
def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
|
|
2643
|
+
pass
|
|
2644
|
+
|
|
2645
|
+
@overload
|
|
2646
|
+
def get(self, memory_id: Iterable[str]) -> list[MemoryT]:
|
|
2647
|
+
pass
|
|
2648
|
+
|
|
2649
|
+
def get(self, memory_id: str | Iterable[str]) -> MemoryT | list[MemoryT]:
|
|
2650
|
+
"""
|
|
2651
|
+
Fetch a memory or memories from the memoryset
|
|
2652
|
+
|
|
2653
|
+
Params:
|
|
2654
|
+
memory_id: Unique identifier of the memory or memories to fetch
|
|
2655
|
+
|
|
2656
|
+
Returns:
|
|
2657
|
+
Memory or list of memories from the memoryset
|
|
2658
|
+
|
|
2659
|
+
Raises:
|
|
2660
|
+
LookupError: If no memory with the given id is found
|
|
2661
|
+
|
|
2662
|
+
Examples:
|
|
2663
|
+
Fetch a single memory:
|
|
2664
|
+
>>> memoryset.get("0195019a-5bc7-7afb-b902-5945ee1fb766")
|
|
2665
|
+
LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
|
|
2666
|
+
|
|
2667
|
+
Fetch multiple memories:
|
|
2668
|
+
>>> memoryset.get([
|
|
2669
|
+
... "0195019a-5bc7-7afb-b902-5945ee1fb766",
|
|
2670
|
+
... "019501a1-ea08-76b2-9f62-95e4800b4841",
|
|
2671
|
+
... ])
|
|
2672
|
+
[
|
|
2673
|
+
LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
|
|
2674
|
+
LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
|
|
2675
|
+
]
|
|
2676
|
+
"""
|
|
2677
|
+
if isinstance(memory_id, str):
|
|
2678
|
+
client = OrcaClient._resolve_client()
|
|
2679
|
+
response = client.GET(
|
|
2680
|
+
"/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
|
|
2681
|
+
)
|
|
2682
|
+
return cast(
|
|
2683
|
+
MemoryT,
|
|
2684
|
+
(LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
|
|
2685
|
+
)
|
|
2686
|
+
else:
|
|
2687
|
+
client = OrcaClient._resolve_client()
|
|
2688
|
+
response = client.POST(
|
|
2689
|
+
"/memoryset/{name_or_id}/memories/get",
|
|
2690
|
+
params={"name_or_id": self.id},
|
|
2691
|
+
json={"memory_ids": list(memory_id)},
|
|
2692
|
+
)
|
|
2693
|
+
return [
|
|
2694
|
+
cast(
|
|
2695
|
+
MemoryT,
|
|
2696
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
2697
|
+
)
|
|
2698
|
+
for memory in response
|
|
2699
|
+
]
|
|
2700
|
+
|
|
2701
|
+
@overload
|
|
2702
|
+
def update(self, updates: dict[str, Any], *, batch_size: int = 32) -> MemoryT:
|
|
2703
|
+
pass
|
|
2704
|
+
|
|
2705
|
+
@overload
|
|
2706
|
+
def update(self, updates: Iterable[dict[str, Any]], *, batch_size: int = 32) -> list[MemoryT]:
|
|
2707
|
+
pass
|
|
2708
|
+
|
|
2709
|
+
def update(
|
|
2710
|
+
self, updates: dict[str, Any] | Iterable[dict[str, Any]], *, batch_size: int = 32
|
|
2711
|
+
) -> MemoryT | list[MemoryT]:
|
|
2712
|
+
"""
|
|
2713
|
+
Update one or multiple memories in the memoryset
|
|
2714
|
+
|
|
2715
|
+
Params:
|
|
2716
|
+
updates: List of updates to apply to the memories. Each update should be a dictionary
|
|
2717
|
+
with the following keys:
|
|
2718
|
+
|
|
2719
|
+
- `memory_id`: Unique identifier of the memory to update (required)
|
|
2720
|
+
- `value`: Optional new value of the memory
|
|
2721
|
+
- `label`: Optional new label of the memory
|
|
2722
|
+
- `source_id`: Optional new source ID of the memory
|
|
2723
|
+
- `partition_id`: Optional new partition ID of the memory
|
|
2724
|
+
- `...`: Optional new values for metadata properties
|
|
2725
|
+
|
|
2726
|
+
batch_size: Number of memories to update in a single API call
|
|
2727
|
+
|
|
2728
|
+
Returns:
|
|
2729
|
+
Updated memory or list of updated memories
|
|
2730
|
+
|
|
2731
|
+
Examples:
|
|
2732
|
+
Update a single memory:
|
|
2733
|
+
>>> memoryset.update(
|
|
2734
|
+
... {
|
|
2735
|
+
... "memory_id": "019501a1-ea08-76b2-9f62-95e4800b4841",
|
|
2736
|
+
... "tag": "happy",
|
|
2737
|
+
... },
|
|
2738
|
+
... )
|
|
2739
|
+
|
|
2740
|
+
Update multiple memories:
|
|
2741
|
+
>>> memoryset.update(
|
|
2742
|
+
... {"memory_id": m.memory_id, "label": 2}
|
|
2743
|
+
... for m in memoryset.query(filters=[("tag", "==", "happy")])
|
|
2744
|
+
... )
|
|
2745
|
+
"""
|
|
2746
|
+
if batch_size <= 0 or batch_size > 500:
|
|
2747
|
+
raise ValueError("batch_size must be between 1 and 500")
|
|
2748
|
+
client = OrcaClient._resolve_client()
|
|
2749
|
+
updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
|
|
2750
|
+
# update memories in batches to avoid API timeouts
|
|
2751
|
+
updated_memories: list[MemoryT] = []
|
|
2752
|
+
for i in range(0, len(updates_list), batch_size):
|
|
2753
|
+
batch = updates_list[i : i + batch_size]
|
|
2754
|
+
response = client.PATCH(
|
|
2755
|
+
"/gpu/memoryset/{name_or_id}/memories",
|
|
2756
|
+
params={"name_or_id": self.id},
|
|
2757
|
+
json=cast(
|
|
2758
|
+
list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
|
|
2759
|
+
[_parse_memory_update(update, type=self.memory_type) for update in batch],
|
|
2760
|
+
),
|
|
2761
|
+
)
|
|
2762
|
+
updated_memories.extend(
|
|
2763
|
+
cast(
|
|
2764
|
+
MemoryT,
|
|
2765
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
2766
|
+
)
|
|
2767
|
+
for memory in response
|
|
2768
|
+
)
|
|
2769
|
+
|
|
2770
|
+
return updated_memories[0] if isinstance(updates, dict) else updated_memories
|
|
2771
|
+
|
|
2772
|
+
def get_cascading_edits_suggestions(
|
|
2773
|
+
self,
|
|
2774
|
+
memory: MemoryT,
|
|
2775
|
+
*,
|
|
2776
|
+
old_label: int,
|
|
2777
|
+
new_label: int,
|
|
2778
|
+
max_neighbors: int = 50,
|
|
2779
|
+
max_validation_neighbors: int = 10,
|
|
2780
|
+
similarity_threshold: float | None = None,
|
|
2781
|
+
only_if_has_old_label: bool = True,
|
|
2782
|
+
exclude_if_new_label: bool = True,
|
|
2783
|
+
suggestion_cooldown_time: float = 3600.0 * 24.0, # 1 day
|
|
2784
|
+
label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7, # 1 week
|
|
2785
|
+
) -> list[CascadingEditSuggestion]:
|
|
2786
|
+
"""
|
|
2787
|
+
Suggests cascading edits for a given memory based on nearby points with similar labels.
|
|
2788
|
+
|
|
2789
|
+
This function is triggered after a user changes a memory's label. It looks for nearby
|
|
2790
|
+
candidates in embedding space that may be subject to similar relabeling and returns them
|
|
2791
|
+
as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
|
|
2792
|
+
to reduce noise and improve usability.
|
|
2793
|
+
|
|
2794
|
+
Params:
|
|
2795
|
+
memory: The memory whose label was just changed.
|
|
2796
|
+
old_label: The label this memory used to have.
|
|
2797
|
+
new_label: The label it was changed to.
|
|
2798
|
+
max_neighbors: Maximum number of neighbors to consider.
|
|
2799
|
+
max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
|
|
2800
|
+
similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
|
|
2801
|
+
only_if_has_old_label: If True, only consider neighbors that have the old label.
|
|
2802
|
+
exclude_if_new_label: If True, exclude neighbors that already have the new label.
|
|
2803
|
+
suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
|
|
2804
|
+
to be considered again.
|
|
2805
|
+
label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
|
|
2806
|
+
to be considered for suggestions.
|
|
2807
|
+
|
|
2808
|
+
Returns:
|
|
2809
|
+
A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
|
|
2810
|
+
"""
|
|
2811
|
+
# TODO: properly integrate this with memory edits and return something that can be applied
|
|
2812
|
+
client = OrcaClient._resolve_client()
|
|
2813
|
+
return client.POST(
|
|
2814
|
+
"/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
|
|
2815
|
+
params={"name_or_id": self.id, "memory_id": memory.memory_id},
|
|
2816
|
+
json={
|
|
2817
|
+
"old_label": old_label,
|
|
2818
|
+
"new_label": new_label,
|
|
2819
|
+
"max_neighbors": max_neighbors,
|
|
2820
|
+
"max_validation_neighbors": max_validation_neighbors,
|
|
2821
|
+
"similarity_threshold": similarity_threshold,
|
|
2822
|
+
"only_if_has_old_label": only_if_has_old_label,
|
|
2823
|
+
"exclude_if_new_label": exclude_if_new_label,
|
|
2824
|
+
"suggestion_cooldown_time": suggestion_cooldown_time,
|
|
2825
|
+
"label_confirmation_cooldown_time": label_confirmation_cooldown_time,
|
|
2826
|
+
},
|
|
2827
|
+
)
|
|
2828
|
+
|
|
2829
|
+
def delete(self, memory_id: str | Iterable[str], *, batch_size: int = 32) -> None:
|
|
2830
|
+
"""
|
|
2831
|
+
Delete memories from the memoryset
|
|
2832
|
+
|
|
2833
|
+
Params:
|
|
2834
|
+
memory_id: unique identifiers of the memories to delete
|
|
2835
|
+
batch_size: Number of memories to delete in a single API call
|
|
2836
|
+
|
|
2837
|
+
Examples:
|
|
2838
|
+
Delete a single memory:
|
|
2839
|
+
>>> memoryset.delete("0195019a-5bc7-7afb-b902-5945ee1fb766")
|
|
2840
|
+
|
|
2841
|
+
Delete multiple memories:
|
|
2842
|
+
>>> memoryset.delete([
|
|
2843
|
+
... "0195019a-5bc7-7afb-b902-5945ee1fb766",
|
|
2844
|
+
... "019501a1-ea08-76b2-9f62-95e4800b4841",
|
|
2845
|
+
... )
|
|
2846
|
+
|
|
2847
|
+
"""
|
|
2848
|
+
if batch_size <= 0 or batch_size > 500:
|
|
2849
|
+
raise ValueError("batch_size must be between 1 and 500")
|
|
2850
|
+
client = OrcaClient._resolve_client()
|
|
2851
|
+
memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
|
|
2852
|
+
# delete memories in batches to avoid API timeouts
|
|
2853
|
+
for i in range(0, len(memory_ids), batch_size):
|
|
2854
|
+
batch = memory_ids[i : i + batch_size]
|
|
2855
|
+
client.POST(
|
|
2856
|
+
"/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
|
|
2857
|
+
)
|
|
2858
|
+
logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
|
|
2859
|
+
self.refresh()
|
|
2860
|
+
|
|
2861
|
+
@overload
|
|
2862
|
+
def analyze(
|
|
2863
|
+
self,
|
|
2864
|
+
*analyses: dict[str, Any] | str,
|
|
2865
|
+
lookup_count: int = 15,
|
|
2866
|
+
clear_metrics: bool = False,
|
|
2867
|
+
background: Literal[True],
|
|
2868
|
+
partition_filter_mode: Literal[
|
|
2869
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2870
|
+
] = "include_global",
|
|
2871
|
+
) -> Job[MemorysetMetrics]:
|
|
2872
|
+
pass
|
|
2873
|
+
|
|
2874
|
+
@overload
|
|
2875
|
+
def analyze(
|
|
2876
|
+
self,
|
|
2877
|
+
*analyses: dict[str, Any] | str,
|
|
2878
|
+
lookup_count: int = 15,
|
|
2879
|
+
clear_metrics: bool = False,
|
|
2880
|
+
background: Literal[False] = False,
|
|
2881
|
+
partition_filter_mode: Literal[
|
|
2882
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2883
|
+
] = "include_global",
|
|
2884
|
+
) -> MemorysetMetrics:
|
|
2885
|
+
pass
|
|
2886
|
+
|
|
2887
|
+
def analyze(
|
|
2888
|
+
self,
|
|
2889
|
+
*analyses: dict[str, Any] | str,
|
|
2890
|
+
lookup_count: int = 15,
|
|
2891
|
+
clear_metrics: bool = False,
|
|
2892
|
+
background: bool = False,
|
|
2893
|
+
partition_filter_mode: Literal[
|
|
2894
|
+
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
2895
|
+
] = "include_global",
|
|
2896
|
+
) -> Job[MemorysetMetrics] | MemorysetMetrics:
|
|
2897
|
+
"""
|
|
2898
|
+
Run analyses on the memoryset to find duplicates, clusters, mislabelings, and more
|
|
2899
|
+
|
|
2900
|
+
The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
|
|
2901
|
+
attribute of each memory in the memoryset. Overall memoryset metrics will be returned as a dictionary.
|
|
2902
|
+
|
|
2903
|
+
Params:
|
|
2904
|
+
analyses: List of analysis to run on the memoryset, can either be just the name of an
|
|
2905
|
+
analysis or a dictionary with a name property and additional config. The available
|
|
2906
|
+
analyses are:
|
|
2907
|
+
|
|
2908
|
+
- **`"duplicate"`**: Find potentially duplicate memories in the memoryset
|
|
2909
|
+
- **`"cluster"`**: Cluster the memories in the memoryset
|
|
2910
|
+
- **`"distribution"`**: Analyze the embedding distribution
|
|
2911
|
+
- **`"projection"`**: Create a 2D projection of the embeddings for visualization
|
|
2912
|
+
- **`"label"`**: Analyze the labels to find potential mislabelings (labeled memorysets only)
|
|
2913
|
+
- **`"class_patterns"`**: Analyze class patterns and find representative memories (labeled memorysets only)
|
|
2914
|
+
- **`"concepts"`**: Discover and name conceptual clusters in the memoryset (labeled memorysets only)
|
|
2915
|
+
|
|
2916
|
+
lookup_count: Number of memories to lookup for each memory in the memoryset
|
|
2917
|
+
clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
|
|
2918
|
+
partition_filter_mode: How to filter partitions when running the analysis
|
|
2919
|
+
- "ignore_partitions": Ignore partitions
|
|
2920
|
+
- "include_global": Include global memories
|
|
2921
|
+
- "exclude_global": Exclude global memories
|
|
2922
|
+
- "only_global": Only include global memories
|
|
2923
|
+
|
|
2924
|
+
Returns:
|
|
2925
|
+
dictionary with aggregate metrics for each analysis that was run
|
|
2926
|
+
|
|
2927
|
+
Raises:
|
|
2928
|
+
ValueError: If an invalid analysis name is provided
|
|
2929
|
+
|
|
2930
|
+
Examples:
|
|
2931
|
+
Run label and duplicate analysis:
|
|
2932
|
+
>>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
|
|
2933
|
+
{ "duplicate": { "num_duplicates": 10 },
|
|
2934
|
+
"label": {
|
|
2935
|
+
"label_metrics": [{
|
|
2936
|
+
"label": 0,
|
|
2937
|
+
"label_name": "negative",
|
|
2938
|
+
"average_lookup_score": 0.95,
|
|
2939
|
+
"memory_count": 100,
|
|
2940
|
+
}, {
|
|
2941
|
+
"label": 1,
|
|
2942
|
+
"label_name": "positive",
|
|
2943
|
+
"average_lookup_score": 0.90,
|
|
2944
|
+
"memory_count": 100,
|
|
2945
|
+
}]
|
|
2946
|
+
"neighbor_prediction_accuracy": 0.95,
|
|
2947
|
+
"mean_neighbor_label_confidence": 0.95,
|
|
2948
|
+
"mean_neighbor_label_entropy": 0.95,
|
|
2949
|
+
"mean_neighbor_predicted_label_ambiguity": 0.95,
|
|
2950
|
+
}
|
|
2951
|
+
}
|
|
2952
|
+
|
|
2953
|
+
Remove all exact duplicates:
|
|
2954
|
+
>>> memoryset.delete(
|
|
2955
|
+
... m.memory_id
|
|
2956
|
+
... for m in memoryset.query(
|
|
2957
|
+
... filters=[("metrics.is_duplicate", "==", True)]
|
|
2958
|
+
... )
|
|
2959
|
+
... )
|
|
2960
|
+
|
|
2961
|
+
Display label analysis to review potential mislabelings:
|
|
2962
|
+
>>> memoryset.display_label_analysis()
|
|
2963
|
+
"""
|
|
2964
|
+
|
|
2965
|
+
# Get valid analysis names from MemorysetAnalysisConfigs
|
|
2966
|
+
valid_analysis_names = set(MemorysetAnalysisConfigs.__annotations__)
|
|
2967
|
+
|
|
2968
|
+
configs: MemorysetAnalysisConfigs = {}
|
|
2969
|
+
for analysis in analyses:
|
|
2970
|
+
if isinstance(analysis, str):
|
|
2971
|
+
error_msg = (
|
|
2972
|
+
f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2973
|
+
)
|
|
2974
|
+
if analysis not in valid_analysis_names:
|
|
2975
|
+
raise ValueError(error_msg)
|
|
2976
|
+
configs[analysis] = {}
|
|
2977
|
+
else:
|
|
2978
|
+
name = analysis.pop("name")
|
|
2979
|
+
error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2980
|
+
if name not in valid_analysis_names:
|
|
2981
|
+
raise ValueError(error_msg)
|
|
2982
|
+
configs[name] = analysis
|
|
2983
|
+
|
|
2984
|
+
client = OrcaClient._resolve_client()
|
|
2985
|
+
analysis = client.POST(
|
|
2986
|
+
"/memoryset/{name_or_id}/analysis",
|
|
2987
|
+
params={"name_or_id": self.id},
|
|
2988
|
+
json={
|
|
2989
|
+
"configs": configs,
|
|
2990
|
+
"lookup_count": lookup_count,
|
|
2991
|
+
"clear_metrics": clear_metrics,
|
|
2992
|
+
"partition_filter_mode": partition_filter_mode,
|
|
2993
|
+
},
|
|
2994
|
+
)
|
|
2995
|
+
|
|
2996
|
+
def get_analysis_result():
|
|
2997
|
+
client = OrcaClient._resolve_client()
|
|
2998
|
+
return client.GET(
|
|
2999
|
+
"/memoryset/{name_or_id}/analysis/{analysis_job_id}",
|
|
3000
|
+
params={"name_or_id": self.id, "analysis_job_id": analysis["job_id"]},
|
|
3001
|
+
)["results"]
|
|
3002
|
+
|
|
3003
|
+
job = Job(analysis["job_id"], get_analysis_result)
|
|
3004
|
+
return job if background else job.result()
|
|
3005
|
+
|
|
3006
|
+
def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
|
|
3007
|
+
"""Group potential duplicates in the memoryset"""
|
|
3008
|
+
client = OrcaClient._resolve_client()
|
|
3009
|
+
response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
|
|
3010
|
+
return [
|
|
3011
|
+
[cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
|
|
3012
|
+
for ms in response
|
|
3013
|
+
]
|
|
3014
|
+
|
|
3015
|
+
|
|
3016
|
+
class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
3017
|
+
"""
|
|
3018
|
+
A Handle to a collection of memories with labels in the OrcaCloud
|
|
3019
|
+
|
|
3020
|
+
Attributes:
|
|
3021
|
+
id: Unique identifier for the memoryset
|
|
3022
|
+
name: Unique name of the memoryset
|
|
3023
|
+
description: Description of the memoryset
|
|
3024
|
+
label_names: Names for the class labels in the memoryset
|
|
3025
|
+
length: Number of memories in the memoryset
|
|
3026
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
3027
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
3028
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
3029
|
+
"""
|
|
3030
|
+
|
|
3031
|
+
label_names: list[str]
|
|
3032
|
+
memory_type: MemoryType = "LABELED"
|
|
3033
|
+
|
|
3034
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
3035
|
+
super().__init__(metadata)
|
|
3036
|
+
assert metadata["label_names"] is not None
|
|
3037
|
+
self.label_names = metadata["label_names"]
|
|
3038
|
+
|
|
3039
|
+
def __eq__(self, other) -> bool:
|
|
3040
|
+
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
3041
|
+
|
|
3042
|
+
@overload
|
|
3043
|
+
@classmethod
|
|
3044
|
+
def create(
|
|
3045
|
+
cls,
|
|
3046
|
+
name: str,
|
|
3047
|
+
*,
|
|
3048
|
+
datasource: None = None,
|
|
3049
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3050
|
+
description: str | None = None,
|
|
3051
|
+
label_names: list[str],
|
|
3052
|
+
max_seq_length_override: int | None = None,
|
|
3053
|
+
prompt: str | None = None,
|
|
3054
|
+
index_type: IndexType = "FLAT",
|
|
3055
|
+
index_params: dict[str, Any] = {},
|
|
3056
|
+
if_exists: CreateMode = "error",
|
|
3057
|
+
hidden: bool = False,
|
|
3058
|
+
) -> Self:
|
|
3059
|
+
pass
|
|
3060
|
+
|
|
3061
|
+
@overload
|
|
3062
|
+
@classmethod
|
|
3063
|
+
def create(
|
|
3064
|
+
cls,
|
|
3065
|
+
name: str,
|
|
3066
|
+
*,
|
|
3067
|
+
datasource: Datasource,
|
|
3068
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3069
|
+
value_column: str = "value",
|
|
3070
|
+
label_column: str | None = "label",
|
|
3071
|
+
source_id_column: str | None = None,
|
|
3072
|
+
partition_id_column: str | None = None,
|
|
3073
|
+
description: str | None = None,
|
|
3074
|
+
label_names: list[str] | None = None,
|
|
3075
|
+
max_seq_length_override: int | None = None,
|
|
3076
|
+
prompt: str | None = None,
|
|
3077
|
+
remove_duplicates: bool = True,
|
|
3078
|
+
index_type: IndexType = "FLAT",
|
|
3079
|
+
index_params: dict[str, Any] = {},
|
|
3080
|
+
if_exists: CreateMode = "error",
|
|
3081
|
+
background: Literal[True],
|
|
3082
|
+
hidden: bool = False,
|
|
3083
|
+
subsample: int | float | None = None,
|
|
3084
|
+
) -> Job[Self]:
|
|
3085
|
+
pass
|
|
3086
|
+
|
|
3087
|
+
@overload
|
|
3088
|
+
@classmethod
|
|
3089
|
+
def create(
|
|
3090
|
+
cls,
|
|
3091
|
+
name: str,
|
|
3092
|
+
*,
|
|
3093
|
+
datasource: Datasource,
|
|
3094
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3095
|
+
value_column: str = "value",
|
|
3096
|
+
label_column: str | None = "label",
|
|
3097
|
+
source_id_column: str | None = None,
|
|
3098
|
+
partition_id_column: str | None = None,
|
|
3099
|
+
description: str | None = None,
|
|
3100
|
+
label_names: list[str] | None = None,
|
|
3101
|
+
max_seq_length_override: int | None = None,
|
|
3102
|
+
prompt: str | None = None,
|
|
3103
|
+
remove_duplicates: bool = True,
|
|
3104
|
+
index_type: IndexType = "FLAT",
|
|
3105
|
+
index_params: dict[str, Any] = {},
|
|
3106
|
+
if_exists: CreateMode = "error",
|
|
3107
|
+
background: Literal[False] = False,
|
|
3108
|
+
hidden: bool = False,
|
|
3109
|
+
subsample: int | float | None = None,
|
|
3110
|
+
) -> Self:
|
|
3111
|
+
pass
|
|
3112
|
+
|
|
3113
|
+
@classmethod
|
|
3114
|
+
def create( # type: ignore[override]
|
|
3115
|
+
cls,
|
|
3116
|
+
name: str,
|
|
3117
|
+
*,
|
|
3118
|
+
datasource: Datasource | None = None,
|
|
3119
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3120
|
+
value_column: str = "value",
|
|
3121
|
+
label_column: str | None = "label",
|
|
3122
|
+
source_id_column: str | None = None,
|
|
3123
|
+
partition_id_column: str | None = None,
|
|
3124
|
+
description: str | None = None,
|
|
3125
|
+
label_names: list[str] | None = None,
|
|
3126
|
+
max_seq_length_override: int | None = None,
|
|
3127
|
+
prompt: str | None = None,
|
|
3128
|
+
remove_duplicates: bool = True,
|
|
3129
|
+
index_type: IndexType = "FLAT",
|
|
3130
|
+
index_params: dict[str, Any] = {},
|
|
3131
|
+
if_exists: CreateMode = "error",
|
|
3132
|
+
background: bool = False,
|
|
3133
|
+
hidden: bool = False,
|
|
3134
|
+
subsample: int | float | None = None,
|
|
3135
|
+
) -> Self | Job[Self]:
|
|
3136
|
+
"""
|
|
3137
|
+
Create a new labeled memoryset in the OrcaCloud
|
|
3138
|
+
|
|
3139
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
3140
|
+
`value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
3141
|
+
as metadata in the memoryset.
|
|
3142
|
+
|
|
3143
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
3144
|
+
You can add memories later using the `insert` method.
|
|
3145
|
+
|
|
3146
|
+
Params:
|
|
3147
|
+
name: Name for the new memoryset (must be unique)
|
|
3148
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
3149
|
+
an empty memoryset will be created.
|
|
3150
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3151
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3152
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3153
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
3154
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
3155
|
+
converted to integers with the unique strings extracted as `label_names`. To create
|
|
3156
|
+
a memoryset with all none labels, set to `None`.
|
|
3157
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3158
|
+
the system of reference
|
|
3159
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3160
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3161
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3162
|
+
datasource or the embedding model.
|
|
3163
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
3164
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
3165
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
3166
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
3167
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3168
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3169
|
+
sequence length if not provided
|
|
3170
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3171
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3172
|
+
into the memoryset
|
|
3173
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3174
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3175
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3176
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3177
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3178
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
3179
|
+
hidden: Whether the memoryset should be hidden
|
|
3180
|
+
|
|
3181
|
+
Returns:
|
|
3182
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3183
|
+
|
|
3184
|
+
Raises:
|
|
3185
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3186
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3187
|
+
"""
|
|
3188
|
+
if datasource is None:
|
|
3189
|
+
return super().create(
|
|
3190
|
+
name,
|
|
3191
|
+
datasource=None,
|
|
3192
|
+
embedding_model=embedding_model,
|
|
3193
|
+
description=description,
|
|
3194
|
+
label_names=label_names,
|
|
3195
|
+
max_seq_length_override=max_seq_length_override,
|
|
3196
|
+
prompt=prompt,
|
|
3197
|
+
index_type=index_type,
|
|
3198
|
+
index_params=index_params,
|
|
3199
|
+
if_exists=if_exists,
|
|
3200
|
+
hidden=hidden,
|
|
3201
|
+
memory_type="LABELED",
|
|
3202
|
+
)
|
|
3203
|
+
else:
|
|
3204
|
+
# Type narrowing: datasource is definitely Datasource here
|
|
3205
|
+
assert datasource is not None
|
|
3206
|
+
if background:
|
|
3207
|
+
return super().create(
|
|
3208
|
+
name,
|
|
3209
|
+
datasource=datasource,
|
|
3210
|
+
label_column=label_column,
|
|
3211
|
+
score_column=None,
|
|
3212
|
+
embedding_model=embedding_model,
|
|
3213
|
+
value_column=value_column,
|
|
3214
|
+
source_id_column=source_id_column,
|
|
3215
|
+
partition_id_column=partition_id_column,
|
|
3216
|
+
description=description,
|
|
3217
|
+
label_names=label_names,
|
|
3218
|
+
max_seq_length_override=max_seq_length_override,
|
|
3219
|
+
prompt=prompt,
|
|
3220
|
+
remove_duplicates=remove_duplicates,
|
|
3221
|
+
index_type=index_type,
|
|
3222
|
+
index_params=index_params,
|
|
3223
|
+
if_exists=if_exists,
|
|
3224
|
+
background=True,
|
|
3225
|
+
hidden=hidden,
|
|
3226
|
+
subsample=subsample,
|
|
3227
|
+
memory_type="LABELED",
|
|
3228
|
+
)
|
|
3229
|
+
else:
|
|
3230
|
+
return super().create(
|
|
3231
|
+
name,
|
|
3232
|
+
datasource=datasource,
|
|
3233
|
+
label_column=label_column,
|
|
3234
|
+
score_column=None,
|
|
3235
|
+
embedding_model=embedding_model,
|
|
3236
|
+
value_column=value_column,
|
|
3237
|
+
source_id_column=source_id_column,
|
|
3238
|
+
partition_id_column=partition_id_column,
|
|
3239
|
+
description=description,
|
|
3240
|
+
label_names=label_names,
|
|
3241
|
+
max_seq_length_override=max_seq_length_override,
|
|
3242
|
+
prompt=prompt,
|
|
3243
|
+
remove_duplicates=remove_duplicates,
|
|
3244
|
+
index_type=index_type,
|
|
3245
|
+
index_params=index_params,
|
|
3246
|
+
if_exists=if_exists,
|
|
3247
|
+
background=False,
|
|
3248
|
+
hidden=hidden,
|
|
3249
|
+
subsample=subsample,
|
|
3250
|
+
memory_type="LABELED",
|
|
3251
|
+
)
|
|
3252
|
+
|
|
3253
|
+
@overload
|
|
3254
|
+
@classmethod
|
|
3255
|
+
def from_datasource(
|
|
3256
|
+
cls,
|
|
3257
|
+
name: str,
|
|
3258
|
+
*,
|
|
3259
|
+
datasource: Datasource,
|
|
3260
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3261
|
+
value_column: str = "value",
|
|
3262
|
+
label_column: str | None = "label",
|
|
3263
|
+
source_id_column: str | None = None,
|
|
3264
|
+
partition_id_column: str | None = None,
|
|
3265
|
+
description: str | None = None,
|
|
3266
|
+
label_names: list[str] | None = None,
|
|
3267
|
+
max_seq_length_override: int | None = None,
|
|
3268
|
+
prompt: str | None = None,
|
|
3269
|
+
remove_duplicates: bool = True,
|
|
3270
|
+
index_type: IndexType = "FLAT",
|
|
3271
|
+
index_params: dict[str, Any] = {},
|
|
3272
|
+
if_exists: CreateMode = "error",
|
|
3273
|
+
background: Literal[True],
|
|
3274
|
+
hidden: bool = False,
|
|
3275
|
+
subsample: int | float | None = None,
|
|
3276
|
+
) -> Job[Self]:
|
|
3277
|
+
pass
|
|
3278
|
+
|
|
3279
|
+
@overload
|
|
3280
|
+
@classmethod
|
|
3281
|
+
def from_datasource(
|
|
3282
|
+
cls,
|
|
3283
|
+
name: str,
|
|
3284
|
+
*,
|
|
3285
|
+
datasource: Datasource,
|
|
3286
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3287
|
+
value_column: str = "value",
|
|
3288
|
+
label_column: str | None = "label",
|
|
3289
|
+
source_id_column: str | None = None,
|
|
3290
|
+
partition_id_column: str | None = None,
|
|
3291
|
+
description: str | None = None,
|
|
3292
|
+
label_names: list[str] | None = None,
|
|
3293
|
+
max_seq_length_override: int | None = None,
|
|
3294
|
+
prompt: str | None = None,
|
|
3295
|
+
remove_duplicates: bool = True,
|
|
3296
|
+
index_type: IndexType = "FLAT",
|
|
3297
|
+
index_params: dict[str, Any] = {},
|
|
3298
|
+
if_exists: CreateMode = "error",
|
|
3299
|
+
background: Literal[False] = False,
|
|
3300
|
+
hidden: bool = False,
|
|
3301
|
+
subsample: int | float | None = None,
|
|
3302
|
+
) -> Self:
|
|
3303
|
+
pass
|
|
3304
|
+
|
|
3305
|
+
@classmethod
|
|
3306
|
+
def from_datasource( # type: ignore[override]
|
|
3307
|
+
cls,
|
|
3308
|
+
name: str,
|
|
3309
|
+
*,
|
|
3310
|
+
datasource: Datasource,
|
|
3311
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3312
|
+
value_column: str = "value",
|
|
3313
|
+
label_column: str | None = "label",
|
|
3314
|
+
source_id_column: str | None = None,
|
|
3315
|
+
partition_id_column: str | None = None,
|
|
3316
|
+
description: str | None = None,
|
|
3317
|
+
label_names: list[str] | None = None,
|
|
3318
|
+
max_seq_length_override: int | None = None,
|
|
3319
|
+
prompt: str | None = None,
|
|
3320
|
+
remove_duplicates: bool = True,
|
|
3321
|
+
index_type: IndexType = "FLAT",
|
|
3322
|
+
index_params: dict[str, Any] = {},
|
|
3323
|
+
if_exists: CreateMode = "error",
|
|
3324
|
+
background: bool = False,
|
|
3325
|
+
hidden: bool = False,
|
|
3326
|
+
subsample: int | float | None = None,
|
|
3327
|
+
) -> Self | Job[Self]:
|
|
3328
|
+
"""
|
|
3329
|
+
Create a new labeled memoryset in the OrcaCloud from a datasource.
|
|
3330
|
+
|
|
3331
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
3332
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
3333
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
3334
|
+
in the memoryset.
|
|
3335
|
+
|
|
3336
|
+
Params:
|
|
3337
|
+
name: Name for the new memoryset (must be unique)
|
|
3338
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
3339
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3340
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3341
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3342
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
3343
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
3344
|
+
converted to integers with the unique strings extracted as `label_names`. To create
|
|
3345
|
+
a memoryset with all none labels, set to `None`.
|
|
3346
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3347
|
+
the system of reference
|
|
3348
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3349
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3350
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3351
|
+
datasource or the embedding model.
|
|
3352
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
3353
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
3354
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
3355
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
3356
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3357
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3358
|
+
sequence length if not provided
|
|
3359
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3360
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3361
|
+
into the memoryset
|
|
3362
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3363
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3364
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3365
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3366
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3367
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
3368
|
+
hidden: Whether the memoryset should be hidden
|
|
3369
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
3370
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
3371
|
+
|
|
3372
|
+
Returns:
|
|
3373
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3374
|
+
|
|
3375
|
+
Raises:
|
|
3376
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3377
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3378
|
+
"""
|
|
3379
|
+
if background:
|
|
3380
|
+
return super().create(
|
|
3381
|
+
name,
|
|
3382
|
+
datasource=datasource,
|
|
3383
|
+
label_column=label_column,
|
|
3384
|
+
score_column=None,
|
|
3385
|
+
embedding_model=embedding_model,
|
|
3386
|
+
value_column=value_column,
|
|
3387
|
+
source_id_column=source_id_column,
|
|
3388
|
+
partition_id_column=partition_id_column,
|
|
3389
|
+
description=description,
|
|
3390
|
+
label_names=label_names,
|
|
3391
|
+
max_seq_length_override=max_seq_length_override,
|
|
3392
|
+
prompt=prompt,
|
|
3393
|
+
remove_duplicates=remove_duplicates,
|
|
3394
|
+
index_type=index_type,
|
|
3395
|
+
index_params=index_params,
|
|
3396
|
+
if_exists=if_exists,
|
|
3397
|
+
background=True,
|
|
3398
|
+
hidden=hidden,
|
|
3399
|
+
subsample=subsample,
|
|
3400
|
+
memory_type="LABELED",
|
|
3401
|
+
)
|
|
3402
|
+
else:
|
|
3403
|
+
return super().create(
|
|
3404
|
+
name,
|
|
3405
|
+
datasource=datasource,
|
|
3406
|
+
label_column=label_column,
|
|
3407
|
+
score_column=None,
|
|
3408
|
+
embedding_model=embedding_model,
|
|
3409
|
+
value_column=value_column,
|
|
3410
|
+
source_id_column=source_id_column,
|
|
3411
|
+
partition_id_column=partition_id_column,
|
|
3412
|
+
description=description,
|
|
3413
|
+
label_names=label_names,
|
|
3414
|
+
max_seq_length_override=max_seq_length_override,
|
|
3415
|
+
prompt=prompt,
|
|
3416
|
+
remove_duplicates=remove_duplicates,
|
|
3417
|
+
index_type=index_type,
|
|
3418
|
+
index_params=index_params,
|
|
3419
|
+
if_exists=if_exists,
|
|
3420
|
+
background=False,
|
|
3421
|
+
hidden=hidden,
|
|
3422
|
+
subsample=subsample,
|
|
3423
|
+
memory_type="LABELED",
|
|
3424
|
+
)
|
|
3425
|
+
|
|
3426
|
+
def display_label_analysis(self):
|
|
3427
|
+
"""
|
|
3428
|
+
Display an interactive UI to review and act upon the label analysis results
|
|
3429
|
+
|
|
3430
|
+
Note:
|
|
3431
|
+
This method is only available in Jupyter notebooks.
|
|
3432
|
+
"""
|
|
3433
|
+
from ._utils.analysis_ui import display_suggested_memory_relabels
|
|
3434
|
+
|
|
3435
|
+
display_suggested_memory_relabels(self)
|
|
3436
|
+
|
|
3437
|
+
|
|
3438
|
+
class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
3439
|
+
"""
|
|
3440
|
+
A Handle to a collection of memories with scores in the OrcaCloud
|
|
3441
|
+
|
|
3442
|
+
Attributes:
|
|
3443
|
+
id: Unique identifier for the memoryset
|
|
3444
|
+
name: Unique name of the memoryset
|
|
3445
|
+
description: Description of the memoryset
|
|
3446
|
+
length: Number of memories in the memoryset
|
|
3447
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
3448
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
3449
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
3450
|
+
"""
|
|
3451
|
+
|
|
3452
|
+
memory_type: MemoryType = "SCORED"
|
|
3453
|
+
|
|
3454
|
+
def __eq__(self, other) -> bool:
|
|
3455
|
+
return isinstance(other, ScoredMemoryset) and self.id == other.id
|
|
3456
|
+
|
|
3457
|
+
@overload
|
|
3458
|
+
@classmethod
|
|
3459
|
+
def create(
|
|
3460
|
+
cls,
|
|
3461
|
+
name: str,
|
|
3462
|
+
*,
|
|
3463
|
+
datasource: None = None,
|
|
3464
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3465
|
+
description: str | None = None,
|
|
3466
|
+
max_seq_length_override: int | None = None,
|
|
3467
|
+
prompt: str | None = None,
|
|
3468
|
+
index_type: IndexType = "FLAT",
|
|
3469
|
+
index_params: dict[str, Any] = {},
|
|
3470
|
+
if_exists: CreateMode = "error",
|
|
3471
|
+
hidden: bool = False,
|
|
3472
|
+
) -> Self:
|
|
3473
|
+
pass
|
|
3474
|
+
|
|
3475
|
+
@overload
|
|
3476
|
+
@classmethod
|
|
3477
|
+
def create(
|
|
3478
|
+
cls,
|
|
3479
|
+
name: str,
|
|
3480
|
+
*,
|
|
3481
|
+
datasource: Datasource,
|
|
3482
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3483
|
+
value_column: str = "value",
|
|
3484
|
+
score_column: str | None = "score",
|
|
3485
|
+
source_id_column: str | None = None,
|
|
3486
|
+
partition_id_column: str | None = None,
|
|
3487
|
+
description: str | None = None,
|
|
3488
|
+
max_seq_length_override: int | None = None,
|
|
3489
|
+
prompt: str | None = None,
|
|
3490
|
+
remove_duplicates: bool = True,
|
|
3491
|
+
index_type: IndexType = "FLAT",
|
|
3492
|
+
index_params: dict[str, Any] = {},
|
|
3493
|
+
if_exists: CreateMode = "error",
|
|
3494
|
+
background: Literal[True],
|
|
3495
|
+
hidden: bool = False,
|
|
3496
|
+
subsample: int | float | None = None,
|
|
3497
|
+
) -> Job[Self]:
|
|
3498
|
+
pass
|
|
3499
|
+
|
|
3500
|
+
@overload
|
|
3501
|
+
@classmethod
|
|
3502
|
+
def create(
|
|
3503
|
+
cls,
|
|
3504
|
+
name: str,
|
|
3505
|
+
*,
|
|
3506
|
+
datasource: Datasource,
|
|
3507
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3508
|
+
score_column: str | None = "score",
|
|
3509
|
+
value_column: str = "value",
|
|
3510
|
+
source_id_column: str | None = None,
|
|
3511
|
+
partition_id_column: str | None = None,
|
|
3512
|
+
description: str | None = None,
|
|
3513
|
+
max_seq_length_override: int | None = None,
|
|
3514
|
+
prompt: str | None = None,
|
|
3515
|
+
remove_duplicates: bool = True,
|
|
3516
|
+
index_type: IndexType = "FLAT",
|
|
3517
|
+
index_params: dict[str, Any] = {},
|
|
3518
|
+
if_exists: CreateMode = "error",
|
|
3519
|
+
background: Literal[False] = False,
|
|
3520
|
+
hidden: bool = False,
|
|
3521
|
+
subsample: int | float | None = None,
|
|
3522
|
+
) -> Self:
|
|
3523
|
+
pass
|
|
3524
|
+
|
|
3525
|
+
@classmethod
|
|
3526
|
+
def create( # type: ignore[override]
|
|
3527
|
+
cls,
|
|
3528
|
+
name: str,
|
|
3529
|
+
*,
|
|
3530
|
+
datasource: Datasource | None = None,
|
|
3531
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3532
|
+
value_column: str = "value",
|
|
3533
|
+
score_column: str | None = "score",
|
|
3534
|
+
source_id_column: str | None = None,
|
|
3535
|
+
partition_id_column: str | None = None,
|
|
3536
|
+
description: str | None = None,
|
|
3537
|
+
max_seq_length_override: int | None = None,
|
|
3538
|
+
prompt: str | None = None,
|
|
3539
|
+
remove_duplicates: bool = True,
|
|
3540
|
+
index_type: IndexType = "FLAT",
|
|
3541
|
+
index_params: dict[str, Any] = {},
|
|
3542
|
+
if_exists: CreateMode = "error",
|
|
3543
|
+
background: bool = False,
|
|
3544
|
+
hidden: bool = False,
|
|
3545
|
+
subsample: int | float | None = None,
|
|
3546
|
+
) -> Self | Job[Self]:
|
|
3547
|
+
"""
|
|
3548
|
+
Create a new scored memoryset in the OrcaCloud
|
|
3549
|
+
|
|
3550
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
3551
|
+
`value_column`, `score_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
3552
|
+
as metadata in the memoryset.
|
|
3553
|
+
|
|
3554
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
3555
|
+
You can add memories later using the `insert` method.
|
|
3556
|
+
|
|
3557
|
+
Params:
|
|
3558
|
+
name: Name for the new memoryset (must be unique)
|
|
3559
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
3560
|
+
an empty memoryset will be created.
|
|
3561
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3562
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3563
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3564
|
+
score_column: Name of the column in the datasource that contains the memory scores. Must
|
|
3565
|
+
contain numerical values. To create a memoryset with all none scores, set to `None`.
|
|
3566
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3567
|
+
the system of reference
|
|
3568
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3569
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3570
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3571
|
+
datasource or the embedding model.
|
|
3572
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3573
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3574
|
+
sequence length if not provided
|
|
3575
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3576
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3577
|
+
into the memoryset
|
|
3578
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3579
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3580
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3581
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3582
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3583
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
3584
|
+
hidden: Whether the memoryset should be hidden
|
|
3585
|
+
|
|
3586
|
+
Returns:
|
|
3587
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3588
|
+
|
|
3589
|
+
Raises:
|
|
3590
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3591
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3592
|
+
"""
|
|
3593
|
+
if datasource is None:
|
|
3594
|
+
return super().create(
|
|
3595
|
+
name,
|
|
3596
|
+
datasource=None,
|
|
3597
|
+
embedding_model=embedding_model,
|
|
3598
|
+
description=description,
|
|
3599
|
+
max_seq_length_override=max_seq_length_override,
|
|
3600
|
+
prompt=prompt,
|
|
3601
|
+
index_type=index_type,
|
|
3602
|
+
index_params=index_params,
|
|
3603
|
+
if_exists=if_exists,
|
|
3604
|
+
hidden=hidden,
|
|
3605
|
+
memory_type="SCORED",
|
|
3606
|
+
)
|
|
3607
|
+
else:
|
|
3608
|
+
# Type narrowing: datasource is definitely Datasource here
|
|
3609
|
+
assert datasource is not None
|
|
3610
|
+
if background:
|
|
3611
|
+
return super().create(
|
|
3612
|
+
name,
|
|
3613
|
+
datasource=datasource,
|
|
3614
|
+
embedding_model=embedding_model,
|
|
3615
|
+
value_column=value_column,
|
|
3616
|
+
score_column=score_column,
|
|
3617
|
+
source_id_column=source_id_column,
|
|
3618
|
+
partition_id_column=partition_id_column,
|
|
3619
|
+
description=description,
|
|
3620
|
+
max_seq_length_override=max_seq_length_override,
|
|
3621
|
+
prompt=prompt,
|
|
3622
|
+
remove_duplicates=remove_duplicates,
|
|
3623
|
+
index_type=index_type,
|
|
3624
|
+
index_params=index_params,
|
|
3625
|
+
if_exists=if_exists,
|
|
3626
|
+
background=True,
|
|
3627
|
+
hidden=hidden,
|
|
3628
|
+
subsample=subsample,
|
|
3629
|
+
memory_type="SCORED",
|
|
3630
|
+
)
|
|
3631
|
+
else:
|
|
3632
|
+
return super().create(
|
|
3633
|
+
name,
|
|
3634
|
+
datasource=datasource,
|
|
3635
|
+
embedding_model=embedding_model,
|
|
3636
|
+
value_column=value_column,
|
|
3637
|
+
score_column=score_column,
|
|
3638
|
+
source_id_column=source_id_column,
|
|
3639
|
+
partition_id_column=partition_id_column,
|
|
3640
|
+
description=description,
|
|
3641
|
+
max_seq_length_override=max_seq_length_override,
|
|
3642
|
+
prompt=prompt,
|
|
3643
|
+
remove_duplicates=remove_duplicates,
|
|
3644
|
+
index_type=index_type,
|
|
3645
|
+
index_params=index_params,
|
|
3646
|
+
if_exists=if_exists,
|
|
3647
|
+
background=False,
|
|
3648
|
+
hidden=hidden,
|
|
3649
|
+
subsample=subsample,
|
|
3650
|
+
memory_type="SCORED",
|
|
3651
|
+
)
|
|
3652
|
+
|
|
3653
|
+
@overload
|
|
3654
|
+
@classmethod
|
|
3655
|
+
def from_datasource(
|
|
3656
|
+
cls,
|
|
3657
|
+
name: str,
|
|
3658
|
+
*,
|
|
3659
|
+
datasource: Datasource,
|
|
3660
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3661
|
+
value_column: str = "value",
|
|
3662
|
+
score_column: str | None = "score",
|
|
3663
|
+
source_id_column: str | None = None,
|
|
3664
|
+
partition_id_column: str | None = None,
|
|
3665
|
+
description: str | None = None,
|
|
3666
|
+
max_seq_length_override: int | None = None,
|
|
3667
|
+
prompt: str | None = None,
|
|
3668
|
+
remove_duplicates: bool = True,
|
|
3669
|
+
index_type: IndexType = "FLAT",
|
|
3670
|
+
index_params: dict[str, Any] = {},
|
|
3671
|
+
if_exists: CreateMode = "error",
|
|
3672
|
+
background: Literal[True],
|
|
3673
|
+
hidden: bool = False,
|
|
3674
|
+
subsample: int | float | None = None,
|
|
3675
|
+
) -> Job[Self]:
|
|
3676
|
+
pass
|
|
3677
|
+
|
|
3678
|
+
@overload
|
|
3679
|
+
@classmethod
|
|
3680
|
+
def from_datasource(
|
|
3681
|
+
cls,
|
|
3682
|
+
name: str,
|
|
3683
|
+
*,
|
|
3684
|
+
datasource: Datasource,
|
|
3685
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3686
|
+
score_column: str | None = "score",
|
|
3687
|
+
value_column: str = "value",
|
|
3688
|
+
source_id_column: str | None = None,
|
|
3689
|
+
partition_id_column: str | None = None,
|
|
3690
|
+
description: str | None = None,
|
|
3691
|
+
max_seq_length_override: int | None = None,
|
|
3692
|
+
prompt: str | None = None,
|
|
3693
|
+
remove_duplicates: bool = True,
|
|
3694
|
+
index_type: IndexType = "FLAT",
|
|
3695
|
+
index_params: dict[str, Any] = {},
|
|
3696
|
+
if_exists: CreateMode = "error",
|
|
3697
|
+
background: Literal[False] = False,
|
|
3698
|
+
hidden: bool = False,
|
|
3699
|
+
subsample: int | float | None = None,
|
|
3700
|
+
) -> Self:
|
|
3701
|
+
pass
|
|
3702
|
+
|
|
3703
|
+
@classmethod
|
|
3704
|
+
def from_datasource( # type: ignore[override]
|
|
3705
|
+
cls,
|
|
3706
|
+
name: str,
|
|
3707
|
+
*,
|
|
3708
|
+
datasource: Datasource,
|
|
3709
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3710
|
+
value_column: str = "value",
|
|
3711
|
+
score_column: str | None = "score",
|
|
3712
|
+
source_id_column: str | None = None,
|
|
3713
|
+
partition_id_column: str | None = None,
|
|
3714
|
+
description: str | None = None,
|
|
3715
|
+
max_seq_length_override: int | None = None,
|
|
3716
|
+
prompt: str | None = None,
|
|
3717
|
+
remove_duplicates: bool = True,
|
|
3718
|
+
index_type: IndexType = "FLAT",
|
|
3719
|
+
index_params: dict[str, Any] = {},
|
|
3720
|
+
if_exists: CreateMode = "error",
|
|
3721
|
+
background: bool = False,
|
|
3722
|
+
hidden: bool = False,
|
|
3723
|
+
subsample: int | float | None = None,
|
|
3724
|
+
) -> Self | Job[Self]:
|
|
3725
|
+
"""
|
|
3726
|
+
Create a new scored memoryset in the OrcaCloud from a datasource.
|
|
3727
|
+
|
|
3728
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
3729
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
3730
|
+
`score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
3731
|
+
in the memoryset.
|
|
3732
|
+
|
|
3733
|
+
Params:
|
|
3734
|
+
name: Name for the new memoryset (must be unique)
|
|
3735
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
3736
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3737
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3738
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3739
|
+
score_column: Name of the column in the datasource that contains the memory scores. Must
|
|
3740
|
+
contain numerical values. To create a memoryset with all none scores, set to `None`.
|
|
3741
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3742
|
+
the system of reference
|
|
3743
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3744
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3745
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3746
|
+
datasource or the embedding model.
|
|
3747
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3748
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3749
|
+
sequence length if not provided
|
|
3750
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3751
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3752
|
+
into the memoryset
|
|
3753
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3754
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3755
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3756
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3757
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3758
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
3759
|
+
hidden: Whether the memoryset should be hidden
|
|
3760
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
3761
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
3762
|
+
|
|
3763
|
+
Returns:
|
|
3764
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3765
|
+
|
|
3766
|
+
Raises:
|
|
3767
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3768
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3769
|
+
"""
|
|
3770
|
+
if background:
|
|
3771
|
+
return super().create(
|
|
3772
|
+
name,
|
|
3773
|
+
datasource=datasource,
|
|
3774
|
+
embedding_model=embedding_model,
|
|
3775
|
+
value_column=value_column,
|
|
3776
|
+
score_column=score_column,
|
|
3777
|
+
source_id_column=source_id_column,
|
|
3778
|
+
partition_id_column=partition_id_column,
|
|
3779
|
+
description=description,
|
|
3780
|
+
max_seq_length_override=max_seq_length_override,
|
|
3781
|
+
prompt=prompt,
|
|
3782
|
+
remove_duplicates=remove_duplicates,
|
|
3783
|
+
index_type=index_type,
|
|
3784
|
+
index_params=index_params,
|
|
3785
|
+
if_exists=if_exists,
|
|
3786
|
+
background=True,
|
|
3787
|
+
hidden=hidden,
|
|
3788
|
+
subsample=subsample,
|
|
3789
|
+
memory_type="SCORED",
|
|
3790
|
+
)
|
|
3791
|
+
else:
|
|
3792
|
+
return super().create(
|
|
3793
|
+
name,
|
|
3794
|
+
datasource=datasource,
|
|
3795
|
+
embedding_model=embedding_model,
|
|
3796
|
+
value_column=value_column,
|
|
3797
|
+
score_column=score_column,
|
|
3798
|
+
source_id_column=source_id_column,
|
|
3799
|
+
partition_id_column=partition_id_column,
|
|
3800
|
+
description=description,
|
|
3801
|
+
max_seq_length_override=max_seq_length_override,
|
|
3802
|
+
prompt=prompt,
|
|
3803
|
+
remove_duplicates=remove_duplicates,
|
|
3804
|
+
index_type=index_type,
|
|
3805
|
+
index_params=index_params,
|
|
3806
|
+
if_exists=if_exists,
|
|
3807
|
+
background=False,
|
|
3808
|
+
hidden=hidden,
|
|
3809
|
+
subsample=subsample,
|
|
3810
|
+
memory_type="SCORED",
|
|
3811
|
+
)
|