orca-sdk 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. orca_sdk/__init__.py +30 -0
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +634 -0
  4. orca_sdk/_shared/metrics_test.py +570 -0
  5. orca_sdk/_utils/__init__.py +0 -0
  6. orca_sdk/_utils/analysis_ui.py +196 -0
  7. orca_sdk/_utils/analysis_ui_style.css +51 -0
  8. orca_sdk/_utils/auth.py +65 -0
  9. orca_sdk/_utils/auth_test.py +31 -0
  10. orca_sdk/_utils/common.py +37 -0
  11. orca_sdk/_utils/data_parsing.py +129 -0
  12. orca_sdk/_utils/data_parsing_test.py +244 -0
  13. orca_sdk/_utils/pagination.py +126 -0
  14. orca_sdk/_utils/pagination_test.py +132 -0
  15. orca_sdk/_utils/prediction_result_ui.css +18 -0
  16. orca_sdk/_utils/prediction_result_ui.py +110 -0
  17. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  18. orca_sdk/_utils/value_parser.py +45 -0
  19. orca_sdk/_utils/value_parser_test.py +39 -0
  20. orca_sdk/async_client.py +4104 -0
  21. orca_sdk/classification_model.py +1165 -0
  22. orca_sdk/classification_model_test.py +887 -0
  23. orca_sdk/client.py +4096 -0
  24. orca_sdk/conftest.py +382 -0
  25. orca_sdk/credentials.py +217 -0
  26. orca_sdk/credentials_test.py +121 -0
  27. orca_sdk/datasource.py +576 -0
  28. orca_sdk/datasource_test.py +463 -0
  29. orca_sdk/embedding_model.py +712 -0
  30. orca_sdk/embedding_model_test.py +206 -0
  31. orca_sdk/job.py +343 -0
  32. orca_sdk/job_test.py +108 -0
  33. orca_sdk/memoryset.py +3811 -0
  34. orca_sdk/memoryset_test.py +1150 -0
  35. orca_sdk/regression_model.py +841 -0
  36. orca_sdk/regression_model_test.py +595 -0
  37. orca_sdk/telemetry.py +742 -0
  38. orca_sdk/telemetry_test.py +119 -0
  39. orca_sdk-0.1.9.dist-info/METADATA +98 -0
  40. orca_sdk-0.1.9.dist-info/RECORD +41 -0
  41. orca_sdk-0.1.9.dist-info/WHEEL +4 -0
orca_sdk/memoryset.py ADDED
@@ -0,0 +1,3811 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from abc import ABC
5
+ from datetime import datetime, timedelta
6
+ from os import PathLike
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ Generic,
11
+ Iterable,
12
+ Literal,
13
+ Self,
14
+ TypeVar,
15
+ cast,
16
+ overload,
17
+ )
18
+
19
+ import pandas as pd
20
+ import pyarrow as pa
21
+ from datasets import Dataset
22
+ from torch.utils.data import DataLoader as TorchDataLoader
23
+ from torch.utils.data import Dataset as TorchDataset
24
+
25
+ from ._utils.common import UNSET, CreateMode, DropMode
26
+ from .async_client import OrcaAsyncClient
27
+ from .client import (
28
+ CascadingEditSuggestion,
29
+ CloneMemorysetRequest,
30
+ CreateMemorysetFromDatasourceRequest,
31
+ CreateMemorysetRequest,
32
+ FilterItem,
33
+ )
34
+ from .client import LabeledMemory as LabeledMemoryResponse
35
+ from .client import (
36
+ LabeledMemoryInsert,
37
+ )
38
+ from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
39
+ from .client import (
40
+ LabeledMemoryUpdate,
41
+ LabeledMemoryWithFeedbackMetrics,
42
+ LabelPredictionMemoryLookup,
43
+ LabelPredictionWithMemoriesAndFeedback,
44
+ ListPredictionsRequest,
45
+ MemoryMetrics,
46
+ MemorysetAnalysisConfigs,
47
+ MemorysetMetadata,
48
+ MemorysetMetrics,
49
+ MemorysetUpdate,
50
+ MemoryType,
51
+ OrcaClient,
52
+ )
53
+ from .client import ScoredMemory as ScoredMemoryResponse
54
+ from .client import (
55
+ ScoredMemoryInsert,
56
+ )
57
+ from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
58
+ from .client import (
59
+ ScoredMemoryUpdate,
60
+ ScoredMemoryWithFeedbackMetrics,
61
+ ScorePredictionMemoryLookup,
62
+ ScorePredictionWithMemoriesAndFeedback,
63
+ TelemetryField,
64
+ TelemetryFilterItem,
65
+ TelemetrySortOptions,
66
+ )
67
+ from .datasource import Datasource
68
+ from .embedding_model import (
69
+ EmbeddingModelBase,
70
+ FinetunedEmbeddingModel,
71
+ PretrainedEmbeddingModel,
72
+ )
73
+ from .job import Job, Status
74
+ from .telemetry import ClassificationPrediction, RegressionPrediction
75
+
76
+ if TYPE_CHECKING:
77
+ from .classification_model import ClassificationModel
78
+ from .regression_model import RegressionModel
79
+
80
+ TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
81
+ """
82
+ Sort expression for telemetry data consisting of a field and a direction.
83
+
84
+ * **`field`**: The field to sort on.
85
+ * **`direction`**: The direction to sort in.
86
+
87
+ Examples:
88
+ >>> ("feedback_metrics.accuracy.avg", "asc")
89
+ >>> ("lookup.count", "desc")
90
+ """
91
+
92
+ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
93
+ """
94
+ Operations that can be used in a filter expression.
95
+ """
96
+
97
+ FilterValue = str | int | float | bool | datetime | None | list[str | None] | list[int] | list[float] | list[bool]
98
+ """
99
+ Values that can be used in a filter expression.
100
+ """
101
+
102
+ FilterItemTuple = tuple[str, FilterOperation, FilterValue]
103
+ """
104
+ Filter expression consisting of a field, an operator, and a value:
105
+
106
+ * **`field`**: The field to filter on.
107
+ * **`operation`**: The operation to apply to the field and value.
108
+ * **`value`**: The value to compare the field against.
109
+
110
+ Examples:
111
+ >>> ("label", "==", 0)
112
+ >>> ("metadata.author", "like", "John")
113
+ >>> ("source_id", "in", ["123", "456"])
114
+ >>> ("feedback_metrics.accuracy.avg", ">", 0.95)
115
+ """
116
+
117
+ IndexType = Literal["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "DISKANN"]
118
+
119
+ DEFAULT_COLUMN_NAMES = {"value", "source_id", "partition_id"}
120
+ TYPE_SPECIFIC_COLUMN_NAMES = {"label", "score"}
121
+ FORBIDDEN_METADATA_COLUMN_NAMES = {
122
+ "memory_id",
123
+ "memory_version",
124
+ "embedding",
125
+ "created_at",
126
+ "updated_at",
127
+ "metrics",
128
+ "feedback_metrics",
129
+ "lookup",
130
+ }
131
+
132
+
133
+ def _is_metric_column(column: str):
134
+ return column in ["feedback_metrics", "lookup"]
135
+
136
+
137
+ def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem | TelemetryFilterItem:
138
+ field = input[0].split(".")
139
+ if (
140
+ len(field) == 1
141
+ and field[0] not in DEFAULT_COLUMN_NAMES | TYPE_SPECIFIC_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES
142
+ ):
143
+ field = ["metadata", field[0]]
144
+ op = input[1]
145
+ value = input[2]
146
+ if isinstance(value, datetime):
147
+ value = value.isoformat()
148
+ if _is_metric_column(field[0]):
149
+ if not (
150
+ (isinstance(value, list) and all(isinstance(v, float) or isinstance(v, int) for v in value))
151
+ or isinstance(value, float)
152
+ or isinstance(value, int)
153
+ ):
154
+ raise ValueError(f"Invalid value for {field[0]} filter: {value}")
155
+ if field[0] == "feedback_metrics" and (len(field) != 3 or field[2] not in ["avg", "count"]):
156
+ raise ValueError(
157
+ "Feedback metrics filters must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
158
+ )
159
+ elif field[0] == "lookup" and (len(field) != 2 or field[1] != "count"):
160
+ raise ValueError("Lookup filters must follow the format `lookup.count`")
161
+ if op == "like":
162
+ raise ValueError("Like filters are not supported on metric columns")
163
+ op = cast(Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in"], op)
164
+ value = cast(float | int | list[float] | list[int], value)
165
+ return TelemetryFilterItem(field=cast(TelemetryField, tuple(field)), op=op, value=value)
166
+
167
+ # Convert list to tuple for FilterItem field type
168
+ return FilterItem(field=tuple(field), op=op, value=value) # type: ignore[assignment]
169
+
170
+
171
+ def _parse_sort_item_from_tuple(
172
+ input: TelemetrySortItem,
173
+ ) -> TelemetrySortOptions:
174
+ field = input[0].split(".")
175
+
176
+ if len(field) == 1:
177
+ raise ValueError("Sort field must be a telemetry field with an aggregate function name value")
178
+ if field[0] not in ["feedback_metrics", "lookup"]:
179
+ raise ValueError("Sort field must be one of telemetry fields: feedback_metrics or lookup")
180
+ if field[0] == "feedback_metrics":
181
+ if len(field) != 3:
182
+ raise ValueError(
183
+ "Feedback metrics must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
184
+ )
185
+ if field[2] not in ["avg", "count"]:
186
+ raise ValueError("Feedback metrics can only be sorted on avg or count")
187
+ if field[0] == "lookup":
188
+ if len(field) != 2:
189
+ raise ValueError("Lookup must follow the format `lookup.count`")
190
+ if field[1] != "count":
191
+ raise ValueError("Lookup can only be sorted on count")
192
+ # Convert list to tuple for TelemetryField type
193
+ return TelemetrySortOptions(field=cast(TelemetryField, tuple(field)), direction=input[1])
194
+
195
+
196
+ def _parse_memory_insert(memory: dict[str, Any], type: MemoryType) -> LabeledMemoryInsert | ScoredMemoryInsert:
197
+ value = memory.get("value")
198
+ if not isinstance(value, str):
199
+ raise ValueError("Memory value must be a string")
200
+ source_id = memory.get("source_id")
201
+ if source_id is not None and not isinstance(source_id, str):
202
+ raise ValueError("Memory source_id must be a string")
203
+ partition_id = memory.get("partition_id")
204
+ if partition_id is not None and not isinstance(partition_id, str):
205
+ raise ValueError("Memory partition_id must be a string")
206
+ match type:
207
+ case "LABELED":
208
+ label = memory.get("label")
209
+ if label is not None and not isinstance(label, int):
210
+ raise ValueError("Memory label must be an integer")
211
+ metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"label"}}
212
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
213
+ raise ValueError(
214
+ f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
215
+ )
216
+ return {
217
+ "value": value,
218
+ "label": label,
219
+ "source_id": source_id,
220
+ "partition_id": partition_id,
221
+ "metadata": metadata,
222
+ }
223
+ case "SCORED":
224
+ score = memory.get("score")
225
+ if score is not None and not isinstance(score, (int, float)):
226
+ raise ValueError("Memory score must be a number")
227
+ metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"score"}}
228
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
229
+ raise ValueError(
230
+ f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
231
+ )
232
+ return {
233
+ "value": value,
234
+ "score": score,
235
+ "source_id": source_id,
236
+ "partition_id": partition_id,
237
+ "metadata": metadata,
238
+ }
239
+
240
+
241
+ def _parse_memory_update(update: dict[str, Any], type: MemoryType) -> LabeledMemoryUpdate | ScoredMemoryUpdate:
242
+ if "memory_id" not in update:
243
+ raise ValueError("memory_id must be specified in the update dictionary")
244
+ memory_id = update["memory_id"]
245
+ if not isinstance(memory_id, str):
246
+ raise ValueError("memory_id must be a string")
247
+ payload: LabeledMemoryUpdate | ScoredMemoryUpdate = {"memory_id": memory_id}
248
+ if "value" in update:
249
+ if not isinstance(update["value"], str):
250
+ raise ValueError("value must be a string or unset")
251
+ payload["value"] = update["value"]
252
+ if "source_id" in update:
253
+ source_id = update["source_id"]
254
+ if source_id is not None and not isinstance(source_id, str):
255
+ raise ValueError("source_id must be a string or None")
256
+ payload["source_id"] = source_id
257
+ if "partition_id" in update:
258
+ partition_id = update["partition_id"]
259
+ if partition_id is not None and not isinstance(partition_id, str):
260
+ raise ValueError("partition_id must be a string or None")
261
+ payload["partition_id"] = partition_id
262
+ match type:
263
+ case "LABELED":
264
+ payload = cast(LabeledMemoryUpdate, payload)
265
+ if "label" in update:
266
+ if not isinstance(update["label"], int):
267
+ raise ValueError("label must be an integer or unset")
268
+ payload["label"] = update["label"]
269
+ metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "label"}}
270
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
271
+ raise ValueError(
272
+ f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
273
+ )
274
+ payload["metadata"] = metadata
275
+ return payload
276
+ case "SCORED":
277
+ payload = cast(ScoredMemoryUpdate, payload)
278
+ if "score" in update:
279
+ if not isinstance(update["score"], (int, float)):
280
+ raise ValueError("score must be a number or unset")
281
+ payload["score"] = update["score"]
282
+ metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "score"}}
283
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
284
+ raise ValueError(
285
+ f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
286
+ )
287
+ payload["metadata"] = metadata
288
+ return cast(ScoredMemoryUpdate, payload)
289
+
290
+
291
+ class MemoryBase(ABC):
292
+ value: str
293
+ embedding: list[float]
294
+ source_id: str | None
295
+ partition_id: str | None
296
+ created_at: datetime
297
+ updated_at: datetime
298
+ metadata: dict[str, str | float | int | bool | None]
299
+ metrics: MemoryMetrics
300
+ memory_id: str
301
+ memory_version: int
302
+ feedback_metrics: dict[str, Any]
303
+ lookup_count: int
304
+ memory_type: MemoryType # defined by subclasses
305
+
306
+ def __init__(
307
+ self,
308
+ memoryset_id: str,
309
+ memory: (
310
+ LabeledMemoryResponse
311
+ | LabeledMemoryLookupResponse
312
+ | LabeledMemoryWithFeedbackMetrics
313
+ | LabelPredictionMemoryLookup
314
+ | ScoredMemoryResponse
315
+ | ScoredMemoryLookupResponse
316
+ | ScoredMemoryWithFeedbackMetrics
317
+ | ScorePredictionMemoryLookup
318
+ ),
319
+ ):
320
+ # for internal use only, do not document
321
+ self.memoryset_id = memoryset_id
322
+ self.memory_id = memory["memory_id"]
323
+ self.memory_version = memory["memory_version"]
324
+ self.value = cast(str, memory["value"])
325
+ self.embedding = memory["embedding"]
326
+ self.source_id = memory["source_id"]
327
+ self.partition_id = memory["partition_id"]
328
+ self.created_at = datetime.fromisoformat(memory["created_at"])
329
+ self.updated_at = datetime.fromisoformat(memory["updated_at"])
330
+ self.metadata = memory["metadata"]
331
+ self.metrics = memory["metrics"] if "metrics" in memory else {}
332
+ self.feedback_metrics = memory.get("feedback_metrics", {}) or {}
333
+ self.lookup_count = memory.get("lookup_count", 0)
334
+
335
+ def __getattr__(self, key: str) -> Any:
336
+ if key.startswith("__") or key not in self.metadata:
337
+ raise AttributeError(f"{key} is not a valid attribute")
338
+ return self.metadata[key]
339
+
340
+ def _convert_to_classification_prediction(
341
+ self,
342
+ prediction: LabelPredictionWithMemoriesAndFeedback,
343
+ *,
344
+ memoryset: LabeledMemoryset,
345
+ model: ClassificationModel,
346
+ ) -> ClassificationPrediction:
347
+ """
348
+ Convert internal prediction TypedDict to ClassificationPrediction object.
349
+ """
350
+ input_value = prediction.get("input_value")
351
+ input_value_str: str | None = None
352
+ if input_value is not None:
353
+ input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
354
+
355
+ return ClassificationPrediction(
356
+ prediction_id=prediction["prediction_id"],
357
+ label=prediction.get("label"),
358
+ label_name=prediction.get("label_name"),
359
+ score=None,
360
+ confidence=prediction["confidence"],
361
+ anomaly_score=prediction["anomaly_score"],
362
+ memoryset=memoryset,
363
+ model=model,
364
+ telemetry=prediction,
365
+ logits=prediction.get("logits"),
366
+ input_value=input_value_str,
367
+ )
368
+
369
+ def _convert_to_regression_prediction(
370
+ self,
371
+ prediction: ScorePredictionWithMemoriesAndFeedback,
372
+ *,
373
+ memoryset: ScoredMemoryset,
374
+ model: RegressionModel,
375
+ ) -> RegressionPrediction:
376
+ """
377
+ Convert internal prediction TypedDict to RegressionPrediction object.
378
+ """
379
+ input_value = prediction.get("input_value")
380
+ input_value_str: str | None = None
381
+ if input_value is not None:
382
+ input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
383
+
384
+ return RegressionPrediction(
385
+ prediction_id=prediction["prediction_id"],
386
+ label=None,
387
+ label_name=None,
388
+ score=prediction.get("score"),
389
+ confidence=prediction["confidence"],
390
+ anomaly_score=prediction["anomaly_score"],
391
+ memoryset=memoryset,
392
+ model=model,
393
+ telemetry=prediction,
394
+ logits=None,
395
+ input_value=input_value_str,
396
+ )
397
+
398
+ def feedback(self) -> dict[str, list[bool] | list[float]]:
399
+ """
400
+ Get feedback metrics computed from predictions that used this memory.
401
+
402
+ Returns a dictionary where:
403
+ - Keys are feedback category names
404
+ - Values are lists of feedback values (you may want to look at mean on the raw data)
405
+ """
406
+ # Collect all feedbacks by category, paginating through all predictions
407
+ feedback_by_category: dict[str, list[bool] | list[float]] = {}
408
+ batch_size = 500
409
+ offset = 0
410
+
411
+ while True:
412
+ predictions_batch = self.predictions(limit=batch_size, offset=offset)
413
+
414
+ if not predictions_batch:
415
+ break
416
+
417
+ for prediction in predictions_batch:
418
+ telemetry = prediction._telemetry
419
+ if "feedbacks" not in telemetry:
420
+ continue
421
+
422
+ for fb in telemetry["feedbacks"]:
423
+ category_name = fb["category_name"]
424
+ value = fb["value"]
425
+ # Convert BINARY (1/0) to boolean, CONTINUOUS to float
426
+ if fb["category_type"] == "BINARY":
427
+ value = bool(value)
428
+ if category_name not in feedback_by_category:
429
+ feedback_by_category[category_name] = []
430
+ cast(list[bool], feedback_by_category[category_name]).append(value)
431
+ else:
432
+ value = float(value)
433
+ if category_name not in feedback_by_category:
434
+ feedback_by_category[category_name] = []
435
+ cast(list[float], feedback_by_category[category_name]).append(value)
436
+
437
+ if len(predictions_batch) < batch_size:
438
+ break
439
+
440
+ offset += batch_size
441
+
442
+ return feedback_by_category
443
+
444
+ def _update(
445
+ self,
446
+ *,
447
+ value: str = UNSET,
448
+ source_id: str | None = UNSET,
449
+ partition_id: str | None = UNSET,
450
+ **metadata: None | bool | float | int | str,
451
+ ) -> Self:
452
+ client = OrcaClient._resolve_client()
453
+ response = client.PATCH(
454
+ "/gpu/memoryset/{name_or_id}/memory",
455
+ params={"name_or_id": self.memoryset_id},
456
+ json=_parse_memory_update(
457
+ {"memory_id": self.memory_id}
458
+ | ({"value": value} if value is not UNSET else {})
459
+ | ({"source_id": source_id} if source_id is not UNSET else {})
460
+ | ({"partition_id": partition_id} if partition_id is not UNSET else {})
461
+ | {k: v for k, v in metadata.items() if v is not UNSET},
462
+ type=self.memory_type,
463
+ ),
464
+ )
465
+ self.__dict__.update(self.__class__(self.memoryset_id, response).__dict__)
466
+ return self
467
+
468
+ def to_dict(self) -> dict[str, Any]:
469
+ """
470
+ Convert the memory to a dictionary
471
+ """
472
+ return {
473
+ "value": self.value,
474
+ "embedding": self.embedding,
475
+ "source_id": self.source_id,
476
+ "partition_id": self.partition_id,
477
+ "created_at": self.created_at,
478
+ "updated_at": self.updated_at,
479
+ "metadata": self.metadata,
480
+ "metrics": self.metrics,
481
+ "memory_id": self.memory_id,
482
+ "memory_version": self.memory_version,
483
+ "feedback_metrics": self.feedback_metrics,
484
+ "lookup_count": self.lookup_count,
485
+ "memory_type": self.memory_type,
486
+ }
487
+
488
+
489
+ class LabeledMemory(MemoryBase):
490
+ """
491
+ A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
492
+
493
+ Attributes:
494
+ value: Value represented by the row
495
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
496
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
497
+ label: Class label of the memory
498
+ label_name: Human-readable name of the label, automatically populated from the
499
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
500
+ source_id: Optional unique identifier of the memory in a system of reference
501
+ partition_id: Optional identifier of the partition the memory belongs to
502
+ metrics: Metrics about the memory, generated when running an analysis on the
503
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
504
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
505
+ properties are also accessible as individual attributes on the instance.
506
+ memory_id: Unique identifier for the memory, automatically generated on insert
507
+ memory_version: Version of the memory, automatically updated when the label or value changes
508
+ created_at: When the memory was created, automatically generated on insert
509
+ updated_at: When the memory was last updated, automatically updated on update
510
+
511
+ ## Other Attributes:
512
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
513
+ """
514
+
515
+ label: int | None
516
+ label_name: str | None
517
+ memory_type = "LABELED"
518
+
519
+ def __init__(
520
+ self,
521
+ memoryset_id: str,
522
+ memory: (
523
+ LabeledMemoryResponse
524
+ | LabeledMemoryLookupResponse
525
+ | LabelPredictionMemoryLookup
526
+ | LabeledMemoryWithFeedbackMetrics
527
+ ),
528
+ ):
529
+ # for internal use only, do not document
530
+ super().__init__(memoryset_id, memory)
531
+ self.label = memory["label"]
532
+ self.label_name = memory["label_name"]
533
+
534
+ def __repr__(self) -> str:
535
+ return (
536
+ "LabeledMemory({ "
537
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
538
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
539
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
540
+ + (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
541
+ + " })"
542
+ )
543
+
544
+ def __eq__(self, other: object) -> bool:
545
+ return isinstance(other, LabeledMemory) and self.memory_id == other.memory_id
546
+
547
+ def update(
548
+ self,
549
+ *,
550
+ value: str = UNSET,
551
+ label: int | None = UNSET,
552
+ source_id: str | None = UNSET,
553
+ partition_id: str | None = UNSET,
554
+ **metadata: None | bool | float | int | str,
555
+ ) -> LabeledMemory:
556
+ """
557
+ Update the memory with new values
558
+
559
+ Note:
560
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
561
+
562
+ Params:
563
+ value: New value of the memory
564
+ label: New label of the memory
565
+ source_id: New source ID of the memory
566
+ partition_id: New partition ID of the memory
567
+ **metadata: New values for metadata properties
568
+
569
+ Returns:
570
+ The updated memory
571
+ """
572
+ self._update(value=value, label=label, source_id=source_id, partition_id=partition_id, **metadata)
573
+ return self
574
+
575
+ def predictions(
576
+ self,
577
+ limit: int = 100,
578
+ offset: int = 0,
579
+ tag: str | None = None,
580
+ sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
581
+ expected_label_match: bool | None = None,
582
+ ) -> list[ClassificationPrediction]:
583
+ """
584
+ Get classification predictions that used this memory.
585
+
586
+ Args:
587
+ limit: Maximum number of predictions to return (default: 100)
588
+ offset: Number of predictions to skip for pagination (default: 0)
589
+ tag: Optional tag filter to only include predictions with this tag
590
+ sort: List of (field, direction) tuples for sorting results.
591
+ Valid fields: "anomaly_score", "confidence", "timestamp".
592
+ Valid directions: "asc", "desc"
593
+ expected_label_match: Filter by prediction correctness:
594
+ - True: only return correct predictions (label == expected_label)
595
+ - False: only return incorrect predictions (label != expected_label)
596
+ - None: return all predictions (default)
597
+
598
+ Returns:
599
+ List of ClassificationPrediction objects that used this memory
600
+ """
601
+
602
+ client = OrcaClient._resolve_client()
603
+ request_json: ListPredictionsRequest = {
604
+ "memory_id": self.memory_id,
605
+ "limit": limit,
606
+ "offset": offset,
607
+ "tag": tag,
608
+ "expected_label_match": expected_label_match,
609
+ }
610
+ if sort:
611
+ request_json["sort"] = sort
612
+ predictions_data = client.POST(
613
+ "/telemetry/prediction",
614
+ json=request_json,
615
+ )
616
+
617
+ # Filter to only classification predictions and convert to ClassificationPrediction objects
618
+ classification_predictions = [
619
+ cast(LabelPredictionWithMemoriesAndFeedback, p) for p in predictions_data if "label" in p
620
+ ]
621
+
622
+ from .classification_model import ClassificationModel
623
+
624
+ memorysets: dict[str, LabeledMemoryset] = {}
625
+ models: dict[str, ClassificationModel] = {}
626
+
627
+ def resolve_memoryset(memoryset_id: str) -> LabeledMemoryset:
628
+ if memoryset_id not in memorysets:
629
+ memorysets[memoryset_id] = LabeledMemoryset.open(memoryset_id)
630
+ return memorysets[memoryset_id]
631
+
632
+ def resolve_model(model_id: str) -> ClassificationModel:
633
+ if model_id not in models:
634
+ models[model_id] = ClassificationModel.open(model_id)
635
+ return models[model_id]
636
+
637
+ return [
638
+ self._convert_to_classification_prediction(
639
+ p,
640
+ memoryset=resolve_memoryset(p["memoryset_id"]),
641
+ model=resolve_model(p["model_id"]),
642
+ )
643
+ for p in classification_predictions
644
+ ]
645
+
646
+ def to_dict(self) -> dict[str, Any]:
647
+ """
648
+ Convert the memory to a dictionary
649
+ """
650
+ super_dict = super().to_dict()
651
+ super_dict["label"] = self.label
652
+ super_dict["label_name"] = self.label_name
653
+ return super_dict
654
+
655
+
656
+ class LabeledMemoryLookup(LabeledMemory):
657
+ """
658
+ Lookup result for a memory in a memoryset
659
+
660
+ Attributes:
661
+ lookup_score: Similarity between the memory embedding and search query embedding
662
+ attention_weight: Weight the model assigned to the memory during prediction if this lookup
663
+ happened as part of a prediction
664
+ value: Value represented by the row
665
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
666
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
667
+ label: Class label of the memory
668
+ label_name: Human-readable name of the label, automatically populated from the
669
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
670
+ source_id: Optional unique identifier of the memory in a system of reference
671
+ partition_id: Optional identifier of the partition the memory belongs to
672
+ metrics: Metrics about the memory, generated when running an analysis on the
673
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
674
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
675
+ properties are also accessible as individual attributes on the instance.
676
+ memory_id: The unique identifier for the memory, automatically generated on insert
677
+ memory_version: The version of the memory, automatically updated when the label or value changes
678
+ created_at: When the memory was created, automatically generated on insert
679
+ updated_at: When the memory was last updated, automatically updated on update
680
+
681
+ ## Other Attributes:
682
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
683
+ """
684
+
685
+ lookup_score: float
686
+ attention_weight: float | None
687
+
688
+ def __init__(
689
+ self,
690
+ memoryset_id: str,
691
+ memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
692
+ ):
693
+ # for internal use only, do not document
694
+ super().__init__(memoryset_id, memory_lookup)
695
+ self.lookup_score = memory_lookup["lookup_score"]
696
+ self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
697
+
698
+ def __repr__(self) -> str:
699
+ return (
700
+ "LabeledMemoryLookup({ "
701
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
702
+ + f", lookup_score: {self.lookup_score:.2f}"
703
+ + (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
704
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
705
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
706
+ + (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
707
+ + " })"
708
+ )
709
+
710
+
711
+ class ScoredMemory(MemoryBase):
712
+ """
713
+ A row of the [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
714
+
715
+ Attributes:
716
+ value: Value represented by the row
717
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
718
+ with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
719
+ score: Score of the memory
720
+ source_id: Optional unique identifier of the memory in a system of reference
721
+ partition_id: Optional identifier of the partition the memory belongs to
722
+ metrics: Metrics about the memory, generated when running an analysis on the
723
+ [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
724
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
725
+ properties are also accessible as individual attributes on the instance.
726
+ memory_id: Unique identifier for the memory, automatically generated on insert
727
+ memory_version: Version of the memory, automatically updated when the score or value changes
728
+ created_at: When the memory was created, automatically generated on insert
729
+ updated_at: When the memory was last updated, automatically updated on update
730
+
731
+ ## Other Attributes:
732
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
733
+ """
734
+
735
+ score: float | None
736
+ memory_type = "SCORED"
737
+
738
+ def __init__(
739
+ self,
740
+ memoryset_id: str,
741
+ memory: (
742
+ ScoredMemoryResponse
743
+ | ScoredMemoryLookupResponse
744
+ | ScorePredictionMemoryLookup
745
+ | ScoredMemoryWithFeedbackMetrics
746
+ ),
747
+ ):
748
+ # for internal use only, do not document
749
+ super().__init__(memoryset_id, memory)
750
+ self.score = memory["score"]
751
+
752
+ def __repr__(self) -> str:
753
+ return (
754
+ "ScoredMemory({ "
755
+ + f"score: {self.score:.2f}"
756
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
757
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
758
+ + (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
759
+ + " })"
760
+ )
761
+
762
+ def __eq__(self, other: object) -> bool:
763
+ return isinstance(other, ScoredMemory) and self.memory_id == other.memory_id
764
+
765
+ def update(
766
+ self,
767
+ *,
768
+ value: str = UNSET,
769
+ score: float | None = UNSET,
770
+ source_id: str | None = UNSET,
771
+ partition_id: str | None = UNSET,
772
+ **metadata: None | bool | float | int | str,
773
+ ) -> ScoredMemory:
774
+ """
775
+ Update the memory with new values
776
+
777
+ Note:
778
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
779
+
780
+ Params:
781
+ value: New value of the memory
782
+ score: New score of the memory
783
+ source_id: New source ID of the memory
784
+ **metadata: New values for metadata properties
785
+
786
+ Returns:
787
+ The updated memory
788
+ """
789
+ self._update(value=value, score=score, source_id=source_id, partition_id=partition_id, **metadata)
790
+ return self
791
+
792
+ def predictions(
793
+ self,
794
+ limit: int = 100,
795
+ offset: int = 0,
796
+ tag: str | None = None,
797
+ sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
798
+ expected_label_match: bool | None = None,
799
+ ) -> list[RegressionPrediction]:
800
+ """
801
+ Get regression predictions that used this memory.
802
+
803
+ Args:
804
+ limit: Maximum number of predictions to return (default: 100)
805
+ offset: Number of predictions to skip for pagination (default: 0)
806
+ tag: Optional tag filter to only include predictions with this tag
807
+ sort: List of (field, direction) tuples for sorting results.
808
+ Valid fields: "anomaly_score", "confidence", "timestamp".
809
+ Valid directions: "asc", "desc"
810
+ expected_label_match: Filter by prediction correctness:
811
+ - True: only return correct predictions (score close to expected_score)
812
+ - False: only return incorrect predictions (score differs from expected_score)
813
+ - None: return all predictions (default)
814
+ Note: For regression, "correctness" is based on score proximity to expected_score.
815
+
816
+ Returns:
817
+ List of RegressionPrediction objects that used this memory
818
+ """
819
+ client = OrcaClient._resolve_client()
820
+ request_json: ListPredictionsRequest = {
821
+ "memory_id": self.memory_id,
822
+ "limit": limit,
823
+ "offset": offset,
824
+ "tag": tag,
825
+ "expected_label_match": expected_label_match,
826
+ }
827
+ if sort:
828
+ request_json["sort"] = sort
829
+ predictions_data = client.POST(
830
+ "/telemetry/prediction",
831
+ json=request_json,
832
+ )
833
+
834
+ # Filter to only regression predictions and convert to RegressionPrediction objects
835
+ regression_predictions = [
836
+ cast(ScorePredictionWithMemoriesAndFeedback, p) for p in predictions_data if "score" in p
837
+ ]
838
+
839
+ from .regression_model import RegressionModel
840
+
841
+ memorysets: dict[str, ScoredMemoryset] = {}
842
+ models: dict[str, RegressionModel] = {}
843
+
844
+ def resolve_memoryset(memoryset_id: str) -> ScoredMemoryset:
845
+ if memoryset_id not in memorysets:
846
+ memorysets[memoryset_id] = ScoredMemoryset.open(memoryset_id)
847
+ return memorysets[memoryset_id]
848
+
849
+ def resolve_model(model_id: str) -> RegressionModel:
850
+ if model_id not in models:
851
+ models[model_id] = RegressionModel.open(model_id)
852
+ return models[model_id]
853
+
854
+ return [
855
+ self._convert_to_regression_prediction(
856
+ p,
857
+ memoryset=resolve_memoryset(p["memoryset_id"]),
858
+ model=resolve_model(p["model_id"]),
859
+ )
860
+ for p in regression_predictions
861
+ ]
862
+
863
+ def to_dict(self) -> dict[str, Any]:
864
+ """
865
+ Convert the memory to a dictionary
866
+ """
867
+ super_dict = super().to_dict()
868
+ super_dict["score"] = self.score
869
+ return super_dict
870
+
871
+
872
+ class ScoredMemoryLookup(ScoredMemory):
873
+ """
874
+ Lookup result for a memory in a memoryset
875
+
876
+ Attributes:
877
+ lookup_score: Similarity between the memory embedding and search query embedding
878
+ attention_weight: Weight the model assigned to the memory during prediction if this lookup
879
+ happened as part of a prediction
880
+ value: Value represented by the row
881
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
882
+ with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
883
+ score: Score of the memory
884
+ source_id: Optional unique identifier of the memory in a system of reference
885
+ partition_id: Optional identifier of the partition the memory belongs to
886
+ metrics: Metrics about the memory, generated when running an analysis on the
887
+ [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
888
+ memory_id: The unique identifier for the memory, automatically generated on insert
889
+ memory_version: The version of the memory, automatically updated when the score or value changes
890
+ created_at: When the memory was created, automatically generated on insert
891
+ updated_at: When the memory was last updated, automatically updated on update
892
+
893
+ ## Other Attributes:
894
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
895
+ """
896
+
897
+ lookup_score: float
898
+ attention_weight: float | None
899
+
900
+ def __init__(
901
+ self,
902
+ memoryset_id: str,
903
+ memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup,
904
+ ):
905
+ # for internal use only, do not document
906
+ super().__init__(memoryset_id, memory_lookup)
907
+ self.lookup_score = memory_lookup["lookup_score"]
908
+ self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
909
+
910
+ def __repr__(self) -> str:
911
+ return (
912
+ "ScoredMemoryLookup({ "
913
+ + f"score: {self.score:.2f}"
914
+ + f", lookup_score: {self.lookup_score:.2f}"
915
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
916
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
917
+ + (f", partition_id: '{self.partition_id}'" if self.partition_id is not None else "")
918
+ + " })"
919
+ )
920
+
921
+
922
+ MemoryT = TypeVar("MemoryT", bound=MemoryBase)
923
+ MemoryLookupT = TypeVar("MemoryLookupT", bound=MemoryBase)
924
+
925
+
926
+ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
927
+ """
928
+ A Handle to a collection of memories with labels in the OrcaCloud
929
+
930
+ Attributes:
931
+ id: Unique identifier for the memoryset
932
+ name: Unique name of the memoryset
933
+ description: Description of the memoryset
934
+ length: Number of memories in the memoryset
935
+ embedding_model: Embedding model used to embed the memory values for semantic search
936
+ created_at: When the memoryset was created, automatically generated on create
937
+ updated_at: When the memoryset was last updated, automatically updated on updates
938
+ """
939
+
940
+ id: str
941
+ name: str
942
+ description: str | None
943
+ memory_type: MemoryType # defined by subclasses
944
+
945
+ length: int
946
+ created_at: datetime
947
+ updated_at: datetime
948
+ insertion_status: Status | None
949
+ embedding_model: EmbeddingModelBase
950
+ index_type: IndexType
951
+ index_params: dict[str, Any]
952
+ hidden: bool
953
+
954
+ def __init__(self, metadata: MemorysetMetadata):
955
+ # for internal use only, do not document
956
+ if metadata["pretrained_embedding_model_name"]:
957
+ self.embedding_model = PretrainedEmbeddingModel._get(metadata["pretrained_embedding_model_name"])
958
+ elif metadata["finetuned_embedding_model_id"]:
959
+ self.embedding_model = FinetunedEmbeddingModel.open(metadata["finetuned_embedding_model_id"])
960
+ else:
961
+ raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
962
+ self.id = metadata["id"]
963
+ self.name = metadata["name"]
964
+ self.description = metadata["description"]
965
+ self.length = metadata["length"]
966
+ self.created_at = datetime.fromisoformat(metadata["created_at"])
967
+ self.updated_at = datetime.fromisoformat(metadata["updated_at"])
968
+ self.insertion_status = (
969
+ Status(metadata["insertion_status"]) if metadata["insertion_status"] is not None else None
970
+ )
971
+ self._last_refresh = datetime.now()
972
+ self.index_type = metadata["index_type"]
973
+ self.index_params = metadata["index_params"]
974
+ self.memory_type = metadata["memory_type"]
975
+ self.hidden = metadata["hidden"]
976
+
977
+ def __eq__(self, other) -> bool:
978
+ return isinstance(other, MemorysetBase) and self.id == other.id
979
+
980
+ def __repr__(self) -> str:
981
+ return (
982
+ f"{self.memory_type.capitalize()}Memoryset(" + "{\n"
983
+ f" name: '{self.name}',\n"
984
+ f" length: {self.length},\n"
985
+ f" embedding_model: {self.embedding_model},\n"
986
+ "})"
987
+ )
988
+
989
+ @classmethod
990
+ def _handle_if_exists(
991
+ cls,
992
+ name: str,
993
+ *,
994
+ if_exists: CreateMode,
995
+ label_names: list[str] | None,
996
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
997
+ ) -> Self | None:
998
+ """
999
+ Handle common `if_exists` logic shared by all creator-style helpers.
1000
+
1001
+ Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
1002
+ and returns `None` when the memoryset does not yet exist.
1003
+ """
1004
+ if not cls.exists(name):
1005
+ return None
1006
+ if if_exists == "error":
1007
+ raise ValueError(f"Memoryset with name {name} already exists")
1008
+
1009
+ existing = cls.open(name)
1010
+
1011
+ if label_names is not None and hasattr(existing, "label_names"):
1012
+ existing_label_names = getattr(existing, "label_names")
1013
+ if label_names != existing_label_names:
1014
+ requested = ", ".join(label_names)
1015
+ existing_joined = ", ".join(existing_label_names)
1016
+ raise ValueError(
1017
+ f"Memoryset {name} already exists with label names [{existing_joined}] "
1018
+ f"(requested: [{requested}])."
1019
+ )
1020
+
1021
+ if embedding_model is not None and embedding_model != existing.embedding_model:
1022
+ existing_model = existing.embedding_model
1023
+ existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
1024
+ requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
1025
+ raise ValueError(
1026
+ f"Memoryset {name} already exists with embedding_model {existing_model_name} "
1027
+ f"(requested: {requested_name})."
1028
+ )
1029
+
1030
+ return existing
1031
+
1032
+ @classmethod
1033
+ def _create_from_datasource(
1034
+ cls,
1035
+ name: str,
1036
+ *,
1037
+ datasource: Datasource,
1038
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1039
+ value_column: str = "value",
1040
+ label_column: str | None = None,
1041
+ score_column: str | None = None,
1042
+ source_id_column: str | None = None,
1043
+ partition_id_column: str | None = None,
1044
+ description: str | None = None,
1045
+ label_names: list[str] | None = None,
1046
+ max_seq_length_override: int | None = None,
1047
+ prompt: str | None = None,
1048
+ remove_duplicates: bool = True,
1049
+ index_type: IndexType = "FLAT",
1050
+ index_params: dict[str, Any] = {},
1051
+ if_exists: CreateMode = "error",
1052
+ background: bool = False,
1053
+ hidden: bool = False,
1054
+ subsample: int | float | None = None,
1055
+ memory_type: MemoryType | None = None,
1056
+ ) -> Self | Job[Self]:
1057
+ """
1058
+ Create a memoryset from a datasource by calling the API.
1059
+
1060
+ This is a private method that performs the actual API call to create a memoryset from a datasource.
1061
+ """
1062
+ if embedding_model is None:
1063
+ embedding_model = PretrainedEmbeddingModel.GTE_BASE
1064
+
1065
+ existing = cls._handle_if_exists(
1066
+ name,
1067
+ if_exists=if_exists,
1068
+ label_names=label_names,
1069
+ embedding_model=embedding_model,
1070
+ )
1071
+ if existing is not None:
1072
+ return existing
1073
+
1074
+ payload: CreateMemorysetFromDatasourceRequest = {
1075
+ "name": name,
1076
+ "description": description,
1077
+ "datasource_name_or_id": datasource.id,
1078
+ "datasource_label_column": label_column,
1079
+ "datasource_score_column": score_column,
1080
+ "datasource_value_column": value_column,
1081
+ "datasource_source_id_column": source_id_column,
1082
+ "datasource_partition_id_column": partition_id_column,
1083
+ "label_names": label_names,
1084
+ "max_seq_length_override": max_seq_length_override,
1085
+ "remove_duplicates": remove_duplicates,
1086
+ "index_type": index_type,
1087
+ "index_params": index_params,
1088
+ "hidden": hidden,
1089
+ }
1090
+ if memory_type is not None:
1091
+ payload["memory_type"] = memory_type
1092
+ if subsample is not None:
1093
+ payload["subsample"] = subsample
1094
+ if prompt is not None:
1095
+ payload["prompt"] = prompt
1096
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
1097
+ payload["pretrained_embedding_model_name"] = embedding_model.name
1098
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
1099
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
1100
+ else:
1101
+ raise ValueError("Invalid embedding model")
1102
+ client = OrcaClient._resolve_client()
1103
+ response = client.POST("/memoryset", json=payload)
1104
+
1105
+ if response["insertion_job_id"] is None:
1106
+ raise ValueError("Create memoryset operation failed to produce an insertion job")
1107
+
1108
+ job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
1109
+ return job if background else job.result()
1110
+
1111
+ @overload
1112
+ @classmethod
1113
+ def create(
1114
+ cls,
1115
+ name: str,
1116
+ *,
1117
+ datasource: None = None,
1118
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1119
+ description: str | None = None,
1120
+ label_names: list[str] | None = None,
1121
+ max_seq_length_override: int | None = None,
1122
+ prompt: str | None = None,
1123
+ index_type: IndexType = "FLAT",
1124
+ index_params: dict[str, Any] = {},
1125
+ if_exists: CreateMode = "error",
1126
+ hidden: bool = False,
1127
+ memory_type: MemoryType | None = None,
1128
+ ) -> Self:
1129
+ pass
1130
+
1131
+ @overload
1132
+ @classmethod
1133
+ def create(
1134
+ cls,
1135
+ name: str,
1136
+ *,
1137
+ datasource: Datasource,
1138
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1139
+ value_column: str = "value",
1140
+ label_column: str | None = None,
1141
+ score_column: str | None = None,
1142
+ source_id_column: str | None = None,
1143
+ partition_id_column: str | None = None,
1144
+ description: str | None = None,
1145
+ label_names: list[str] | None = None,
1146
+ max_seq_length_override: int | None = None,
1147
+ prompt: str | None = None,
1148
+ remove_duplicates: bool = True,
1149
+ index_type: IndexType = "FLAT",
1150
+ index_params: dict[str, Any] = {},
1151
+ if_exists: CreateMode = "error",
1152
+ background: Literal[True],
1153
+ hidden: bool = False,
1154
+ subsample: int | float | None = None,
1155
+ memory_type: MemoryType | None = None,
1156
+ ) -> Job[Self]:
1157
+ pass
1158
+
1159
+ @overload
1160
+ @classmethod
1161
+ def create(
1162
+ cls,
1163
+ name: str,
1164
+ *,
1165
+ datasource: Datasource,
1166
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1167
+ value_column: str = "value",
1168
+ label_column: str | None = None,
1169
+ score_column: str | None = None,
1170
+ source_id_column: str | None = None,
1171
+ partition_id_column: str | None = None,
1172
+ description: str | None = None,
1173
+ label_names: list[str] | None = None,
1174
+ max_seq_length_override: int | None = None,
1175
+ prompt: str | None = None,
1176
+ remove_duplicates: bool = True,
1177
+ index_type: IndexType = "FLAT",
1178
+ index_params: dict[str, Any] = {},
1179
+ if_exists: CreateMode = "error",
1180
+ background: Literal[False] = False,
1181
+ hidden: bool = False,
1182
+ subsample: int | float | None = None,
1183
+ memory_type: MemoryType | None = None,
1184
+ ) -> Self:
1185
+ pass
1186
+
1187
+ @classmethod
1188
+ def create(
1189
+ cls,
1190
+ name: str,
1191
+ *,
1192
+ datasource: Datasource | None = None,
1193
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1194
+ value_column: str = "value",
1195
+ label_column: str | None = None,
1196
+ score_column: str | None = None,
1197
+ source_id_column: str | None = None,
1198
+ partition_id_column: str | None = None,
1199
+ description: str | None = None,
1200
+ label_names: list[str] | None = None,
1201
+ max_seq_length_override: int | None = None,
1202
+ prompt: str | None = None,
1203
+ remove_duplicates: bool = True,
1204
+ index_type: IndexType = "FLAT",
1205
+ index_params: dict[str, Any] = {},
1206
+ if_exists: CreateMode = "error",
1207
+ background: bool = False,
1208
+ hidden: bool = False,
1209
+ subsample: int | float | None = None,
1210
+ memory_type: MemoryType | None = None,
1211
+ ) -> Self | Job[Self]:
1212
+ """
1213
+ Create a new memoryset in the OrcaCloud
1214
+
1215
+ If `datasource` is provided, all columns from the datasource that are not specified in the
1216
+ `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
1217
+ as metadata in the memoryset.
1218
+
1219
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
1220
+ You can add memories later using the `insert` method.
1221
+
1222
+ Params:
1223
+ name: Name for the new memoryset (must be unique)
1224
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
1225
+ an empty memoryset will be created.
1226
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1227
+ If not provided, a default embedding model for the memoryset will be used.
1228
+ value_column: Name of the column in the datasource that contains the memory values
1229
+ label_column: Name of the column in the datasource that contains the memory labels.
1230
+ Must contain categorical values as integers or strings. String labels will be
1231
+ converted to integers with the unique strings extracted as `label_names`
1232
+ score_column: Name of the column in the datasource that contains the memory scores
1233
+ source_id_column: Optional name of the column in the datasource that contains the ids in
1234
+ the system of reference
1235
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
1236
+ description: Optional description for the memoryset, this will be used in agentic flows,
1237
+ so make sure it is concise and describes the contents of your memoryset not the
1238
+ datasource or the embedding model.
1239
+ label_names: List of human-readable names for the labels in the memoryset, must match
1240
+ the number of labels in the `label_column`. Will be automatically inferred if string
1241
+ labels are provided or if a [Dataset][datasets.Dataset] with a
1242
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
1243
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1244
+ value is longer than this it will be truncated, will default to the model's max
1245
+ sequence length if not provided
1246
+ prompt: Optional prompt to use when embedding documents/memories for storage
1247
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
1248
+ into the memoryset
1249
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1250
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1251
+ index_params: Parameters for the vector index, defaults to `{}`
1252
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1253
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1254
+ background: Whether to run the operation none blocking and return a job handle.
1255
+ Note: This parameter is ignored when creating an empty memoryset (when datasource is None).
1256
+ hidden: Whether the memoryset should be hidden
1257
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1258
+ datasource to insert. Use to limit the size of the initial memoryset.
1259
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1260
+ and `"SCORED"` if `score_column` is provided, must be specified for other cases.
1261
+ Returns:
1262
+ Handle to the new memoryset in the OrcaCloud
1263
+
1264
+ Raises:
1265
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1266
+ `"open"` and the params do not match those of the existing memoryset.
1267
+ """
1268
+ if datasource is None:
1269
+ return cls._create_empty(
1270
+ name,
1271
+ embedding_model=embedding_model,
1272
+ description=description,
1273
+ label_names=label_names,
1274
+ max_seq_length_override=max_seq_length_override,
1275
+ prompt=prompt,
1276
+ index_type=index_type,
1277
+ index_params=index_params,
1278
+ if_exists=if_exists,
1279
+ hidden=hidden,
1280
+ memory_type=memory_type,
1281
+ )
1282
+ else:
1283
+ return cls._create_from_datasource(
1284
+ name,
1285
+ datasource=datasource,
1286
+ embedding_model=embedding_model,
1287
+ value_column=value_column,
1288
+ label_column=label_column,
1289
+ score_column=score_column,
1290
+ source_id_column=source_id_column,
1291
+ partition_id_column=partition_id_column,
1292
+ description=description,
1293
+ label_names=label_names,
1294
+ max_seq_length_override=max_seq_length_override,
1295
+ prompt=prompt,
1296
+ remove_duplicates=remove_duplicates,
1297
+ index_type=index_type,
1298
+ index_params=index_params,
1299
+ if_exists=if_exists,
1300
+ background=background,
1301
+ hidden=hidden,
1302
+ subsample=subsample,
1303
+ memory_type=memory_type,
1304
+ )
1305
+
1306
+ @overload
1307
+ @classmethod
1308
+ def from_datasource(
1309
+ cls,
1310
+ name: str,
1311
+ *,
1312
+ datasource: Datasource,
1313
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1314
+ value_column: str = "value",
1315
+ label_column: str | None = None,
1316
+ score_column: str | None = None,
1317
+ source_id_column: str | None = None,
1318
+ partition_id_column: str | None = None,
1319
+ description: str | None = None,
1320
+ label_names: list[str] | None = None,
1321
+ max_seq_length_override: int | None = None,
1322
+ prompt: str | None = None,
1323
+ remove_duplicates: bool = True,
1324
+ index_type: IndexType = "FLAT",
1325
+ index_params: dict[str, Any] = {},
1326
+ if_exists: CreateMode = "error",
1327
+ background: Literal[True],
1328
+ hidden: bool = False,
1329
+ subsample: int | float | None = None,
1330
+ memory_type: MemoryType | None = None,
1331
+ ) -> Job[Self]:
1332
+ pass
1333
+
1334
+ @overload
1335
+ @classmethod
1336
+ def from_datasource(
1337
+ cls,
1338
+ name: str,
1339
+ *,
1340
+ datasource: Datasource,
1341
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1342
+ value_column: str = "value",
1343
+ label_column: str | None = None,
1344
+ score_column: str | None = None,
1345
+ source_id_column: str | None = None,
1346
+ partition_id_column: str | None = None,
1347
+ description: str | None = None,
1348
+ label_names: list[str] | None = None,
1349
+ max_seq_length_override: int | None = None,
1350
+ prompt: str | None = None,
1351
+ remove_duplicates: bool = True,
1352
+ index_type: IndexType = "FLAT",
1353
+ index_params: dict[str, Any] = {},
1354
+ if_exists: CreateMode = "error",
1355
+ background: Literal[False] = False,
1356
+ hidden: bool = False,
1357
+ subsample: int | float | None = None,
1358
+ memory_type: MemoryType | None = None,
1359
+ ) -> Self:
1360
+ pass
1361
+
1362
+ @classmethod
1363
+ def from_datasource(
1364
+ cls,
1365
+ name: str,
1366
+ *,
1367
+ datasource: Datasource,
1368
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1369
+ value_column: str = "value",
1370
+ label_column: str | None = None,
1371
+ score_column: str | None = None,
1372
+ source_id_column: str | None = None,
1373
+ partition_id_column: str | None = None,
1374
+ description: str | None = None,
1375
+ label_names: list[str] | None = None,
1376
+ max_seq_length_override: int | None = None,
1377
+ prompt: str | None = None,
1378
+ remove_duplicates: bool = True,
1379
+ index_type: IndexType = "FLAT",
1380
+ index_params: dict[str, Any] = {},
1381
+ if_exists: CreateMode = "error",
1382
+ background: bool = False,
1383
+ hidden: bool = False,
1384
+ subsample: int | float | None = None,
1385
+ memory_type: MemoryType | None = None,
1386
+ ) -> Self | Job[Self]:
1387
+ """
1388
+ Create a new memoryset in the OrcaCloud from a datasource.
1389
+
1390
+ This is a convenience method that is equivalent to calling `create` with a datasource.
1391
+ All columns from the datasource that are not specified in the `value_column`,
1392
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
1393
+ in the memoryset.
1394
+
1395
+ Params:
1396
+ name: Name for the new memoryset (must be unique)
1397
+ datasource: Source data to populate the memories in the memoryset.
1398
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1399
+ If not provided, a default embedding model for the memoryset will be used.
1400
+ value_column: Name of the column in the datasource that contains the memory values
1401
+ label_column: Name of the column in the datasource that contains the memory labels.
1402
+ Must contain categorical values as integers or strings. String labels will be
1403
+ converted to integers with the unique strings extracted as `label_names`
1404
+ score_column: Name of the column in the datasource that contains the memory scores
1405
+ source_id_column: Optional name of the column in the datasource that contains the ids in
1406
+ the system of reference
1407
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
1408
+ description: Optional description for the memoryset, this will be used in agentic flows,
1409
+ so make sure it is concise and describes the contents of your memoryset not the
1410
+ datasource or the embedding model.
1411
+ label_names: List of human-readable names for the labels in the memoryset, must match
1412
+ the number of labels in the `label_column`. Will be automatically inferred if string
1413
+ labels are provided or if a [Dataset][datasets.Dataset] with a
1414
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
1415
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1416
+ value is longer than this it will be truncated, will default to the model's max
1417
+ sequence length if not provided
1418
+ prompt: Optional prompt to use when embedding documents/memories for storage
1419
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
1420
+ into the memoryset
1421
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1422
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1423
+ index_params: Parameters for the vector index, defaults to `{}`
1424
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1425
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1426
+ background: Whether to run the operation none blocking and return a job handle.
1427
+ hidden: Whether the memoryset should be hidden
1428
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1429
+ datasource to insert. Use to limit the size of the initial memoryset.
1430
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1431
+ and `"SCORED"` if `score_column` is provided, must be specified for other cases.
1432
+ Returns:
1433
+ Handle to the new memoryset in the OrcaCloud
1434
+
1435
+ Raises:
1436
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1437
+ `"open"` and the params do not match those of the existing memoryset.
1438
+ """
1439
+ return cls._create_from_datasource(
1440
+ name,
1441
+ datasource=datasource,
1442
+ embedding_model=embedding_model,
1443
+ value_column=value_column,
1444
+ label_column=label_column,
1445
+ score_column=score_column,
1446
+ source_id_column=source_id_column,
1447
+ partition_id_column=partition_id_column,
1448
+ description=description,
1449
+ label_names=label_names,
1450
+ max_seq_length_override=max_seq_length_override,
1451
+ prompt=prompt,
1452
+ remove_duplicates=remove_duplicates,
1453
+ index_type=index_type,
1454
+ index_params=index_params,
1455
+ if_exists=if_exists,
1456
+ background=background,
1457
+ hidden=hidden,
1458
+ subsample=subsample,
1459
+ memory_type=memory_type,
1460
+ )
1461
+
1462
+ @classmethod
1463
+ def _create_empty(
1464
+ cls,
1465
+ name: str,
1466
+ *,
1467
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1468
+ description: str | None = None,
1469
+ label_names: list[str] | None = None,
1470
+ max_seq_length_override: int | None = None,
1471
+ prompt: str | None = None,
1472
+ index_type: IndexType = "FLAT",
1473
+ index_params: dict[str, Any] = {},
1474
+ if_exists: CreateMode = "error",
1475
+ hidden: bool = False,
1476
+ memory_type: MemoryType | None = None,
1477
+ ) -> Self:
1478
+ """
1479
+ Create an empty memoryset in the OrcaCloud
1480
+
1481
+ This creates a memoryset with no initial memories. You can add memories later using
1482
+ the `insert` method.
1483
+
1484
+ Params:
1485
+ name: Name for the new memoryset (must be unique)
1486
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1487
+ If not provided, a default embedding model for the memoryset will be used.
1488
+ description: Optional description for the memoryset, this will be used in agentic flows,
1489
+ so make sure it is concise and describes the contents of your memoryset not the
1490
+ datasource or the embedding model.
1491
+ label_names: List of human-readable names for the labels in the memoryset
1492
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1493
+ value is longer than this it will be truncated, will default to the model's max
1494
+ sequence length if not provided
1495
+ prompt: Optional prompt to use when embedding documents/memories for storage
1496
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1497
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1498
+ index_params: Parameters for the vector index, defaults to `{}`
1499
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1500
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1501
+ hidden: Whether the memoryset should be hidden
1502
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if called from
1503
+ `LabeledMemoryset` and `"SCORED"` if called from `ScoredMemoryset`.
1504
+
1505
+ Returns:
1506
+ Handle to the new memoryset in the OrcaCloud
1507
+
1508
+ Raises:
1509
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1510
+ `"open"` and the params do not match those of the existing memoryset.
1511
+ """
1512
+ if embedding_model is None:
1513
+ embedding_model = PretrainedEmbeddingModel.GTE_BASE
1514
+
1515
+ existing = cls._handle_if_exists(
1516
+ name,
1517
+ if_exists=if_exists,
1518
+ label_names=label_names,
1519
+ embedding_model=embedding_model,
1520
+ )
1521
+ if existing is not None:
1522
+ return existing
1523
+
1524
+ payload: CreateMemorysetRequest = {
1525
+ "name": name,
1526
+ "description": description,
1527
+ "label_names": label_names,
1528
+ "max_seq_length_override": max_seq_length_override,
1529
+ "index_type": index_type,
1530
+ "index_params": index_params,
1531
+ "hidden": hidden,
1532
+ }
1533
+ if memory_type is not None:
1534
+ payload["memory_type"] = memory_type
1535
+ if prompt is not None:
1536
+ payload["prompt"] = prompt
1537
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
1538
+ payload["pretrained_embedding_model_name"] = embedding_model.name
1539
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
1540
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
1541
+ else:
1542
+ raise ValueError("Invalid embedding model")
1543
+
1544
+ client = OrcaClient._resolve_client()
1545
+ response = client.POST("/memoryset/empty", json=payload)
1546
+ return cls.open(response["id"])
1547
+
1548
+ @overload
1549
+ @classmethod
1550
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
1551
+ pass
1552
+
1553
+ @overload
1554
+ @classmethod
1555
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
1556
+ pass
1557
+
1558
+ @classmethod
1559
+ def from_hf_dataset(
1560
+ cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
1561
+ ) -> Self | Job[Self]:
1562
+ """
1563
+ Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
1564
+
1565
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1566
+ appended with `_datasource` and use that as the datasource for the memoryset.
1567
+
1568
+ All features that are not specified to be used as `value_column`, `label_column`, or
1569
+ `source_id_column` will be stored as metadata in the memoryset.
1570
+
1571
+ Params:
1572
+ name: Name for the new memoryset (must be unique)
1573
+ hf_dataset: Hugging Face dataset to create the memoryset from
1574
+ kwargs: Additional parameters for creating the memoryset. See
1575
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1576
+
1577
+ Returns:
1578
+ Handle to the new memoryset in the OrcaCloud
1579
+ """
1580
+ if_exists = kwargs.get("if_exists", "error")
1581
+ existing = cls._handle_if_exists(
1582
+ name,
1583
+ if_exists=if_exists,
1584
+ label_names=kwargs.get("label_names"),
1585
+ embedding_model=kwargs.get("embedding_model"),
1586
+ )
1587
+ if existing is not None:
1588
+ return existing
1589
+
1590
+ datasource = Datasource.from_hf_dataset(
1591
+ f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
1592
+ )
1593
+ kwargs["background"] = background
1594
+ return cls.create(name, datasource=datasource, **kwargs)
1595
+
1596
+ @overload
1597
+ @classmethod
1598
+ def from_pytorch(
1599
+ cls,
1600
+ name: str,
1601
+ torch_data: TorchDataLoader | TorchDataset,
1602
+ *,
1603
+ column_names: list[str] | None = None,
1604
+ background: Literal[True],
1605
+ **kwargs: Any,
1606
+ ) -> Job[Self]:
1607
+ pass
1608
+
1609
+ @overload
1610
+ @classmethod
1611
+ def from_pytorch(
1612
+ cls,
1613
+ name: str,
1614
+ torch_data: TorchDataLoader | TorchDataset,
1615
+ *,
1616
+ column_names: list[str] | None = None,
1617
+ background: Literal[False] = False,
1618
+ **kwargs: Any,
1619
+ ) -> Self:
1620
+ pass
1621
+
1622
+ @classmethod
1623
+ def from_pytorch(
1624
+ cls,
1625
+ name: str,
1626
+ torch_data: TorchDataLoader | TorchDataset,
1627
+ *,
1628
+ column_names: list[str] | None = None,
1629
+ background: bool = False,
1630
+ **kwargs: Any,
1631
+ ) -> Self | Job[Self]:
1632
+ """
1633
+ Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
1634
+ [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
1635
+
1636
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1637
+ appended with `_datasource` and use that as the datasource for the memoryset.
1638
+
1639
+ All properties that are not specified to be used as `value_column`, `label_column`, or
1640
+ `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1641
+
1642
+ Params:
1643
+ name: Name for the new memoryset (must be unique)
1644
+ torch_data: PyTorch data loader or dataset to create the memoryset from
1645
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
1646
+ argument must be provided to specify the names of the columns.
1647
+ background: Whether to run the operation in the background
1648
+ kwargs: Additional parameters for creating the memoryset. See
1649
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1650
+
1651
+ Returns:
1652
+ Handle to the new memoryset in the OrcaCloud
1653
+ """
1654
+ if_exists = kwargs.get("if_exists", "error")
1655
+ existing = cls._handle_if_exists(
1656
+ name,
1657
+ if_exists=if_exists,
1658
+ label_names=kwargs.get("label_names"),
1659
+ embedding_model=kwargs.get("embedding_model"),
1660
+ )
1661
+ if existing is not None:
1662
+ return existing
1663
+
1664
+ datasource = Datasource.from_pytorch(
1665
+ f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
1666
+ )
1667
+ kwargs["background"] = background
1668
+ return cls.create(name, datasource=datasource, **kwargs)
1669
+
1670
+ @overload
1671
+ @classmethod
1672
+ def from_list(
1673
+ cls,
1674
+ name: str,
1675
+ data: list[dict],
1676
+ *,
1677
+ background: Literal[True],
1678
+ **kwargs: Any,
1679
+ ) -> Job[Self]:
1680
+ pass
1681
+
1682
+ @overload
1683
+ @classmethod
1684
+ def from_list(
1685
+ cls,
1686
+ name: str,
1687
+ data: list[dict],
1688
+ *,
1689
+ background: Literal[False] = False,
1690
+ **kwargs: Any,
1691
+ ) -> Self:
1692
+ pass
1693
+
1694
+ @classmethod
1695
+ def from_list(
1696
+ cls,
1697
+ name: str,
1698
+ data: list[dict],
1699
+ *,
1700
+ background: bool = False,
1701
+ **kwargs: Any,
1702
+ ) -> Self | Job[Self]:
1703
+ """
1704
+ Create a new memoryset from a list of dictionaries in the OrcaCloud
1705
+
1706
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1707
+ appended with `_datasource` and use that as the datasource for the memoryset.
1708
+
1709
+ All properties that are not specified to be used as `value_column`, `label_column`, or
1710
+ `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1711
+
1712
+ Params:
1713
+ name: Name for the new memoryset (must be unique)
1714
+ data: List of dictionaries to create the memoryset from
1715
+ background: Whether to run the operation in the background
1716
+ kwargs: Additional parameters for creating the memoryset. See
1717
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1718
+
1719
+ Returns:
1720
+ Handle to the new memoryset in the OrcaCloud
1721
+
1722
+ Examples:
1723
+ >>> LabeledMemoryset.from_list("my_memoryset", [
1724
+ ... {"value": "hello", "label": 0, "tag": "tag1"},
1725
+ ... {"value": "world", "label": 1, "tag": "tag2"},
1726
+ ... ])
1727
+ """
1728
+ if_exists = kwargs.get("if_exists", "error")
1729
+ existing = cls._handle_if_exists(
1730
+ name,
1731
+ if_exists=if_exists,
1732
+ label_names=kwargs.get("label_names"),
1733
+ embedding_model=kwargs.get("embedding_model"),
1734
+ )
1735
+ if existing is not None:
1736
+ return existing
1737
+
1738
+ datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
1739
+ kwargs["background"] = background
1740
+ return cls.create(name, datasource=datasource, **kwargs)
1741
+
1742
+ @overload
1743
+ @classmethod
1744
+ def from_dict(
1745
+ cls,
1746
+ name: str,
1747
+ data: dict,
1748
+ *,
1749
+ background: Literal[True],
1750
+ **kwargs: Any,
1751
+ ) -> Job[Self]:
1752
+ pass
1753
+
1754
+ @overload
1755
+ @classmethod
1756
+ def from_dict(
1757
+ cls,
1758
+ name: str,
1759
+ data: dict,
1760
+ *,
1761
+ background: Literal[False] = False,
1762
+ **kwargs: Any,
1763
+ ) -> Self:
1764
+ pass
1765
+
1766
+ @classmethod
1767
+ def from_dict(
1768
+ cls,
1769
+ name: str,
1770
+ data: dict,
1771
+ *,
1772
+ background: bool = False,
1773
+ **kwargs: Any,
1774
+ ) -> Self | Job[Self]:
1775
+ """
1776
+ Create a new memoryset from a dictionary of columns in the OrcaCloud
1777
+
1778
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1779
+ appended with `_datasource` and use that as the datasource for the memoryset.
1780
+
1781
+ All columns from the datasource that are not specified in the `value_column`,
1782
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1783
+
1784
+ Params:
1785
+ name: Name for the new memoryset (must be unique)
1786
+ data: Dictionary of columns to create the memoryset from
1787
+ background: Whether to run the operation in the background
1788
+ kwargs: Additional parameters for creating the memoryset. See
1789
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1790
+
1791
+ Returns:
1792
+ Handle to the new memoryset in the OrcaCloud
1793
+
1794
+ Examples:
1795
+ >>> LabeledMemoryset.from_dict("my_memoryset", {
1796
+ ... "value": ["hello", "world"],
1797
+ ... "label": [0, 1],
1798
+ ... "tag": ["tag1", "tag2"],
1799
+ ... })
1800
+ """
1801
+ if_exists = kwargs.get("if_exists", "error")
1802
+ existing = cls._handle_if_exists(
1803
+ name,
1804
+ if_exists=if_exists,
1805
+ label_names=kwargs.get("label_names"),
1806
+ embedding_model=kwargs.get("embedding_model"),
1807
+ )
1808
+ if existing is not None:
1809
+ return existing
1810
+
1811
+ datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
1812
+ kwargs["background"] = background
1813
+ return cls.create(name, datasource=datasource, **kwargs)
1814
+
1815
+ @overload
1816
+ @classmethod
1817
+ def from_pandas(
1818
+ cls,
1819
+ name: str,
1820
+ dataframe: pd.DataFrame,
1821
+ *,
1822
+ background: Literal[True],
1823
+ **kwargs: Any,
1824
+ ) -> Job[Self]:
1825
+ pass
1826
+
1827
+ @overload
1828
+ @classmethod
1829
+ def from_pandas(
1830
+ cls,
1831
+ name: str,
1832
+ dataframe: pd.DataFrame,
1833
+ *,
1834
+ background: Literal[False] = False,
1835
+ **kwargs: Any,
1836
+ ) -> Self:
1837
+ pass
1838
+
1839
+ @classmethod
1840
+ def from_pandas(
1841
+ cls,
1842
+ name: str,
1843
+ dataframe: pd.DataFrame,
1844
+ *,
1845
+ background: bool = False,
1846
+ **kwargs: Any,
1847
+ ) -> Self | Job[Self]:
1848
+ """
1849
+ Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
1850
+
1851
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1852
+ appended with `_datasource` and use that as the datasource for the memoryset.
1853
+
1854
+ All columns that are not specified to be used as `value_column`, `label_column`, or
1855
+ `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1856
+
1857
+ Params:
1858
+ name: Name for the new memoryset (must be unique)
1859
+ dataframe: Dataframe to create the memoryset from
1860
+ background: Whether to run the operation in the background
1861
+ kwargs: Additional parameters for creating the memoryset. See
1862
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1863
+
1864
+ Returns:
1865
+ Handle to the new memoryset in the OrcaCloud
1866
+ """
1867
+ if_exists = kwargs.get("if_exists", "error")
1868
+ existing = cls._handle_if_exists(
1869
+ name,
1870
+ if_exists=if_exists,
1871
+ label_names=kwargs.get("label_names"),
1872
+ embedding_model=kwargs.get("embedding_model"),
1873
+ )
1874
+ if existing is not None:
1875
+ return existing
1876
+
1877
+ datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
1878
+ kwargs["background"] = background
1879
+ return cls.create(name, datasource=datasource, **kwargs)
1880
+
1881
+ @overload
1882
+ @classmethod
1883
+ def from_arrow(
1884
+ cls,
1885
+ name: str,
1886
+ pyarrow_table: pa.Table,
1887
+ *,
1888
+ background: Literal[True],
1889
+ **kwargs: Any,
1890
+ ) -> Job[Self]:
1891
+ pass
1892
+
1893
+ @overload
1894
+ @classmethod
1895
+ def from_arrow(
1896
+ cls,
1897
+ name: str,
1898
+ pyarrow_table: pa.Table,
1899
+ *,
1900
+ background: Literal[False] = False,
1901
+ **kwargs: Any,
1902
+ ) -> Self:
1903
+ pass
1904
+
1905
+ @classmethod
1906
+ def from_arrow(
1907
+ cls,
1908
+ name: str,
1909
+ pyarrow_table: pa.Table,
1910
+ *,
1911
+ background: bool = False,
1912
+ **kwargs: Any,
1913
+ ) -> Self | Job[Self]:
1914
+ """
1915
+ Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
1916
+
1917
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1918
+ appended with `_datasource` and use that as the datasource for the memoryset.
1919
+
1920
+ All columns that are not specified to be used as `value_column`, `label_column`, or
1921
+ `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1922
+
1923
+ Params:
1924
+ name: Name for the new memoryset (must be unique)
1925
+ pyarrow_table: PyArrow table to create the memoryset from
1926
+ background: Whether to run the operation in the background
1927
+ kwargs: Additional parameters for creating the memoryset. See
1928
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1929
+
1930
+ Returns:
1931
+ Handle to the new memoryset in the OrcaCloud
1932
+ """
1933
+ if_exists = kwargs.get("if_exists", "error")
1934
+ existing = cls._handle_if_exists(
1935
+ name,
1936
+ if_exists=if_exists,
1937
+ label_names=kwargs.get("label_names"),
1938
+ embedding_model=kwargs.get("embedding_model"),
1939
+ )
1940
+ if existing is not None:
1941
+ return existing
1942
+
1943
+ datasource = Datasource.from_arrow(
1944
+ f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
1945
+ )
1946
+ kwargs["background"] = background
1947
+ return cls.create(name, datasource=datasource, **kwargs)
1948
+
1949
+ @overload
1950
+ @classmethod
1951
+ def from_disk(
1952
+ cls,
1953
+ name: str,
1954
+ file_path: str | PathLike,
1955
+ *,
1956
+ background: Literal[True],
1957
+ **kwargs: Any,
1958
+ ) -> Job[Self]:
1959
+ pass
1960
+
1961
+ @overload
1962
+ @classmethod
1963
+ def from_disk(
1964
+ cls,
1965
+ name: str,
1966
+ file_path: str | PathLike,
1967
+ *,
1968
+ background: Literal[False] = False,
1969
+ **kwargs: Any,
1970
+ ) -> Self:
1971
+ pass
1972
+
1973
+ @classmethod
1974
+ def from_disk(
1975
+ cls,
1976
+ name: str,
1977
+ file_path: str | PathLike,
1978
+ *,
1979
+ background: bool = False,
1980
+ **kwargs: Any,
1981
+ ) -> Self | Job[Self]:
1982
+ """
1983
+ Create a new memoryset from a file on disk in the OrcaCloud
1984
+
1985
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1986
+ appended with `_datasource` and use that as the datasource for the memoryset.
1987
+
1988
+ All columns from the datasource that are not specified in the `value_column`,
1989
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1990
+
1991
+ Params:
1992
+ name: Name for the new memoryset (must be unique)
1993
+ file_path: Path to the file on disk to create the memoryset from. The file type will
1994
+ be inferred from the file extension. The following file types are supported:
1995
+
1996
+ - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
1997
+ - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
1998
+ - .csv: [`CSV`][csv] files
1999
+ - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
2000
+ - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
2001
+ background: Whether to run the operation in the background
2002
+ kwargs: Additional parameters for creating the memoryset. See
2003
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
2004
+
2005
+ Returns:
2006
+ Handle to the new memoryset in the OrcaCloud
2007
+ """
2008
+ if_exists = kwargs.get("if_exists", "error")
2009
+ existing = cls._handle_if_exists(
2010
+ name,
2011
+ if_exists=if_exists,
2012
+ label_names=kwargs.get("label_names"),
2013
+ embedding_model=kwargs.get("embedding_model"),
2014
+ )
2015
+ if existing is not None:
2016
+ return existing
2017
+
2018
+ datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
2019
+ kwargs["background"] = background
2020
+ return cls.create(name, datasource=datasource, **kwargs)
2021
+
2022
+ @classmethod
2023
+ def open(cls, name: str) -> Self:
2024
+ """
2025
+ Get a handle to a memoryset in the OrcaCloud
2026
+
2027
+ Params:
2028
+ name: Name or unique identifier of the memoryset
2029
+
2030
+ Returns:
2031
+ Handle to the existing memoryset in the OrcaCloud
2032
+
2033
+ Raises:
2034
+ LookupError: If the memoryset does not exist
2035
+ """
2036
+ client = OrcaClient._resolve_client()
2037
+ metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
2038
+ return cls(metadata)
2039
+
2040
+ @classmethod
2041
+ async def aopen(cls, name: str) -> Self:
2042
+ """
2043
+ Asynchronously get a handle to a memoryset in the OrcaCloud
2044
+
2045
+ Params:
2046
+ name: Name or unique identifier of the memoryset
2047
+
2048
+ Returns:
2049
+ Handle to the existing memoryset in the OrcaCloud
2050
+
2051
+ Raises:
2052
+ LookupError: If the memoryset does not exist
2053
+ """
2054
+ client = OrcaAsyncClient._resolve_client()
2055
+ metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
2056
+ return cls(metadata)
2057
+
2058
+ @classmethod
2059
+ def exists(cls, name_or_id: str) -> bool:
2060
+ """
2061
+ Check if a memoryset exists in the OrcaCloud
2062
+
2063
+ Params:
2064
+ name_or_id: Name or id of the memoryset
2065
+
2066
+ Returns:
2067
+ True if the memoryset exists, False otherwise
2068
+ """
2069
+ try:
2070
+ cls.open(name_or_id)
2071
+ return True
2072
+ except LookupError:
2073
+ return False
2074
+
2075
+ @classmethod
2076
+ def all(cls, show_hidden: bool = False) -> list[Self]:
2077
+ """
2078
+ Get a list of handles to all memorysets in the OrcaCloud
2079
+
2080
+ Params:
2081
+ show_hidden: Whether to include hidden memorysets in results, defaults to `False`
2082
+
2083
+ Returns:
2084
+ List of handles to all memorysets in the OrcaCloud
2085
+ """
2086
+ client = OrcaClient._resolve_client()
2087
+ return [
2088
+ cls(metadata)
2089
+ for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
2090
+ ]
2091
+
2092
+ @classmethod
2093
+ def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
2094
+ """
2095
+ Delete a memoryset from the OrcaCloud
2096
+
2097
+ Params:
2098
+ name_or_id: Name or id of the memoryset
2099
+ if_not_exists: What to do if the memoryset does not exist, defaults to `"error"`.
2100
+ Other options are `"ignore"` to do nothing if the memoryset does not exist.
2101
+
2102
+ Raises:
2103
+ LookupError: If the memoryset does not exist and if_not_exists is `"error"`
2104
+ """
2105
+ try:
2106
+ client = OrcaClient._resolve_client()
2107
+ client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
2108
+ logging.info(f"Deleted memoryset {name_or_id}")
2109
+ except LookupError:
2110
+ if if_not_exists == "error":
2111
+ raise
2112
+
2113
+ def set(
2114
+ self,
2115
+ *,
2116
+ name: str = UNSET,
2117
+ description: str | None = UNSET,
2118
+ label_names: list[str] = UNSET,
2119
+ hidden: bool = UNSET,
2120
+ ):
2121
+ """
2122
+ Update editable attributes of the memoryset
2123
+
2124
+ Note:
2125
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
2126
+
2127
+ Params:
2128
+ description: Value to set for the description
2129
+ name: Value to set for the name
2130
+ label_names: Value to replace existing label names with
2131
+ """
2132
+ payload: MemorysetUpdate = {}
2133
+ if name is not UNSET:
2134
+ payload["name"] = name
2135
+ if description is not UNSET:
2136
+ payload["description"] = description
2137
+ if label_names is not UNSET:
2138
+ payload["label_names"] = label_names
2139
+ if hidden is not UNSET:
2140
+ payload["hidden"] = hidden
2141
+
2142
+ client = OrcaClient._resolve_client()
2143
+ client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
2144
+ self.refresh()
2145
+
2146
+ @overload
2147
+ def clone(
2148
+ self,
2149
+ name: str,
2150
+ *,
2151
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
2152
+ max_seq_length_override: int | None = None,
2153
+ prompt: str | None = None,
2154
+ if_exists: CreateMode = "error",
2155
+ background: Literal[True],
2156
+ ) -> Job[Self]:
2157
+ pass
2158
+
2159
+ @overload
2160
+ def clone(
2161
+ self,
2162
+ name: str,
2163
+ *,
2164
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
2165
+ max_seq_length_override: int | None = None,
2166
+ prompt: str | None = None,
2167
+ if_exists: CreateMode = "error",
2168
+ background: Literal[False] = False,
2169
+ ) -> Self:
2170
+ pass
2171
+
2172
+ def clone(
2173
+ self,
2174
+ name: str,
2175
+ *,
2176
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
2177
+ max_seq_length_override: int | None = UNSET,
2178
+ prompt: str | None = None,
2179
+ if_exists: CreateMode = "error",
2180
+ background: bool = False,
2181
+ ) -> Self | Job[Self]:
2182
+ """
2183
+ Create a clone of the memoryset with a new name
2184
+
2185
+ Params:
2186
+ name: Name for the new memoryset (must be unique)
2187
+ embedding_model: Optional new embedding model to use for re-embedding the memory values
2188
+ value is longer than this it will be truncated, will default to the model's max
2189
+ sequence length if not provided
2190
+ max_seq_length_override: Optional custom max sequence length to use for the cloned memoryset.
2191
+ If not provided, will use the source memoryset's max sequence length.
2192
+ prompt: Optional custom prompt to use for the cloned memoryset.
2193
+ If not provided, will use the source memoryset's prompt.
2194
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
2195
+ `"error"`. Other option is `"open"` to open the existing memoryset.
2196
+
2197
+ Returns:
2198
+ Handle to the cloned memoryset in the OrcaCloud
2199
+
2200
+ Examples:
2201
+ >>> memoryset = LabeledMemoryset.open("my_memoryset")
2202
+ >>> finetuned_embedding_model = PretrainedEmbeddingModel.GTE_BASE.finetune(
2203
+ ... "gte_base_finetuned", my_memoryset
2204
+ ... )
2205
+ >>> new_memoryset = memoryset.clone(
2206
+ ... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
2207
+ ... )
2208
+
2209
+ >>> # Clone with custom prompts
2210
+ >>> new_memoryset = memoryset.clone(
2211
+ ... "my_memoryset_with_prompts",
2212
+ ... document_prompt_override="Represent this document for retrieval:",
2213
+ ... query_prompt_override="Represent this query for retrieval:",
2214
+ ... )
2215
+ """
2216
+ if self.exists(name):
2217
+ if if_exists == "error":
2218
+ raise ValueError(f"Memoryset with name {name} already exists")
2219
+ elif if_exists == "open":
2220
+ existing = self.open(name)
2221
+ for attribute in {"embedding_model"}:
2222
+ if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
2223
+ raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
2224
+ return existing
2225
+ payload: CloneMemorysetRequest = {"name": name}
2226
+ if max_seq_length_override is not UNSET:
2227
+ payload["max_seq_length_override"] = max_seq_length_override
2228
+ if prompt is not None:
2229
+ payload["prompt"] = prompt
2230
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
2231
+ payload["pretrained_embedding_model_name"] = embedding_model.name
2232
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
2233
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
2234
+
2235
+ client = OrcaClient._resolve_client()
2236
+ metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
2237
+
2238
+ if metadata["insertion_job_id"] is None:
2239
+ raise ValueError("Create memoryset operation failed to produce an insertion job")
2240
+
2241
+ job = Job(
2242
+ metadata["insertion_job_id"],
2243
+ lambda: self.open(metadata["id"]),
2244
+ )
2245
+ return job if background else job.result()
2246
+
2247
+ def refresh(self, throttle: float = 0):
2248
+ """
2249
+ Refresh the information about the memoryset from the OrcaCloud
2250
+
2251
+ Params:
2252
+ throttle: Minimum time in seconds between refreshes
2253
+ """
2254
+ current_time = datetime.now()
2255
+ # Skip refresh if last refresh was too recent
2256
+ if (current_time - self._last_refresh) < timedelta(seconds=throttle):
2257
+ return
2258
+
2259
+ self.__dict__.update(self.open(self.id).__dict__)
2260
+ self._last_refresh = current_time
2261
+
2262
+ def __len__(self) -> int:
2263
+ """Get the number of memories in the memoryset"""
2264
+ self.refresh(throttle=5)
2265
+ return self.length
2266
+
2267
+ @overload
2268
+ def __getitem__(self, index: int | str) -> MemoryT:
2269
+ pass
2270
+
2271
+ @overload
2272
+ def __getitem__(self, index: slice) -> list[MemoryT]:
2273
+ pass
2274
+
2275
+ def __getitem__(self, index: int | slice | str) -> MemoryT | list[MemoryT]:
2276
+ """
2277
+ Get memories from the memoryset by index or memory id
2278
+
2279
+ Params:
2280
+ index: Index or memory to retrieve or slice of memories to retrieve or unique
2281
+ identifier of the memory to retrieve
2282
+
2283
+ Returns:
2284
+ Memory or memories from the memoryset
2285
+
2286
+ Raises:
2287
+ LookupError: If the id is not found or the index is out of bounds
2288
+
2289
+ Examples:
2290
+ Retrieve the first memory in the memoryset:
2291
+ >>> memoryset[0]
2292
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
2293
+
2294
+ Retrieve the last memory in the memoryset:
2295
+ >>> memoryset[-1]
2296
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' })
2297
+
2298
+ Retrieve a slice of memories in the memoryset:
2299
+ >>> memoryset[1:3]
2300
+ [
2301
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
2302
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
2303
+ ]
2304
+
2305
+ Retrieve a memory by id:
2306
+ >>> memoryset["0195019a-5bc7-7afb-b902-5945ee1fb766"]
2307
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
2308
+ """
2309
+ if isinstance(index, int):
2310
+ return self.query(offset=len(self) + index if index < 0 else index, limit=1)[0]
2311
+ elif isinstance(index, str):
2312
+ return self.get(index)
2313
+ elif isinstance(index, slice):
2314
+ start = 0 if index.start is None else (len(self) + index.start) if index.start < 0 else index.start
2315
+ stop = len(self) if index.stop is None else (len(self) + index.stop) if index.stop < 0 else index.stop
2316
+ return self.query(offset=start, limit=stop - start)
2317
+ else:
2318
+ raise ValueError(f"Invalid index type: {type(index)}")
2319
+
2320
+ @overload
2321
+ def search(
2322
+ self,
2323
+ query: str,
2324
+ *,
2325
+ count: int = 1,
2326
+ prompt: str | None = None,
2327
+ partition_id: str | None = None,
2328
+ partition_filter_mode: Literal[
2329
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2330
+ ] = "include_global",
2331
+ ) -> list[MemoryLookupT]:
2332
+ pass
2333
+
2334
+ @overload
2335
+ def search(
2336
+ self,
2337
+ query: list[str],
2338
+ *,
2339
+ count: int = 1,
2340
+ prompt: str | None = None,
2341
+ partition_id: str | None = None,
2342
+ partition_filter_mode: Literal[
2343
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2344
+ ] = "include_global",
2345
+ ) -> list[list[MemoryLookupT]]:
2346
+ pass
2347
+
2348
+ def search(
2349
+ self,
2350
+ query: str | list[str],
2351
+ *,
2352
+ count: int = 1,
2353
+ prompt: str | None = None,
2354
+ partition_id: str | None = None,
2355
+ partition_filter_mode: Literal[
2356
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2357
+ ] = "include_global",
2358
+ ) -> list[MemoryLookupT] | list[list[MemoryLookupT]]:
2359
+ """
2360
+ Search for memories that are semantically similar to the query
2361
+
2362
+ Params:
2363
+ query: Query to lookup memories in the memoryset, can be a single query or a list
2364
+ count: Number of memories to return for each query
2365
+ prompt: Optional prompt for query embedding during search.
2366
+ If not provided, the memoryset's default query prompt will be used if available.
2367
+ partition_id: Optional partition ID to filter memories by
2368
+ partition_filter_mode: How to filter partitions when searching for memories
2369
+ - "ignore_partitions": Ignore partitions
2370
+ - "include_global": Include global memories
2371
+ - "exclude_global": Exclude global memories
2372
+ - "only_global": Only include global memories
2373
+ Returns:
2374
+ List of memories from the memoryset that match the query. If a single query is provided,
2375
+ the return value is a list containing a single list of memories. If a list of
2376
+ queries is provided, the return value is a list of lists of memories.
2377
+
2378
+ Examples:
2379
+ Search for similar memories:
2380
+ >>> memoryset.search("I am happy", count=2)
2381
+ [
2382
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
2383
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
2384
+ ]
2385
+
2386
+ Search with custom query prompt for instruction-following models:
2387
+ >>> memoryset.search("I am happy", count=2, query_prompt="Represent this query for sentiment retrieval:")
2388
+ [
2389
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
2390
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
2391
+ ]
2392
+
2393
+ Search for similar memories for multiple queries:
2394
+ >>> memoryset.search(["I am happy", "I am sad"], count=1)
2395
+ [
2396
+ [
2397
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
2398
+ ],
2399
+ [
2400
+ LabeledMemoryLookup({ label: <negative: 0>, value: 'I am sad' }),
2401
+ ],
2402
+ ]
2403
+ """
2404
+ client = OrcaClient._resolve_client()
2405
+ response = client.POST(
2406
+ "/gpu/memoryset/{name_or_id}/lookup",
2407
+ params={"name_or_id": self.id},
2408
+ json={
2409
+ "query": query if isinstance(query, list) else [query],
2410
+ "count": count,
2411
+ "prompt": prompt,
2412
+ "partition_id": partition_id,
2413
+ "partition_filter_mode": partition_filter_mode,
2414
+ },
2415
+ )
2416
+ lookups = [
2417
+ [
2418
+ cast(
2419
+ MemoryLookupT,
2420
+ (
2421
+ LabeledMemoryLookup(self.id, lookup_response)
2422
+ if "label" in lookup_response
2423
+ else ScoredMemoryLookup(self.id, lookup_response)
2424
+ ),
2425
+ )
2426
+ for lookup_response in batch
2427
+ ]
2428
+ for batch in response
2429
+ ]
2430
+ return lookups if isinstance(query, list) else lookups[0]
2431
+
2432
+ def query(
2433
+ self,
2434
+ offset: int = 0,
2435
+ limit: int = 100,
2436
+ filters: list[FilterItemTuple] = [],
2437
+ with_feedback_metrics: bool = False,
2438
+ sort: list[TelemetrySortItem] | None = None,
2439
+ partition_id: str | None = None,
2440
+ partition_filter_mode: Literal[
2441
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2442
+ ] = "include_global",
2443
+ ) -> list[MemoryT]:
2444
+ """
2445
+ Query the memoryset for memories that match the filters
2446
+
2447
+ Params:
2448
+ offset: The offset of the first memory to return
2449
+ limit: The maximum number of memories to return
2450
+ filters: List of filters to apply to the query.
2451
+ with_feedback_metrics: Whether to include feedback metrics in the response
2452
+
2453
+ Returns:
2454
+ List of memories from the memoryset that match the filters
2455
+
2456
+ Examples:
2457
+ >>> memoryset.query(filters=[("label", "==", 0)], limit=2)
2458
+ [
2459
+ LabeledMemory({ label: <positive: 1>, value: "I am happy" }),
2460
+ LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
2461
+ ]
2462
+ """
2463
+ parsed_filters = [
2464
+ _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
2465
+ ]
2466
+
2467
+ if with_feedback_metrics:
2468
+ if partition_id:
2469
+ raise ValueError("Partition ID is not supported when with_feedback_metrics is True")
2470
+ if partition_filter_mode != "include_global":
2471
+ raise ValueError(
2472
+ f"Partition filter mode {partition_filter_mode} is not supported when with_feedback_metrics is True. Only 'include_global' is supported."
2473
+ )
2474
+
2475
+ client = OrcaClient._resolve_client()
2476
+ response = client.POST(
2477
+ "/telemetry/memories",
2478
+ json={
2479
+ "memoryset_id": self.id,
2480
+ "offset": offset,
2481
+ "limit": limit,
2482
+ "filters": parsed_filters,
2483
+ "sort": [_parse_sort_item_from_tuple(item) for item in sort] if sort else None,
2484
+ },
2485
+ )
2486
+ return [
2487
+ cast(
2488
+ MemoryT,
2489
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
2490
+ )
2491
+ for memory in response["items"]
2492
+ ]
2493
+
2494
+ if any(_is_metric_column(filter[0]) for filter in filters):
2495
+ raise ValueError("Feedback metrics are only supported when the with_feedback_metrics flag is set to True")
2496
+
2497
+ if sort:
2498
+ logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
2499
+
2500
+ client = OrcaClient._resolve_client()
2501
+ response = client.POST(
2502
+ "/memoryset/{name_or_id}/memories",
2503
+ params={"name_or_id": self.id},
2504
+ json={
2505
+ "offset": offset,
2506
+ "limit": limit,
2507
+ "filters": cast(list[FilterItem], parsed_filters),
2508
+ "partition_id": partition_id,
2509
+ "partition_filter_mode": partition_filter_mode,
2510
+ },
2511
+ )
2512
+ return [
2513
+ cast(
2514
+ MemoryT,
2515
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
2516
+ )
2517
+ for memory in response
2518
+ ]
2519
+
2520
+ def to_pandas(
2521
+ self,
2522
+ offset: int = 0,
2523
+ limit: int = 100,
2524
+ filters: list[FilterItemTuple] = [],
2525
+ with_feedback_metrics: bool = False,
2526
+ sort: list[TelemetrySortItem] | None = None,
2527
+ ) -> pd.DataFrame:
2528
+ """
2529
+ Convert the memoryset to a pandas DataFrame
2530
+ """
2531
+ return pd.DataFrame(
2532
+ [
2533
+ memory.to_dict()
2534
+ for memory in self.query(
2535
+ offset=offset,
2536
+ limit=limit,
2537
+ filters=filters,
2538
+ with_feedback_metrics=with_feedback_metrics,
2539
+ sort=sort,
2540
+ )
2541
+ ]
2542
+ )
2543
+
2544
+ def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any], *, batch_size: int = 32) -> None:
2545
+ """
2546
+ Insert memories into the memoryset
2547
+
2548
+ Params:
2549
+ items: List of memories to insert into the memoryset. This should be a list of
2550
+ dictionaries with the following keys:
2551
+
2552
+ - `value`: Value of the memory
2553
+ - `label`: Label of the memory
2554
+ - `score`: Score of the memory
2555
+ - `source_id`: Optional unique ID of the memory in a system of reference
2556
+ - `...`: Any other metadata to store for the memory
2557
+
2558
+ batch_size: Number of memories to insert in a single API call
2559
+
2560
+ Examples:
2561
+ >>> memoryset.insert([
2562
+ ... {"value": "I am happy", "label": 1, "source_id": "data_123", "partition_id": "user_1", "tag": "happy"},
2563
+ ... {"value": "I am sad", "label": 0, "source_id": "data_124", "partition_id": "user_1", "tag": "sad"},
2564
+ ... ])
2565
+ """
2566
+ if batch_size <= 0 or batch_size > 500:
2567
+ raise ValueError("batch_size must be between 1 and 500")
2568
+ client = OrcaClient._resolve_client()
2569
+ items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
2570
+ # insert memories in batches to avoid API timeouts
2571
+ for i in range(0, len(items), batch_size):
2572
+ batch = items[i : i + batch_size]
2573
+ client.POST(
2574
+ "/gpu/memoryset/{name_or_id}/memory",
2575
+ params={"name_or_id": self.id},
2576
+ json=cast(
2577
+ list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
2578
+ [_parse_memory_insert(item, type=self.memory_type) for item in batch],
2579
+ ),
2580
+ )
2581
+
2582
+ self.refresh()
2583
+
2584
+ async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any], *, batch_size: int = 32) -> None:
2585
+ """
2586
+ Asynchronously insert memories into the memoryset
2587
+
2588
+ Params:
2589
+ items: List of memories to insert into the memoryset. This should be a list of
2590
+ dictionaries with the following keys:
2591
+
2592
+ - `value`: Value of the memory
2593
+ - `label`: Label of the memory
2594
+ - `score`: Score of the memory
2595
+ - `source_id`: Optional unique ID of the memory in a system of reference
2596
+ - `partition_id`: Optional partition ID of the memory
2597
+ - `...`: Any other metadata to store for the memory
2598
+
2599
+ batch_size: Number of memories to insert in a single API call
2600
+
2601
+ Examples:
2602
+ >>> await memoryset.ainsert([
2603
+ ... {"value": "I am happy", "label": 1, "source_id": "data_123", "partition_id": "user_1", "tag": "happy"},
2604
+ ... {"value": "I am sad", "label": 0, "source_id": "data_124", "partition_id": "user_1", "tag": "sad"},
2605
+ ... ])
2606
+ """
2607
+ if batch_size <= 0 or batch_size > 500:
2608
+ raise ValueError("batch_size must be between 1 and 500")
2609
+ client = OrcaAsyncClient._resolve_client()
2610
+ items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
2611
+ # insert memories in batches to avoid API timeouts
2612
+ for i in range(0, len(items), batch_size):
2613
+ batch = items[i : i + batch_size]
2614
+ await client.POST(
2615
+ "/gpu/memoryset/{name_or_id}/memory",
2616
+ params={"name_or_id": self.id},
2617
+ json=cast(
2618
+ list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
2619
+ [_parse_memory_insert(item, type=self.memory_type) for item in batch],
2620
+ ),
2621
+ )
2622
+
2623
+ await self.arefresh()
2624
+
2625
+ async def arefresh(self, throttle: float = 0):
2626
+ """
2627
+ Asynchronously refresh the information about the memoryset from the OrcaCloud
2628
+
2629
+ Params:
2630
+ throttle: Minimum time in seconds between refreshes
2631
+ """
2632
+ current_time = datetime.now()
2633
+ # Skip refresh if last refresh was too recent
2634
+ if (current_time - self._last_refresh) < timedelta(seconds=throttle):
2635
+ return
2636
+
2637
+ refreshed_memoryset = await type(self).aopen(self.id)
2638
+ self.__dict__.update(refreshed_memoryset.__dict__)
2639
+ self._last_refresh = current_time
2640
+
2641
+ @overload
2642
+ def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
2643
+ pass
2644
+
2645
+ @overload
2646
+ def get(self, memory_id: Iterable[str]) -> list[MemoryT]:
2647
+ pass
2648
+
2649
+ def get(self, memory_id: str | Iterable[str]) -> MemoryT | list[MemoryT]:
2650
+ """
2651
+ Fetch a memory or memories from the memoryset
2652
+
2653
+ Params:
2654
+ memory_id: Unique identifier of the memory or memories to fetch
2655
+
2656
+ Returns:
2657
+ Memory or list of memories from the memoryset
2658
+
2659
+ Raises:
2660
+ LookupError: If no memory with the given id is found
2661
+
2662
+ Examples:
2663
+ Fetch a single memory:
2664
+ >>> memoryset.get("0195019a-5bc7-7afb-b902-5945ee1fb766")
2665
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
2666
+
2667
+ Fetch multiple memories:
2668
+ >>> memoryset.get([
2669
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
2670
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
2671
+ ... ])
2672
+ [
2673
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
2674
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
2675
+ ]
2676
+ """
2677
+ if isinstance(memory_id, str):
2678
+ client = OrcaClient._resolve_client()
2679
+ response = client.GET(
2680
+ "/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
2681
+ )
2682
+ return cast(
2683
+ MemoryT,
2684
+ (LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
2685
+ )
2686
+ else:
2687
+ client = OrcaClient._resolve_client()
2688
+ response = client.POST(
2689
+ "/memoryset/{name_or_id}/memories/get",
2690
+ params={"name_or_id": self.id},
2691
+ json={"memory_ids": list(memory_id)},
2692
+ )
2693
+ return [
2694
+ cast(
2695
+ MemoryT,
2696
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
2697
+ )
2698
+ for memory in response
2699
+ ]
2700
+
2701
+ @overload
2702
+ def update(self, updates: dict[str, Any], *, batch_size: int = 32) -> MemoryT:
2703
+ pass
2704
+
2705
+ @overload
2706
+ def update(self, updates: Iterable[dict[str, Any]], *, batch_size: int = 32) -> list[MemoryT]:
2707
+ pass
2708
+
2709
+ def update(
2710
+ self, updates: dict[str, Any] | Iterable[dict[str, Any]], *, batch_size: int = 32
2711
+ ) -> MemoryT | list[MemoryT]:
2712
+ """
2713
+ Update one or multiple memories in the memoryset
2714
+
2715
+ Params:
2716
+ updates: List of updates to apply to the memories. Each update should be a dictionary
2717
+ with the following keys:
2718
+
2719
+ - `memory_id`: Unique identifier of the memory to update (required)
2720
+ - `value`: Optional new value of the memory
2721
+ - `label`: Optional new label of the memory
2722
+ - `source_id`: Optional new source ID of the memory
2723
+ - `partition_id`: Optional new partition ID of the memory
2724
+ - `...`: Optional new values for metadata properties
2725
+
2726
+ batch_size: Number of memories to update in a single API call
2727
+
2728
+ Returns:
2729
+ Updated memory or list of updated memories
2730
+
2731
+ Examples:
2732
+ Update a single memory:
2733
+ >>> memoryset.update(
2734
+ ... {
2735
+ ... "memory_id": "019501a1-ea08-76b2-9f62-95e4800b4841",
2736
+ ... "tag": "happy",
2737
+ ... },
2738
+ ... )
2739
+
2740
+ Update multiple memories:
2741
+ >>> memoryset.update(
2742
+ ... {"memory_id": m.memory_id, "label": 2}
2743
+ ... for m in memoryset.query(filters=[("tag", "==", "happy")])
2744
+ ... )
2745
+ """
2746
+ if batch_size <= 0 or batch_size > 500:
2747
+ raise ValueError("batch_size must be between 1 and 500")
2748
+ client = OrcaClient._resolve_client()
2749
+ updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
2750
+ # update memories in batches to avoid API timeouts
2751
+ updated_memories: list[MemoryT] = []
2752
+ for i in range(0, len(updates_list), batch_size):
2753
+ batch = updates_list[i : i + batch_size]
2754
+ response = client.PATCH(
2755
+ "/gpu/memoryset/{name_or_id}/memories",
2756
+ params={"name_or_id": self.id},
2757
+ json=cast(
2758
+ list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
2759
+ [_parse_memory_update(update, type=self.memory_type) for update in batch],
2760
+ ),
2761
+ )
2762
+ updated_memories.extend(
2763
+ cast(
2764
+ MemoryT,
2765
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
2766
+ )
2767
+ for memory in response
2768
+ )
2769
+
2770
+ return updated_memories[0] if isinstance(updates, dict) else updated_memories
2771
+
2772
+ def get_cascading_edits_suggestions(
2773
+ self,
2774
+ memory: MemoryT,
2775
+ *,
2776
+ old_label: int,
2777
+ new_label: int,
2778
+ max_neighbors: int = 50,
2779
+ max_validation_neighbors: int = 10,
2780
+ similarity_threshold: float | None = None,
2781
+ only_if_has_old_label: bool = True,
2782
+ exclude_if_new_label: bool = True,
2783
+ suggestion_cooldown_time: float = 3600.0 * 24.0, # 1 day
2784
+ label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7, # 1 week
2785
+ ) -> list[CascadingEditSuggestion]:
2786
+ """
2787
+ Suggests cascading edits for a given memory based on nearby points with similar labels.
2788
+
2789
+ This function is triggered after a user changes a memory's label. It looks for nearby
2790
+ candidates in embedding space that may be subject to similar relabeling and returns them
2791
+ as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
2792
+ to reduce noise and improve usability.
2793
+
2794
+ Params:
2795
+ memory: The memory whose label was just changed.
2796
+ old_label: The label this memory used to have.
2797
+ new_label: The label it was changed to.
2798
+ max_neighbors: Maximum number of neighbors to consider.
2799
+ max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
2800
+ similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
2801
+ only_if_has_old_label: If True, only consider neighbors that have the old label.
2802
+ exclude_if_new_label: If True, exclude neighbors that already have the new label.
2803
+ suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
2804
+ to be considered again.
2805
+ label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
2806
+ to be considered for suggestions.
2807
+
2808
+ Returns:
2809
+ A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
2810
+ """
2811
+ # TODO: properly integrate this with memory edits and return something that can be applied
2812
+ client = OrcaClient._resolve_client()
2813
+ return client.POST(
2814
+ "/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
2815
+ params={"name_or_id": self.id, "memory_id": memory.memory_id},
2816
+ json={
2817
+ "old_label": old_label,
2818
+ "new_label": new_label,
2819
+ "max_neighbors": max_neighbors,
2820
+ "max_validation_neighbors": max_validation_neighbors,
2821
+ "similarity_threshold": similarity_threshold,
2822
+ "only_if_has_old_label": only_if_has_old_label,
2823
+ "exclude_if_new_label": exclude_if_new_label,
2824
+ "suggestion_cooldown_time": suggestion_cooldown_time,
2825
+ "label_confirmation_cooldown_time": label_confirmation_cooldown_time,
2826
+ },
2827
+ )
2828
+
2829
+ def delete(self, memory_id: str | Iterable[str], *, batch_size: int = 32) -> None:
2830
+ """
2831
+ Delete memories from the memoryset
2832
+
2833
+ Params:
2834
+ memory_id: unique identifiers of the memories to delete
2835
+ batch_size: Number of memories to delete in a single API call
2836
+
2837
+ Examples:
2838
+ Delete a single memory:
2839
+ >>> memoryset.delete("0195019a-5bc7-7afb-b902-5945ee1fb766")
2840
+
2841
+ Delete multiple memories:
2842
+ >>> memoryset.delete([
2843
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
2844
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
2845
+ ... )
2846
+
2847
+ """
2848
+ if batch_size <= 0 or batch_size > 500:
2849
+ raise ValueError("batch_size must be between 1 and 500")
2850
+ client = OrcaClient._resolve_client()
2851
+ memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
2852
+ # delete memories in batches to avoid API timeouts
2853
+ for i in range(0, len(memory_ids), batch_size):
2854
+ batch = memory_ids[i : i + batch_size]
2855
+ client.POST(
2856
+ "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
2857
+ )
2858
+ logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
2859
+ self.refresh()
2860
+
2861
+ @overload
2862
+ def analyze(
2863
+ self,
2864
+ *analyses: dict[str, Any] | str,
2865
+ lookup_count: int = 15,
2866
+ clear_metrics: bool = False,
2867
+ background: Literal[True],
2868
+ partition_filter_mode: Literal[
2869
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2870
+ ] = "include_global",
2871
+ ) -> Job[MemorysetMetrics]:
2872
+ pass
2873
+
2874
+ @overload
2875
+ def analyze(
2876
+ self,
2877
+ *analyses: dict[str, Any] | str,
2878
+ lookup_count: int = 15,
2879
+ clear_metrics: bool = False,
2880
+ background: Literal[False] = False,
2881
+ partition_filter_mode: Literal[
2882
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2883
+ ] = "include_global",
2884
+ ) -> MemorysetMetrics:
2885
+ pass
2886
+
2887
+ def analyze(
2888
+ self,
2889
+ *analyses: dict[str, Any] | str,
2890
+ lookup_count: int = 15,
2891
+ clear_metrics: bool = False,
2892
+ background: bool = False,
2893
+ partition_filter_mode: Literal[
2894
+ "ignore_partitions", "include_global", "exclude_global", "only_global"
2895
+ ] = "include_global",
2896
+ ) -> Job[MemorysetMetrics] | MemorysetMetrics:
2897
+ """
2898
+ Run analyses on the memoryset to find duplicates, clusters, mislabelings, and more
2899
+
2900
+ The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
2901
+ attribute of each memory in the memoryset. Overall memoryset metrics will be returned as a dictionary.
2902
+
2903
+ Params:
2904
+ analyses: List of analysis to run on the memoryset, can either be just the name of an
2905
+ analysis or a dictionary with a name property and additional config. The available
2906
+ analyses are:
2907
+
2908
+ - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
2909
+ - **`"cluster"`**: Cluster the memories in the memoryset
2910
+ - **`"distribution"`**: Analyze the embedding distribution
2911
+ - **`"projection"`**: Create a 2D projection of the embeddings for visualization
2912
+ - **`"label"`**: Analyze the labels to find potential mislabelings (labeled memorysets only)
2913
+ - **`"class_patterns"`**: Analyze class patterns and find representative memories (labeled memorysets only)
2914
+ - **`"concepts"`**: Discover and name conceptual clusters in the memoryset (labeled memorysets only)
2915
+
2916
+ lookup_count: Number of memories to lookup for each memory in the memoryset
2917
+ clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
2918
+ partition_filter_mode: How to filter partitions when running the analysis
2919
+ - "ignore_partitions": Ignore partitions
2920
+ - "include_global": Include global memories
2921
+ - "exclude_global": Exclude global memories
2922
+ - "only_global": Only include global memories
2923
+
2924
+ Returns:
2925
+ dictionary with aggregate metrics for each analysis that was run
2926
+
2927
+ Raises:
2928
+ ValueError: If an invalid analysis name is provided
2929
+
2930
+ Examples:
2931
+ Run label and duplicate analysis:
2932
+ >>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
2933
+ { "duplicate": { "num_duplicates": 10 },
2934
+ "label": {
2935
+ "label_metrics": [{
2936
+ "label": 0,
2937
+ "label_name": "negative",
2938
+ "average_lookup_score": 0.95,
2939
+ "memory_count": 100,
2940
+ }, {
2941
+ "label": 1,
2942
+ "label_name": "positive",
2943
+ "average_lookup_score": 0.90,
2944
+ "memory_count": 100,
2945
+ }]
2946
+ "neighbor_prediction_accuracy": 0.95,
2947
+ "mean_neighbor_label_confidence": 0.95,
2948
+ "mean_neighbor_label_entropy": 0.95,
2949
+ "mean_neighbor_predicted_label_ambiguity": 0.95,
2950
+ }
2951
+ }
2952
+
2953
+ Remove all exact duplicates:
2954
+ >>> memoryset.delete(
2955
+ ... m.memory_id
2956
+ ... for m in memoryset.query(
2957
+ ... filters=[("metrics.is_duplicate", "==", True)]
2958
+ ... )
2959
+ ... )
2960
+
2961
+ Display label analysis to review potential mislabelings:
2962
+ >>> memoryset.display_label_analysis()
2963
+ """
2964
+
2965
+ # Get valid analysis names from MemorysetAnalysisConfigs
2966
+ valid_analysis_names = set(MemorysetAnalysisConfigs.__annotations__)
2967
+
2968
+ configs: MemorysetAnalysisConfigs = {}
2969
+ for analysis in analyses:
2970
+ if isinstance(analysis, str):
2971
+ error_msg = (
2972
+ f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
2973
+ )
2974
+ if analysis not in valid_analysis_names:
2975
+ raise ValueError(error_msg)
2976
+ configs[analysis] = {}
2977
+ else:
2978
+ name = analysis.pop("name")
2979
+ error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
2980
+ if name not in valid_analysis_names:
2981
+ raise ValueError(error_msg)
2982
+ configs[name] = analysis
2983
+
2984
+ client = OrcaClient._resolve_client()
2985
+ analysis = client.POST(
2986
+ "/memoryset/{name_or_id}/analysis",
2987
+ params={"name_or_id": self.id},
2988
+ json={
2989
+ "configs": configs,
2990
+ "lookup_count": lookup_count,
2991
+ "clear_metrics": clear_metrics,
2992
+ "partition_filter_mode": partition_filter_mode,
2993
+ },
2994
+ )
2995
+
2996
+ def get_analysis_result():
2997
+ client = OrcaClient._resolve_client()
2998
+ return client.GET(
2999
+ "/memoryset/{name_or_id}/analysis/{analysis_job_id}",
3000
+ params={"name_or_id": self.id, "analysis_job_id": analysis["job_id"]},
3001
+ )["results"]
3002
+
3003
+ job = Job(analysis["job_id"], get_analysis_result)
3004
+ return job if background else job.result()
3005
+
3006
+ def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
3007
+ """Group potential duplicates in the memoryset"""
3008
+ client = OrcaClient._resolve_client()
3009
+ response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
3010
+ return [
3011
+ [cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
3012
+ for ms in response
3013
+ ]
3014
+
3015
+
3016
+ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
3017
+ """
3018
+ A Handle to a collection of memories with labels in the OrcaCloud
3019
+
3020
+ Attributes:
3021
+ id: Unique identifier for the memoryset
3022
+ name: Unique name of the memoryset
3023
+ description: Description of the memoryset
3024
+ label_names: Names for the class labels in the memoryset
3025
+ length: Number of memories in the memoryset
3026
+ embedding_model: Embedding model used to embed the memory values for semantic search
3027
+ created_at: When the memoryset was created, automatically generated on create
3028
+ updated_at: When the memoryset was last updated, automatically updated on updates
3029
+ """
3030
+
3031
+ label_names: list[str]
3032
+ memory_type: MemoryType = "LABELED"
3033
+
3034
+ def __init__(self, metadata: MemorysetMetadata):
3035
+ super().__init__(metadata)
3036
+ assert metadata["label_names"] is not None
3037
+ self.label_names = metadata["label_names"]
3038
+
3039
+ def __eq__(self, other) -> bool:
3040
+ return isinstance(other, LabeledMemoryset) and self.id == other.id
3041
+
3042
+ @overload
3043
+ @classmethod
3044
+ def create(
3045
+ cls,
3046
+ name: str,
3047
+ *,
3048
+ datasource: None = None,
3049
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3050
+ description: str | None = None,
3051
+ label_names: list[str],
3052
+ max_seq_length_override: int | None = None,
3053
+ prompt: str | None = None,
3054
+ index_type: IndexType = "FLAT",
3055
+ index_params: dict[str, Any] = {},
3056
+ if_exists: CreateMode = "error",
3057
+ hidden: bool = False,
3058
+ ) -> Self:
3059
+ pass
3060
+
3061
+ @overload
3062
+ @classmethod
3063
+ def create(
3064
+ cls,
3065
+ name: str,
3066
+ *,
3067
+ datasource: Datasource,
3068
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3069
+ value_column: str = "value",
3070
+ label_column: str | None = "label",
3071
+ source_id_column: str | None = None,
3072
+ partition_id_column: str | None = None,
3073
+ description: str | None = None,
3074
+ label_names: list[str] | None = None,
3075
+ max_seq_length_override: int | None = None,
3076
+ prompt: str | None = None,
3077
+ remove_duplicates: bool = True,
3078
+ index_type: IndexType = "FLAT",
3079
+ index_params: dict[str, Any] = {},
3080
+ if_exists: CreateMode = "error",
3081
+ background: Literal[True],
3082
+ hidden: bool = False,
3083
+ subsample: int | float | None = None,
3084
+ ) -> Job[Self]:
3085
+ pass
3086
+
3087
+ @overload
3088
+ @classmethod
3089
+ def create(
3090
+ cls,
3091
+ name: str,
3092
+ *,
3093
+ datasource: Datasource,
3094
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3095
+ value_column: str = "value",
3096
+ label_column: str | None = "label",
3097
+ source_id_column: str | None = None,
3098
+ partition_id_column: str | None = None,
3099
+ description: str | None = None,
3100
+ label_names: list[str] | None = None,
3101
+ max_seq_length_override: int | None = None,
3102
+ prompt: str | None = None,
3103
+ remove_duplicates: bool = True,
3104
+ index_type: IndexType = "FLAT",
3105
+ index_params: dict[str, Any] = {},
3106
+ if_exists: CreateMode = "error",
3107
+ background: Literal[False] = False,
3108
+ hidden: bool = False,
3109
+ subsample: int | float | None = None,
3110
+ ) -> Self:
3111
+ pass
3112
+
3113
+ @classmethod
3114
+ def create( # type: ignore[override]
3115
+ cls,
3116
+ name: str,
3117
+ *,
3118
+ datasource: Datasource | None = None,
3119
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3120
+ value_column: str = "value",
3121
+ label_column: str | None = "label",
3122
+ source_id_column: str | None = None,
3123
+ partition_id_column: str | None = None,
3124
+ description: str | None = None,
3125
+ label_names: list[str] | None = None,
3126
+ max_seq_length_override: int | None = None,
3127
+ prompt: str | None = None,
3128
+ remove_duplicates: bool = True,
3129
+ index_type: IndexType = "FLAT",
3130
+ index_params: dict[str, Any] = {},
3131
+ if_exists: CreateMode = "error",
3132
+ background: bool = False,
3133
+ hidden: bool = False,
3134
+ subsample: int | float | None = None,
3135
+ ) -> Self | Job[Self]:
3136
+ """
3137
+ Create a new labeled memoryset in the OrcaCloud
3138
+
3139
+ If `datasource` is provided, all columns from the datasource that are not specified in the
3140
+ `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
3141
+ as metadata in the memoryset.
3142
+
3143
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
3144
+ You can add memories later using the `insert` method.
3145
+
3146
+ Params:
3147
+ name: Name for the new memoryset (must be unique)
3148
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
3149
+ an empty memoryset will be created.
3150
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3151
+ If not provided, a default embedding model for the memoryset will be used.
3152
+ value_column: Name of the column in the datasource that contains the memory values
3153
+ label_column: Name of the column in the datasource that contains the memory labels.
3154
+ Must contain categorical values as integers or strings. String labels will be
3155
+ converted to integers with the unique strings extracted as `label_names`. To create
3156
+ a memoryset with all none labels, set to `None`.
3157
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3158
+ the system of reference
3159
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3160
+ description: Optional description for the memoryset, this will be used in agentic flows,
3161
+ so make sure it is concise and describes the contents of your memoryset not the
3162
+ datasource or the embedding model.
3163
+ label_names: List of human-readable names for the labels in the memoryset, must match
3164
+ the number of labels in the `label_column`. Will be automatically inferred if string
3165
+ labels are provided or if a [Dataset][datasets.Dataset] with a
3166
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
3167
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3168
+ value is longer than this it will be truncated, will default to the model's max
3169
+ sequence length if not provided
3170
+ prompt: Optional prompt to use when embedding documents/memories for storage
3171
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3172
+ into the memoryset
3173
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3174
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3175
+ index_params: Parameters for the vector index, defaults to `{}`
3176
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3177
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3178
+ background: Whether to run the operation none blocking and return a job handle
3179
+ hidden: Whether the memoryset should be hidden
3180
+
3181
+ Returns:
3182
+ Handle to the new memoryset in the OrcaCloud
3183
+
3184
+ Raises:
3185
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3186
+ `"open"` and the params do not match those of the existing memoryset.
3187
+ """
3188
+ if datasource is None:
3189
+ return super().create(
3190
+ name,
3191
+ datasource=None,
3192
+ embedding_model=embedding_model,
3193
+ description=description,
3194
+ label_names=label_names,
3195
+ max_seq_length_override=max_seq_length_override,
3196
+ prompt=prompt,
3197
+ index_type=index_type,
3198
+ index_params=index_params,
3199
+ if_exists=if_exists,
3200
+ hidden=hidden,
3201
+ memory_type="LABELED",
3202
+ )
3203
+ else:
3204
+ # Type narrowing: datasource is definitely Datasource here
3205
+ assert datasource is not None
3206
+ if background:
3207
+ return super().create(
3208
+ name,
3209
+ datasource=datasource,
3210
+ label_column=label_column,
3211
+ score_column=None,
3212
+ embedding_model=embedding_model,
3213
+ value_column=value_column,
3214
+ source_id_column=source_id_column,
3215
+ partition_id_column=partition_id_column,
3216
+ description=description,
3217
+ label_names=label_names,
3218
+ max_seq_length_override=max_seq_length_override,
3219
+ prompt=prompt,
3220
+ remove_duplicates=remove_duplicates,
3221
+ index_type=index_type,
3222
+ index_params=index_params,
3223
+ if_exists=if_exists,
3224
+ background=True,
3225
+ hidden=hidden,
3226
+ subsample=subsample,
3227
+ memory_type="LABELED",
3228
+ )
3229
+ else:
3230
+ return super().create(
3231
+ name,
3232
+ datasource=datasource,
3233
+ label_column=label_column,
3234
+ score_column=None,
3235
+ embedding_model=embedding_model,
3236
+ value_column=value_column,
3237
+ source_id_column=source_id_column,
3238
+ partition_id_column=partition_id_column,
3239
+ description=description,
3240
+ label_names=label_names,
3241
+ max_seq_length_override=max_seq_length_override,
3242
+ prompt=prompt,
3243
+ remove_duplicates=remove_duplicates,
3244
+ index_type=index_type,
3245
+ index_params=index_params,
3246
+ if_exists=if_exists,
3247
+ background=False,
3248
+ hidden=hidden,
3249
+ subsample=subsample,
3250
+ memory_type="LABELED",
3251
+ )
3252
+
3253
+ @overload
3254
+ @classmethod
3255
+ def from_datasource(
3256
+ cls,
3257
+ name: str,
3258
+ *,
3259
+ datasource: Datasource,
3260
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3261
+ value_column: str = "value",
3262
+ label_column: str | None = "label",
3263
+ source_id_column: str | None = None,
3264
+ partition_id_column: str | None = None,
3265
+ description: str | None = None,
3266
+ label_names: list[str] | None = None,
3267
+ max_seq_length_override: int | None = None,
3268
+ prompt: str | None = None,
3269
+ remove_duplicates: bool = True,
3270
+ index_type: IndexType = "FLAT",
3271
+ index_params: dict[str, Any] = {},
3272
+ if_exists: CreateMode = "error",
3273
+ background: Literal[True],
3274
+ hidden: bool = False,
3275
+ subsample: int | float | None = None,
3276
+ ) -> Job[Self]:
3277
+ pass
3278
+
3279
+ @overload
3280
+ @classmethod
3281
+ def from_datasource(
3282
+ cls,
3283
+ name: str,
3284
+ *,
3285
+ datasource: Datasource,
3286
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3287
+ value_column: str = "value",
3288
+ label_column: str | None = "label",
3289
+ source_id_column: str | None = None,
3290
+ partition_id_column: str | None = None,
3291
+ description: str | None = None,
3292
+ label_names: list[str] | None = None,
3293
+ max_seq_length_override: int | None = None,
3294
+ prompt: str | None = None,
3295
+ remove_duplicates: bool = True,
3296
+ index_type: IndexType = "FLAT",
3297
+ index_params: dict[str, Any] = {},
3298
+ if_exists: CreateMode = "error",
3299
+ background: Literal[False] = False,
3300
+ hidden: bool = False,
3301
+ subsample: int | float | None = None,
3302
+ ) -> Self:
3303
+ pass
3304
+
3305
+ @classmethod
3306
+ def from_datasource( # type: ignore[override]
3307
+ cls,
3308
+ name: str,
3309
+ *,
3310
+ datasource: Datasource,
3311
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3312
+ value_column: str = "value",
3313
+ label_column: str | None = "label",
3314
+ source_id_column: str | None = None,
3315
+ partition_id_column: str | None = None,
3316
+ description: str | None = None,
3317
+ label_names: list[str] | None = None,
3318
+ max_seq_length_override: int | None = None,
3319
+ prompt: str | None = None,
3320
+ remove_duplicates: bool = True,
3321
+ index_type: IndexType = "FLAT",
3322
+ index_params: dict[str, Any] = {},
3323
+ if_exists: CreateMode = "error",
3324
+ background: bool = False,
3325
+ hidden: bool = False,
3326
+ subsample: int | float | None = None,
3327
+ ) -> Self | Job[Self]:
3328
+ """
3329
+ Create a new labeled memoryset in the OrcaCloud from a datasource.
3330
+
3331
+ This is a convenience method that is equivalent to calling `create` with a datasource.
3332
+ All columns from the datasource that are not specified in the `value_column`,
3333
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
3334
+ in the memoryset.
3335
+
3336
+ Params:
3337
+ name: Name for the new memoryset (must be unique)
3338
+ datasource: Source data to populate the memories in the memoryset.
3339
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3340
+ If not provided, a default embedding model for the memoryset will be used.
3341
+ value_column: Name of the column in the datasource that contains the memory values
3342
+ label_column: Name of the column in the datasource that contains the memory labels.
3343
+ Must contain categorical values as integers or strings. String labels will be
3344
+ converted to integers with the unique strings extracted as `label_names`. To create
3345
+ a memoryset with all none labels, set to `None`.
3346
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3347
+ the system of reference
3348
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3349
+ description: Optional description for the memoryset, this will be used in agentic flows,
3350
+ so make sure it is concise and describes the contents of your memoryset not the
3351
+ datasource or the embedding model.
3352
+ label_names: List of human-readable names for the labels in the memoryset, must match
3353
+ the number of labels in the `label_column`. Will be automatically inferred if string
3354
+ labels are provided or if a [Dataset][datasets.Dataset] with a
3355
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
3356
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3357
+ value is longer than this it will be truncated, will default to the model's max
3358
+ sequence length if not provided
3359
+ prompt: Optional prompt to use when embedding documents/memories for storage
3360
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3361
+ into the memoryset
3362
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3363
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3364
+ index_params: Parameters for the vector index, defaults to `{}`
3365
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3366
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3367
+ background: Whether to run the operation none blocking and return a job handle.
3368
+ hidden: Whether the memoryset should be hidden
3369
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
3370
+ datasource to insert. Use to limit the size of the initial memoryset.
3371
+
3372
+ Returns:
3373
+ Handle to the new memoryset in the OrcaCloud
3374
+
3375
+ Raises:
3376
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3377
+ `"open"` and the params do not match those of the existing memoryset.
3378
+ """
3379
+ if background:
3380
+ return super().create(
3381
+ name,
3382
+ datasource=datasource,
3383
+ label_column=label_column,
3384
+ score_column=None,
3385
+ embedding_model=embedding_model,
3386
+ value_column=value_column,
3387
+ source_id_column=source_id_column,
3388
+ partition_id_column=partition_id_column,
3389
+ description=description,
3390
+ label_names=label_names,
3391
+ max_seq_length_override=max_seq_length_override,
3392
+ prompt=prompt,
3393
+ remove_duplicates=remove_duplicates,
3394
+ index_type=index_type,
3395
+ index_params=index_params,
3396
+ if_exists=if_exists,
3397
+ background=True,
3398
+ hidden=hidden,
3399
+ subsample=subsample,
3400
+ memory_type="LABELED",
3401
+ )
3402
+ else:
3403
+ return super().create(
3404
+ name,
3405
+ datasource=datasource,
3406
+ label_column=label_column,
3407
+ score_column=None,
3408
+ embedding_model=embedding_model,
3409
+ value_column=value_column,
3410
+ source_id_column=source_id_column,
3411
+ partition_id_column=partition_id_column,
3412
+ description=description,
3413
+ label_names=label_names,
3414
+ max_seq_length_override=max_seq_length_override,
3415
+ prompt=prompt,
3416
+ remove_duplicates=remove_duplicates,
3417
+ index_type=index_type,
3418
+ index_params=index_params,
3419
+ if_exists=if_exists,
3420
+ background=False,
3421
+ hidden=hidden,
3422
+ subsample=subsample,
3423
+ memory_type="LABELED",
3424
+ )
3425
+
3426
+ def display_label_analysis(self):
3427
+ """
3428
+ Display an interactive UI to review and act upon the label analysis results
3429
+
3430
+ Note:
3431
+ This method is only available in Jupyter notebooks.
3432
+ """
3433
+ from ._utils.analysis_ui import display_suggested_memory_relabels
3434
+
3435
+ display_suggested_memory_relabels(self)
3436
+
3437
+
3438
+ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
3439
+ """
3440
+ A Handle to a collection of memories with scores in the OrcaCloud
3441
+
3442
+ Attributes:
3443
+ id: Unique identifier for the memoryset
3444
+ name: Unique name of the memoryset
3445
+ description: Description of the memoryset
3446
+ length: Number of memories in the memoryset
3447
+ embedding_model: Embedding model used to embed the memory values for semantic search
3448
+ created_at: When the memoryset was created, automatically generated on create
3449
+ updated_at: When the memoryset was last updated, automatically updated on updates
3450
+ """
3451
+
3452
+ memory_type: MemoryType = "SCORED"
3453
+
3454
+ def __eq__(self, other) -> bool:
3455
+ return isinstance(other, ScoredMemoryset) and self.id == other.id
3456
+
3457
+ @overload
3458
+ @classmethod
3459
+ def create(
3460
+ cls,
3461
+ name: str,
3462
+ *,
3463
+ datasource: None = None,
3464
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3465
+ description: str | None = None,
3466
+ max_seq_length_override: int | None = None,
3467
+ prompt: str | None = None,
3468
+ index_type: IndexType = "FLAT",
3469
+ index_params: dict[str, Any] = {},
3470
+ if_exists: CreateMode = "error",
3471
+ hidden: bool = False,
3472
+ ) -> Self:
3473
+ pass
3474
+
3475
+ @overload
3476
+ @classmethod
3477
+ def create(
3478
+ cls,
3479
+ name: str,
3480
+ *,
3481
+ datasource: Datasource,
3482
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3483
+ value_column: str = "value",
3484
+ score_column: str | None = "score",
3485
+ source_id_column: str | None = None,
3486
+ partition_id_column: str | None = None,
3487
+ description: str | None = None,
3488
+ max_seq_length_override: int | None = None,
3489
+ prompt: str | None = None,
3490
+ remove_duplicates: bool = True,
3491
+ index_type: IndexType = "FLAT",
3492
+ index_params: dict[str, Any] = {},
3493
+ if_exists: CreateMode = "error",
3494
+ background: Literal[True],
3495
+ hidden: bool = False,
3496
+ subsample: int | float | None = None,
3497
+ ) -> Job[Self]:
3498
+ pass
3499
+
3500
+ @overload
3501
+ @classmethod
3502
+ def create(
3503
+ cls,
3504
+ name: str,
3505
+ *,
3506
+ datasource: Datasource,
3507
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3508
+ score_column: str | None = "score",
3509
+ value_column: str = "value",
3510
+ source_id_column: str | None = None,
3511
+ partition_id_column: str | None = None,
3512
+ description: str | None = None,
3513
+ max_seq_length_override: int | None = None,
3514
+ prompt: str | None = None,
3515
+ remove_duplicates: bool = True,
3516
+ index_type: IndexType = "FLAT",
3517
+ index_params: dict[str, Any] = {},
3518
+ if_exists: CreateMode = "error",
3519
+ background: Literal[False] = False,
3520
+ hidden: bool = False,
3521
+ subsample: int | float | None = None,
3522
+ ) -> Self:
3523
+ pass
3524
+
3525
+ @classmethod
3526
+ def create( # type: ignore[override]
3527
+ cls,
3528
+ name: str,
3529
+ *,
3530
+ datasource: Datasource | None = None,
3531
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3532
+ value_column: str = "value",
3533
+ score_column: str | None = "score",
3534
+ source_id_column: str | None = None,
3535
+ partition_id_column: str | None = None,
3536
+ description: str | None = None,
3537
+ max_seq_length_override: int | None = None,
3538
+ prompt: str | None = None,
3539
+ remove_duplicates: bool = True,
3540
+ index_type: IndexType = "FLAT",
3541
+ index_params: dict[str, Any] = {},
3542
+ if_exists: CreateMode = "error",
3543
+ background: bool = False,
3544
+ hidden: bool = False,
3545
+ subsample: int | float | None = None,
3546
+ ) -> Self | Job[Self]:
3547
+ """
3548
+ Create a new scored memoryset in the OrcaCloud
3549
+
3550
+ If `datasource` is provided, all columns from the datasource that are not specified in the
3551
+ `value_column`, `score_column`, `source_id_column`, or `partition_id_column` will be stored
3552
+ as metadata in the memoryset.
3553
+
3554
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
3555
+ You can add memories later using the `insert` method.
3556
+
3557
+ Params:
3558
+ name: Name for the new memoryset (must be unique)
3559
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
3560
+ an empty memoryset will be created.
3561
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3562
+ If not provided, a default embedding model for the memoryset will be used.
3563
+ value_column: Name of the column in the datasource that contains the memory values
3564
+ score_column: Name of the column in the datasource that contains the memory scores. Must
3565
+ contain numerical values. To create a memoryset with all none scores, set to `None`.
3566
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3567
+ the system of reference
3568
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3569
+ description: Optional description for the memoryset, this will be used in agentic flows,
3570
+ so make sure it is concise and describes the contents of your memoryset not the
3571
+ datasource or the embedding model.
3572
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3573
+ value is longer than this it will be truncated, will default to the model's max
3574
+ sequence length if not provided
3575
+ prompt: Optional prompt to use when embedding documents/memories for storage
3576
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3577
+ into the memoryset
3578
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3579
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3580
+ index_params: Parameters for the vector index, defaults to `{}`
3581
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3582
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3583
+ background: Whether to run the operation none blocking and return a job handle
3584
+ hidden: Whether the memoryset should be hidden
3585
+
3586
+ Returns:
3587
+ Handle to the new memoryset in the OrcaCloud
3588
+
3589
+ Raises:
3590
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3591
+ `"open"` and the params do not match those of the existing memoryset.
3592
+ """
3593
+ if datasource is None:
3594
+ return super().create(
3595
+ name,
3596
+ datasource=None,
3597
+ embedding_model=embedding_model,
3598
+ description=description,
3599
+ max_seq_length_override=max_seq_length_override,
3600
+ prompt=prompt,
3601
+ index_type=index_type,
3602
+ index_params=index_params,
3603
+ if_exists=if_exists,
3604
+ hidden=hidden,
3605
+ memory_type="SCORED",
3606
+ )
3607
+ else:
3608
+ # Type narrowing: datasource is definitely Datasource here
3609
+ assert datasource is not None
3610
+ if background:
3611
+ return super().create(
3612
+ name,
3613
+ datasource=datasource,
3614
+ embedding_model=embedding_model,
3615
+ value_column=value_column,
3616
+ score_column=score_column,
3617
+ source_id_column=source_id_column,
3618
+ partition_id_column=partition_id_column,
3619
+ description=description,
3620
+ max_seq_length_override=max_seq_length_override,
3621
+ prompt=prompt,
3622
+ remove_duplicates=remove_duplicates,
3623
+ index_type=index_type,
3624
+ index_params=index_params,
3625
+ if_exists=if_exists,
3626
+ background=True,
3627
+ hidden=hidden,
3628
+ subsample=subsample,
3629
+ memory_type="SCORED",
3630
+ )
3631
+ else:
3632
+ return super().create(
3633
+ name,
3634
+ datasource=datasource,
3635
+ embedding_model=embedding_model,
3636
+ value_column=value_column,
3637
+ score_column=score_column,
3638
+ source_id_column=source_id_column,
3639
+ partition_id_column=partition_id_column,
3640
+ description=description,
3641
+ max_seq_length_override=max_seq_length_override,
3642
+ prompt=prompt,
3643
+ remove_duplicates=remove_duplicates,
3644
+ index_type=index_type,
3645
+ index_params=index_params,
3646
+ if_exists=if_exists,
3647
+ background=False,
3648
+ hidden=hidden,
3649
+ subsample=subsample,
3650
+ memory_type="SCORED",
3651
+ )
3652
+
3653
+ @overload
3654
+ @classmethod
3655
+ def from_datasource(
3656
+ cls,
3657
+ name: str,
3658
+ *,
3659
+ datasource: Datasource,
3660
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3661
+ value_column: str = "value",
3662
+ score_column: str | None = "score",
3663
+ source_id_column: str | None = None,
3664
+ partition_id_column: str | None = None,
3665
+ description: str | None = None,
3666
+ max_seq_length_override: int | None = None,
3667
+ prompt: str | None = None,
3668
+ remove_duplicates: bool = True,
3669
+ index_type: IndexType = "FLAT",
3670
+ index_params: dict[str, Any] = {},
3671
+ if_exists: CreateMode = "error",
3672
+ background: Literal[True],
3673
+ hidden: bool = False,
3674
+ subsample: int | float | None = None,
3675
+ ) -> Job[Self]:
3676
+ pass
3677
+
3678
+ @overload
3679
+ @classmethod
3680
+ def from_datasource(
3681
+ cls,
3682
+ name: str,
3683
+ *,
3684
+ datasource: Datasource,
3685
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3686
+ score_column: str | None = "score",
3687
+ value_column: str = "value",
3688
+ source_id_column: str | None = None,
3689
+ partition_id_column: str | None = None,
3690
+ description: str | None = None,
3691
+ max_seq_length_override: int | None = None,
3692
+ prompt: str | None = None,
3693
+ remove_duplicates: bool = True,
3694
+ index_type: IndexType = "FLAT",
3695
+ index_params: dict[str, Any] = {},
3696
+ if_exists: CreateMode = "error",
3697
+ background: Literal[False] = False,
3698
+ hidden: bool = False,
3699
+ subsample: int | float | None = None,
3700
+ ) -> Self:
3701
+ pass
3702
+
3703
+ @classmethod
3704
+ def from_datasource( # type: ignore[override]
3705
+ cls,
3706
+ name: str,
3707
+ *,
3708
+ datasource: Datasource,
3709
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3710
+ value_column: str = "value",
3711
+ score_column: str | None = "score",
3712
+ source_id_column: str | None = None,
3713
+ partition_id_column: str | None = None,
3714
+ description: str | None = None,
3715
+ max_seq_length_override: int | None = None,
3716
+ prompt: str | None = None,
3717
+ remove_duplicates: bool = True,
3718
+ index_type: IndexType = "FLAT",
3719
+ index_params: dict[str, Any] = {},
3720
+ if_exists: CreateMode = "error",
3721
+ background: bool = False,
3722
+ hidden: bool = False,
3723
+ subsample: int | float | None = None,
3724
+ ) -> Self | Job[Self]:
3725
+ """
3726
+ Create a new scored memoryset in the OrcaCloud from a datasource.
3727
+
3728
+ This is a convenience method that is equivalent to calling `create` with a datasource.
3729
+ All columns from the datasource that are not specified in the `value_column`,
3730
+ `score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
3731
+ in the memoryset.
3732
+
3733
+ Params:
3734
+ name: Name for the new memoryset (must be unique)
3735
+ datasource: Source data to populate the memories in the memoryset.
3736
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3737
+ If not provided, a default embedding model for the memoryset will be used.
3738
+ value_column: Name of the column in the datasource that contains the memory values
3739
+ score_column: Name of the column in the datasource that contains the memory scores. Must
3740
+ contain numerical values. To create a memoryset with all none scores, set to `None`.
3741
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3742
+ the system of reference
3743
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3744
+ description: Optional description for the memoryset, this will be used in agentic flows,
3745
+ so make sure it is concise and describes the contents of your memoryset not the
3746
+ datasource or the embedding model.
3747
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3748
+ value is longer than this it will be truncated, will default to the model's max
3749
+ sequence length if not provided
3750
+ prompt: Optional prompt to use when embedding documents/memories for storage
3751
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3752
+ into the memoryset
3753
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3754
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3755
+ index_params: Parameters for the vector index, defaults to `{}`
3756
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3757
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3758
+ background: Whether to run the operation none blocking and return a job handle.
3759
+ hidden: Whether the memoryset should be hidden
3760
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
3761
+ datasource to insert. Use to limit the size of the initial memoryset.
3762
+
3763
+ Returns:
3764
+ Handle to the new memoryset in the OrcaCloud
3765
+
3766
+ Raises:
3767
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3768
+ `"open"` and the params do not match those of the existing memoryset.
3769
+ """
3770
+ if background:
3771
+ return super().create(
3772
+ name,
3773
+ datasource=datasource,
3774
+ embedding_model=embedding_model,
3775
+ value_column=value_column,
3776
+ score_column=score_column,
3777
+ source_id_column=source_id_column,
3778
+ partition_id_column=partition_id_column,
3779
+ description=description,
3780
+ max_seq_length_override=max_seq_length_override,
3781
+ prompt=prompt,
3782
+ remove_duplicates=remove_duplicates,
3783
+ index_type=index_type,
3784
+ index_params=index_params,
3785
+ if_exists=if_exists,
3786
+ background=True,
3787
+ hidden=hidden,
3788
+ subsample=subsample,
3789
+ memory_type="SCORED",
3790
+ )
3791
+ else:
3792
+ return super().create(
3793
+ name,
3794
+ datasource=datasource,
3795
+ embedding_model=embedding_model,
3796
+ value_column=value_column,
3797
+ score_column=score_column,
3798
+ source_id_column=source_id_column,
3799
+ partition_id_column=partition_id_column,
3800
+ description=description,
3801
+ max_seq_length_override=max_seq_length_override,
3802
+ prompt=prompt,
3803
+ remove_duplicates=remove_duplicates,
3804
+ index_type=index_type,
3805
+ index_params=index_params,
3806
+ if_exists=if_exists,
3807
+ background=False,
3808
+ hidden=hidden,
3809
+ subsample=subsample,
3810
+ memory_type="SCORED",
3811
+ )