orca-sdk 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. orca_sdk/__init__.py +10 -4
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +393 -0
  4. orca_sdk/_shared/metrics_test.py +273 -0
  5. orca_sdk/_utils/analysis_ui.py +12 -10
  6. orca_sdk/_utils/analysis_ui_style.css +0 -3
  7. orca_sdk/_utils/auth.py +27 -29
  8. orca_sdk/_utils/data_parsing.py +28 -2
  9. orca_sdk/_utils/data_parsing_test.py +15 -15
  10. orca_sdk/_utils/pagination.py +126 -0
  11. orca_sdk/_utils/pagination_test.py +132 -0
  12. orca_sdk/_utils/prediction_result_ui.py +67 -21
  13. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  14. orca_sdk/_utils/value_parser.py +45 -0
  15. orca_sdk/_utils/value_parser_test.py +39 -0
  16. orca_sdk/classification_model.py +439 -129
  17. orca_sdk/classification_model_test.py +334 -104
  18. orca_sdk/client.py +3747 -0
  19. orca_sdk/conftest.py +164 -19
  20. orca_sdk/credentials.py +120 -18
  21. orca_sdk/credentials_test.py +20 -0
  22. orca_sdk/datasource.py +259 -68
  23. orca_sdk/datasource_test.py +242 -0
  24. orca_sdk/embedding_model.py +425 -82
  25. orca_sdk/embedding_model_test.py +39 -13
  26. orca_sdk/job.py +337 -0
  27. orca_sdk/job_test.py +108 -0
  28. orca_sdk/memoryset.py +1341 -305
  29. orca_sdk/memoryset_test.py +350 -111
  30. orca_sdk/regression_model.py +684 -0
  31. orca_sdk/regression_model_test.py +369 -0
  32. orca_sdk/telemetry.py +449 -143
  33. orca_sdk/telemetry_test.py +43 -24
  34. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/METADATA +34 -16
  35. orca_sdk-0.1.2.dist-info/RECORD +40 -0
  36. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/WHEEL +1 -1
  37. orca_sdk/_generated_api_client/__init__.py +0 -3
  38. orca_sdk/_generated_api_client/api/__init__.py +0 -193
  39. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  40. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
  41. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
  42. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
  43. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
  44. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
  45. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  46. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
  47. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
  48. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
  49. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
  50. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
  51. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
  52. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
  53. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
  54. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
  55. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  56. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
  57. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
  58. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
  59. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
  60. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  61. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
  62. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
  63. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  64. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
  65. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
  66. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
  67. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
  68. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
  69. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  70. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
  71. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
  72. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
  73. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
  74. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
  75. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
  76. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
  77. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
  78. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
  79. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
  80. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
  81. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
  82. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
  83. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
  84. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
  85. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
  86. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
  87. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  88. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
  89. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
  90. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
  91. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  92. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
  93. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
  94. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
  95. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  96. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
  97. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
  98. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
  99. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
  100. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
  101. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
  102. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
  103. orca_sdk/_generated_api_client/client.py +0 -216
  104. orca_sdk/_generated_api_client/errors.py +0 -38
  105. orca_sdk/_generated_api_client/models/__init__.py +0 -159
  106. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
  107. orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
  108. orca_sdk/_generated_api_client/models/base_model.py +0 -55
  109. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
  110. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
  111. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
  112. orca_sdk/_generated_api_client/models/column_info.py +0 -114
  113. orca_sdk/_generated_api_client/models/column_type.py +0 -14
  114. orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
  115. orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
  116. orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
  117. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
  118. orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
  119. orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
  120. orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
  121. orca_sdk/_generated_api_client/models/embed_request.py +0 -127
  122. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
  123. orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
  124. orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
  125. orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
  126. orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
  127. orca_sdk/_generated_api_client/models/filter_item.py +0 -231
  128. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
  129. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
  130. orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
  131. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
  132. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
  133. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
  134. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
  135. orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
  136. orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
  137. orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
  138. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
  139. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
  140. orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
  141. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
  142. orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
  143. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
  144. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
  145. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
  146. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
  147. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
  148. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
  149. orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
  150. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
  151. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
  152. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
  153. orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
  154. orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
  155. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
  156. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
  157. orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
  158. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
  159. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
  160. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
  161. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
  162. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
  163. orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
  164. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
  165. orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
  166. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
  167. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
  168. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
  169. orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
  170. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
  171. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
  172. orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
  173. orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
  174. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
  175. orca_sdk/_generated_api_client/models/task.py +0 -198
  176. orca_sdk/_generated_api_client/models/task_status.py +0 -14
  177. orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
  178. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
  179. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
  180. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
  181. orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
  182. orca_sdk/_generated_api_client/py.typed +0 -1
  183. orca_sdk/_generated_api_client/types.py +0 -56
  184. orca_sdk/_utils/task.py +0 -73
  185. orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/memoryset.py CHANGED
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ from abc import ABC
4
5
  from datetime import datetime, timedelta
5
6
  from os import PathLike
6
- from typing import Any, Iterable, Literal, cast, overload
7
+ from typing import Any, Generic, Iterable, Literal, Self, TypeVar, cast, overload
7
8
 
8
9
  import pandas as pd
9
10
  import pyarrow as pa
@@ -11,62 +12,62 @@ from datasets import Dataset
11
12
  from torch.utils.data import DataLoader as TorchDataLoader
12
13
  from torch.utils.data import Dataset as TorchDataset
13
14
 
14
- from ._generated_api_client.api import (
15
- clone_memoryset,
16
- create_analysis,
17
- create_memoryset,
18
- delete_memories,
19
- delete_memoryset,
20
- get_analysis,
21
- get_memories,
22
- get_memory,
23
- get_memoryset,
24
- insert_memories_gpu,
25
- list_memorysets,
26
- memoryset_lookup_gpu,
27
- query_memoryset,
28
- update_memories_gpu,
29
- update_memory_gpu,
30
- )
31
- from ._generated_api_client.models import (
32
- AnalyzeNeighborLabelsResult,
33
- CloneLabeledMemorysetRequest,
34
- CreateLabeledMemorysetRequest,
35
- DeleteMemoriesRequest,
15
+ from ._utils.common import UNSET, CreateMode, DropMode
16
+ from .client import (
17
+ CascadingEditSuggestion,
18
+ CloneMemorysetRequest,
19
+ CreateMemorysetRequest,
20
+ EmbeddingModelResult,
36
21
  FilterItem,
37
- FilterItemOp,
38
- FindDuplicatesAnalysisResult,
39
- GetMemoriesRequest,
40
22
  )
41
- from ._generated_api_client.models import LabeledMemory as LabeledMemoryResponse
42
- from ._generated_api_client.models import (
23
+ from .client import LabeledMemory as LabeledMemoryResponse
24
+ from .client import (
43
25
  LabeledMemoryInsert,
44
- LabeledMemoryInsertMetadata,
45
- )
46
- from ._generated_api_client.models import (
47
- LabeledMemoryLookup as LabeledMemoryLookupResponse,
48
26
  )
49
- from ._generated_api_client.models import (
50
- LabeledMemoryMetrics,
51
- LabeledMemorysetMetadata,
27
+ from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
28
+ from .client import (
52
29
  LabeledMemoryUpdate,
53
- LabeledMemoryUpdateMetadataType0,
30
+ LabeledMemoryWithFeedbackMetrics,
54
31
  LabelPredictionMemoryLookup,
55
- ListMemoriesRequest,
56
- LookupRequest,
57
- MemorysetAnalysisRequest,
58
- MemorysetAnalysisRequestType,
59
- TaskStatus,
32
+ MemoryMetrics,
33
+ MemorysetAnalysisConfigs,
34
+ MemorysetMetadata,
35
+ MemorysetMetrics,
36
+ MemorysetUpdate,
37
+ MemoryType,
38
+ )
39
+ from .client import ScoredMemory as ScoredMemoryResponse
40
+ from .client import (
41
+ ScoredMemoryInsert,
42
+ )
43
+ from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
44
+ from .client import (
45
+ ScoredMemoryUpdate,
46
+ ScoredMemoryWithFeedbackMetrics,
47
+ ScorePredictionMemoryLookup,
48
+ TelemetryFilterItem,
49
+ TelemetrySortOptions,
50
+ orca_api,
60
51
  )
61
- from ._generated_api_client.types import UNSET as CLIENT_UNSET
62
- from ._utils.common import UNSET, CreateMode, DropMode
63
- from ._utils.task import wait_for_task
64
52
  from .datasource import Datasource
65
53
  from .embedding_model import (
54
+ EmbeddingModelBase,
66
55
  FinetunedEmbeddingModel,
67
56
  PretrainedEmbeddingModel,
68
- _EmbeddingModel,
69
57
  )
58
+ from .job import Job, Status
59
+
60
+ TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
61
+ """
62
+ Sort expression for telemetry data consisting of a field and a direction.
63
+
64
+ * **`field`**: The field to sort on.
65
+ * **`direction`**: The direction to sort in.
66
+
67
+ Examples:
68
+ >>> ("feedback_metrics.accuracy.avg", "asc")
69
+ >>> ("lookup.count", "desc")
70
+ """
70
71
 
71
72
  FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
72
73
  """
@@ -90,62 +91,249 @@ Examples:
90
91
  >>> ("label", "==", 0)
91
92
  >>> ("metadata.author", "like", "John")
92
93
  >>> ("source_id", "in", ["123", "456"])
94
+ >>> ("feedback_metrics.accuracy.avg", ">", 0.95)
93
95
  """
94
96
 
97
+ IndexType = Literal["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "DISKANN"]
98
+
99
+ DEFAULT_COLUMN_NAMES = {"value", "source_id"}
100
+ TYPE_SPECIFIC_COLUMN_NAMES = {"label", "score"}
101
+ FORBIDDEN_METADATA_COLUMN_NAMES = {
102
+ "memory_id",
103
+ "memory_version",
104
+ "embedding",
105
+ "created_at",
106
+ "updated_at",
107
+ "metrics",
108
+ "feedback_metrics",
109
+ "lookup",
110
+ }
95
111
 
96
- DEFAULT_COLUMN_NAMES = {"value", "label", "source_id"}
97
- FORBIDDEN_METADATA_COLUMN_NAMES = {"memory_id", "memory_version", "embedding", "created_at", "updated_at", "metrics"}
98
112
 
113
+ def _is_metric_column(column: str):
114
+ return column in ["feedback_metrics", "lookup"]
99
115
 
100
- def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem:
116
+
117
+ def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem | TelemetryFilterItem:
101
118
  field = input[0].split(".")
102
- if len(field) == 1 and field[0] not in DEFAULT_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES:
119
+ if (
120
+ len(field) == 1
121
+ and field[0] not in DEFAULT_COLUMN_NAMES | TYPE_SPECIFIC_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES
122
+ ):
103
123
  field = ["metadata", field[0]]
104
- op = FilterItemOp(input[1])
124
+ op = input[1]
105
125
  value = input[2]
126
+ if isinstance(value, datetime):
127
+ value = value.isoformat()
128
+ if _is_metric_column(field[0]):
129
+ if not (
130
+ (isinstance(value, list) and all(isinstance(v, float) or isinstance(v, int) for v in value))
131
+ or isinstance(value, float)
132
+ or isinstance(value, int)
133
+ ):
134
+ raise ValueError(f"Invalid value for {field[0]} filter: {value}")
135
+ if field[0] == "feedback_metrics" and (len(field) != 3 or field[2] not in ["avg", "count"]):
136
+ raise ValueError(
137
+ "Feedback metrics filters must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
138
+ )
139
+ elif field[0] == "lookup" and (len(field) != 2 or field[1] != "count"):
140
+ raise ValueError("Lookup filters must follow the format `lookup.count`")
141
+ if op == "like":
142
+ raise ValueError("Like filters are not supported on metric columns")
143
+ op = cast(Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in"], op)
144
+ value = cast(float | int | list[float] | list[int], value)
145
+ return TelemetryFilterItem(field=field, op=op, value=value)
146
+
106
147
  return FilterItem(field=field, op=op, value=value)
107
148
 
108
149
 
109
- def _parse_memory_insert(memory: dict[str, Any]) -> LabeledMemoryInsert:
150
+ def _parse_sort_item_from_tuple(
151
+ input: TelemetrySortItem,
152
+ ) -> TelemetrySortOptions:
153
+ field = input[0].split(".")
154
+
155
+ if len(field) == 1:
156
+ raise ValueError("Sort field must be a telemetry field with an aggregate function name value")
157
+ if field[0] not in ["feedback_metrics", "lookup"]:
158
+ raise ValueError("Sort field must be one of telemetry fields: feedback_metrics or lookup")
159
+ if field[0] == "feedback_metrics":
160
+ if len(field) != 3:
161
+ raise ValueError(
162
+ "Feedback metrics must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
163
+ )
164
+ if field[2] not in ["avg", "count"]:
165
+ raise ValueError("Feedback metrics can only be sorted on avg or count")
166
+ if field[0] == "lookup":
167
+ if len(field) != 2:
168
+ raise ValueError("Lookup must follow the format `lookup.count`")
169
+ if field[1] != "count":
170
+ raise ValueError("Lookup can only be sorted on count")
171
+ return TelemetrySortOptions(field=field, direction=input[1])
172
+
173
+
174
+ def _parse_memory_insert(memory: dict[str, Any], type: MemoryType) -> LabeledMemoryInsert | ScoredMemoryInsert:
110
175
  value = memory.get("value")
111
176
  if not isinstance(value, str):
112
177
  raise ValueError("Memory value must be a string")
113
- label = memory.get("label")
114
- if not isinstance(label, int):
115
- raise ValueError("Memory label must be an integer")
116
178
  source_id = memory.get("source_id")
117
179
  if source_id and not isinstance(source_id, str):
118
180
  raise ValueError("Memory source_id must be a string")
119
- metadata = LabeledMemoryInsertMetadata.from_dict({k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES})
120
- if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
121
- raise ValueError(f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
122
- return LabeledMemoryInsert(value=value, label=label, source_id=source_id, metadata=metadata)
123
-
124
-
125
- def _parse_memory_update(update: dict[str, Any]) -> LabeledMemoryUpdate:
181
+ match type:
182
+ case "LABELED":
183
+ label = memory.get("label")
184
+ if label is not None and not isinstance(label, int):
185
+ raise ValueError("Memory label must be an integer")
186
+ metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"label"}}
187
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
188
+ raise ValueError(
189
+ f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
190
+ )
191
+ return {"value": value, "label": label, "source_id": source_id, "metadata": metadata}
192
+ case "SCORED":
193
+ score = memory.get("score")
194
+ if score is not None and not isinstance(score, (int, float)):
195
+ raise ValueError("Memory score must be a number")
196
+ metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"score"}}
197
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
198
+ raise ValueError(
199
+ f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
200
+ )
201
+ return {"value": value, "score": score, "source_id": source_id, "metadata": metadata}
202
+
203
+
204
+ def _parse_memory_update(update: dict[str, Any], type: MemoryType) -> LabeledMemoryUpdate | ScoredMemoryUpdate:
126
205
  if "memory_id" not in update:
127
206
  raise ValueError("memory_id must be specified in the update dictionary")
128
207
  memory_id = update["memory_id"]
129
208
  if not isinstance(memory_id, str):
130
209
  raise ValueError("memory_id must be a string")
131
- value = update.get("value", CLIENT_UNSET)
132
- if value is not CLIENT_UNSET and not isinstance(value, str):
133
- raise ValueError("value must be a string or unset")
134
- label = update.get("label", CLIENT_UNSET)
135
- if label is not CLIENT_UNSET and not isinstance(label, int):
136
- raise ValueError("label must be an integer or unset")
137
- source_id = update.get("source_id", CLIENT_UNSET)
138
- if source_id is not CLIENT_UNSET and not isinstance(source_id, str):
139
- raise ValueError("source_id must be a string or unset")
140
- metadata = LabeledMemoryUpdateMetadataType0.from_dict(
141
- {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id"}}
142
- )
143
- if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
144
- raise ValueError(f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
145
- return LabeledMemoryUpdate(memory_id=memory_id, value=value, label=label, source_id=source_id, metadata=metadata)
146
-
147
-
148
- class LabeledMemory:
210
+ payload: LabeledMemoryUpdate | ScoredMemoryUpdate = {"memory_id": memory_id}
211
+ if "value" in update:
212
+ if not isinstance(update["value"], str):
213
+ raise ValueError("value must be a string or unset")
214
+ payload["value"] = update["value"]
215
+ if "source_id" in update:
216
+ if not isinstance(update["source_id"], str):
217
+ raise ValueError("source_id must be a string or unset")
218
+ payload["source_id"] = update["source_id"]
219
+ match type:
220
+ case "LABELED":
221
+ payload = cast(LabeledMemoryUpdate, payload)
222
+ if "label" in update:
223
+ if not isinstance(update["label"], int):
224
+ raise ValueError("label must be an integer or unset")
225
+ payload["label"] = update["label"]
226
+ metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "label"}}
227
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
228
+ raise ValueError(
229
+ f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
230
+ )
231
+ payload["metadata"] = metadata
232
+ return payload
233
+ case "SCORED":
234
+ payload = cast(ScoredMemoryUpdate, payload)
235
+ if "score" in update:
236
+ if not isinstance(update["score"], (int, float)):
237
+ raise ValueError("score must be a number or unset")
238
+ payload["score"] = update["score"]
239
+ metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "score"}}
240
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
241
+ raise ValueError(
242
+ f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
243
+ )
244
+ payload["metadata"] = metadata
245
+ return cast(ScoredMemoryUpdate, payload)
246
+
247
+
248
+ class MemoryBase(ABC):
249
+ value: str
250
+ embedding: list[float]
251
+ source_id: str | None
252
+ created_at: datetime
253
+ updated_at: datetime
254
+ metadata: dict[str, str | float | int | bool | None]
255
+ metrics: MemoryMetrics
256
+ memory_id: str
257
+ memory_version: int
258
+ feedback_metrics: dict[str, Any]
259
+ lookup_count: int
260
+ memory_type: MemoryType # defined by subclasses
261
+
262
+ def __init__(
263
+ self,
264
+ memoryset_id: str,
265
+ memory: (
266
+ LabeledMemoryResponse
267
+ | LabeledMemoryLookupResponse
268
+ | LabeledMemoryWithFeedbackMetrics
269
+ | LabelPredictionMemoryLookup
270
+ | ScoredMemoryResponse
271
+ | ScoredMemoryLookupResponse
272
+ | ScoredMemoryWithFeedbackMetrics
273
+ | ScorePredictionMemoryLookup
274
+ ),
275
+ ):
276
+ # for internal use only, do not document
277
+ self.memoryset_id = memoryset_id
278
+ self.memory_id = memory["memory_id"]
279
+ self.memory_version = memory["memory_version"]
280
+ self.value = cast(str, memory["value"])
281
+ self.embedding = memory["embedding"]
282
+ self.source_id = memory["source_id"]
283
+ self.created_at = datetime.fromisoformat(memory["created_at"])
284
+ self.updated_at = datetime.fromisoformat(memory["updated_at"])
285
+ self.metadata = memory["metadata"]
286
+ self.metrics = memory["metrics"] if "metrics" in memory else {}
287
+ self.feedback_metrics = memory.get("feedback_metrics", {}) or {}
288
+ self.lookup_count = memory.get("lookup_count", 0)
289
+
290
+ def __getattr__(self, key: str) -> Any:
291
+ if key.startswith("__") or key not in self.metadata:
292
+ raise AttributeError(f"{key} is not a valid attribute")
293
+ return self.metadata[key]
294
+
295
+ def _update(
296
+ self,
297
+ *,
298
+ value: str = UNSET,
299
+ source_id: str | None = UNSET,
300
+ **metadata: None | bool | float | int | str,
301
+ ) -> Self:
302
+ response = orca_api.PATCH(
303
+ "/gpu/memoryset/{name_or_id}/memory",
304
+ params={"name_or_id": self.memoryset_id},
305
+ json=_parse_memory_update(
306
+ {"memory_id": self.memory_id}
307
+ | ({"value": value} if value is not UNSET else {})
308
+ | ({"source_id": source_id} if source_id is not UNSET else {})
309
+ | {k: v for k, v in metadata.items() if v is not UNSET},
310
+ type=self.memory_type,
311
+ ),
312
+ )
313
+ self.__dict__.update(self.__class__(self.memoryset_id, response).__dict__)
314
+ return self
315
+
316
+ def to_dict(self) -> dict[str, Any]:
317
+ """
318
+ Convert the memory to a dictionary
319
+ """
320
+ return {
321
+ "value": self.value,
322
+ "embedding": self.embedding,
323
+ "source_id": self.source_id,
324
+ "created_at": self.created_at,
325
+ "updated_at": self.updated_at,
326
+ "metadata": self.metadata,
327
+ "metrics": self.metrics,
328
+ "memory_id": self.memory_id,
329
+ "memory_version": self.memory_version,
330
+ "feedback_metrics": self.feedback_metrics,
331
+ "lookup_count": self.lookup_count,
332
+ "memory_type": self.memory_type,
333
+ }
334
+
335
+
336
+ class LabeledMemory(MemoryBase):
149
337
  """
150
338
  A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
151
339
 
@@ -170,47 +358,30 @@ class LabeledMemory:
170
358
  * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
171
359
  """
172
360
 
173
- value: str
174
- embedding: list[float]
175
- label: int
361
+ label: int | None
176
362
  label_name: str | None
177
- source_id: str | None
178
- created_at: datetime
179
- updated_at: datetime
180
- metadata: dict[str, str | float | int | bool | None]
181
- metrics: LabeledMemoryMetrics | None
182
- memory_id: str
183
- memory_version: int
363
+ memory_type = "LABELED"
184
364
 
185
365
  def __init__(
186
366
  self,
187
367
  memoryset_id: str,
188
- memory: LabeledMemoryResponse | LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
368
+ memory: (
369
+ LabeledMemoryResponse
370
+ | LabeledMemoryLookupResponse
371
+ | LabelPredictionMemoryLookup
372
+ | LabeledMemoryWithFeedbackMetrics
373
+ ),
189
374
  ):
190
375
  # for internal use only, do not document
191
- self.memoryset_id = memoryset_id
192
- self.memory_id = memory.memory_id
193
- self.memory_version = memory.memory_version
194
- self.value = memory.value
195
- self.embedding = memory.embedding
196
- self.label = memory.label
197
- self.label_name = memory.label_name
198
- self.source_id = memory.source_id
199
- self.created_at = memory.created_at
200
- self.updated_at = memory.updated_at
201
- self.metadata = memory.metadata.to_dict()
202
- self.metrics = memory.metrics
203
-
204
- def __getattr__(self, key: str) -> Any:
205
- if key.startswith("__") or key not in self.metadata:
206
- raise AttributeError(f"{key} is not a valid attribute")
207
- return self.metadata[key]
376
+ super().__init__(memoryset_id, memory)
377
+ self.label = memory["label"]
378
+ self.label_name = memory["label_name"]
208
379
 
209
380
  def __repr__(self) -> str:
210
381
  return (
211
382
  "LabeledMemory({ "
212
383
  + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
213
- + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
384
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
214
385
  + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
215
386
  + " })"
216
387
  )
@@ -222,7 +393,7 @@ class LabeledMemory:
222
393
  self,
223
394
  *,
224
395
  value: str = UNSET,
225
- label: int = UNSET,
396
+ label: int | None = UNSET,
226
397
  source_id: str | None = UNSET,
227
398
  **metadata: None | bool | float | int | str,
228
399
  ) -> LabeledMemory:
@@ -241,19 +412,18 @@ class LabeledMemory:
241
412
  Returns:
242
413
  The updated memory
243
414
  """
244
- response = update_memory_gpu(
245
- self.memoryset_id,
246
- body=_parse_memory_update(
247
- {"memory_id": self.memory_id}
248
- | ({"value": value} if value is not UNSET else {})
249
- | ({"label": label} if label is not UNSET else {})
250
- | ({"source_id": source_id} if source_id is not UNSET else {})
251
- | metadata
252
- ),
253
- )
254
- self.__dict__.update(LabeledMemory(self.memoryset_id, response).__dict__)
415
+ self._update(value=value, label=label, source_id=source_id, **metadata)
255
416
  return self
256
417
 
418
+ def to_dict(self) -> dict[str, Any]:
419
+ """
420
+ Convert the memory to a dictionary
421
+ """
422
+ super_dict = super().to_dict()
423
+ super_dict["label"] = self.label
424
+ super_dict["label_name"] = self.label_name
425
+ return super_dict
426
+
257
427
 
258
428
  class LabeledMemoryLookup(LabeledMemory):
259
429
  """
@@ -289,10 +459,8 @@ class LabeledMemoryLookup(LabeledMemory):
289
459
  def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
290
460
  # for internal use only, do not document
291
461
  super().__init__(memoryset_id, memory_lookup)
292
- self.lookup_score = memory_lookup.lookup_score
293
- self.attention_weight = (
294
- memory_lookup.attention_weight if isinstance(memory_lookup, LabelPredictionMemoryLookup) else None
295
- )
462
+ self.lookup_score = memory_lookup["lookup_score"]
463
+ self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
296
464
 
297
465
  def __repr__(self) -> str:
298
466
  return (
@@ -300,20 +468,155 @@ class LabeledMemoryLookup(LabeledMemory):
300
468
  + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
301
469
  + f", lookup_score: {self.lookup_score:.2f}"
302
470
  + (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
303
- + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
471
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
472
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
473
+ + " })"
474
+ )
475
+
476
+
477
+ class ScoredMemory(MemoryBase):
478
+ """
479
+ A row of the [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
480
+
481
+ Attributes:
482
+ value: Value represented by the row
483
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
484
+ with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
485
+ score: Score of the memory
486
+ source_id: Optional unique identifier of the memory in a system of reference
487
+ metrics: Metrics about the memory, generated when running an analysis on the
488
+ [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
489
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
490
+ properties are also accessible as individual attributes on the instance.
491
+ memory_id: Unique identifier for the memory, automatically generated on insert
492
+ memory_version: Version of the memory, automatically updated when the score or value changes
493
+ created_at: When the memory was created, automatically generated on insert
494
+ updated_at: When the memory was last updated, automatically updated on update
495
+
496
+ ## Other Attributes:
497
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
498
+ """
499
+
500
+ score: float | None
501
+ memory_type = "SCORED"
502
+
503
+ def __init__(
504
+ self,
505
+ memoryset_id: str,
506
+ memory: (
507
+ ScoredMemoryResponse
508
+ | ScoredMemoryLookupResponse
509
+ | ScorePredictionMemoryLookup
510
+ | ScoredMemoryWithFeedbackMetrics
511
+ ),
512
+ ):
513
+ # for internal use only, do not document
514
+ super().__init__(memoryset_id, memory)
515
+ self.score = memory["score"]
516
+
517
+ def __repr__(self) -> str:
518
+ return (
519
+ "ScoredMemory({ "
520
+ + f"score: {self.score:.2f}"
521
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
304
522
  + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
305
523
  + " })"
306
524
  )
307
525
 
526
+ def __eq__(self, other: object) -> bool:
527
+ return isinstance(other, ScoredMemory) and self.memory_id == other.memory_id
528
+
529
+ def update(
530
+ self,
531
+ *,
532
+ value: str = UNSET,
533
+ score: float | None = UNSET,
534
+ source_id: str | None = UNSET,
535
+ **metadata: None | bool | float | int | str,
536
+ ) -> ScoredMemory:
537
+ """
538
+ Update the memory with new values
539
+
540
+ Note:
541
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
542
+
543
+ Params:
544
+ value: New value of the memory
545
+ score: New score of the memory
546
+ source_id: New source ID of the memory
547
+ **metadata: New values for metadata properties
548
+
549
+ Returns:
550
+ The updated memory
551
+ """
552
+ self._update(value=value, score=score, source_id=source_id, **metadata)
553
+ return self
554
+
555
+ def to_dict(self) -> dict[str, Any]:
556
+ """
557
+ Convert the memory to a dictionary
558
+ """
559
+ super_dict = super().to_dict()
560
+ super_dict["score"] = self.score
561
+ return super_dict
308
562
 
309
- class LabeledMemoryset:
563
+
564
+ class ScoredMemoryLookup(ScoredMemory):
565
+ """
566
+ Lookup result for a memory in a memoryset
567
+
568
+ Attributes:
569
+ lookup_score: Similarity between the memory embedding and search query embedding
570
+ attention_weight: Weight the model assigned to the memory during prediction if this lookup
571
+ happened as part of a prediction
572
+ value: Value represented by the row
573
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
574
+ with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
575
+ score: Score of the memory
576
+ source_id: Optional unique identifier of the memory in a system of reference
577
+ metrics: Metrics about the memory, generated when running an analysis on the
578
+ [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
579
+ memory_id: The unique identifier for the memory, automatically generated on insert
580
+ memory_version: The version of the memory, automatically updated when the score or value changes
581
+ created_at: When the memory was created, automatically generated on insert
582
+ updated_at: When the memory was last updated, automatically updated on update
583
+
584
+ ## Other Attributes:
585
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
586
+ """
587
+
588
+ lookup_score: float
589
+ attention_weight: float | None
590
+
591
+ def __init__(self, memoryset_id: str, memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup):
592
+ # for internal use only, do not document
593
+ super().__init__(memoryset_id, memory_lookup)
594
+ self.lookup_score = memory_lookup["lookup_score"]
595
+ self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
596
+
597
+ def __repr__(self) -> str:
598
+ return (
599
+ "ScoredMemoryLookup({ "
600
+ + f"score: {self.score:.2f}"
601
+ + f", lookup_score: {self.lookup_score:.2f}"
602
+ + f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
603
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
604
+ + " })"
605
+ )
606
+
607
+
608
+ MemoryT = TypeVar("MemoryT", bound=MemoryBase)
609
+ MemoryLookupT = TypeVar("MemoryLookupT", bound=MemoryBase)
610
+
611
+
612
+ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
310
613
  """
311
614
  A Handle to a collection of memories with labels in the OrcaCloud
312
615
 
313
616
  Attributes:
314
617
  id: Unique identifier for the memoryset
315
618
  name: Unique name of the memoryset
316
- label_names: Names for the class labels in the memoryset
619
+ description: Description of the memoryset
317
620
  length: Number of memories in the memoryset
318
621
  embedding_model: Embedding model used to embed the memory values for semantic search
319
622
  created_at: When the memoryset was created, automatically generated on create
@@ -322,43 +625,52 @@ class LabeledMemoryset:
322
625
 
323
626
  id: str
324
627
  name: str
325
- label_names: list[str]
628
+ description: str | None
629
+ memory_type: MemoryType # defined by subclasses
630
+
326
631
  length: int
327
632
  created_at: datetime
328
633
  updated_at: datetime
329
- insertion_status: TaskStatus
330
- embedding_model: _EmbeddingModel
634
+ insertion_status: Status
635
+ embedding_model: EmbeddingModelBase
636
+ index_type: IndexType
637
+ index_params: dict[str, Any]
638
+ hidden: bool
331
639
 
332
- def __init__(self, metadata: LabeledMemorysetMetadata):
640
+ def __init__(self, metadata: MemorysetMetadata):
333
641
  # for internal use only, do not document
334
- if metadata.pretrained_embedding_model_name:
335
- self.embedding_model = PretrainedEmbeddingModel._get(metadata.pretrained_embedding_model_name)
336
- elif metadata.finetuned_embedding_model_id:
337
- self.embedding_model = FinetunedEmbeddingModel.open(metadata.finetuned_embedding_model_id)
642
+ if metadata["pretrained_embedding_model_name"]:
643
+ self.embedding_model = PretrainedEmbeddingModel._get(metadata["pretrained_embedding_model_name"])
644
+ elif metadata["finetuned_embedding_model_id"]:
645
+ self.embedding_model = FinetunedEmbeddingModel.open(metadata["finetuned_embedding_model_id"])
338
646
  else:
339
647
  raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
340
- self.id = metadata.id
341
- self.name = metadata.name
342
- self.label_names = metadata.label_names
343
- self.length = metadata.length
344
- self.created_at = metadata.created_at
345
- self.updated_at = metadata.updated_at
346
- self.insertion_status = metadata.insertion_status
648
+ self.id = metadata["id"]
649
+ self.name = metadata["name"]
650
+ self.description = metadata["description"]
651
+ self.length = metadata["length"]
652
+ self.created_at = datetime.fromisoformat(metadata["created_at"])
653
+ self.updated_at = datetime.fromisoformat(metadata["updated_at"])
654
+ self.insertion_status = Status(metadata["insertion_status"])
347
655
  self._last_refresh = datetime.now()
656
+ self.index_type = metadata["index_type"]
657
+ self.index_params = metadata["index_params"]
658
+ self.memory_type = metadata["memory_type"]
659
+ self.hidden = metadata["hidden"]
348
660
 
349
661
  def __eq__(self, other) -> bool:
350
- return isinstance(other, LabeledMemoryset) and self.id == other.id
662
+ return isinstance(other, MemorysetBase) and self.id == other.id
351
663
 
352
664
  def __repr__(self) -> str:
353
665
  return (
354
- "LabeledMemoryset({\n"
666
+ "Memoryset({\n"
355
667
  f" name: '{self.name}',\n"
356
668
  f" length: {self.length},\n"
357
- f" label_names: {self.label_names},\n"
358
669
  f" embedding_model: {self.embedding_model},\n"
359
670
  "})"
360
671
  )
361
672
 
673
+ @overload
362
674
  @classmethod
363
675
  def create(
364
676
  cls,
@@ -367,12 +679,69 @@ class LabeledMemoryset:
367
679
  *,
368
680
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
369
681
  value_column: str = "value",
370
- label_column: str = "label",
682
+ label_column: str | None = None,
683
+ score_column: str | None = None,
684
+ source_id_column: str | None = None,
685
+ description: str | None = None,
686
+ label_names: list[str] | None = None,
687
+ max_seq_length_override: int | None = None,
688
+ prompt: str | None = None,
689
+ remove_duplicates: bool = True,
690
+ index_type: IndexType = "FLAT",
691
+ index_params: dict[str, Any] = {},
692
+ if_exists: CreateMode = "error",
693
+ background: Literal[True],
694
+ hidden: bool = False,
695
+ ) -> Job[Self]:
696
+ pass
697
+
698
+ @overload
699
+ @classmethod
700
+ def create(
701
+ cls,
702
+ name: str,
703
+ datasource: Datasource,
704
+ *,
705
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
706
+ value_column: str = "value",
707
+ label_column: str | None = None,
708
+ score_column: str | None = None,
709
+ source_id_column: str | None = None,
710
+ description: str | None = None,
711
+ label_names: list[str] | None = None,
712
+ max_seq_length_override: int | None = None,
713
+ prompt: str | None = None,
714
+ remove_duplicates: bool = True,
715
+ index_type: IndexType = "FLAT",
716
+ index_params: dict[str, Any] = {},
717
+ if_exists: CreateMode = "error",
718
+ background: Literal[False] = False,
719
+ hidden: bool = False,
720
+ ) -> Self:
721
+ pass
722
+
723
+ @classmethod
724
+ def create(
725
+ cls,
726
+ name: str,
727
+ datasource: Datasource,
728
+ *,
729
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
730
+ value_column: str = "value",
731
+ label_column: str | None = None,
732
+ score_column: str | None = None,
371
733
  source_id_column: str | None = None,
734
+ description: str | None = None,
372
735
  label_names: list[str] | None = None,
373
736
  max_seq_length_override: int | None = None,
737
+ prompt: str | None = None,
738
+ remove_duplicates: bool = True,
739
+ index_type: IndexType = "FLAT",
740
+ index_params: dict[str, Any] = {},
374
741
  if_exists: CreateMode = "error",
375
- ) -> LabeledMemoryset:
742
+ background: bool = False,
743
+ hidden: bool = False,
744
+ ) -> Self | Job[Self]:
376
745
  """
377
746
  Create a new memoryset in the OrcaCloud
378
747
 
@@ -387,8 +756,12 @@ class LabeledMemoryset:
387
756
  value_column: Name of the column in the datasource that contains the memory values
388
757
  label_column: Name of the column in the datasource that contains the memory labels,
389
758
  these must be contiguous integers starting from 0
759
+ score_column: Name of the column in the datasource that contains the memory scores
390
760
  source_id_column: Optional name of the column in the datasource that contains the ids in
391
761
  the system of reference
762
+ description: Optional description for the memoryset, this will be used in agentic flows,
763
+ so make sure it is concise and describes the contents of your memoryset not the
764
+ datasource or the embedding model.
392
765
  label_names: List of human-readable names for the labels in the memoryset, must match
393
766
  the number of labels in the `label_column`. Will be automatically inferred if a
394
767
  [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
@@ -396,8 +769,16 @@ class LabeledMemoryset:
396
769
  max_seq_length_override: Maximum sequence length of values in the memoryset, if the
397
770
  value is longer than this it will be truncated, will default to the model's max
398
771
  sequence length if not provided
772
+ prompt: Optional prompt to use when embedding documents/memories for storage
773
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
774
+ into the memoryset
775
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
776
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
777
+ index_params: Parameters for the vector index, defaults to `{}`
399
778
  if_exists: What to do if a memoryset with the same name already exists, defaults to
400
779
  `"error"`. Other option is `"open"` to open the existing memoryset.
780
+ background: Whether to run the operation none blocking and return a job handle
781
+ hidden: Whether the memoryset should be hidden
401
782
 
402
783
  Returns:
403
784
  Handle to the new memoryset in the OrcaCloud
@@ -407,9 +788,11 @@ class LabeledMemoryset:
407
788
  `"open"` and the params do not match those of the existing memoryset.
408
789
  """
409
790
  if embedding_model is None:
410
- embedding_model = PretrainedEmbeddingModel.CDE_SMALL
791
+ embedding_model = PretrainedEmbeddingModel.GTE_BASE
792
+
793
+ if label_column is None and score_column is None:
794
+ raise ValueError("label_column or score_column must be provided")
411
795
 
412
- logging.info(f"Checking if memoryset with name: {name} exists")
413
796
  if cls.exists(name):
414
797
  if if_exists == "error":
415
798
  raise ValueError(f"Memoryset with name {name} already exists")
@@ -420,29 +803,47 @@ class LabeledMemoryset:
420
803
  raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
421
804
  return existing
422
805
 
423
- logging.info(f"Creating memoryset with name: {name} from datasource: {datasource}")
424
- response = create_memoryset(
425
- body=CreateLabeledMemorysetRequest(
426
- name=name,
427
- datasource_id=datasource.id,
428
- datasource_label_column=label_column,
429
- datasource_value_column=value_column,
430
- datasource_source_id_column=source_id_column,
431
- pretrained_embedding_model_name=(
432
- embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
433
- ),
434
- finetuned_embedding_model_id=(
435
- embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
436
- ),
437
- label_names=label_names or [],
438
- max_seq_length_override=max_seq_length_override,
439
- ),
440
- )
441
- wait_for_task(response.insertion_task_id, description="Inserting datasource")
442
- return cls.open(response.id)
806
+ payload: CreateMemorysetRequest = {
807
+ "name": name,
808
+ "description": description,
809
+ "datasource_name_or_id": datasource.id,
810
+ "datasource_label_column": label_column,
811
+ "datasource_score_column": score_column,
812
+ "datasource_value_column": value_column,
813
+ "datasource_source_id_column": source_id_column,
814
+ "label_names": label_names,
815
+ "max_seq_length_override": max_seq_length_override,
816
+ "remove_duplicates": remove_duplicates,
817
+ "index_type": index_type,
818
+ "index_params": index_params,
819
+ "hidden": hidden,
820
+ }
821
+ if prompt is not None:
822
+ payload["prompt"] = prompt
823
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
824
+ payload["pretrained_embedding_model_name"] = embedding_model.name
825
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
826
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
827
+ else:
828
+ raise ValueError("Invalid embedding model")
829
+ response = orca_api.POST("/memoryset", json=payload)
830
+ job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
831
+ return job if background else job.result()
832
+
833
+ @overload
834
+ @classmethod
835
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
836
+ pass
443
837
 
838
+ @overload
444
839
  @classmethod
445
- def from_hf_dataset(cls, name: str, hf_dataset: Dataset, **kwargs: Any) -> LabeledMemoryset:
840
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
841
+ pass
842
+
843
+ @classmethod
844
+ def from_hf_dataset(
845
+ cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
846
+ ) -> Self | Job[Self]:
446
847
  """
447
848
  Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
448
849
 
@@ -456,8 +857,7 @@ class LabeledMemoryset:
456
857
  name: Name for the new memoryset (must be unique)
457
858
  hf_dataset: Hugging Face dataset to create the memoryset from
458
859
  kwargs: Additional parameters for creating the memoryset. See
459
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
460
-
860
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
461
861
 
462
862
  Returns:
463
863
  Handle to the new memoryset in the OrcaCloud
@@ -465,9 +865,23 @@ class LabeledMemoryset:
465
865
  datasource = Datasource.from_hf_dataset(
466
866
  f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
467
867
  )
468
- logging.info(f"Datasource: {datasource}")
868
+ kwargs["background"] = background
469
869
  return cls.create(name, datasource, **kwargs)
470
870
 
871
+ @overload
872
+ @classmethod
873
+ def from_pytorch(
874
+ cls,
875
+ name: str,
876
+ torch_data: TorchDataLoader | TorchDataset,
877
+ *,
878
+ column_names: list[str] | None = None,
879
+ background: Literal[True],
880
+ **kwargs: Any,
881
+ ) -> Job[Self]:
882
+ pass
883
+
884
+ @overload
471
885
  @classmethod
472
886
  def from_pytorch(
473
887
  cls,
@@ -475,8 +889,21 @@ class LabeledMemoryset:
475
889
  torch_data: TorchDataLoader | TorchDataset,
476
890
  *,
477
891
  column_names: list[str] | None = None,
892
+ background: Literal[False] = False,
478
893
  **kwargs: Any,
479
- ) -> LabeledMemoryset:
894
+ ) -> Self:
895
+ pass
896
+
897
+ @classmethod
898
+ def from_pytorch(
899
+ cls,
900
+ name: str,
901
+ torch_data: TorchDataLoader | TorchDataset,
902
+ *,
903
+ column_names: list[str] | None = None,
904
+ background: bool = False,
905
+ **kwargs: Any,
906
+ ) -> Self | Job[Self]:
480
907
  """
481
908
  Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
482
909
  [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
@@ -492,9 +919,9 @@ class LabeledMemoryset:
492
919
  torch_data: PyTorch data loader or dataset to create the memoryset from
493
920
  column_names: If the provided dataset or data loader returns unnamed tuples, this
494
921
  argument must be provided to specify the names of the columns.
922
+ background: Whether to run the operation in the background
495
923
  kwargs: Additional parameters for creating the memoryset. See
496
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
497
-
924
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
498
925
 
499
926
  Returns:
500
927
  Handle to the new memoryset in the OrcaCloud
@@ -502,10 +929,42 @@ class LabeledMemoryset:
502
929
  datasource = Datasource.from_pytorch(
503
930
  f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
504
931
  )
932
+ kwargs["background"] = background
505
933
  return cls.create(name, datasource, **kwargs)
506
934
 
935
+ @overload
936
+ @classmethod
937
+ def from_list(
938
+ cls,
939
+ name: str,
940
+ data: list[dict],
941
+ *,
942
+ background: Literal[True],
943
+ **kwargs: Any,
944
+ ) -> Job[Self]:
945
+ pass
946
+
947
+ @overload
507
948
  @classmethod
508
- def from_list(cls, name: str, data: list[dict], **kwargs: Any) -> LabeledMemoryset:
949
+ def from_list(
950
+ cls,
951
+ name: str,
952
+ data: list[dict],
953
+ *,
954
+ background: Literal[False] = False,
955
+ **kwargs: Any,
956
+ ) -> Self:
957
+ pass
958
+
959
+ @classmethod
960
+ def from_list(
961
+ cls,
962
+ name: str,
963
+ data: list[dict],
964
+ *,
965
+ background: bool = False,
966
+ **kwargs: Any,
967
+ ) -> Self | Job[Self]:
509
968
  """
510
969
  Create a new memoryset from a list of dictionaries in the OrcaCloud
511
970
 
@@ -518,8 +977,9 @@ class LabeledMemoryset:
518
977
  Params:
519
978
  name: Name for the new memoryset (must be unique)
520
979
  data: List of dictionaries to create the memoryset from
980
+ background: Whether to run the operation in the background
521
981
  kwargs: Additional parameters for creating the memoryset. See
522
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
982
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
523
983
 
524
984
  Returns:
525
985
  Handle to the new memoryset in the OrcaCloud
@@ -531,10 +991,42 @@ class LabeledMemoryset:
531
991
  ... ])
532
992
  """
533
993
  datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
994
+ kwargs["background"] = background
534
995
  return cls.create(name, datasource, **kwargs)
535
996
 
997
+ @overload
998
+ @classmethod
999
+ def from_dict(
1000
+ cls,
1001
+ name: str,
1002
+ data: dict,
1003
+ *,
1004
+ background: Literal[True],
1005
+ **kwargs: Any,
1006
+ ) -> Job[Self]:
1007
+ pass
1008
+
1009
+ @overload
536
1010
  @classmethod
537
- def from_dict(cls, name: str, data: dict, **kwargs: Any) -> LabeledMemoryset:
1011
+ def from_dict(
1012
+ cls,
1013
+ name: str,
1014
+ data: dict,
1015
+ *,
1016
+ background: Literal[False] = False,
1017
+ **kwargs: Any,
1018
+ ) -> Self:
1019
+ pass
1020
+
1021
+ @classmethod
1022
+ def from_dict(
1023
+ cls,
1024
+ name: str,
1025
+ data: dict,
1026
+ *,
1027
+ background: bool = False,
1028
+ **kwargs: Any,
1029
+ ) -> Self | Job[Self]:
538
1030
  """
539
1031
  Create a new memoryset from a dictionary of columns in the OrcaCloud
540
1032
 
@@ -547,8 +1039,9 @@ class LabeledMemoryset:
547
1039
  Params:
548
1040
  name: Name for the new memoryset (must be unique)
549
1041
  data: Dictionary of columns to create the memoryset from
1042
+ background: Whether to run the operation in the background
550
1043
  kwargs: Additional parameters for creating the memoryset. See
551
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
1044
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
552
1045
 
553
1046
  Returns:
554
1047
  Handle to the new memoryset in the OrcaCloud
@@ -561,10 +1054,42 @@ class LabeledMemoryset:
561
1054
  ... })
562
1055
  """
563
1056
  datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
1057
+ kwargs["background"] = background
564
1058
  return cls.create(name, datasource, **kwargs)
565
1059
 
1060
+ @overload
1061
+ @classmethod
1062
+ def from_pandas(
1063
+ cls,
1064
+ name: str,
1065
+ dataframe: pd.DataFrame,
1066
+ *,
1067
+ background: Literal[True],
1068
+ **kwargs: Any,
1069
+ ) -> Job[Self]:
1070
+ pass
1071
+
1072
+ @overload
566
1073
  @classmethod
567
- def from_pandas(cls, name: str, dataframe: pd.DataFrame, **kwargs: Any) -> LabeledMemoryset:
1074
+ def from_pandas(
1075
+ cls,
1076
+ name: str,
1077
+ dataframe: pd.DataFrame,
1078
+ *,
1079
+ background: Literal[False] = False,
1080
+ **kwargs: Any,
1081
+ ) -> Self:
1082
+ pass
1083
+
1084
+ @classmethod
1085
+ def from_pandas(
1086
+ cls,
1087
+ name: str,
1088
+ dataframe: pd.DataFrame,
1089
+ *,
1090
+ background: bool = False,
1091
+ **kwargs: Any,
1092
+ ) -> Self | Job[Self]:
568
1093
  """
569
1094
  Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
570
1095
 
@@ -577,17 +1102,50 @@ class LabeledMemoryset:
577
1102
  Params:
578
1103
  name: Name for the new memoryset (must be unique)
579
1104
  dataframe: Dataframe to create the memoryset from
1105
+ background: Whether to run the operation in the background
580
1106
  kwargs: Additional parameters for creating the memoryset. See
581
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
1107
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
582
1108
 
583
1109
  Returns:
584
1110
  Handle to the new memoryset in the OrcaCloud
585
1111
  """
586
1112
  datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
1113
+ kwargs["background"] = background
587
1114
  return cls.create(name, datasource, **kwargs)
588
1115
 
1116
+ @overload
1117
+ @classmethod
1118
+ def from_arrow(
1119
+ cls,
1120
+ name: str,
1121
+ pyarrow_table: pa.Table,
1122
+ *,
1123
+ background: Literal[True],
1124
+ **kwargs: Any,
1125
+ ) -> Job[Self]:
1126
+ pass
1127
+
1128
+ @overload
1129
+ @classmethod
1130
+ def from_arrow(
1131
+ cls,
1132
+ name: str,
1133
+ pyarrow_table: pa.Table,
1134
+ *,
1135
+ background: Literal[False] = False,
1136
+ **kwargs: Any,
1137
+ ) -> Self:
1138
+ pass
1139
+
589
1140
  @classmethod
590
- def from_arrow(cls, name: str, pyarrow_table: pa.Table, **kwargs: Any) -> LabeledMemoryset:
1141
+ def from_arrow(
1142
+ cls,
1143
+ name: str,
1144
+ pyarrow_table: pa.Table,
1145
+ *,
1146
+ background: bool = False,
1147
+ **kwargs: Any,
1148
+ ) -> Self | Job[Self]:
591
1149
  """
592
1150
  Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
593
1151
 
@@ -600,8 +1158,9 @@ class LabeledMemoryset:
600
1158
  Params:
601
1159
  name: Name for the new memoryset (must be unique)
602
1160
  pyarrow_table: PyArrow table to create the memoryset from
1161
+ background: Whether to run the operation in the background
603
1162
  kwargs: Additional parameters for creating the memoryset. See
604
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
1163
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
605
1164
 
606
1165
  Returns:
607
1166
  Handle to the new memoryset in the OrcaCloud
@@ -609,10 +1168,42 @@ class LabeledMemoryset:
609
1168
  datasource = Datasource.from_arrow(
610
1169
  f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
611
1170
  )
1171
+ kwargs["background"] = background
612
1172
  return cls.create(name, datasource, **kwargs)
613
1173
 
1174
+ @overload
1175
+ @classmethod
1176
+ def from_disk(
1177
+ cls,
1178
+ name: str,
1179
+ file_path: str | PathLike,
1180
+ *,
1181
+ background: Literal[True],
1182
+ **kwargs: Any,
1183
+ ) -> Job[Self]:
1184
+ pass
1185
+
1186
+ @overload
1187
+ @classmethod
1188
+ def from_disk(
1189
+ cls,
1190
+ name: str,
1191
+ file_path: str | PathLike,
1192
+ *,
1193
+ background: Literal[False] = False,
1194
+ **kwargs: Any,
1195
+ ) -> Self:
1196
+ pass
1197
+
614
1198
  @classmethod
615
- def from_disk(cls, name: str, file_path: str | PathLike, **kwargs: Any) -> LabeledMemoryset:
1199
+ def from_disk(
1200
+ cls,
1201
+ name: str,
1202
+ file_path: str | PathLike,
1203
+ *,
1204
+ background: bool = False,
1205
+ **kwargs: Any,
1206
+ ) -> Self | Job[Self]:
616
1207
  """
617
1208
  Create a new memoryset from a file on disk in the OrcaCloud
618
1209
 
@@ -632,17 +1223,19 @@ class LabeledMemoryset:
632
1223
  - .csv: [`CSV`][csv] files
633
1224
  - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
634
1225
  - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
1226
+ background: Whether to run the operation in the background
635
1227
  kwargs: Additional parameters for creating the memoryset. See
636
- [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
1228
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
637
1229
 
638
1230
  Returns:
639
1231
  Handle to the new memoryset in the OrcaCloud
640
1232
  """
641
1233
  datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
1234
+ kwargs["background"] = background
642
1235
  return cls.create(name, datasource, **kwargs)
643
1236
 
644
1237
  @classmethod
645
- def open(cls, name: str) -> LabeledMemoryset:
1238
+ def open(cls, name: str) -> Self:
646
1239
  """
647
1240
  Get a handle to a memoryset in the OrcaCloud
648
1241
 
@@ -655,7 +1248,7 @@ class LabeledMemoryset:
655
1248
  Raises:
656
1249
  LookupError: If the memoryset does not exist
657
1250
  """
658
- metadata = get_memoryset(name)
1251
+ metadata = orca_api.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
659
1252
  return cls(metadata)
660
1253
 
661
1254
  @classmethod
@@ -676,14 +1269,20 @@ class LabeledMemoryset:
676
1269
  return False
677
1270
 
678
1271
  @classmethod
679
- def all(cls) -> list[LabeledMemoryset]:
1272
+ def all(cls, show_hidden: bool = False) -> list[Self]:
680
1273
  """
681
1274
  Get a list of handles to all memorysets in the OrcaCloud
682
1275
 
1276
+ Params:
1277
+ show_hidden: Whether to include hidden memorysets in results, defaults to `False`
1278
+
683
1279
  Returns:
684
1280
  List of handles to all memorysets in the OrcaCloud
685
1281
  """
686
- return [cls(metadata) for metadata in list_memorysets()]
1282
+ return [
1283
+ cls(metadata)
1284
+ for metadata in orca_api.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
1285
+ ]
687
1286
 
688
1287
  @classmethod
689
1288
  def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
@@ -699,29 +1298,92 @@ class LabeledMemoryset:
699
1298
  LookupError: If the memoryset does not exist and if_not_exists is `"error"`
700
1299
  """
701
1300
  try:
702
- delete_memoryset(name_or_id)
1301
+ orca_api.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
703
1302
  logging.info(f"Deleted memoryset {name_or_id}")
704
1303
  except LookupError:
705
1304
  if if_not_exists == "error":
706
1305
  raise
707
1306
 
1307
+ def set(
1308
+ self,
1309
+ *,
1310
+ name: str = UNSET,
1311
+ description: str | None = UNSET,
1312
+ label_names: list[str] = UNSET,
1313
+ hidden: bool = UNSET,
1314
+ ):
1315
+ """
1316
+ Update editable attributes of the memoryset
1317
+
1318
+ Note:
1319
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
1320
+
1321
+ Params:
1322
+ description: Value to set for the description
1323
+ name: Value to set for the name
1324
+ label_names: Value to replace existing label names with
1325
+ """
1326
+ payload: MemorysetUpdate = {}
1327
+ if name is not UNSET:
1328
+ payload["name"] = name
1329
+ if description is not UNSET:
1330
+ payload["description"] = description
1331
+ if label_names is not UNSET:
1332
+ payload["label_names"] = label_names
1333
+ if hidden is not UNSET:
1334
+ payload["hidden"] = hidden
1335
+
1336
+ orca_api.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
1337
+ self.refresh()
1338
+
1339
+ @overload
1340
+ def clone(
1341
+ self,
1342
+ name: str,
1343
+ *,
1344
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
1345
+ max_seq_length_override: int | None = None,
1346
+ prompt: str | None = None,
1347
+ if_exists: CreateMode = "error",
1348
+ background: Literal[True],
1349
+ ) -> Job[Self]:
1350
+ pass
1351
+
1352
+ @overload
708
1353
  def clone(
709
1354
  self,
710
1355
  name: str,
711
1356
  *,
712
1357
  embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
713
1358
  max_seq_length_override: int | None = None,
1359
+ prompt: str | None = None,
1360
+ if_exists: CreateMode = "error",
1361
+ background: Literal[False] = False,
1362
+ ) -> Self:
1363
+ pass
1364
+
1365
+ def clone(
1366
+ self,
1367
+ name: str,
1368
+ *,
1369
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
1370
+ max_seq_length_override: int | None = UNSET,
1371
+ prompt: str | None = None,
714
1372
  if_exists: CreateMode = "error",
715
- ) -> LabeledMemoryset:
1373
+ background: bool = False,
1374
+ ) -> Self | Job[Self]:
716
1375
  """
717
1376
  Create a clone of the memoryset with a new name
718
1377
 
719
1378
  Params:
720
1379
  name: Name for the new memoryset (must be unique)
721
1380
  embedding_model: Optional new embedding model to use for re-embedding the memory values
722
- max_seq_length_override: Maximum sequence length of values in the memoryset, if the
723
1381
  value is longer than this it will be truncated, will default to the model's max
724
1382
  sequence length if not provided
1383
+ max_seq_length_override: Optional custom max sequence length to use for the cloned memoryset.
1384
+ If not provided, will use the source memoryset's max sequence length.
1385
+ prompt: Optional custom prompt to use for the cloned memoryset.
1386
+ If not provided, will use the source memoryset's prompt.
725
1387
  if_exists: What to do if a memoryset with the same name already exists, defaults to
726
1388
  `"error"`. Other option is `"open"` to open the existing memoryset.
727
1389
 
@@ -736,6 +1398,13 @@ class LabeledMemoryset:
736
1398
  >>> new_memoryset = memoryset.clone(
737
1399
  ... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
738
1400
  ... )
1401
+
1402
+ >>> # Clone with custom prompts
1403
+ >>> new_memoryset = memoryset.clone(
1404
+ ... "my_memoryset_with_prompts",
1405
+ ... document_prompt_override="Represent this document for retrieval:",
1406
+ ... query_prompt_override="Represent this query for retrieval:",
1407
+ ... )
739
1408
  """
740
1409
  if self.exists(name):
741
1410
  if if_exists == "error":
@@ -746,22 +1415,22 @@ class LabeledMemoryset:
746
1415
  if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
747
1416
  raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
748
1417
  return existing
749
-
750
- metadata = clone_memoryset(
751
- self.id,
752
- body=CloneLabeledMemorysetRequest(
753
- name=name,
754
- pretrained_embedding_model_name=(
755
- embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
756
- ),
757
- finetuned_embedding_model_id=(
758
- embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
759
- ),
760
- max_seq_length_override=max_seq_length_override,
761
- ),
1418
+ payload: CloneMemorysetRequest = {"name": name}
1419
+ if max_seq_length_override is not UNSET:
1420
+ payload["max_seq_length_override"] = max_seq_length_override
1421
+ if prompt is not None:
1422
+ payload["prompt"] = prompt
1423
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
1424
+ payload["pretrained_embedding_model_name"] = embedding_model.name
1425
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
1426
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
1427
+
1428
+ metadata = orca_api.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
1429
+ job = Job(
1430
+ metadata["insertion_task_id"],
1431
+ lambda: self.open(metadata["id"]),
762
1432
  )
763
- wait_for_task(metadata.insertion_task_id, description="Cloning memoryset")
764
- return LabeledMemoryset.open(metadata.id)
1433
+ return job if background else job.result()
765
1434
 
766
1435
  def refresh(self, throttle: float = 0):
767
1436
  """
@@ -775,7 +1444,7 @@ class LabeledMemoryset:
775
1444
  if (current_time - self._last_refresh) < timedelta(seconds=throttle):
776
1445
  return
777
1446
 
778
- self.__dict__.update(LabeledMemoryset.open(self.id).__dict__)
1447
+ self.__dict__.update(self.open(self.id).__dict__)
779
1448
  self._last_refresh = current_time
780
1449
 
781
1450
  def __len__(self) -> int:
@@ -784,14 +1453,14 @@ class LabeledMemoryset:
784
1453
  return self.length
785
1454
 
786
1455
  @overload
787
- def __getitem__(self, index: int | str) -> LabeledMemory:
1456
+ def __getitem__(self, index: int | str) -> MemoryT:
788
1457
  pass
789
1458
 
790
1459
  @overload
791
- def __getitem__(self, index: slice) -> list[LabeledMemory]:
1460
+ def __getitem__(self, index: slice) -> list[MemoryT]:
792
1461
  pass
793
1462
 
794
- def __getitem__(self, index: int | slice | str) -> LabeledMemory | list[LabeledMemory]:
1463
+ def __getitem__(self, index: int | slice | str) -> MemoryT | list[MemoryT]:
795
1464
  """
796
1465
  Get memories from the memoryset by index or memory id
797
1466
 
@@ -837,22 +1506,24 @@ class LabeledMemoryset:
837
1506
  raise ValueError(f"Invalid index type: {type(index)}")
838
1507
 
839
1508
  @overload
840
- def search(self, query: str, *, count: int = 1) -> list[LabeledMemoryLookup]:
1509
+ def search(self, query: str, *, count: int = 1, prompt: str | None = None) -> list[MemoryLookupT]:
841
1510
  pass
842
1511
 
843
1512
  @overload
844
- def search(self, query: list[str], *, count: int = 1) -> list[list[LabeledMemoryLookup]]:
1513
+ def search(self, query: list[str], *, count: int = 1, prompt: str | None = None) -> list[list[MemoryLookupT]]:
845
1514
  pass
846
1515
 
847
1516
  def search(
848
- self, query: str | list[str], *, count: int = 1
849
- ) -> list[LabeledMemoryLookup] | list[list[LabeledMemoryLookup]]:
1517
+ self, query: str | list[str], *, count: int = 1, prompt: str | None = None
1518
+ ) -> list[MemoryLookupT] | list[list[MemoryLookupT]]:
850
1519
  """
851
1520
  Search for memories that are semantically similar to the query
852
1521
 
853
1522
  Params:
854
1523
  query: Query to lookup memories in the memoryset, can be a single query or a list
855
1524
  count: Number of memories to return for each query
1525
+ prompt: Optional prompt for query embedding during search.
1526
+ If not provided, the memoryset's default query prompt will be used if available.
856
1527
 
857
1528
  Returns:
858
1529
  List of memories from the memoryset that match the query. If a single query is provided,
@@ -867,6 +1538,13 @@ class LabeledMemoryset:
867
1538
  LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
868
1539
  ]
869
1540
 
1541
+ Search with custom query prompt for instruction-following models:
1542
+ >>> memoryset.search("I am happy", count=2, query_prompt="Represent this query for sentiment retrieval:")
1543
+ [
1544
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
1545
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
1546
+ ]
1547
+
870
1548
  Search for similar memories for multiple queries:
871
1549
  >>> memoryset.search(["I am happy", "I am sad"], count=1)
872
1550
  [
@@ -878,14 +1556,29 @@ class LabeledMemoryset:
878
1556
  ],
879
1557
  ]
880
1558
  """
881
- response = memoryset_lookup_gpu(
882
- name_or_id=self.id,
883
- body=LookupRequest(
884
- query=query if isinstance(query, list) else [query],
885
- count=count,
886
- ),
1559
+ response = orca_api.POST(
1560
+ "/gpu/memoryset/{name_or_id}/lookup",
1561
+ params={"name_or_id": self.id},
1562
+ json={
1563
+ "query": query if isinstance(query, list) else [query],
1564
+ "count": count,
1565
+ "prompt": prompt,
1566
+ },
887
1567
  )
888
- lookups = [[LabeledMemoryLookup(self.id, lookup_response) for lookup_response in batch] for batch in response]
1568
+ lookups = [
1569
+ [
1570
+ cast(
1571
+ MemoryLookupT,
1572
+ (
1573
+ LabeledMemoryLookup(self.id, lookup_response)
1574
+ if "label" in lookup_response
1575
+ else ScoredMemoryLookup(self.id, lookup_response)
1576
+ ),
1577
+ )
1578
+ for lookup_response in batch
1579
+ ]
1580
+ for batch in response
1581
+ ]
889
1582
  return lookups if isinstance(query, list) else lookups[0]
890
1583
 
891
1584
  def query(
@@ -893,7 +1586,9 @@ class LabeledMemoryset:
893
1586
  offset: int = 0,
894
1587
  limit: int = 100,
895
1588
  filters: list[FilterItemTuple] = [],
896
- ) -> list[LabeledMemory]:
1589
+ with_feedback_metrics: bool = False,
1590
+ sort: list[TelemetrySortItem] | None = None,
1591
+ ) -> list[MemoryT]:
897
1592
  """
898
1593
  Query the memoryset for memories that match the filters
899
1594
 
@@ -901,6 +1596,7 @@ class LabeledMemoryset:
901
1596
  offset: The offset of the first memory to return
902
1597
  limit: The maximum number of memories to return
903
1598
  filters: List of filters to apply to the query.
1599
+ with_feedback_metrics: Whether to include feedback metrics in the response
904
1600
 
905
1601
  Returns:
906
1602
  List of memories from the memoryset that match the filters
@@ -912,21 +1608,76 @@ class LabeledMemoryset:
912
1608
  LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
913
1609
  ]
914
1610
  """
1611
+ parsed_filters = [
1612
+ _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
1613
+ ]
1614
+
1615
+ if with_feedback_metrics:
1616
+ response = orca_api.POST(
1617
+ "/telemetry/memories",
1618
+ json={
1619
+ "memoryset_id": self.id,
1620
+ "offset": offset,
1621
+ "limit": limit,
1622
+ "filters": parsed_filters,
1623
+ "sort": [_parse_sort_item_from_tuple(item) for item in sort] if sort else None,
1624
+ },
1625
+ )
1626
+ return [
1627
+ cast(
1628
+ MemoryT,
1629
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
1630
+ )
1631
+ for memory in response["items"]
1632
+ ]
1633
+
1634
+ if any(_is_metric_column(filter[0]) for filter in filters):
1635
+ raise ValueError("Feedback metrics are only supported when the with_feedback_metrics flag is set to True")
1636
+
1637
+ if sort:
1638
+ logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
1639
+
1640
+ response = orca_api.POST(
1641
+ "/memoryset/{name_or_id}/memories",
1642
+ params={"name_or_id": self.id},
1643
+ json={
1644
+ "offset": offset,
1645
+ "limit": limit,
1646
+ "filters": cast(list[FilterItem], parsed_filters),
1647
+ },
1648
+ )
915
1649
  return [
916
- LabeledMemory(self.id, memory)
917
- for memory in query_memoryset(
918
- self.id,
919
- body=ListMemoriesRequest(
920
- offset=offset,
921
- limit=limit,
922
- filters=[
923
- _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter
924
- for filter in filters
925
- ],
926
- ),
1650
+ cast(
1651
+ MemoryT,
1652
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
927
1653
  )
1654
+ for memory in response
928
1655
  ]
929
1656
 
1657
+ def to_pandas(
1658
+ self,
1659
+ offset: int = 0,
1660
+ limit: int = 100,
1661
+ filters: list[FilterItemTuple] = [],
1662
+ with_feedback_metrics: bool = False,
1663
+ sort: list[TelemetrySortItem] | None = None,
1664
+ ) -> pd.DataFrame:
1665
+ """
1666
+ Convert the memoryset to a pandas DataFrame
1667
+ """
1668
+ return pd.DataFrame(
1669
+ [
1670
+ memory.to_dict()
1671
+ for memory in self.query(
1672
+ offset=offset,
1673
+ limit=limit,
1674
+ filters=filters,
1675
+ with_feedback_metrics=with_feedback_metrics,
1676
+ sort=sort,
1677
+ )
1678
+ ]
1679
+ )
1680
+
930
1681
  def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
931
1682
  """
932
1683
  Insert memories into the memoryset
@@ -937,6 +1688,7 @@ class LabeledMemoryset:
937
1688
 
938
1689
  - `value`: Value of the memory
939
1690
  - `label`: Label of the memory
1691
+ - `score`: Score of the memory
940
1692
  - `source_id`: Optional unique ID of the memory in a system of reference
941
1693
  - `...`: Any other metadata to store for the memory
942
1694
 
@@ -946,26 +1698,28 @@ class LabeledMemoryset:
946
1698
  ... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
947
1699
  ... ])
948
1700
  """
949
- insert_memories_gpu(
950
- self.id,
951
- body=(
1701
+ orca_api.POST(
1702
+ "/gpu/memoryset/{name_or_id}/memory",
1703
+ params={"name_or_id": self.id},
1704
+ json=cast(
1705
+ list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
952
1706
  [
953
- _parse_memory_insert(memory)
1707
+ _parse_memory_insert(memory, type=self.memory_type)
954
1708
  for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
955
- ]
1709
+ ],
956
1710
  ),
957
1711
  )
958
1712
  self.refresh()
959
1713
 
960
1714
  @overload
961
- def get(self, memory_id: str) -> LabeledMemory: # type: ignore -- this takes precedence
1715
+ def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
962
1716
  pass
963
1717
 
964
1718
  @overload
965
- def get(self, memory_id: Iterable[str]) -> list[LabeledMemory]:
1719
+ def get(self, memory_id: Iterable[str]) -> list[MemoryT]:
966
1720
  pass
967
1721
 
968
- def get(self, memory_id: str | Iterable[str]) -> LabeledMemory | list[LabeledMemory]:
1722
+ def get(self, memory_id: str | Iterable[str]) -> MemoryT | list[MemoryT]:
969
1723
  """
970
1724
  Fetch a memory or memories from the memoryset
971
1725
 
@@ -994,22 +1748,36 @@ class LabeledMemoryset:
994
1748
  ]
995
1749
  """
996
1750
  if isinstance(memory_id, str):
997
- return LabeledMemory(self.id, get_memory(self.id, memory_id))
1751
+ response = orca_api.GET(
1752
+ "/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
1753
+ )
1754
+ return cast(
1755
+ MemoryT,
1756
+ (LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
1757
+ )
998
1758
  else:
1759
+ response = orca_api.POST(
1760
+ "/memoryset/{name_or_id}/memories/get",
1761
+ params={"name_or_id": self.id},
1762
+ json={"memory_ids": list(memory_id)},
1763
+ )
999
1764
  return [
1000
- LabeledMemory(self.id, memory)
1001
- for memory in get_memories(self.id, body=GetMemoriesRequest(memory_ids=list(memory_id)))
1765
+ cast(
1766
+ MemoryT,
1767
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
1768
+ )
1769
+ for memory in response
1002
1770
  ]
1003
1771
 
1004
1772
  @overload
1005
- def update(self, updates: dict[str, Any]) -> LabeledMemory:
1773
+ def update(self, updates: dict[str, Any]) -> MemoryT:
1006
1774
  pass
1007
1775
 
1008
1776
  @overload
1009
- def update(self, updates: Iterable[dict[str, Any]]) -> list[LabeledMemory]:
1777
+ def update(self, updates: Iterable[dict[str, Any]]) -> list[MemoryT]:
1010
1778
  pass
1011
1779
 
1012
- def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> LabeledMemory | list[LabeledMemory]:
1780
+ def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> MemoryT | list[MemoryT]:
1013
1781
  """
1014
1782
  Update one or multiple memories in the memoryset
1015
1783
 
@@ -1041,16 +1809,82 @@ class LabeledMemoryset:
1041
1809
  ... for m in memoryset.query(filters=[("tag", "==", "happy")])
1042
1810
  ... )
1043
1811
  """
1044
- response = update_memories_gpu(
1045
- self.id,
1046
- body=[
1047
- _parse_memory_update(update)
1048
- for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
1049
- ],
1812
+ response = orca_api.PATCH(
1813
+ "/gpu/memoryset/{name_or_id}/memories",
1814
+ params={"name_or_id": self.id},
1815
+ json=cast(
1816
+ list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
1817
+ [
1818
+ _parse_memory_update(update, type=self.memory_type)
1819
+ for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
1820
+ ],
1821
+ ),
1050
1822
  )
1051
- updated_memories = [LabeledMemory(self.id, memory) for memory in response]
1823
+ updated_memories = [
1824
+ cast(
1825
+ MemoryT,
1826
+ (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
1827
+ )
1828
+ for memory in response
1829
+ ]
1052
1830
  return updated_memories[0] if isinstance(updates, dict) else updated_memories
1053
1831
 
1832
+ def get_cascading_edits_suggestions(
1833
+ self,
1834
+ memory: MemoryT,
1835
+ *,
1836
+ old_label: int,
1837
+ new_label: int,
1838
+ max_neighbors: int = 50,
1839
+ max_validation_neighbors: int = 10,
1840
+ similarity_threshold: float | None = None,
1841
+ only_if_has_old_label: bool = True,
1842
+ exclude_if_new_label: bool = True,
1843
+ suggestion_cooldown_time: float = 3600.0 * 24.0, # 1 day
1844
+ label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7, # 1 week
1845
+ ) -> list[CascadingEditSuggestion]:
1846
+ """
1847
+ Suggests cascading edits for a given memory based on nearby points with similar labels.
1848
+
1849
+ This function is triggered after a user changes a memory's label. It looks for nearby
1850
+ candidates in embedding space that may be subject to similar relabeling and returns them
1851
+ as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
1852
+ to reduce noise and improve usability.
1853
+
1854
+ Params:
1855
+ memory: The memory whose label was just changed.
1856
+ old_label: The label this memory used to have.
1857
+ new_label: The label it was changed to.
1858
+ max_neighbors: Maximum number of neighbors to consider.
1859
+ max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
1860
+ similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
1861
+ only_if_has_old_label: If True, only consider neighbors that have the old label.
1862
+ exclude_if_new_label: If True, exclude neighbors that already have the new label.
1863
+ suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
1864
+ to be considered again.
1865
+ label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
1866
+ to be considered for suggestions.
1867
+
1868
+ Returns:
1869
+ A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
1870
+ """
1871
+ # TODO: properly integrate this with memory edits and return something that can be applied
1872
+ return orca_api.POST(
1873
+ "/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
1874
+ params={"name_or_id": self.id, "memory_id": memory.memory_id},
1875
+ json={
1876
+ "old_label": old_label,
1877
+ "new_label": new_label,
1878
+ "max_neighbors": max_neighbors,
1879
+ "max_validation_neighbors": max_validation_neighbors,
1880
+ "similarity_threshold": similarity_threshold,
1881
+ "only_if_has_old_label": only_if_has_old_label,
1882
+ "exclude_if_new_label": exclude_if_new_label,
1883
+ "suggestion_cooldown_time": suggestion_cooldown_time,
1884
+ "label_confirmation_cooldown_time": label_confirmation_cooldown_time,
1885
+ },
1886
+ )
1887
+
1054
1888
  def delete(self, memory_id: str | Iterable[str]) -> None:
1055
1889
  """
1056
1890
  Delete memories from the memoryset
@@ -1070,56 +1904,70 @@ class LabeledMemoryset:
1070
1904
 
1071
1905
  """
1072
1906
  memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
1073
- delete_memories(self.id, body=DeleteMemoriesRequest(memory_ids=memory_ids))
1907
+ orca_api.POST(
1908
+ "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": memory_ids}
1909
+ )
1074
1910
  logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
1075
1911
  self.refresh()
1076
1912
 
1077
- def find_duplicates(self) -> dict:
1913
+ @overload
1914
+ def analyze(
1915
+ self,
1916
+ *analyses: dict[str, Any] | str,
1917
+ lookup_count: int = 15,
1918
+ clear_metrics: bool = False,
1919
+ background: Literal[True],
1920
+ ) -> Job[MemorysetMetrics]:
1921
+ pass
1922
+
1923
+ @overload
1924
+ def analyze(
1925
+ self,
1926
+ *analyses: dict[str, Any] | str,
1927
+ lookup_count: int = 15,
1928
+ clear_metrics: bool = False,
1929
+ background: Literal[False] = False,
1930
+ ) -> MemorysetMetrics:
1931
+ pass
1932
+
1933
+ def analyze(
1934
+ self,
1935
+ *analyses: dict[str, Any] | str,
1936
+ lookup_count: int = 15,
1937
+ clear_metrics: bool = False,
1938
+ background: bool = False,
1939
+ ) -> Job[MemorysetMetrics] | MemorysetMetrics:
1078
1940
  """
1079
- Run an analysis to find duplicate memories in the memoryset
1941
+ Run analyses on the memoryset to find duplicates, clusters, mislabelings, and more
1080
1942
 
1081
1943
  The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1082
- attribute of each memory in the memoryset.
1944
+ attribute of each memory in the memoryset. Overall memoryset metrics will be returned as a dictionary.
1083
1945
 
1084
- Returns:
1085
- Summary of analysis with number of duplicate memories found
1086
-
1087
- Examples:
1088
- >>> memoryset.find_duplicate_memories()
1089
- { "num_duplicates": 10 }
1090
- >>> memoryset.delete(
1091
- ... m.memory_id
1092
- ... for m in memoryset.query(
1093
- ... filters=[("metrics.is_duplicate", "==", True)]
1094
- ... )
1095
- ... )
1096
- """
1097
- analysis = create_analysis(
1098
- self.id,
1099
- body=MemorysetAnalysisRequest(
1100
- type=MemorysetAnalysisRequestType.ANALYZE_DUPLICATE_MEMORIES,
1101
- ),
1102
- )
1103
- wait_for_task(analysis.task_id, description="Analyzing duplicates")
1104
- analysis = get_analysis(self.id, analysis.task_id)
1105
- assert isinstance(analysis.result, FindDuplicatesAnalysisResult)
1106
- # TODO: return a custom duplicate analysis class instance with helper methods
1107
- return analysis.result.to_dict()
1946
+ Params:
1947
+ analyses: List of analysis to run on the memoryset, can either be just the name of an
1948
+ analysis or a dictionary with a name property and additional config. The available
1949
+ analyses are:
1108
1950
 
1109
- def analyze_labels(self, neighbor_count: int = 10) -> dict:
1110
- """
1111
- Run an analysis to access if the labels in the memoryset are consistent to detect possibly
1112
- mislabeled memories.
1951
+ - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
1952
+ - **`"cluster"`**: Cluster the memories in the memoryset
1953
+ - **`"label"`**: Analyze the labels to find potential mislabelings
1954
+ - **`"neighbor"`**: Analyze the neighbors to populate anomaly scores
1955
+ - **`"projection"`**: Create a 2D projection of the embeddings for visualization
1113
1956
 
1114
- The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1115
- attribute of each memory in the memoryset.
1957
+ lookup_count: Number of memories to lookup for each memory in the memoryset
1958
+ clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
1116
1959
 
1117
1960
  Returns:
1118
- Summary of analysis with aggregate metrics for each label class
1961
+ dictionary with aggregate metrics for each analysis that was run
1962
+
1963
+ Raises:
1964
+ ValueError: If an invalid analysis name is provided
1119
1965
 
1120
1966
  Examples:
1121
- >>> memoryset.analyze_labels()
1122
- {
1967
+ Run label and duplicate analysis:
1968
+ >>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
1969
+ { "duplicate": { "num_duplicates": 10 },
1970
+ "label": {
1123
1971
  "label_metrics": [{
1124
1972
  "label": 0,
1125
1973
  "label_name": "negative",
@@ -1131,24 +1979,212 @@ class LabeledMemoryset:
1131
1979
  "average_lookup_score": 0.90,
1132
1980
  "memory_count": 100,
1133
1981
  }]
1982
+ "neighbor_prediction_accuracy": 0.95,
1983
+ "mean_neighbor_label_confidence": 0.95,
1984
+ "mean_neighbor_label_entropy": 0.95,
1985
+ "mean_neighbor_predicted_label_ambiguity": 0.95,
1986
+ }
1134
1987
  }
1988
+
1989
+ Remove all exact duplicates:
1990
+ >>> memoryset.delete(
1991
+ ... m.memory_id
1992
+ ... for m in memoryset.query(
1993
+ ... filters=[("metrics.is_duplicate", "==", True)]
1994
+ ... )
1995
+ ... )
1996
+
1997
+ Display label analysis to review potential mislabelings:
1135
1998
  >>> memoryset.display_label_analysis()
1136
1999
  """
1137
- analysis = create_analysis(
1138
- self.id,
1139
- body=MemorysetAnalysisRequest(
1140
- type=MemorysetAnalysisRequestType.ANALYZE_MEMORY_NEIGHBOR_LABELS,
1141
- neighbor_count=neighbor_count,
1142
- ),
2000
+
2001
+ # Get valid analysis names from MemorysetAnalysisConfigs
2002
+ valid_analysis_names = set(MemorysetAnalysisConfigs.__annotations__)
2003
+
2004
+ configs: MemorysetAnalysisConfigs = {}
2005
+ for analysis in analyses:
2006
+ if isinstance(analysis, str):
2007
+ error_msg = (
2008
+ f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
2009
+ )
2010
+ if analysis not in valid_analysis_names:
2011
+ raise ValueError(error_msg)
2012
+ configs[analysis] = {}
2013
+ else:
2014
+ name = analysis.pop("name")
2015
+ error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
2016
+ if name not in valid_analysis_names:
2017
+ raise ValueError(error_msg)
2018
+ configs[name] = analysis
2019
+
2020
+ analysis = orca_api.POST(
2021
+ "/memoryset/{name_or_id}/analysis",
2022
+ params={"name_or_id": self.id},
2023
+ json={
2024
+ "configs": configs,
2025
+ "lookup_count": lookup_count,
2026
+ "clear_metrics": clear_metrics,
2027
+ },
2028
+ )
2029
+ job = Job(
2030
+ analysis["task_id"],
2031
+ lambda: orca_api.GET(
2032
+ "/memoryset/{name_or_id}/analysis/{analysis_task_id}",
2033
+ params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
2034
+ )["results"],
2035
+ )
2036
+ return job if background else job.result()
2037
+
2038
+ def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
2039
+ """Group potential duplicates in the memoryset"""
2040
+ response = orca_api.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
2041
+ return [
2042
+ [cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
2043
+ for ms in response
2044
+ ]
2045
+
2046
+ @overload
2047
+ @staticmethod
2048
+ def run_embedding_evaluation(
2049
+ datasource: Datasource,
2050
+ *,
2051
+ value_column: str = "value",
2052
+ label_column: str = "label",
2053
+ source_id_column: str | None = None,
2054
+ neighbor_count: int = 5,
2055
+ embedding_models: list[str] | None = None,
2056
+ background: Literal[True],
2057
+ ) -> Job[list[EmbeddingModelResult]]:
2058
+ pass
2059
+
2060
+ @overload
2061
+ @staticmethod
2062
+ def run_embedding_evaluation(
2063
+ datasource: Datasource,
2064
+ *,
2065
+ value_column: str = "value",
2066
+ label_column: str = "label",
2067
+ source_id_column: str | None = None,
2068
+ neighbor_count: int = 5,
2069
+ embedding_models: list[str] | None = None,
2070
+ background: Literal[False] = False,
2071
+ ) -> list[EmbeddingModelResult]:
2072
+ pass
2073
+
2074
+ @staticmethod
2075
+ def run_embedding_evaluation(
2076
+ datasource: Datasource,
2077
+ *,
2078
+ value_column: str = "value",
2079
+ label_column: str = "label",
2080
+ source_id_column: str | None = None,
2081
+ neighbor_count: int = 5,
2082
+ embedding_models: list[str] | None = None,
2083
+ background: bool = False,
2084
+ ) -> Job[list[EmbeddingModelResult]] | list[EmbeddingModelResult]:
2085
+ """
2086
+ Test the quality of embeddings for the datasource by computing metrics such as prediction accuracy.
2087
+
2088
+ Params:
2089
+ datasource: The datasource to run the embedding evaluation on
2090
+ value_column: Name of the column in the datasource that contains the memory values
2091
+ label_column: Name of the column in the datasource that contains the memory labels,
2092
+ these must be contiguous integers starting from 0
2093
+ source_id_column: Optional name of the column in the datasource that contains the ids in
2094
+ the system of reference
2095
+ neighbor_count: The number of neighbors to select for prediction
2096
+ embedding_models: Optional list of embedding model keys to evaluate, if not provided all
2097
+ available embedding models will be used
2098
+
2099
+ Returns:
2100
+ A dictionary containing the results of the embedding evaluation
2101
+ """
2102
+
2103
+ response = orca_api.POST(
2104
+ "/datasource/{name_or_id}/embedding_evaluation",
2105
+ params={"name_or_id": datasource.id},
2106
+ json={
2107
+ "value_column": value_column,
2108
+ "label_column": label_column,
2109
+ "source_id_column": source_id_column,
2110
+ "neighbor_count": neighbor_count,
2111
+ "embedding_models": embedding_models,
2112
+ },
1143
2113
  )
1144
- wait_for_task(analysis.task_id, description="Analyzing labels")
1145
- analysis = get_analysis(self.id, analysis.task_id)
1146
- assert isinstance(analysis.result, AnalyzeNeighborLabelsResult)
1147
- # TODO: return a custom label analysis class instance with helper methods
1148
- return analysis.result.to_dict()
2114
+
2115
+ def get_value() -> list[EmbeddingModelResult]:
2116
+ res = orca_api.GET(
2117
+ "/datasource/{name_or_id}/embedding_evaluation/{task_id}",
2118
+ params={"name_or_id": datasource.id, "task_id": response["task_id"]},
2119
+ )
2120
+ assert res["result"] is not None
2121
+ return res["result"]["evaluation_results"]
2122
+
2123
+ job = Job(response["task_id"], get_value)
2124
+ return job if background else job.result()
2125
+
2126
+
2127
+ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2128
+ """
2129
+ A Handle to a collection of memories with labels in the OrcaCloud
2130
+
2131
+ Attributes:
2132
+ id: Unique identifier for the memoryset
2133
+ name: Unique name of the memoryset
2134
+ description: Description of the memoryset
2135
+ label_names: Names for the class labels in the memoryset
2136
+ length: Number of memories in the memoryset
2137
+ embedding_model: Embedding model used to embed the memory values for semantic search
2138
+ created_at: When the memoryset was created, automatically generated on create
2139
+ updated_at: When the memoryset was last updated, automatically updated on updates
2140
+ """
2141
+
2142
+ label_names: list[str]
2143
+ memory_type: MemoryType = "LABELED"
2144
+
2145
+ def __init__(self, metadata: MemorysetMetadata):
2146
+ super().__init__(metadata)
2147
+ assert metadata["label_names"] is not None
2148
+ self.label_names = metadata["label_names"]
2149
+
2150
+ def __eq__(self, other) -> bool:
2151
+ return isinstance(other, LabeledMemoryset) and self.id == other.id
2152
+
2153
+ @classmethod
2154
+ def create(cls, name: str, datasource: Datasource, *, label_column: str | None = "label", **kwargs):
2155
+ return super().create(name, datasource, label_column=label_column, score_column=None, **kwargs)
1149
2156
 
1150
2157
  def display_label_analysis(self):
1151
- """Display a UI to review and act upon the label analysis results"""
2158
+ """
2159
+ Display an interactive UI to review and act upon the label analysis results
2160
+
2161
+ Note:
2162
+ This method is only available in Jupyter notebooks.
2163
+ """
1152
2164
  from ._utils.analysis_ui import display_suggested_memory_relabels
1153
2165
 
1154
2166
  display_suggested_memory_relabels(self)
2167
+
2168
+
2169
+ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2170
+ """
2171
+ A Handle to a collection of memories with scores in the OrcaCloud
2172
+
2173
+ Attributes:
2174
+ id: Unique identifier for the memoryset
2175
+ name: Unique name of the memoryset
2176
+ description: Description of the memoryset
2177
+ length: Number of memories in the memoryset
2178
+ embedding_model: Embedding model used to embed the memory values for semantic search
2179
+ created_at: When the memoryset was created, automatically generated on create
2180
+ updated_at: When the memoryset was last updated, automatically updated on updates
2181
+ """
2182
+
2183
+ memory_type: MemoryType = "SCORED"
2184
+
2185
+ def __eq__(self, other) -> bool:
2186
+ return isinstance(other, ScoredMemoryset) and self.id == other.id
2187
+
2188
+ @classmethod
2189
+ def create(cls, name: str, datasource: Datasource, *, score_column: str | None = "score", **kwargs):
2190
+ return super().create(name, datasource, score_column=score_column, label_column=None, **kwargs)