orca-sdk 0.0.78__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. orca_sdk/__init__.py +24 -0
  2. orca_sdk/_generated_api_client/__init__.py +3 -0
  3. orca_sdk/_generated_api_client/api/__init__.py +205 -0
  4. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  5. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +130 -0
  6. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +172 -0
  7. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +158 -0
  8. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +132 -0
  9. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +129 -0
  10. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  11. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +185 -0
  12. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +172 -0
  13. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +170 -0
  14. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +156 -0
  15. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +172 -0
  16. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +158 -0
  17. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +163 -0
  18. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +129 -0
  19. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +192 -0
  20. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  21. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +169 -0
  22. orca_sdk/_generated_api_client/api/datasource/create_embedding_evaluation_datasource_name_or_id_embedding_evaluation_post.py +185 -0
  23. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +158 -0
  24. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +158 -0
  25. orca_sdk/_generated_api_client/api/datasource/get_embedding_evaluation_datasource_name_or_id_embedding_evaluation_task_id_get.py +171 -0
  26. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +129 -0
  27. orca_sdk/_generated_api_client/api/datasource/list_embedding_evaluations_datasource_name_or_id_embedding_evaluation_get.py +237 -0
  28. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  29. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +120 -0
  30. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +120 -0
  31. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  32. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +170 -0
  33. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +158 -0
  34. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +191 -0
  35. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +158 -0
  36. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +129 -0
  37. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  38. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +183 -0
  39. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +185 -0
  40. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +170 -0
  41. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +183 -0
  42. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +169 -0
  43. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +158 -0
  44. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +171 -0
  45. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +190 -0
  46. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +171 -0
  47. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +158 -0
  48. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +186 -0
  49. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +262 -0
  50. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +129 -0
  51. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +195 -0
  52. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +190 -0
  53. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +193 -0
  54. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +189 -0
  55. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  56. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +194 -0
  57. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +163 -0
  58. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +129 -0
  59. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  60. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +156 -0
  61. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +158 -0
  62. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +245 -0
  63. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  64. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +164 -0
  65. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +158 -0
  66. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +159 -0
  67. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +129 -0
  68. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +177 -0
  69. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +173 -0
  70. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +183 -0
  71. orca_sdk/_generated_api_client/client.py +216 -0
  72. orca_sdk/_generated_api_client/errors.py +38 -0
  73. orca_sdk/_generated_api_client/models/__init__.py +179 -0
  74. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +116 -0
  75. orca_sdk/_generated_api_client/models/api_key_metadata.py +137 -0
  76. orca_sdk/_generated_api_client/models/api_key_metadata_scope_item.py +9 -0
  77. orca_sdk/_generated_api_client/models/base_model.py +55 -0
  78. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +176 -0
  79. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +147 -0
  80. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +150 -0
  81. orca_sdk/_generated_api_client/models/column_info.py +114 -0
  82. orca_sdk/_generated_api_client/models/column_type.py +14 -0
  83. orca_sdk/_generated_api_client/models/conflict_error_response.py +80 -0
  84. orca_sdk/_generated_api_client/models/create_api_key_request.py +120 -0
  85. orca_sdk/_generated_api_client/models/create_api_key_request_scope_item.py +9 -0
  86. orca_sdk/_generated_api_client/models/create_api_key_response.py +145 -0
  87. orca_sdk/_generated_api_client/models/create_api_key_response_scope_item.py +9 -0
  88. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +279 -0
  89. orca_sdk/_generated_api_client/models/create_rac_model_request.py +209 -0
  90. orca_sdk/_generated_api_client/models/datasource_metadata.py +142 -0
  91. orca_sdk/_generated_api_client/models/delete_memories_request.py +70 -0
  92. orca_sdk/_generated_api_client/models/embed_request.py +127 -0
  93. orca_sdk/_generated_api_client/models/embedding_evaluation_request.py +179 -0
  94. orca_sdk/_generated_api_client/models/embedding_evaluation_response.py +148 -0
  95. orca_sdk/_generated_api_client/models/embedding_evaluation_result.py +86 -0
  96. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +9 -0
  97. orca_sdk/_generated_api_client/models/embedding_model_result.py +114 -0
  98. orca_sdk/_generated_api_client/models/evaluation_request.py +180 -0
  99. orca_sdk/_generated_api_client/models/evaluation_response.py +140 -0
  100. orca_sdk/_generated_api_client/models/feedback_type.py +9 -0
  101. orca_sdk/_generated_api_client/models/field_validation_error.py +103 -0
  102. orca_sdk/_generated_api_client/models/filter_item.py +231 -0
  103. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +15 -0
  104. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +20 -0
  105. orca_sdk/_generated_api_client/models/filter_item_op.py +16 -0
  106. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +70 -0
  107. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +259 -0
  108. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +66 -0
  109. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +166 -0
  110. orca_sdk/_generated_api_client/models/get_memories_request.py +70 -0
  111. orca_sdk/_generated_api_client/models/internal_server_error_response.py +80 -0
  112. orca_sdk/_generated_api_client/models/label_class_metrics.py +108 -0
  113. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +274 -0
  114. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +68 -0
  115. orca_sdk/_generated_api_client/models/label_prediction_result.py +115 -0
  116. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +246 -0
  117. orca_sdk/_generated_api_client/models/labeled_memory.py +197 -0
  118. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +128 -0
  119. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +68 -0
  120. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +258 -0
  121. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +68 -0
  122. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +68 -0
  123. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +237 -0
  124. orca_sdk/_generated_api_client/models/labeled_memory_update.py +171 -0
  125. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +68 -0
  126. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +195 -0
  127. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +9 -0
  128. orca_sdk/_generated_api_client/models/list_memories_request.py +104 -0
  129. orca_sdk/_generated_api_client/models/list_predictions_request.py +257 -0
  130. orca_sdk/_generated_api_client/models/lookup_request.py +81 -0
  131. orca_sdk/_generated_api_client/models/memory_metrics.py +156 -0
  132. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +83 -0
  133. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +9 -0
  134. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +180 -0
  135. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +66 -0
  136. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +9 -0
  137. orca_sdk/_generated_api_client/models/not_found_error_response.py +100 -0
  138. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +21 -0
  139. orca_sdk/_generated_api_client/models/precision_recall_curve.py +94 -0
  140. orca_sdk/_generated_api_client/models/prediction_feedback.py +157 -0
  141. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +115 -0
  142. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +122 -0
  143. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +102 -0
  144. orca_sdk/_generated_api_client/models/prediction_request.py +169 -0
  145. orca_sdk/_generated_api_client/models/prediction_sort_item_item_type_0.py +10 -0
  146. orca_sdk/_generated_api_client/models/prediction_sort_item_item_type_1.py +9 -0
  147. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +97 -0
  148. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +12 -0
  149. orca_sdk/_generated_api_client/models/rac_head_type.py +11 -0
  150. orca_sdk/_generated_api_client/models/rac_model_metadata.py +191 -0
  151. orca_sdk/_generated_api_client/models/roc_curve.py +94 -0
  152. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +80 -0
  153. orca_sdk/_generated_api_client/models/task.py +198 -0
  154. orca_sdk/_generated_api_client/models/task_status.py +14 -0
  155. orca_sdk/_generated_api_client/models/task_status_info.py +133 -0
  156. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +72 -0
  157. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +80 -0
  158. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +94 -0
  159. orca_sdk/_generated_api_client/models/update_prediction_request.py +93 -0
  160. orca_sdk/_generated_api_client/py.typed +1 -0
  161. orca_sdk/_generated_api_client/types.py +56 -0
  162. orca_sdk/_utils/__init__.py +0 -0
  163. orca_sdk/_utils/analysis_ui.py +192 -0
  164. orca_sdk/_utils/analysis_ui_style.css +54 -0
  165. orca_sdk/_utils/auth.py +68 -0
  166. orca_sdk/_utils/auth_test.py +31 -0
  167. orca_sdk/_utils/common.py +37 -0
  168. orca_sdk/_utils/data_parsing.py +99 -0
  169. orca_sdk/_utils/data_parsing_test.py +244 -0
  170. orca_sdk/_utils/prediction_result_ui.css +18 -0
  171. orca_sdk/_utils/prediction_result_ui.py +64 -0
  172. orca_sdk/_utils/task.py +73 -0
  173. orca_sdk/classification_model.py +508 -0
  174. orca_sdk/classification_model_test.py +272 -0
  175. orca_sdk/conftest.py +116 -0
  176. orca_sdk/credentials.py +126 -0
  177. orca_sdk/credentials_test.py +37 -0
  178. orca_sdk/datasource.py +333 -0
  179. orca_sdk/datasource_test.py +96 -0
  180. orca_sdk/embedding_model.py +347 -0
  181. orca_sdk/embedding_model_test.py +176 -0
  182. orca_sdk/memoryset.py +1209 -0
  183. orca_sdk/memoryset_test.py +287 -0
  184. orca_sdk/telemetry.py +398 -0
  185. orca_sdk/telemetry_test.py +109 -0
  186. orca_sdk-0.0.78.dist-info/METADATA +79 -0
  187. orca_sdk-0.0.78.dist-info/RECORD +188 -0
  188. orca_sdk-0.0.78.dist-info/WHEEL +4 -0
orca_sdk/memoryset.py ADDED
@@ -0,0 +1,1209 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from datetime import datetime, timedelta
5
+ from os import PathLike
6
+ from typing import Any, Iterable, Literal, cast, overload
7
+
8
+ import pandas as pd
9
+ import pyarrow as pa
10
+ from datasets import Dataset
11
+ from torch.utils.data import DataLoader as TorchDataLoader
12
+ from torch.utils.data import Dataset as TorchDataset
13
+
14
+ from ._generated_api_client.api import (
15
+ clone_memoryset,
16
+ create_analysis,
17
+ create_embedding_evaluation,
18
+ create_memoryset,
19
+ delete_datasource,
20
+ delete_memories,
21
+ delete_memoryset,
22
+ get_analysis,
23
+ get_datasource,
24
+ get_embedding_evaluation,
25
+ get_memories,
26
+ get_memory,
27
+ get_memoryset,
28
+ insert_memories_gpu,
29
+ list_datasources,
30
+ list_memorysets,
31
+ memoryset_lookup_gpu,
32
+ query_memoryset,
33
+ update_memories_gpu,
34
+ update_memory_gpu,
35
+ )
36
+ from ._generated_api_client.models import (
37
+ AnalyzeNeighborLabelsResult,
38
+ CloneLabeledMemorysetRequest,
39
+ ColumnType,
40
+ CreateLabeledMemorysetRequest,
41
+ DatasourceMetadata,
42
+ DeleteMemoriesRequest,
43
+ EmbeddingEvaluationRequest,
44
+ EmbeddingEvaluationResponse,
45
+ FilterItem,
46
+ FilterItemOp,
47
+ FindDuplicatesAnalysisResult,
48
+ GetMemoriesRequest,
49
+ )
50
+ from ._generated_api_client.models import LabeledMemory as LabeledMemoryResponse
51
+ from ._generated_api_client.models import (
52
+ LabeledMemoryInsert,
53
+ LabeledMemoryInsertMetadata,
54
+ )
55
+ from ._generated_api_client.models import (
56
+ LabeledMemoryLookup as LabeledMemoryLookupResponse,
57
+ )
58
+ from ._generated_api_client.models import (
59
+ LabeledMemoryMetrics,
60
+ LabeledMemorysetMetadata,
61
+ LabeledMemoryUpdate,
62
+ LabeledMemoryUpdateMetadataType0,
63
+ LabelPredictionMemoryLookup,
64
+ ListMemoriesRequest,
65
+ LookupRequest,
66
+ MemorysetAnalysisRequest,
67
+ MemorysetAnalysisRequestType,
68
+ PretrainedEmbeddingModelName,
69
+ TaskStatus,
70
+ )
71
+ from ._generated_api_client.types import UNSET as CLIENT_UNSET
72
+ from ._utils.common import UNSET, CreateMode, DropMode
73
+ from ._utils.task import wait_for_task
74
+ from .datasource import Datasource
75
+ from .embedding_model import (
76
+ FinetunedEmbeddingModel,
77
+ PretrainedEmbeddingModel,
78
+ _EmbeddingModel,
79
+ )
80
+
81
+ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
82
+ """
83
+ Operations that can be used in a filter expression.
84
+ """
85
+
86
+ FilterValue = str | int | float | bool | datetime | None | list[str] | list[int] | list[float] | list[bool]
87
+ """
88
+ Values that can be used in a filter expression.
89
+ """
90
+
91
+ FilterItemTuple = tuple[str, FilterOperation, FilterValue]
92
+ """
93
+ Filter expression consisting of a field, an operator, and a value:
94
+
95
+ * **`field`**: The field to filter on.
96
+ * **`operation`**: The operation to apply to the field and value.
97
+ * **`value`**: The value to compare the field against.
98
+
99
+ Examples:
100
+ >>> ("label", "==", 0)
101
+ >>> ("metadata.author", "like", "John")
102
+ >>> ("source_id", "in", ["123", "456"])
103
+ """
104
+
105
+
106
+ DEFAULT_COLUMN_NAMES = {"value", "label", "source_id"}
107
+ FORBIDDEN_METADATA_COLUMN_NAMES = {"memory_id", "memory_version", "embedding", "created_at", "updated_at", "metrics"}
108
+
109
+
110
+ def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem:
111
+ field = input[0].split(".")
112
+ if len(field) == 1 and field[0] not in DEFAULT_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES:
113
+ field = ["metadata", field[0]]
114
+ op = FilterItemOp(input[1])
115
+ value = input[2]
116
+ return FilterItem(field=field, op=op, value=value)
117
+
118
+
119
+ def _parse_memory_insert(memory: dict[str, Any]) -> LabeledMemoryInsert:
120
+ value = memory.get("value")
121
+ if not isinstance(value, str):
122
+ raise ValueError("Memory value must be a string")
123
+ label = memory.get("label")
124
+ if not isinstance(label, int):
125
+ raise ValueError("Memory label must be an integer")
126
+ source_id = memory.get("source_id")
127
+ if source_id and not isinstance(source_id, str):
128
+ raise ValueError("Memory source_id must be a string")
129
+ metadata = LabeledMemoryInsertMetadata.from_dict({k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES})
130
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
131
+ raise ValueError(f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
132
+ return LabeledMemoryInsert(value=value, label=label, source_id=source_id, metadata=metadata)
133
+
134
+
135
+ def _parse_memory_update(update: dict[str, Any]) -> LabeledMemoryUpdate:
136
+ if "memory_id" not in update:
137
+ raise ValueError("memory_id must be specified in the update dictionary")
138
+ memory_id = update["memory_id"]
139
+ if not isinstance(memory_id, str):
140
+ raise ValueError("memory_id must be a string")
141
+ value = update.get("value", CLIENT_UNSET)
142
+ if value is not CLIENT_UNSET and not isinstance(value, str):
143
+ raise ValueError("value must be a string or unset")
144
+ label = update.get("label", CLIENT_UNSET)
145
+ if label is not CLIENT_UNSET and not isinstance(label, int):
146
+ raise ValueError("label must be an integer or unset")
147
+ source_id = update.get("source_id", CLIENT_UNSET)
148
+ if source_id is not CLIENT_UNSET and not isinstance(source_id, str):
149
+ raise ValueError("source_id must be a string or unset")
150
+ metadata = LabeledMemoryUpdateMetadataType0.from_dict(
151
+ {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id"}}
152
+ )
153
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
154
+ raise ValueError(f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
155
+ return LabeledMemoryUpdate(memory_id=memory_id, value=value, label=label, source_id=source_id, metadata=metadata)
156
+
157
+
158
+ class LabeledMemory:
159
+ """
160
+ A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
161
+
162
+ Attributes:
163
+ value: Value represented by the row
164
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
165
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
166
+ label: Class label of the memory
167
+ label_name: Human-readable name of the label, automatically populated from the
168
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
169
+ source_id: Optional unique identifier of the memory in a system of reference
170
+ metrics: Metrics about the memory, generated when running an analysis on the
171
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
172
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
173
+ properties are also accessible as individual attributes on the instance.
174
+ memory_id: Unique identifier for the memory, automatically generated on insert
175
+ memory_version: Version of the memory, automatically updated when the label or value changes
176
+ created_at: When the memory was created, automatically generated on insert
177
+ updated_at: When the memory was last updated, automatically updated on update
178
+
179
+ ## Other Attributes:
180
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
181
+ """
182
+
183
+ value: str
184
+ embedding: list[float]
185
+ label: int
186
+ label_name: str | None
187
+ source_id: str | None
188
+ created_at: datetime
189
+ updated_at: datetime
190
+ metadata: dict[str, str | float | int | bool | None]
191
+ metrics: dict[str, Any]
192
+ memory_id: str
193
+ memory_version: int
194
+
195
+ def __init__(
196
+ self,
197
+ memoryset_id: str,
198
+ memory: LabeledMemoryResponse | LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
199
+ ):
200
+ # for internal use only, do not document
201
+ self.memoryset_id = memoryset_id
202
+ self.memory_id = memory.memory_id
203
+ self.memory_version = memory.memory_version
204
+ self.value = memory.value
205
+ self.embedding = memory.embedding
206
+ self.label = memory.label
207
+ self.label_name = memory.label_name
208
+ self.source_id = memory.source_id
209
+ self.created_at = memory.created_at
210
+ self.updated_at = memory.updated_at
211
+ self.metadata = memory.metadata.to_dict()
212
+ self.metrics = memory.metrics.to_dict() if memory.metrics else {}
213
+
214
+ def __getattr__(self, key: str) -> Any:
215
+ if key.startswith("__") or key not in self.metadata:
216
+ raise AttributeError(f"{key} is not a valid attribute")
217
+ return self.metadata[key]
218
+
219
+ def __repr__(self) -> str:
220
+ return (
221
+ "LabeledMemory({ "
222
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
223
+ + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
224
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
225
+ + " })"
226
+ )
227
+
228
+ def __eq__(self, other: object) -> bool:
229
+ return isinstance(other, LabeledMemory) and self.memory_id == other.memory_id
230
+
231
+ def update(
232
+ self,
233
+ *,
234
+ value: str = UNSET,
235
+ label: int = UNSET,
236
+ source_id: str | None = UNSET,
237
+ **metadata: None | bool | float | int | str,
238
+ ) -> LabeledMemory:
239
+ """
240
+ Update the memory with new values
241
+
242
+ Note:
243
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
244
+
245
+ Params:
246
+ value: New value of the memory
247
+ label: New label of the memory
248
+ source_id: New source ID of the memory
249
+ **metadata: New values for metadata properties
250
+
251
+ Returns:
252
+ The updated memory
253
+ """
254
+ response = update_memory_gpu(
255
+ self.memoryset_id,
256
+ body=_parse_memory_update(
257
+ {"memory_id": self.memory_id}
258
+ | ({"value": value} if value is not UNSET else {})
259
+ | ({"label": label} if label is not UNSET else {})
260
+ | ({"source_id": source_id} if source_id is not UNSET else {})
261
+ | metadata
262
+ ),
263
+ )
264
+ self.__dict__.update(LabeledMemory(self.memoryset_id, response).__dict__)
265
+ return self
266
+
267
+
268
+ class LabeledMemoryLookup(LabeledMemory):
269
+ """
270
+ Lookup result for a memory in a memoryset
271
+
272
+ Attributes:
273
+ lookup_score: Similarity between the memory embedding and search query embedding
274
+ attention_weight: Weight the model assigned to the memory during prediction if this lookup
275
+ happened as part of a prediction
276
+ value: Value represented by the row
277
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
278
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
279
+ label: Class label of the memory
280
+ label_name: Human-readable name of the label, automatically populated from the
281
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
282
+ source_id: Optional unique identifier of the memory in a system of reference
283
+ metrics: Metrics about the memory, generated when running an analysis on the
284
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
285
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
286
+ properties are also accessible as individual attributes on the instance.
287
+ memory_id: The unique identifier for the memory, automatically generated on insert
288
+ memory_version: The version of the memory, automatically updated when the label or value changes
289
+ created_at: When the memory was created, automatically generated on insert
290
+ updated_at: When the memory was last updated, automatically updated on update
291
+
292
+ ## Other Attributes:
293
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
294
+ """
295
+
296
+ lookup_score: float
297
+ attention_weight: float | None
298
+
299
+ def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
300
+ # for internal use only, do not document
301
+ super().__init__(memoryset_id, memory_lookup)
302
+ self.lookup_score = memory_lookup.lookup_score
303
+ self.attention_weight = (
304
+ memory_lookup.attention_weight if isinstance(memory_lookup, LabelPredictionMemoryLookup) else None
305
+ )
306
+
307
+ def __repr__(self) -> str:
308
+ return (
309
+ "LabeledMemoryLookup({ "
310
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
311
+ + f", lookup_score: {self.lookup_score:.2f}"
312
+ + (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
313
+ + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
314
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
315
+ + " })"
316
+ )
317
+
318
+
319
+ class LabeledMemoryset:
320
+ """
321
+ A Handle to a collection of memories with labels in the OrcaCloud
322
+
323
+ Attributes:
324
+ id: Unique identifier for the memoryset
325
+ name: Unique name of the memoryset
326
+ label_names: Names for the class labels in the memoryset
327
+ length: Number of memories in the memoryset
328
+ embedding_model: Embedding model used to embed the memory values for semantic search
329
+ created_at: When the memoryset was created, automatically generated on create
330
+ updated_at: When the memoryset was last updated, automatically updated on updates
331
+ """
332
+
333
+ id: str
334
+ name: str
335
+ label_names: list[str]
336
+ length: int
337
+ created_at: datetime
338
+ updated_at: datetime
339
+ insertion_status: TaskStatus
340
+ embedding_model: _EmbeddingModel
341
+
342
+ def __init__(self, metadata: LabeledMemorysetMetadata):
343
+ # for internal use only, do not document
344
+ if metadata.pretrained_embedding_model_name:
345
+ self.embedding_model = PretrainedEmbeddingModel._get(metadata.pretrained_embedding_model_name)
346
+ elif metadata.finetuned_embedding_model_id:
347
+ self.embedding_model = FinetunedEmbeddingModel.open(metadata.finetuned_embedding_model_id)
348
+ else:
349
+ raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
350
+ self.id = metadata.id
351
+ self.name = metadata.name
352
+ self.label_names = metadata.label_names
353
+ self.length = metadata.length
354
+ self.created_at = metadata.created_at
355
+ self.updated_at = metadata.updated_at
356
+ self.insertion_status = metadata.insertion_status
357
+ self._last_refresh = datetime.now()
358
+
359
+ def __eq__(self, other) -> bool:
360
+ return isinstance(other, LabeledMemoryset) and self.id == other.id
361
+
362
+ def __repr__(self) -> str:
363
+ return (
364
+ "LabeledMemoryset({\n"
365
+ f" name: '{self.name}',\n"
366
+ f" length: {self.length},\n"
367
+ f" label_names: {self.label_names},\n"
368
+ f" embedding_model: {self.embedding_model},\n"
369
+ "})"
370
+ )
371
+
372
+ @classmethod
373
+ def create(
374
+ cls,
375
+ name: str,
376
+ datasource: Datasource,
377
+ *,
378
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
379
+ value_column: str = "value",
380
+ label_column: str = "label",
381
+ source_id_column: str | None = None,
382
+ label_names: list[str] | None = None,
383
+ max_seq_length_override: int | None = None,
384
+ if_exists: CreateMode = "error",
385
+ ) -> LabeledMemoryset:
386
+ """
387
+ Create a new memoryset in the OrcaCloud
388
+
389
+ All columns from the datasource that are not specified in the `value_column`,
390
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
391
+
392
+ Params:
393
+ name: Name for the new memoryset (must be unique)
394
+ datasource: Source data to populate the memories in the memoryset
395
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
396
+ If not provided, a default embedding model for the memoryset will be used.
397
+ value_column: Name of the column in the datasource that contains the memory values
398
+ label_column: Name of the column in the datasource that contains the memory labels,
399
+ these must be contiguous integers starting from 0
400
+ source_id_column: Optional name of the column in the datasource that contains the ids in
401
+ the system of reference
402
+ label_names: List of human-readable names for the labels in the memoryset, must match
403
+ the number of labels in the `label_column`. Will be automatically inferred if a
404
+ [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
405
+ labels is used as the datasource
406
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
407
+ value is longer than this it will be truncated, will default to the model's max
408
+ sequence length if not provided
409
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
410
+ `"error"`. Other option is `"open"` to open the existing memoryset.
411
+
412
+ Returns:
413
+ Handle to the new memoryset in the OrcaCloud
414
+
415
+ Raises:
416
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
417
+ `"open"` and the params do not match those of the existing memoryset.
418
+ """
419
+ if embedding_model is None:
420
+ embedding_model = PretrainedEmbeddingModel.CDE_SMALL
421
+
422
+ if cls.exists(name):
423
+ if if_exists == "error":
424
+ raise ValueError(f"Memoryset with name {name} already exists")
425
+ elif if_exists == "open":
426
+ existing = cls.open(name)
427
+ for attribute in {"label_names", "embedding_model"}:
428
+ if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
429
+ raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
430
+ return existing
431
+
432
+ response = create_memoryset(
433
+ body=CreateLabeledMemorysetRequest(
434
+ name=name,
435
+ datasource_id=datasource.id,
436
+ datasource_label_column=label_column,
437
+ datasource_value_column=value_column,
438
+ datasource_source_id_column=source_id_column,
439
+ pretrained_embedding_model_name=(
440
+ embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
441
+ ),
442
+ finetuned_embedding_model_id=(
443
+ embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
444
+ ),
445
+ label_names=label_names or [],
446
+ max_seq_length_override=max_seq_length_override,
447
+ ),
448
+ )
449
+ wait_for_task(response.insertion_task_id, description="Inserting datasource")
450
+ return cls.open(response.id)
451
+
452
+ @classmethod
453
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, **kwargs: Any) -> LabeledMemoryset:
454
+ """
455
+ Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
456
+
457
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
458
+ appended with `_datasource` and use that as the datasource for the memoryset.
459
+
460
+ All features that are not specified to be used as `value_column`, `label_column`, or
461
+ `source_id_column` will be stored as metadata in the memoryset.
462
+
463
+ Params:
464
+ name: Name for the new memoryset (must be unique)
465
+ hf_dataset: Hugging Face dataset to create the memoryset from
466
+ kwargs: Additional parameters for creating the memoryset. See
467
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
468
+
469
+
470
+ Returns:
471
+ Handle to the new memoryset in the OrcaCloud
472
+ """
473
+ datasource = Datasource.from_hf_dataset(
474
+ f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
475
+ )
476
+ return cls.create(name, datasource, **kwargs)
477
+
478
+ @classmethod
479
+ def from_pytorch(
480
+ cls,
481
+ name: str,
482
+ torch_data: TorchDataLoader | TorchDataset,
483
+ *,
484
+ column_names: list[str] | None = None,
485
+ **kwargs: Any,
486
+ ) -> LabeledMemoryset:
487
+ """
488
+ Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
489
+ [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
490
+
491
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
492
+ appended with `_datasource` and use that as the datasource for the memoryset.
493
+
494
+ All properties that are not specified to be used as `value_column`, `label_column`, or
495
+ `source_id_column` will be stored as metadata in the memoryset.
496
+
497
+ Params:
498
+ name: Name for the new memoryset (must be unique)
499
+ torch_data: PyTorch data loader or dataset to create the memoryset from
500
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
501
+ argument must be provided to specify the names of the columns.
502
+ kwargs: Additional parameters for creating the memoryset. See
503
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
504
+
505
+
506
+ Returns:
507
+ Handle to the new memoryset in the OrcaCloud
508
+ """
509
+ datasource = Datasource.from_pytorch(
510
+ f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
511
+ )
512
+ return cls.create(name, datasource, **kwargs)
513
+
514
+ @classmethod
515
+ def from_list(cls, name: str, data: list[dict], **kwargs: Any) -> LabeledMemoryset:
516
+ """
517
+ Create a new memoryset from a list of dictionaries in the OrcaCloud
518
+
519
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
520
+ appended with `_datasource` and use that as the datasource for the memoryset.
521
+
522
+ All properties that are not specified to be used as `value_column`, `label_column`, or
523
+ `source_id_column` will be stored as metadata in the memoryset.
524
+
525
+ Params:
526
+ name: Name for the new memoryset (must be unique)
527
+ data: List of dictionaries to create the memoryset from
528
+ kwargs: Additional parameters for creating the memoryset. See
529
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
530
+
531
+ Returns:
532
+ Handle to the new memoryset in the OrcaCloud
533
+
534
+ Examples:
535
+ >>> LabeledMemoryset.from_list("my_memoryset", [
536
+ ... {"value": "hello", "label": 0, "tag": "tag1"},
537
+ ... {"value": "world", "label": 1, "tag": "tag2"},
538
+ ... ])
539
+ """
540
+ datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
541
+ return cls.create(name, datasource, **kwargs)
542
+
543
+ @classmethod
544
+ def from_dict(cls, name: str, data: dict, **kwargs: Any) -> LabeledMemoryset:
545
+ """
546
+ Create a new memoryset from a dictionary of columns in the OrcaCloud
547
+
548
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
549
+ appended with `_datasource` and use that as the datasource for the memoryset.
550
+
551
+ All columns from the datasource that are not specified in the `value_column`,
552
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
553
+
554
+ Params:
555
+ name: Name for the new memoryset (must be unique)
556
+ data: Dictionary of columns to create the memoryset from
557
+ kwargs: Additional parameters for creating the memoryset. See
558
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
559
+
560
+ Returns:
561
+ Handle to the new memoryset in the OrcaCloud
562
+
563
+ Examples:
564
+ >>> LabeledMemoryset.from_dict("my_memoryset", {
565
+ ... "value": ["hello", "world"],
566
+ ... "label": [0, 1],
567
+ ... "tag": ["tag1", "tag2"],
568
+ ... })
569
+ """
570
+ datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
571
+ return cls.create(name, datasource, **kwargs)
572
+
573
+ @classmethod
574
+ def from_pandas(cls, name: str, dataframe: pd.DataFrame, **kwargs: Any) -> LabeledMemoryset:
575
+ """
576
+ Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
577
+
578
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
579
+ appended with `_datasource` and use that as the datasource for the memoryset.
580
+
581
+ All columns that are not specified to be used as `value_column`, `label_column`, or
582
+ `source_id_column` will be stored as metadata in the memoryset.
583
+
584
+ Params:
585
+ name: Name for the new memoryset (must be unique)
586
+ dataframe: Dataframe to create the memoryset from
587
+ kwargs: Additional parameters for creating the memoryset. See
588
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
589
+
590
+ Returns:
591
+ Handle to the new memoryset in the OrcaCloud
592
+ """
593
+ datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
594
+ return cls.create(name, datasource, **kwargs)
595
+
596
+ @classmethod
597
+ def from_arrow(cls, name: str, pyarrow_table: pa.Table, **kwargs: Any) -> LabeledMemoryset:
598
+ """
599
+ Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
600
+
601
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
602
+ appended with `_datasource` and use that as the datasource for the memoryset.
603
+
604
+ All columns that are not specified to be used as `value_column`, `label_column`, or
605
+ `source_id_column` will be stored as metadata in the memoryset.
606
+
607
+ Params:
608
+ name: Name for the new memoryset (must be unique)
609
+ pyarrow_table: PyArrow table to create the memoryset from
610
+ kwargs: Additional parameters for creating the memoryset. See
611
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
612
+
613
+ Returns:
614
+ Handle to the new memoryset in the OrcaCloud
615
+ """
616
+ datasource = Datasource.from_arrow(
617
+ f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
618
+ )
619
+ return cls.create(name, datasource, **kwargs)
620
+
621
+ @classmethod
622
+ def from_disk(cls, name: str, file_path: str | PathLike, **kwargs: Any) -> LabeledMemoryset:
623
+ """
624
+ Create a new memoryset from a file on disk in the OrcaCloud
625
+
626
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
627
+ appended with `_datasource` and use that as the datasource for the memoryset.
628
+
629
+ All columns from the datasource that are not specified in the `value_column`,
630
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
631
+
632
+ Params:
633
+ name: Name for the new memoryset (must be unique)
634
+ file_path: Path to the file on disk to create the memoryset from. The file type will
635
+ be inferred from the file extension. The following file types are supported:
636
+
637
+ - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
638
+ - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
639
+ - .csv: [`CSV`][csv] files
640
+ - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
641
+ - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
642
+ kwargs: Additional parameters for creating the memoryset. See
643
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
644
+
645
+ Returns:
646
+ Handle to the new memoryset in the OrcaCloud
647
+ """
648
+ datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
649
+ return cls.create(name, datasource, **kwargs)
650
+
651
+ @classmethod
652
+ def open(cls, name: str) -> LabeledMemoryset:
653
+ """
654
+ Get a handle to a memoryset in the OrcaCloud
655
+
656
+ Params:
657
+ name: Name or unique identifier of the memoryset
658
+
659
+ Returns:
660
+ Handle to the existing memoryset in the OrcaCloud
661
+
662
+ Raises:
663
+ LookupError: If the memoryset does not exist
664
+ """
665
+ metadata = get_memoryset(name)
666
+ return cls(metadata)
667
+
668
+ @classmethod
669
+ def exists(cls, name_or_id: str) -> bool:
670
+ """
671
+ Check if a memoryset exists in the OrcaCloud
672
+
673
+ Params:
674
+ name_or_id: Name or id of the memoryset
675
+
676
+ Returns:
677
+ True if the memoryset exists, False otherwise
678
+ """
679
+ try:
680
+ cls.open(name_or_id)
681
+ return True
682
+ except LookupError:
683
+ return False
684
+
685
+ @classmethod
686
+ def all(cls) -> list[LabeledMemoryset]:
687
+ """
688
+ Get a list of handles to all memorysets in the OrcaCloud
689
+
690
+ Returns:
691
+ List of handles to all memorysets in the OrcaCloud
692
+ """
693
+ return [cls(metadata) for metadata in list_memorysets()]
694
+
695
+ @classmethod
696
+ def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
697
+ """
698
+ Delete a memoryset from the OrcaCloud
699
+
700
+ Params:
701
+ name_or_id: Name or id of the memoryset
702
+ if_not_exists: What to do if the memoryset does not exist, defaults to `"error"`.
703
+ Other options are `"ignore"` to do nothing if the memoryset does not exist.
704
+
705
+ Raises:
706
+ LookupError: If the memoryset does not exist and if_not_exists is `"error"`
707
+ """
708
+ try:
709
+ delete_memoryset(name_or_id)
710
+ logging.info(f"Deleted memoryset {name_or_id}")
711
+ except LookupError:
712
+ if if_not_exists == "error":
713
+ raise
714
+
715
+ def clone(
716
+ self,
717
+ name: str,
718
+ *,
719
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
720
+ max_seq_length_override: int | None = None,
721
+ if_exists: CreateMode = "error",
722
+ ) -> LabeledMemoryset:
723
+ """
724
+ Create a clone of the memoryset with a new name
725
+
726
+ Params:
727
+ name: Name for the new memoryset (must be unique)
728
+ embedding_model: Optional new embedding model to use for re-embedding the memory values
729
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
730
+ value is longer than this it will be truncated, will default to the model's max
731
+ sequence length if not provided
732
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
733
+ `"error"`. Other option is `"open"` to open the existing memoryset.
734
+
735
+ Returns:
736
+ Handle to the cloned memoryset in the OrcaCloud
737
+
738
+ Examples:
739
+ >>> memoryset = LabeledMemoryset.open("my_memoryset")
740
+ >>> finetuned_embedding_model = PretrainedEmbeddingModel.GTE_BASE.finetune(
741
+ ... "gte_base_finetuned", my_memoryset
742
+ ... )
743
+ >>> new_memoryset = memoryset.clone(
744
+ ... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
745
+ ... )
746
+ """
747
+ if self.exists(name):
748
+ if if_exists == "error":
749
+ raise ValueError(f"Memoryset with name {name} already exists")
750
+ elif if_exists == "open":
751
+ existing = self.open(name)
752
+ for attribute in {"embedding_model"}:
753
+ if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
754
+ raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
755
+ return existing
756
+
757
+ metadata = clone_memoryset(
758
+ self.id,
759
+ body=CloneLabeledMemorysetRequest(
760
+ name=name,
761
+ pretrained_embedding_model_name=(
762
+ embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
763
+ ),
764
+ finetuned_embedding_model_id=(
765
+ embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
766
+ ),
767
+ max_seq_length_override=max_seq_length_override,
768
+ ),
769
+ )
770
+ wait_for_task(metadata.insertion_task_id, description="Cloning memoryset")
771
+ return LabeledMemoryset.open(metadata.id)
772
+
773
+ def refresh(self, throttle: float = 0):
774
+ """
775
+ Refresh the information about the memoryset from the OrcaCloud
776
+
777
+ Params:
778
+ throttle: Minimum time in seconds between refreshes
779
+ """
780
+ current_time = datetime.now()
781
+ # Skip refresh if last refresh was too recent
782
+ if (current_time - self._last_refresh) < timedelta(seconds=throttle):
783
+ return
784
+
785
+ self.__dict__.update(LabeledMemoryset.open(self.id).__dict__)
786
+ self._last_refresh = current_time
787
+
788
+ def __len__(self) -> int:
789
+ """Get the number of memories in the memoryset"""
790
+ self.refresh(throttle=5)
791
+ return self.length
792
+
793
+ @overload
794
+ def __getitem__(self, index: int | str) -> LabeledMemory:
795
+ pass
796
+
797
+ @overload
798
+ def __getitem__(self, index: slice) -> list[LabeledMemory]:
799
+ pass
800
+
801
+ def __getitem__(self, index: int | slice | str) -> LabeledMemory | list[LabeledMemory]:
802
+ """
803
+ Get memories from the memoryset by index or memory id
804
+
805
+ Params:
806
+ index: Index or memory to retrieve or slice of memories to retrieve or unique
807
+ identifier of the memory to retrieve
808
+
809
+ Returns:
810
+ Memory or memories from the memoryset
811
+
812
+ Raises:
813
+ LookupError: If the id is not found or the index is out of bounds
814
+
815
+ Examples:
816
+ Retrieve the first memory in the memoryset:
817
+ >>> memoryset[0]
818
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
819
+
820
+ Retrieve the last memory in the memoryset:
821
+ >>> memoryset[-1]
822
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' })
823
+
824
+ Retrieve a slice of memories in the memoryset:
825
+ >>> memoryset[1:3]
826
+ [
827
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
828
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
829
+ ]
830
+
831
+ Retrieve a memory by id:
832
+ >>> memoryset["0195019a-5bc7-7afb-b902-5945ee1fb766"]
833
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
834
+ """
835
+ if isinstance(index, int):
836
+ return self.query(offset=len(self) + index if index < 0 else index, limit=1)[0]
837
+ elif isinstance(index, str):
838
+ return self.get(index)
839
+ elif isinstance(index, slice):
840
+ start = 0 if index.start is None else (len(self) + index.start) if index.start < 0 else index.start
841
+ stop = len(self) if index.stop is None else (len(self) + index.stop) if index.stop < 0 else index.stop
842
+ return self.query(offset=start, limit=stop - start)
843
+ else:
844
+ raise ValueError(f"Invalid index type: {type(index)}")
845
+
846
+ @overload
847
+ def search(self, query: str, *, count: int = 1) -> list[LabeledMemoryLookup]:
848
+ pass
849
+
850
+ @overload
851
+ def search(self, query: list[str], *, count: int = 1) -> list[list[LabeledMemoryLookup]]:
852
+ pass
853
+
854
+ def search(
855
+ self, query: str | list[str], *, count: int = 1
856
+ ) -> list[LabeledMemoryLookup] | list[list[LabeledMemoryLookup]]:
857
+ """
858
+ Search for memories that are semantically similar to the query
859
+
860
+ Params:
861
+ query: Query to lookup memories in the memoryset, can be a single query or a list
862
+ count: Number of memories to return for each query
863
+
864
+ Returns:
865
+ List of memories from the memoryset that match the query. If a single query is provided,
866
+ the return value is a list containing a single list of memories. If a list of
867
+ queries is provided, the return value is a list of lists of memories.
868
+
869
+ Examples:
870
+ Search for similar memories:
871
+ >>> memoryset.search("I am happy", count=2)
872
+ [
873
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
874
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
875
+ ]
876
+
877
+ Search for similar memories for multiple queries:
878
+ >>> memoryset.search(["I am happy", "I am sad"], count=1)
879
+ [
880
+ [
881
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
882
+ ],
883
+ [
884
+ LabeledMemoryLookup({ label: <negative: 0>, value: 'I am sad' }),
885
+ ],
886
+ ]
887
+ """
888
+ response = memoryset_lookup_gpu(
889
+ name_or_id=self.id,
890
+ body=LookupRequest(
891
+ query=query if isinstance(query, list) else [query],
892
+ count=count,
893
+ ),
894
+ )
895
+ lookups = [[LabeledMemoryLookup(self.id, lookup_response) for lookup_response in batch] for batch in response]
896
+ return lookups if isinstance(query, list) else lookups[0]
897
+
898
+ def query(
899
+ self,
900
+ offset: int = 0,
901
+ limit: int = 100,
902
+ filters: list[FilterItemTuple] = [],
903
+ ) -> list[LabeledMemory]:
904
+ """
905
+ Query the memoryset for memories that match the filters
906
+
907
+ Params:
908
+ offset: The offset of the first memory to return
909
+ limit: The maximum number of memories to return
910
+ filters: List of filters to apply to the query.
911
+
912
+ Returns:
913
+ List of memories from the memoryset that match the filters
914
+
915
+ Examples:
916
+ >>> memoryset.query(filters=[("label", "==", 0)], limit=2)
917
+ [
918
+ LabeledMemory({ label: <positive: 1>, value: "I am happy" }),
919
+ LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
920
+ ]
921
+ """
922
+ return [
923
+ LabeledMemory(self.id, memory)
924
+ for memory in query_memoryset(
925
+ self.id,
926
+ body=ListMemoriesRequest(
927
+ offset=offset,
928
+ limit=limit,
929
+ filters=[
930
+ _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter
931
+ for filter in filters
932
+ ],
933
+ ),
934
+ )
935
+ ]
936
+
937
+ def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
938
+ """
939
+ Insert memories into the memoryset
940
+
941
+ Params:
942
+ items: List of memories to insert into the memoryset. This should be a list of
943
+ dictionaries with the following keys:
944
+
945
+ - `value`: Value of the memory
946
+ - `label`: Label of the memory
947
+ - `source_id`: Optional unique ID of the memory in a system of reference
948
+ - `...`: Any other metadata to store for the memory
949
+
950
+ Examples:
951
+ >>> memoryset.insert([
952
+ ... {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
953
+ ... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
954
+ ... ])
955
+ """
956
+ insert_memories_gpu(
957
+ self.id,
958
+ body=(
959
+ [
960
+ _parse_memory_insert(memory)
961
+ for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
962
+ ]
963
+ ),
964
+ )
965
+ self.refresh()
966
+
967
+ @overload
968
+ def get(self, memory_id: str) -> LabeledMemory: # type: ignore -- this takes precedence
969
+ pass
970
+
971
+ @overload
972
+ def get(self, memory_id: Iterable[str]) -> list[LabeledMemory]:
973
+ pass
974
+
975
+ def get(self, memory_id: str | Iterable[str]) -> LabeledMemory | list[LabeledMemory]:
976
+ """
977
+ Fetch a memory or memories from the memoryset
978
+
979
+ Params:
980
+ memory_id: Unique identifier of the memory or memories to fetch
981
+
982
+ Returns:
983
+ Memory or list of memories from the memoryset
984
+
985
+ Raises:
986
+ LookupError: If no memory with the given id is found
987
+
988
+ Examples:
989
+ Fetch a single memory:
990
+ >>> memoryset.get("0195019a-5bc7-7afb-b902-5945ee1fb766")
991
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
992
+
993
+ Fetch multiple memories:
994
+ >>> memoryset.get([
995
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
996
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
997
+ ... ])
998
+ [
999
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
1000
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
1001
+ ]
1002
+ """
1003
+ if isinstance(memory_id, str):
1004
+ return LabeledMemory(self.id, get_memory(self.id, memory_id))
1005
+ else:
1006
+ return [
1007
+ LabeledMemory(self.id, memory)
1008
+ for memory in get_memories(self.id, body=GetMemoriesRequest(memory_ids=list(memory_id)))
1009
+ ]
1010
+
1011
+ @overload
1012
+ def update(self, updates: dict[str, Any]) -> LabeledMemory:
1013
+ pass
1014
+
1015
+ @overload
1016
+ def update(self, updates: Iterable[dict[str, Any]]) -> list[LabeledMemory]:
1017
+ pass
1018
+
1019
+ def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> LabeledMemory | list[LabeledMemory]:
1020
+ """
1021
+ Update one or multiple memories in the memoryset
1022
+
1023
+ Params:
1024
+ updates: List of updates to apply to the memories. Each update should be a dictionary
1025
+ with the following keys:
1026
+
1027
+ - `memory_id`: Unique identifier of the memory to update (required)
1028
+ - `value`: Optional new value of the memory
1029
+ - `label`: Optional new label of the memory
1030
+ - `source_id`: Optional new source ID of the memory
1031
+ - `...`: Optional new values for metadata properties
1032
+
1033
+ Returns:
1034
+ Updated memory or list of updated memories
1035
+
1036
+ Examples:
1037
+ Update a single memory:
1038
+ >>> memoryset.update(
1039
+ ... {
1040
+ ... "memory_id": "019501a1-ea08-76b2-9f62-95e4800b4841",
1041
+ ... "tag": "happy",
1042
+ ... },
1043
+ ... )
1044
+
1045
+ Update multiple memories:
1046
+ >>> memoryset.update(
1047
+ ... {"memory_id": m.memory_id, "label": 2}
1048
+ ... for m in memoryset.query(filters=[("tag", "==", "happy")])
1049
+ ... )
1050
+ """
1051
+ response = update_memories_gpu(
1052
+ self.id,
1053
+ body=[
1054
+ _parse_memory_update(update)
1055
+ for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
1056
+ ],
1057
+ )
1058
+ updated_memories = [LabeledMemory(self.id, memory) for memory in response]
1059
+ return updated_memories[0] if isinstance(updates, dict) else updated_memories
1060
+
1061
+ def delete(self, memory_id: str | Iterable[str]) -> None:
1062
+ """
1063
+ Delete memories from the memoryset
1064
+
1065
+ Params:
1066
+ memory_id: unique identifiers of the memories to delete
1067
+
1068
+ Examples:
1069
+ Delete a single memory:
1070
+ >>> memoryset.delete("0195019a-5bc7-7afb-b902-5945ee1fb766")
1071
+
1072
+ Delete multiple memories:
1073
+ >>> memoryset.delete([
1074
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
1075
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
1076
+ ... )
1077
+
1078
+ """
1079
+ memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
1080
+ delete_memories(self.id, body=DeleteMemoriesRequest(memory_ids=memory_ids))
1081
+ logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
1082
+ self.refresh()
1083
+
1084
+ def find_duplicates(self) -> dict:
1085
+ """
1086
+ Run an analysis to find duplicate memories in the memoryset
1087
+
1088
+ The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1089
+ attribute of each memory in the memoryset.
1090
+
1091
+ Returns:
1092
+ Summary of analysis with number of duplicate memories found
1093
+
1094
+ Examples:
1095
+ >>> memoryset.find_duplicate_memories()
1096
+ { "num_duplicates": 10 }
1097
+ >>> memoryset.delete(
1098
+ ... m.memory_id
1099
+ ... for m in memoryset.query(
1100
+ ... filters=[("metrics.is_duplicate", "==", True)]
1101
+ ... )
1102
+ ... )
1103
+ """
1104
+ analysis = create_analysis(
1105
+ self.id,
1106
+ body=MemorysetAnalysisRequest(
1107
+ type=MemorysetAnalysisRequestType.ANALYZE_DUPLICATE_MEMORIES,
1108
+ ),
1109
+ )
1110
+ wait_for_task(analysis.task_id, description="Analyzing duplicates")
1111
+ analysis = get_analysis(self.id, analysis.task_id)
1112
+ assert isinstance(analysis.result, FindDuplicatesAnalysisResult)
1113
+ # TODO: return a custom duplicate analysis class instance with helper methods
1114
+ return analysis.result.to_dict()
1115
+
1116
+ def analyze_labels(self, neighbor_count: int = 10) -> dict:
1117
+ """
1118
+ Run an analysis to access if the labels in the memoryset are consistent to detect possibly
1119
+ mislabeled memories.
1120
+
1121
+ The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1122
+ attribute of each memory in the memoryset.
1123
+
1124
+ Returns:
1125
+ Summary of analysis with aggregate metrics for each label class
1126
+
1127
+ Examples:
1128
+ >>> memoryset.analyze_labels()
1129
+ {
1130
+ "label_metrics": [{
1131
+ "label": 0,
1132
+ "label_name": "negative",
1133
+ "average_lookup_score": 0.95,
1134
+ "memory_count": 100,
1135
+ }, {
1136
+ "label": 1,
1137
+ "label_name": "positive",
1138
+ "average_lookup_score": 0.90,
1139
+ "memory_count": 100,
1140
+ }]
1141
+ }
1142
+ >>> memoryset.display_label_analysis()
1143
+ """
1144
+ analysis = create_analysis(
1145
+ self.id,
1146
+ body=MemorysetAnalysisRequest(
1147
+ type=MemorysetAnalysisRequestType.ANALYZE_MEMORY_NEIGHBOR_LABELS,
1148
+ neighbor_count=neighbor_count,
1149
+ ),
1150
+ )
1151
+ wait_for_task(analysis.task_id, description="Analyzing labels")
1152
+ analysis = get_analysis(self.id, analysis.task_id)
1153
+ assert isinstance(analysis.result, AnalyzeNeighborLabelsResult)
1154
+ # TODO: return a custom label analysis class instance with helper methods
1155
+ return analysis.result.to_dict()
1156
+
1157
+ def display_label_analysis(self):
1158
+ """Display a UI to review and act upon the label analysis results"""
1159
+ from ._utils.analysis_ui import display_suggested_memory_relabels
1160
+
1161
+ display_suggested_memory_relabels(self)
1162
+
1163
+ @staticmethod
1164
+ def run_embedding_evaluation(
1165
+ datasource: Datasource,
1166
+ value_column: str = "value",
1167
+ label_column: str = "label",
1168
+ source_id_column: str | None = None,
1169
+ neighbor_count: int = 5,
1170
+ embedding_models: list[str] | None = None,
1171
+ ) -> dict:
1172
+ """
1173
+ This function runs an embedding evaluation on the datasource. The embedding evaluation will
1174
+ test the quality of embeddings for the datasource by computing metrics such as prediction accuracy.
1175
+
1176
+ Params:
1177
+ datasource: The datasource to run the embedding evaluation on
1178
+ value_column: Name of the column in the datasource that contains the memory values
1179
+ label_column: Name of the column in the datasource that contains the memory labels,
1180
+ these must be contiguous integers starting from 0
1181
+ source_id_column: Optional name of the column in the datasource that contains the ids in
1182
+ the system of reference
1183
+ neighbor_count: The number of neighbors to select for prediction
1184
+ embedding_models: Optional list of embedding model keys to evaluate, if not provided all
1185
+ available embedding models will be used
1186
+
1187
+ Returns:
1188
+ A dictionary containing the results of the embedding evaluation
1189
+ """
1190
+
1191
+ if embedding_models is not None:
1192
+ embedding_model_enums = [PretrainedEmbeddingModelName(model) for model in embedding_models]
1193
+ else:
1194
+ embedding_model_enums = None
1195
+
1196
+ request = EmbeddingEvaluationRequest(
1197
+ value_column=value_column,
1198
+ label_column=label_column,
1199
+ source_id_column=source_id_column,
1200
+ neighbor_count=neighbor_count,
1201
+ embedding_models=embedding_model_enums,
1202
+ )
1203
+
1204
+ response = create_embedding_evaluation(name_or_id=datasource.id, body=request)
1205
+ wait_for_task(response.task_id, description="Running embedding evaluation")
1206
+
1207
+ response = get_embedding_evaluation(datasource.id, response.task_id)
1208
+ assert response.result is not None
1209
+ return response.result.to_dict()