orca-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. orca_sdk/__init__.py +19 -0
  2. orca_sdk/_generated_api_client/__init__.py +3 -0
  3. orca_sdk/_generated_api_client/api/__init__.py +193 -0
  4. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  5. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +128 -0
  6. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +170 -0
  7. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +156 -0
  8. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +130 -0
  9. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +127 -0
  10. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  11. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +183 -0
  12. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +170 -0
  13. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +168 -0
  14. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +154 -0
  15. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +170 -0
  16. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +156 -0
  17. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +161 -0
  18. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +127 -0
  19. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +190 -0
  20. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  21. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +167 -0
  22. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +156 -0
  23. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +156 -0
  24. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +127 -0
  25. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  26. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +118 -0
  27. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +118 -0
  28. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  29. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +168 -0
  30. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +156 -0
  31. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +189 -0
  32. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +156 -0
  33. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +127 -0
  34. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  35. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +181 -0
  36. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +183 -0
  37. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +168 -0
  38. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +181 -0
  39. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +167 -0
  40. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +156 -0
  41. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +169 -0
  42. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +188 -0
  43. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +169 -0
  44. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +156 -0
  45. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +184 -0
  46. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +260 -0
  47. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +127 -0
  48. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +193 -0
  49. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +188 -0
  50. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +191 -0
  51. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +187 -0
  52. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  53. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +188 -0
  54. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +157 -0
  55. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +127 -0
  56. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  57. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +154 -0
  58. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +156 -0
  59. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +243 -0
  60. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  61. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +162 -0
  62. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +156 -0
  63. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +157 -0
  64. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +127 -0
  65. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +175 -0
  66. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +171 -0
  67. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +181 -0
  68. orca_sdk/_generated_api_client/client.py +216 -0
  69. orca_sdk/_generated_api_client/errors.py +38 -0
  70. orca_sdk/_generated_api_client/models/__init__.py +159 -0
  71. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +84 -0
  72. orca_sdk/_generated_api_client/models/api_key_metadata.py +118 -0
  73. orca_sdk/_generated_api_client/models/base_model.py +55 -0
  74. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +176 -0
  75. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +114 -0
  76. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +150 -0
  77. orca_sdk/_generated_api_client/models/column_info.py +114 -0
  78. orca_sdk/_generated_api_client/models/column_type.py +14 -0
  79. orca_sdk/_generated_api_client/models/conflict_error_response.py +80 -0
  80. orca_sdk/_generated_api_client/models/create_api_key_request.py +99 -0
  81. orca_sdk/_generated_api_client/models/create_api_key_response.py +126 -0
  82. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +259 -0
  83. orca_sdk/_generated_api_client/models/create_rac_model_request.py +209 -0
  84. orca_sdk/_generated_api_client/models/datasource_metadata.py +142 -0
  85. orca_sdk/_generated_api_client/models/delete_memories_request.py +70 -0
  86. orca_sdk/_generated_api_client/models/embed_request.py +127 -0
  87. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +9 -0
  88. orca_sdk/_generated_api_client/models/evaluation_request.py +180 -0
  89. orca_sdk/_generated_api_client/models/evaluation_response.py +140 -0
  90. orca_sdk/_generated_api_client/models/feedback_type.py +9 -0
  91. orca_sdk/_generated_api_client/models/field_validation_error.py +103 -0
  92. orca_sdk/_generated_api_client/models/filter_item.py +231 -0
  93. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +15 -0
  94. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +16 -0
  95. orca_sdk/_generated_api_client/models/filter_item_op.py +16 -0
  96. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +70 -0
  97. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +259 -0
  98. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +66 -0
  99. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +166 -0
  100. orca_sdk/_generated_api_client/models/get_memories_request.py +70 -0
  101. orca_sdk/_generated_api_client/models/internal_server_error_response.py +80 -0
  102. orca_sdk/_generated_api_client/models/label_class_metrics.py +108 -0
  103. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +274 -0
  104. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +68 -0
  105. orca_sdk/_generated_api_client/models/label_prediction_result.py +101 -0
  106. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +232 -0
  107. orca_sdk/_generated_api_client/models/labeled_memory.py +197 -0
  108. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +108 -0
  109. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +68 -0
  110. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +258 -0
  111. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +68 -0
  112. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +68 -0
  113. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +277 -0
  114. orca_sdk/_generated_api_client/models/labeled_memory_update.py +171 -0
  115. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +68 -0
  116. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +195 -0
  117. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +9 -0
  118. orca_sdk/_generated_api_client/models/list_memories_request.py +104 -0
  119. orca_sdk/_generated_api_client/models/list_predictions_request.py +234 -0
  120. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +9 -0
  121. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +9 -0
  122. orca_sdk/_generated_api_client/models/lookup_request.py +81 -0
  123. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +83 -0
  124. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +9 -0
  125. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +180 -0
  126. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +66 -0
  127. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +9 -0
  128. orca_sdk/_generated_api_client/models/not_found_error_response.py +100 -0
  129. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +20 -0
  130. orca_sdk/_generated_api_client/models/prediction_feedback.py +157 -0
  131. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +115 -0
  132. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +122 -0
  133. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +102 -0
  134. orca_sdk/_generated_api_client/models/prediction_request.py +169 -0
  135. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +97 -0
  136. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +11 -0
  137. orca_sdk/_generated_api_client/models/rac_head_type.py +11 -0
  138. orca_sdk/_generated_api_client/models/rac_model_metadata.py +191 -0
  139. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +80 -0
  140. orca_sdk/_generated_api_client/models/task.py +198 -0
  141. orca_sdk/_generated_api_client/models/task_status.py +14 -0
  142. orca_sdk/_generated_api_client/models/task_status_info.py +133 -0
  143. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +72 -0
  144. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +80 -0
  145. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +94 -0
  146. orca_sdk/_generated_api_client/models/update_prediction_request.py +93 -0
  147. orca_sdk/_generated_api_client/py.typed +1 -0
  148. orca_sdk/_generated_api_client/types.py +56 -0
  149. orca_sdk/_utils/__init__.py +0 -0
  150. orca_sdk/_utils/analysis_ui.py +194 -0
  151. orca_sdk/_utils/analysis_ui_style.css +54 -0
  152. orca_sdk/_utils/auth.py +63 -0
  153. orca_sdk/_utils/auth_test.py +31 -0
  154. orca_sdk/_utils/common.py +37 -0
  155. orca_sdk/_utils/data_parsing.py +99 -0
  156. orca_sdk/_utils/data_parsing_test.py +244 -0
  157. orca_sdk/_utils/prediction_result_ui.css +18 -0
  158. orca_sdk/_utils/prediction_result_ui.py +64 -0
  159. orca_sdk/_utils/task.py +73 -0
  160. orca_sdk/classification_model.py +499 -0
  161. orca_sdk/classification_model_test.py +266 -0
  162. orca_sdk/conftest.py +117 -0
  163. orca_sdk/datasource.py +333 -0
  164. orca_sdk/datasource_test.py +95 -0
  165. orca_sdk/embedding_model.py +336 -0
  166. orca_sdk/embedding_model_test.py +173 -0
  167. orca_sdk/labeled_memoryset.py +1154 -0
  168. orca_sdk/labeled_memoryset_test.py +271 -0
  169. orca_sdk/orca_credentials.py +75 -0
  170. orca_sdk/orca_credentials_test.py +37 -0
  171. orca_sdk/telemetry.py +386 -0
  172. orca_sdk/telemetry_test.py +100 -0
  173. orca_sdk-0.1.0.dist-info/METADATA +39 -0
  174. orca_sdk-0.1.0.dist-info/RECORD +175 -0
  175. orca_sdk-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1154 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from datetime import datetime, timedelta
5
+ from os import PathLike
6
+ from typing import Any, Iterable, Literal, cast, overload
7
+
8
+ import pandas as pd
9
+ import pyarrow as pa
10
+ from datasets import Dataset
11
+ from torch.utils.data import DataLoader as TorchDataLoader
12
+ from torch.utils.data import Dataset as TorchDataset
13
+
14
+ from ._generated_api_client.api import (
15
+ clone_memoryset,
16
+ create_analysis,
17
+ create_memoryset,
18
+ delete_memories,
19
+ delete_memoryset,
20
+ get_analysis,
21
+ get_memories,
22
+ get_memory,
23
+ get_memoryset,
24
+ insert_memories_gpu,
25
+ list_memorysets,
26
+ memoryset_lookup_gpu,
27
+ query_memoryset,
28
+ update_memories_gpu,
29
+ update_memory_gpu,
30
+ )
31
+ from ._generated_api_client.models import (
32
+ AnalyzeNeighborLabelsResult,
33
+ CloneLabeledMemorysetRequest,
34
+ CreateLabeledMemorysetRequest,
35
+ DeleteMemoriesRequest,
36
+ FilterItem,
37
+ FilterItemOp,
38
+ FindDuplicatesAnalysisResult,
39
+ GetMemoriesRequest,
40
+ )
41
+ from ._generated_api_client.models import LabeledMemory as LabeledMemoryResponse
42
+ from ._generated_api_client.models import (
43
+ LabeledMemoryInsert,
44
+ LabeledMemoryInsertMetadata,
45
+ )
46
+ from ._generated_api_client.models import (
47
+ LabeledMemoryLookup as LabeledMemoryLookupResponse,
48
+ )
49
+ from ._generated_api_client.models import (
50
+ LabeledMemoryMetrics,
51
+ LabeledMemorysetMetadata,
52
+ LabeledMemoryUpdate,
53
+ LabeledMemoryUpdateMetadataType0,
54
+ LabelPredictionMemoryLookup,
55
+ ListMemoriesRequest,
56
+ LookupRequest,
57
+ MemorysetAnalysisRequest,
58
+ MemorysetAnalysisRequestType,
59
+ TaskStatus,
60
+ )
61
+ from ._generated_api_client.types import UNSET as CLIENT_UNSET
62
+ from ._utils.common import UNSET, CreateMode, DropMode
63
+ from ._utils.task import wait_for_task
64
+ from .datasource import Datasource
65
+ from .embedding_model import (
66
+ FinetunedEmbeddingModel,
67
+ PretrainedEmbeddingModel,
68
+ _EmbeddingModel,
69
+ )
70
+
71
+ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
72
+ """
73
+ Operations that can be used in a filter expression.
74
+ """
75
+
76
+ FilterValue = str | int | float | bool | datetime | None | list[str] | list[int] | list[float] | list[bool]
77
+ """
78
+ Values that can be used in a filter expression.
79
+ """
80
+
81
+ FilterItemTuple = tuple[str, FilterOperation, FilterValue]
82
+ """
83
+ Filter expression consisting of a field, an operator, and a value:
84
+
85
+ * **`field`**: The field to filter on.
86
+ * **`operation`**: The operation to apply to the field and value.
87
+ * **`value`**: The value to compare the field against.
88
+
89
+ Examples:
90
+ >>> ("label", "==", 0)
91
+ >>> ("metadata.author", "like", "John")
92
+ >>> ("source_id", "in", ["123", "456"])
93
+ """
94
+
95
+
96
+ DEFAULT_COLUMN_NAMES = {"value", "label", "source_id"}
97
+ FORBIDDEN_METADATA_COLUMN_NAMES = {"memory_id", "memory_version", "embedding", "created_at", "updated_at", "metrics"}
98
+
99
+
100
+ def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem:
101
+ field = input[0].split(".")
102
+ if len(field) == 1 and field[0] not in DEFAULT_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES:
103
+ field = ["metadata", field[0]]
104
+ op = FilterItemOp(input[1])
105
+ value = input[2]
106
+ return FilterItem(field=field, op=op, value=value)
107
+
108
+
109
+ def _parse_memory_insert(memory: dict[str, Any]) -> LabeledMemoryInsert:
110
+ value = memory.get("value")
111
+ if not isinstance(value, str):
112
+ raise ValueError("Memory value must be a string")
113
+ label = memory.get("label")
114
+ if not isinstance(label, int):
115
+ raise ValueError("Memory label must be an integer")
116
+ source_id = memory.get("source_id")
117
+ if source_id and not isinstance(source_id, str):
118
+ raise ValueError("Memory source_id must be a string")
119
+ metadata = LabeledMemoryInsertMetadata.from_dict({k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES})
120
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
121
+ raise ValueError(f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
122
+ return LabeledMemoryInsert(value=value, label=label, source_id=source_id, metadata=metadata)
123
+
124
+
125
+ def _parse_memory_update(update: dict[str, Any]) -> LabeledMemoryUpdate:
126
+ if "memory_id" not in update:
127
+ raise ValueError("memory_id must be specified in the update dictionary")
128
+ memory_id = update["memory_id"]
129
+ if not isinstance(memory_id, str):
130
+ raise ValueError("memory_id must be a string")
131
+ value = update.get("value", CLIENT_UNSET)
132
+ if value is not CLIENT_UNSET and not isinstance(value, str):
133
+ raise ValueError("value must be a string or unset")
134
+ label = update.get("label", CLIENT_UNSET)
135
+ if label is not CLIENT_UNSET and not isinstance(label, int):
136
+ raise ValueError("label must be an integer or unset")
137
+ source_id = update.get("source_id", CLIENT_UNSET)
138
+ if source_id is not CLIENT_UNSET and not isinstance(source_id, str):
139
+ raise ValueError("source_id must be a string or unset")
140
+ metadata = LabeledMemoryUpdateMetadataType0.from_dict(
141
+ {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id"}}
142
+ )
143
+ if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
144
+ raise ValueError(f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}")
145
+ return LabeledMemoryUpdate(memory_id=memory_id, value=value, label=label, source_id=source_id, metadata=metadata)
146
+
147
+
148
+ class LabeledMemory:
149
+ """
150
+ A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
151
+
152
+ Attributes:
153
+ value: Value represented by the row
154
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
155
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
156
+ label: Class label of the memory
157
+ label_name: Human-readable name of the label, automatically populated from the
158
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
159
+ source_id: Optional unique identifier of the memory in a system of reference
160
+ metrics: Metrics about the memory, generated when running an analysis on the
161
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
162
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
163
+ properties are also accessible as individual attributes on the instance.
164
+ memory_id: Unique identifier for the memory, automatically generated on insert
165
+ memory_version: Version of the memory, automatically updated when the label or value changes
166
+ created_at: When the memory was created, automatically generated on insert
167
+ updated_at: When the memory was last updated, automatically updated on update
168
+
169
+ ## Other Attributes:
170
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
171
+ """
172
+
173
+ value: str
174
+ embedding: list[float]
175
+ label: int
176
+ label_name: str | None
177
+ source_id: str | None
178
+ created_at: datetime
179
+ updated_at: datetime
180
+ metadata: dict[str, str | float | int | bool | None]
181
+ metrics: LabeledMemoryMetrics | None
182
+ memory_id: str
183
+ memory_version: int
184
+
185
+ def __init__(
186
+ self,
187
+ memoryset_id: str,
188
+ memory: LabeledMemoryResponse | LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
189
+ ):
190
+ # for internal use only, do not document
191
+ self.memoryset_id = memoryset_id
192
+ self.memory_id = memory.memory_id
193
+ self.memory_version = memory.memory_version
194
+ self.value = memory.value
195
+ self.embedding = memory.embedding
196
+ self.label = memory.label
197
+ self.label_name = memory.label_name
198
+ self.source_id = memory.source_id
199
+ self.created_at = memory.created_at
200
+ self.updated_at = memory.updated_at
201
+ self.metadata = memory.metadata.to_dict()
202
+ self.metrics = memory.metrics
203
+
204
+ def __getattr__(self, key: str) -> Any:
205
+ if key.startswith("__") or key not in self.metadata:
206
+ raise AttributeError(f"{key} is not a valid attribute")
207
+ return self.metadata[key]
208
+
209
+ def __repr__(self) -> str:
210
+ return (
211
+ "LabeledMemory({ "
212
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
213
+ + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
214
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
215
+ + " })"
216
+ )
217
+
218
+ def __eq__(self, other: object) -> bool:
219
+ return isinstance(other, LabeledMemory) and self.memory_id == other.memory_id
220
+
221
+ def update(
222
+ self,
223
+ *,
224
+ value: str = UNSET,
225
+ label: int = UNSET,
226
+ source_id: str | None = UNSET,
227
+ **metadata: None | bool | float | int | str,
228
+ ) -> LabeledMemory:
229
+ """
230
+ Update the memory with new values
231
+
232
+ Note:
233
+ If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
234
+
235
+ Params:
236
+ value: New value of the memory
237
+ label: New label of the memory
238
+ source_id: New source ID of the memory
239
+ **metadata: New values for metadata properties
240
+
241
+ Returns:
242
+ The updated memory
243
+ """
244
+ response = update_memory_gpu(
245
+ self.memoryset_id,
246
+ body=_parse_memory_update(
247
+ {"memory_id": self.memory_id}
248
+ | ({"value": value} if value is not UNSET else {})
249
+ | ({"label": label} if label is not UNSET else {})
250
+ | ({"source_id": source_id} if source_id is not UNSET else {})
251
+ | metadata
252
+ ),
253
+ )
254
+ self.__dict__.update(LabeledMemory(self.memoryset_id, response).__dict__)
255
+ return self
256
+
257
+
258
+ class LabeledMemoryLookup(LabeledMemory):
259
+ """
260
+ Lookup result for a memory in a memoryset
261
+
262
+ Attributes:
263
+ lookup_score: Similarity between the memory embedding and search query embedding
264
+ attention_weight: Weight the model assigned to the memory during prediction if this lookup
265
+ happened as part of a prediction
266
+ value: Value represented by the row
267
+ embedding: Embedding of the value of the memory for semantic search, automatically generated
268
+ with the [`LabeledMemoryset.embedding_model`][orca_sdk.LabeledMemoryset]
269
+ label: Class label of the memory
270
+ label_name: Human-readable name of the label, automatically populated from the
271
+ [`LabeledMemoryset.label_names`][orca_sdk.LabeledMemoryset]
272
+ source_id: Optional unique identifier of the memory in a system of reference
273
+ metrics: Metrics about the memory, generated when running an analysis on the
274
+ [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
275
+ metadata: Metadata associated with the memory that is not used in the model. Metadata
276
+ properties are also accessible as individual attributes on the instance.
277
+ memory_id: The unique identifier for the memory, automatically generated on insert
278
+ memory_version: The version of the memory, automatically updated when the label or value changes
279
+ created_at: When the memory was created, automatically generated on insert
280
+ updated_at: When the memory was last updated, automatically updated on update
281
+
282
+ ## Other Attributes:
283
+ * **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
284
+ """
285
+
286
+ lookup_score: float
287
+ attention_weight: float | None
288
+
289
+ def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
290
+ # for internal use only, do not document
291
+ super().__init__(memoryset_id, memory_lookup)
292
+ self.lookup_score = memory_lookup.lookup_score
293
+ self.attention_weight = (
294
+ memory_lookup.attention_weight if isinstance(memory_lookup, LabelPredictionMemoryLookup) else None
295
+ )
296
+
297
+ def __repr__(self) -> str:
298
+ return (
299
+ "LabeledMemoryLookup({ "
300
+ + f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
301
+ + f", lookup_score: {self.lookup_score:.2f}"
302
+ + (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
303
+ + f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
304
+ + (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
305
+ + " })"
306
+ )
307
+
308
+
309
+ class LabeledMemoryset:
310
+ """
311
+ A Handle to a collection of memories with labels in the OrcaCloud
312
+
313
+ Attributes:
314
+ id: Unique identifier for the memoryset
315
+ name: Unique name of the memoryset
316
+ label_names: Names for the class labels in the memoryset
317
+ length: Number of memories in the memoryset
318
+ embedding_model: Embedding model used to embed the memory values for semantic search
319
+ created_at: When the memoryset was created, automatically generated on create
320
+ updated_at: When the memoryset was last updated, automatically updated on updates
321
+ """
322
+
323
+ id: str
324
+ name: str
325
+ label_names: list[str]
326
+ length: int
327
+ created_at: datetime
328
+ updated_at: datetime
329
+ insertion_status: TaskStatus
330
+ embedding_model: _EmbeddingModel
331
+
332
+ def __init__(self, metadata: LabeledMemorysetMetadata):
333
+ # for internal use only, do not document
334
+ if metadata.pretrained_embedding_model_name:
335
+ self.embedding_model = PretrainedEmbeddingModel._get(metadata.pretrained_embedding_model_name)
336
+ elif metadata.finetuned_embedding_model_id:
337
+ self.embedding_model = FinetunedEmbeddingModel.open(metadata.finetuned_embedding_model_id)
338
+ else:
339
+ raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
340
+ self.id = metadata.id
341
+ self.name = metadata.name
342
+ self.label_names = metadata.label_names
343
+ self.length = metadata.length
344
+ self.created_at = metadata.created_at
345
+ self.updated_at = metadata.updated_at
346
+ self.insertion_status = metadata.insertion_status
347
+ self._last_refresh = datetime.now()
348
+
349
+ def __eq__(self, other) -> bool:
350
+ return isinstance(other, LabeledMemoryset) and self.id == other.id
351
+
352
+ def __repr__(self) -> str:
353
+ return (
354
+ "LabeledMemoryset({\n"
355
+ f" name: '{self.name}',\n"
356
+ f" length: {self.length},\n"
357
+ f" label_names: {self.label_names},\n"
358
+ f" embedding_model: {self.embedding_model},\n"
359
+ "})"
360
+ )
361
+
362
+ @classmethod
363
+ def create(
364
+ cls,
365
+ name: str,
366
+ datasource: Datasource,
367
+ *,
368
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
369
+ value_column: str = "value",
370
+ label_column: str = "label",
371
+ source_id_column: str | None = None,
372
+ label_names: list[str] | None = None,
373
+ max_seq_length_override: int | None = None,
374
+ if_exists: CreateMode = "error",
375
+ ) -> LabeledMemoryset:
376
+ """
377
+ Create a new memoryset in the OrcaCloud
378
+
379
+ All columns from the datasource that are not specified in the `value_column`,
380
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
381
+
382
+ Params:
383
+ name: Name for the new memoryset (must be unique)
384
+ datasource: Source data to populate the memories in the memoryset
385
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
386
+ If not provided, a default embedding model for the memoryset will be used.
387
+ value_column: Name of the column in the datasource that contains the memory values
388
+ label_column: Name of the column in the datasource that contains the memory labels,
389
+ these must be contiguous integers starting from 0
390
+ source_id_column: Optional name of the column in the datasource that contains the ids in
391
+ the system of reference
392
+ label_names: List of human-readable names for the labels in the memoryset, must match
393
+ the number of labels in the `label_column`. Will be automatically inferred if a
394
+ [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
395
+ labels is used as the datasource
396
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
397
+ value is longer than this it will be truncated, will default to the model's max
398
+ sequence length if not provided
399
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
400
+ `"error"`. Other option is `"open"` to open the existing memoryset.
401
+
402
+ Returns:
403
+ Handle to the new memoryset in the OrcaCloud
404
+
405
+ Raises:
406
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
407
+ `"open"` and the params do not match those of the existing memoryset.
408
+ """
409
+ if embedding_model is None:
410
+ embedding_model = PretrainedEmbeddingModel.CDE_SMALL
411
+
412
+ logging.info(f"Checking if memoryset with name: {name} exists")
413
+ if cls.exists(name):
414
+ if if_exists == "error":
415
+ raise ValueError(f"Memoryset with name {name} already exists")
416
+ elif if_exists == "open":
417
+ existing = cls.open(name)
418
+ for attribute in {"label_names", "embedding_model"}:
419
+ if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
420
+ raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
421
+ return existing
422
+
423
+ logging.info(f"Creating memoryset with name: {name} from datasource: {datasource}")
424
+ response = create_memoryset(
425
+ body=CreateLabeledMemorysetRequest(
426
+ name=name,
427
+ datasource_id=datasource.id,
428
+ datasource_label_column=label_column,
429
+ datasource_value_column=value_column,
430
+ datasource_source_id_column=source_id_column,
431
+ pretrained_embedding_model_name=(
432
+ embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
433
+ ),
434
+ finetuned_embedding_model_id=(
435
+ embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
436
+ ),
437
+ label_names=label_names or [],
438
+ max_seq_length_override=max_seq_length_override,
439
+ ),
440
+ )
441
+ wait_for_task(response.insertion_task_id, description="Inserting datasource")
442
+ return cls.open(response.id)
443
+
444
+ @classmethod
445
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, **kwargs) -> LabeledMemoryset:
446
+ """
447
+ Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
448
+
449
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
450
+ appended with `_datasource` and use that as the datasource for the memoryset.
451
+
452
+ All features that are not specified to be used as `value_column`, `label_column`, or
453
+ `source_id_column` will be stored as metadata in the memoryset.
454
+
455
+ Params:
456
+ name: Name for the new memoryset (must be unique)
457
+ hf_dataset: Hugging Face dataset to create the memoryset from
458
+ **kwargs: Additional parameters for creating the memoryset. See
459
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
460
+
461
+
462
+ Returns:
463
+ Handle to the new memoryset in the OrcaCloud
464
+ """
465
+ datasource = Datasource.from_hf_dataset(
466
+ f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
467
+ )
468
+ logging.info(f"Datasource: {datasource}")
469
+ return cls.create(name, datasource, **kwargs)
470
+
471
+ @classmethod
472
+ def from_pytorch(
473
+ cls,
474
+ name: str,
475
+ torch_data: TorchDataLoader | TorchDataset,
476
+ *,
477
+ column_names: list[str] | None = None,
478
+ **kwargs,
479
+ ) -> LabeledMemoryset:
480
+ """
481
+ Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
482
+ [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
483
+
484
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
485
+ appended with `_datasource` and use that as the datasource for the memoryset.
486
+
487
+ All properties that are not specified to be used as `value_column`, `label_column`, or
488
+ `source_id_column` will be stored as metadata in the memoryset.
489
+
490
+ Params:
491
+ name: Name for the new memoryset (must be unique)
492
+ torch_data: PyTorch data loader or dataset to create the memoryset from
493
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
494
+ argument must be provided to specify the names of the columns.
495
+ **kwargs: Additional parameters for creating the memoryset. See
496
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
497
+
498
+
499
+ Returns:
500
+ Handle to the new memoryset in the OrcaCloud
501
+ """
502
+ datasource = Datasource.from_pytorch(
503
+ f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
504
+ )
505
+ return cls.create(name, datasource, **kwargs)
506
+
507
+ @classmethod
508
+ def from_list(cls, name: str, data: list[dict], **kwargs) -> LabeledMemoryset:
509
+ """
510
+ Create a new memoryset from a list of dictionaries in the OrcaCloud
511
+
512
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
513
+ appended with `_datasource` and use that as the datasource for the memoryset.
514
+
515
+ All properties that are not specified to be used as `value_column`, `label_column`, or
516
+ `source_id_column` will be stored as metadata in the memoryset.
517
+
518
+ Params:
519
+ name: Name for the new memoryset (must be unique)
520
+ data: List of dictionaries to create the memoryset from
521
+ **kwargs: Additional parameters for creating the memoryset. See
522
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
523
+
524
+ Returns:
525
+ Handle to the new memoryset in the OrcaCloud
526
+
527
+ Examples:
528
+ >>> LabeledMemoryset.from_list("my_memoryset", [
529
+ ... {"value": "hello", "label": 0, "tag": "tag1"},
530
+ ... {"value": "world", "label": 1, "tag": "tag2"},
531
+ ... ])
532
+ """
533
+ datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
534
+ return cls.create(name, datasource, **kwargs)
535
+
536
+ @classmethod
537
+ def from_dict(cls, name: str, data: dict, **kwargs) -> LabeledMemoryset:
538
+ """
539
+ Create a new memoryset from a dictionary of columns in the OrcaCloud
540
+
541
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
542
+ appended with `_datasource` and use that as the datasource for the memoryset.
543
+
544
+ All columns from the datasource that are not specified in the `value_column`,
545
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
546
+
547
+ Params:
548
+ name: Name for the new memoryset (must be unique)
549
+ data: Dictionary of columns to create the memoryset from
550
+ **kwargs: Additional parameters for creating the memoryset. See
551
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
552
+
553
+ Returns:
554
+ Handle to the new memoryset in the OrcaCloud
555
+
556
+ Examples:
557
+ >>> LabeledMemoryset.from_dict("my_memoryset", {
558
+ ... "value": ["hello", "world"],
559
+ ... "label": [0, 1],
560
+ ... "tag": ["tag1", "tag2"],
561
+ ... })
562
+ """
563
+ datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
564
+ return cls.create(name, datasource, **kwargs)
565
+
566
+ @classmethod
567
+ def from_pandas(cls, name: str, dataframe: pd.DataFrame, **kwargs) -> LabeledMemoryset:
568
+ """
569
+ Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
570
+
571
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
572
+ appended with `_datasource` and use that as the datasource for the memoryset.
573
+
574
+ All columns that are not specified to be used as `value_column`, `label_column`, or
575
+ `source_id_column` will be stored as metadata in the memoryset.
576
+
577
+ Params:
578
+ name: Name for the new memoryset (must be unique)
579
+ dataframe: Dataframe to create the memoryset from
580
+ **kwargs: Additional parameters for creating the memoryset. See
581
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
582
+
583
+ Returns:
584
+ Handle to the new memoryset in the OrcaCloud
585
+ """
586
+ datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
587
+ return cls.create(name, datasource, **kwargs)
588
+
589
+ @classmethod
590
+ def from_arrow(cls, name: str, pyarrow_table: pa.Table, **kwargs) -> LabeledMemoryset:
591
+ """
592
+ Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
593
+
594
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
595
+ appended with `_datasource` and use that as the datasource for the memoryset.
596
+
597
+ All columns that are not specified to be used as `value_column`, `label_column`, or
598
+ `source_id_column` will be stored as metadata in the memoryset.
599
+
600
+ Params:
601
+ name: Name for the new memoryset (must be unique)
602
+ pyarrow_table: PyArrow table to create the memoryset from
603
+ **kwargs: Additional parameters for creating the memoryset. See
604
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
605
+
606
+ Returns:
607
+ Handle to the new memoryset in the OrcaCloud
608
+ """
609
+ datasource = Datasource.from_arrow(
610
+ f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
611
+ )
612
+ return cls.create(name, datasource, **kwargs)
613
+
614
+ @classmethod
615
+ def from_disk(cls, name: str, file_path: str | PathLike, **kwargs) -> LabeledMemoryset:
616
+ """
617
+ Create a new memoryset from a file on disk in the OrcaCloud
618
+
619
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
620
+ appended with `_datasource` and use that as the datasource for the memoryset.
621
+
622
+ All columns from the datasource that are not specified in the `value_column`,
623
+ `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
624
+
625
+ Params:
626
+ name: Name for the new memoryset (must be unique)
627
+ file_path: Path to the file on disk to create the memoryset from. The file type will
628
+ be inferred from the file extension. The following file types are supported:
629
+
630
+ - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
631
+ - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
632
+ - .csv: [`CSV`][csv] files
633
+ - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
634
+ - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
635
+ **kwargs: Additional parameters for creating the memoryset. See
636
+ [`create`][orca_sdk.LabeledMemoryset.create] attributes for details.
637
+
638
+ Returns:
639
+ Handle to the new memoryset in the OrcaCloud
640
+ """
641
+ datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
642
+ return cls.create(name, datasource, **kwargs)
643
+
644
+ @classmethod
645
+ def open(cls, name: str) -> LabeledMemoryset:
646
+ """
647
+ Get a handle to a memoryset in the OrcaCloud
648
+
649
+ Params:
650
+ name: Name or unique identifier of the memoryset
651
+
652
+ Returns:
653
+ Handle to the existing memoryset in the OrcaCloud
654
+
655
+ Raises:
656
+ LookupError: If the memoryset does not exist
657
+ """
658
+ metadata = get_memoryset(name)
659
+ return cls(metadata)
660
+
661
+ @classmethod
662
+ def exists(cls, name_or_id: str) -> bool:
663
+ """
664
+ Check if a memoryset exists in the OrcaCloud
665
+
666
+ Params:
667
+ name_or_id: Name or id of the memoryset
668
+
669
+ Returns:
670
+ True if the memoryset exists, False otherwise
671
+ """
672
+ try:
673
+ cls.open(name_or_id)
674
+ return True
675
+ except LookupError:
676
+ return False
677
+
678
+ @classmethod
679
+ def all(cls) -> list[LabeledMemoryset]:
680
+ """
681
+ Get a list of handles to all memorysets in the OrcaCloud
682
+
683
+ Returns:
684
+ List of handles to all memorysets in the OrcaCloud
685
+ """
686
+ return [cls(metadata) for metadata in list_memorysets()]
687
+
688
+ @classmethod
689
+ def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
690
+ """
691
+ Delete a memoryset from the OrcaCloud
692
+
693
+ Params:
694
+ name_or_id: Name or id of the memoryset
695
+ if_not_exists: What to do if the memoryset does not exist, defaults to `"error"`.
696
+ Other options are `"ignore"` to do nothing if the memoryset does not exist.
697
+
698
+ Raises:
699
+ LookupError: If the memoryset does not exist and if_not_exists is `"error"`
700
+ """
701
+ try:
702
+ delete_memoryset(name_or_id)
703
+ logging.info(f"Deleted memoryset {name_or_id}")
704
+ except LookupError:
705
+ if if_not_exists == "error":
706
+ raise
707
+
708
+ def clone(
709
+ self,
710
+ name: str,
711
+ *,
712
+ embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
713
+ max_seq_length_override: int | None = None,
714
+ if_exists: CreateMode = "error",
715
+ ) -> LabeledMemoryset:
716
+ """
717
+ Create a clone of the memoryset with a new name
718
+
719
+ Params:
720
+ name: Name for the new memoryset (must be unique)
721
+ embedding_model: Optional new embedding model to use for re-embedding the memory values
722
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
723
+ value is longer than this it will be truncated, will default to the model's max
724
+ sequence length if not provided
725
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
726
+ `"error"`. Other option is `"open"` to open the existing memoryset.
727
+
728
+ Returns:
729
+ Handle to the cloned memoryset in the OrcaCloud
730
+
731
+ Examples:
732
+ >>> memoryset = LabeledMemoryset.open("my_memoryset")
733
+ >>> finetuned_embedding_model = PretrainedEmbeddingModel.GTE_BASE.finetune(
734
+ ... "gte_base_finetuned", my_memoryset
735
+ ... )
736
+ >>> new_memoryset = memoryset.clone(
737
+ ... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
738
+ ... )
739
+ """
740
+ if self.exists(name):
741
+ if if_exists == "error":
742
+ raise ValueError(f"Memoryset with name {name} already exists")
743
+ elif if_exists == "open":
744
+ existing = self.open(name)
745
+ for attribute in {"embedding_model"}:
746
+ if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
747
+ raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
748
+ return existing
749
+
750
+ metadata = clone_memoryset(
751
+ self.id,
752
+ body=CloneLabeledMemorysetRequest(
753
+ name=name,
754
+ pretrained_embedding_model_name=(
755
+ embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
756
+ ),
757
+ finetuned_embedding_model_id=(
758
+ embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
759
+ ),
760
+ max_seq_length_override=max_seq_length_override,
761
+ ),
762
+ )
763
+ wait_for_task(metadata.insertion_task_id, description="Cloning memoryset")
764
+ return LabeledMemoryset.open(metadata.id)
765
+
766
+ def refresh(self, throttle: float = 0):
767
+ """
768
+ Refresh the information about the memoryset from the OrcaCloud
769
+
770
+ Params:
771
+ throttle: Minimum time in seconds between refreshes
772
+ """
773
+ current_time = datetime.now()
774
+ # Skip refresh if last refresh was too recent
775
+ if (current_time - self._last_refresh) < timedelta(seconds=throttle):
776
+ return
777
+
778
+ self.__dict__.update(LabeledMemoryset.open(self.id).__dict__)
779
+ self._last_refresh = current_time
780
+
781
+ def __len__(self) -> int:
782
+ """Get the number of memories in the memoryset"""
783
+ self.refresh(throttle=5)
784
+ return self.length
785
+
786
+ @overload
787
+ def __getitem__(self, index: int | str) -> LabeledMemory:
788
+ pass
789
+
790
+ @overload
791
+ def __getitem__(self, index: slice) -> list[LabeledMemory]:
792
+ pass
793
+
794
+ def __getitem__(self, index: int | slice | str) -> LabeledMemory | list[LabeledMemory]:
795
+ """
796
+ Get memories from the memoryset by index or memory id
797
+
798
+ Params:
799
+ index: Index or memory to retrieve or slice of memories to retrieve or unique
800
+ identifier of the memory to retrieve
801
+
802
+ Returns:
803
+ Memory or memories from the memoryset
804
+
805
+ Raises:
806
+ LookupError: If the id is not found or the index is out of bounds
807
+
808
+ Examples:
809
+ Retrieve the first memory in the memoryset:
810
+ >>> memoryset[0]
811
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
812
+
813
+ Retrieve the last memory in the memoryset:
814
+ >>> memoryset[-1]
815
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' })
816
+
817
+ Retrieve a slice of memories in the memoryset:
818
+ >>> memoryset[1:3]
819
+ [
820
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
821
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
822
+ ]
823
+
824
+ Retrieve a memory by id:
825
+ >>> memoryset["0195019a-5bc7-7afb-b902-5945ee1fb766"]
826
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
827
+ """
828
+ if isinstance(index, int):
829
+ return self.query(offset=len(self) + index if index < 0 else index, limit=1)[0]
830
+ elif isinstance(index, str):
831
+ return self.get(index)
832
+ elif isinstance(index, slice):
833
+ start = 0 if index.start is None else (len(self) + index.start) if index.start < 0 else index.start
834
+ stop = len(self) if index.stop is None else (len(self) + index.stop) if index.stop < 0 else index.stop
835
+ return self.query(offset=start, limit=stop - start)
836
+ else:
837
+ raise ValueError(f"Invalid index type: {type(index)}")
838
+
839
+ @overload
840
+ def search(self, query: str, *, count: int = 1) -> list[LabeledMemoryLookup]:
841
+ pass
842
+
843
+ @overload
844
+ def search(self, query: list[str], *, count: int = 1) -> list[list[LabeledMemoryLookup]]:
845
+ pass
846
+
847
+ def search(
848
+ self, query: str | list[str], *, count: int = 1
849
+ ) -> list[LabeledMemoryLookup] | list[list[LabeledMemoryLookup]]:
850
+ """
851
+ Search for memories that are semantically similar to the query
852
+
853
+ Params:
854
+ query: Query to lookup memories in the memoryset, can be a single query or a list
855
+ count: Number of memories to return for each query
856
+
857
+ Returns:
858
+ List of memories from the memoryset that match the query. If a single query is provided,
859
+ the return value is a list containing a single list of memories. If a list of
860
+ queries is provided, the return value is a list of lists of memories.
861
+
862
+ Examples:
863
+ Search for similar memories:
864
+ >>> memoryset.search("I am happy", count=2)
865
+ [
866
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
867
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
868
+ ]
869
+
870
+ Search for similar memories for multiple queries:
871
+ >>> memoryset.search(["I am happy", "I am sad"], count=1)
872
+ [
873
+ [
874
+ LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
875
+ ],
876
+ [
877
+ LabeledMemoryLookup({ label: <negative: 0>, value: 'I am sad' }),
878
+ ],
879
+ ]
880
+ """
881
+ response = memoryset_lookup_gpu(
882
+ name_or_id=self.id,
883
+ body=LookupRequest(
884
+ query=query if isinstance(query, list) else [query],
885
+ count=count,
886
+ ),
887
+ )
888
+ lookups = [[LabeledMemoryLookup(self.id, lookup_response) for lookup_response in batch] for batch in response]
889
+ return lookups if isinstance(query, list) else lookups[0]
890
+
891
+ def query(
892
+ self,
893
+ offset: int = 0,
894
+ limit: int = 100,
895
+ filters: list[FilterItemTuple] = [],
896
+ ) -> list[LabeledMemory]:
897
+ """
898
+ Query the memoryset for memories that match the filters
899
+
900
+ Params:
901
+ offset: The offset of the first memory to return
902
+ limit: The maximum number of memories to return
903
+ filters: List of filters to apply to the query.
904
+
905
+ Returns:
906
+ List of memories from the memoryset that match the filters
907
+
908
+ Examples:
909
+ >>> memoryset.query(filters=[("label", "==", 0)], limit=2)
910
+ [
911
+ LabeledMemory({ label: <positive: 1>, value: "I am happy" }),
912
+ LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
913
+ ]
914
+ """
915
+ return [
916
+ LabeledMemory(self.id, memory)
917
+ for memory in query_memoryset(
918
+ self.id,
919
+ body=ListMemoriesRequest(
920
+ offset=offset,
921
+ limit=limit,
922
+ filters=[
923
+ _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter
924
+ for filter in filters
925
+ ],
926
+ ),
927
+ )
928
+ ]
929
+
930
+ def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
931
+ """
932
+ Insert memories into the memoryset
933
+
934
+ Params:
935
+ memories: List of memories to insert into the memoryset. This should be a list of
936
+ dictionaries with the following keys:
937
+
938
+ - `value`: Value of the memory
939
+ - `label`: Label of the memory
940
+ - `source_id`: Optional unique ID of the memory in a system of reference
941
+ - `...`: Any other metadata to store for the memory
942
+
943
+ Examples:
944
+ >>> memoryset.insert([
945
+ ... {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
946
+ ... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
947
+ ... ])
948
+ """
949
+ insert_memories_gpu(
950
+ self.id,
951
+ body=(
952
+ [
953
+ _parse_memory_insert(memory)
954
+ for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
955
+ ]
956
+ ),
957
+ )
958
+ self.refresh()
959
+
960
+ @overload
961
+ def get(self, memory_id: str) -> LabeledMemory: # type: ignore -- this takes precedence
962
+ pass
963
+
964
+ @overload
965
+ def get(self, memory_id: Iterable[str]) -> list[LabeledMemory]:
966
+ pass
967
+
968
+ def get(self, memory_id: str | Iterable[str]) -> LabeledMemory | list[LabeledMemory]:
969
+ """
970
+ Fetch a memory or memories from the memoryset
971
+
972
+ Params:
973
+ memory_id: Unique identifier of the memory or memories to fetch
974
+
975
+ Returns:
976
+ Memory or list of memories from the memoryset
977
+
978
+ Raises:
979
+ LookupError: If no memory with the given id is found
980
+
981
+ Examples:
982
+ Fetch a single memory:
983
+ >>> memoryset.get("0195019a-5bc7-7afb-b902-5945ee1fb766")
984
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' })
985
+
986
+ Fetch multiple memories:
987
+ >>> memoryset.get([
988
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
989
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
990
+ ... ])
991
+ [
992
+ LabeledMemory({ label: <positive: 1>, value: 'I am happy' }),
993
+ LabeledMemory({ label: <negative: 0>, value: 'I am sad' }),
994
+ ]
995
+ """
996
+ if isinstance(memory_id, str):
997
+ return LabeledMemory(self.id, get_memory(self.id, memory_id))
998
+ else:
999
+ return [
1000
+ LabeledMemory(self.id, memory)
1001
+ for memory in get_memories(self.id, body=GetMemoriesRequest(memory_ids=list(memory_id)))
1002
+ ]
1003
+
1004
+ @overload
1005
+ def update(self, updates: dict[str, Any]) -> LabeledMemory:
1006
+ pass
1007
+
1008
+ @overload
1009
+ def update(self, updates: Iterable[dict[str, Any]]) -> list[LabeledMemory]:
1010
+ pass
1011
+
1012
+ def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> LabeledMemory | list[LabeledMemory]:
1013
+ """
1014
+ Update one or multiple memories in the memoryset
1015
+
1016
+ Params:
1017
+ updates: List of updates to apply to the memories. Each update should be a dictionary
1018
+ with the following keys:
1019
+
1020
+ - `memory_id`: Unique identifier of the memory to update (required)
1021
+ - `value`: Optional new value of the memory
1022
+ - `label`: Optional new label of the memory
1023
+ - `source_id`: Optional new source ID of the memory
1024
+ - `...`: Optional new values for metadata properties
1025
+
1026
+ Returns:
1027
+ Updated memory or list of updated memories
1028
+
1029
+ Examples:
1030
+ Update a single memory:
1031
+ >>> memoryset.update(
1032
+ ... {
1033
+ ... "memory_id": "019501a1-ea08-76b2-9f62-95e4800b4841",
1034
+ ... "tag": "happy",
1035
+ ... },
1036
+ ... )
1037
+
1038
+ Update multiple memories:
1039
+ >>> memoryset.update(
1040
+ ... {"memory_id": m.memory_id, "label": 2}
1041
+ ... for m in memoryset.query(filters=[("tag", "==", "happy")])
1042
+ ... )
1043
+ """
1044
+ response = update_memories_gpu(
1045
+ self.id,
1046
+ body=[
1047
+ _parse_memory_update(update)
1048
+ for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
1049
+ ],
1050
+ )
1051
+ updated_memories = [LabeledMemory(self.id, memory) for memory in response]
1052
+ return updated_memories[0] if isinstance(updates, dict) else updated_memories
1053
+
1054
+ def delete(self, memory_id: str | Iterable[str]) -> None:
1055
+ """
1056
+ Delete memories from the memoryset
1057
+
1058
+ Params:
1059
+ memory_ids: unique identifiers of the memories to delete
1060
+
1061
+ Examples:
1062
+ Delete a single memory:
1063
+ >>> memoryset.delete("0195019a-5bc7-7afb-b902-5945ee1fb766")
1064
+
1065
+ Delete multiple memories:
1066
+ >>> memoryset.delete(
1067
+ ... "0195019a-5bc7-7afb-b902-5945ee1fb766",
1068
+ ... "019501a1-ea08-76b2-9f62-95e4800b4841",
1069
+ ... )
1070
+
1071
+ """
1072
+ memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
1073
+ delete_memories(self.id, body=DeleteMemoriesRequest(memory_ids=memory_ids))
1074
+ logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
1075
+ self.refresh()
1076
+
1077
+ def find_duplicates(self) -> dict:
1078
+ """
1079
+ Run an analysis to find duplicate memories in the memoryset
1080
+
1081
+ The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1082
+ attribute of each memory in the memoryset.
1083
+
1084
+ Returns:
1085
+ Summary of analysis with number of duplicate memories found
1086
+
1087
+ Examples:
1088
+ >>> memoryset.find_duplicate_memories()
1089
+ { "num_duplicates": 10 }
1090
+ >>> memoryset.delete(
1091
+ ... m.memory_id
1092
+ ... for m in memoryset.query(
1093
+ ... filters=[("metrics.is_duplicate", "==", True)]
1094
+ ... )
1095
+ ... )
1096
+ """
1097
+ analysis = create_analysis(
1098
+ self.id,
1099
+ body=MemorysetAnalysisRequest(
1100
+ type=MemorysetAnalysisRequestType.ANALYZE_DUPLICATE_MEMORIES,
1101
+ ),
1102
+ )
1103
+ wait_for_task(analysis.task_id, description="Analyzing duplicates")
1104
+ analysis = get_analysis(self.id, analysis.task_id)
1105
+ assert isinstance(analysis.result, FindDuplicatesAnalysisResult)
1106
+ # TODO: return a custom duplicate analysis class instance with helper methods
1107
+ return analysis.result.to_dict()
1108
+
1109
+ def analyze_labels(self, neighbor_count: int = 10) -> dict:
1110
+ """
1111
+ Run an analysis to access if the labels in the memoryset are consistent to detect possibly
1112
+ mislabeled memories.
1113
+
1114
+ The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
1115
+ attribute of each memory in the memoryset.
1116
+
1117
+ Returns:
1118
+ Summary of analysis with aggregate metrics for each label class
1119
+
1120
+ Examples:
1121
+ >>> memoryset.analyze_labels()
1122
+ {
1123
+ "label_metrics": [{
1124
+ "label": 0,
1125
+ "label_name": "negative",
1126
+ "average_lookup_score": 0.95,
1127
+ "memory_count": 100,
1128
+ }, {
1129
+ "label": 1,
1130
+ "label_name": "positive",
1131
+ "average_lookup_score": 0.90,
1132
+ "memory_count": 100,
1133
+ }]
1134
+ }
1135
+ >>> memoryset.display_label_analysis()
1136
+ """
1137
+ analysis = create_analysis(
1138
+ self.id,
1139
+ body=MemorysetAnalysisRequest(
1140
+ type=MemorysetAnalysisRequestType.ANALYZE_MEMORY_NEIGHBOR_LABELS,
1141
+ neighbor_count=neighbor_count,
1142
+ ),
1143
+ )
1144
+ wait_for_task(analysis.task_id, description="Analyzing labels")
1145
+ analysis = get_analysis(self.id, analysis.task_id)
1146
+ assert isinstance(analysis.result, AnalyzeNeighborLabelsResult)
1147
+ # TODO: return a custom label analysis class instance with helper methods
1148
+ return analysis.result.to_dict()
1149
+
1150
+ def display_label_analysis(self):
1151
+ from ._utils.analysis_ui import display_suggested_memory_relabels
1152
+
1153
+ """Display a UI to review and act upon the label analysis results"""
1154
+ display_suggested_memory_relabels(self)