orca-sdk 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +10 -4
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +393 -0
- orca_sdk/_shared/metrics_test.py +273 -0
- orca_sdk/_utils/analysis_ui.py +12 -10
- orca_sdk/_utils/analysis_ui_style.css +0 -3
- orca_sdk/_utils/auth.py +31 -29
- orca_sdk/_utils/data_parsing.py +28 -2
- orca_sdk/_utils/data_parsing_test.py +15 -15
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.py +67 -21
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/async_client.py +3795 -0
- orca_sdk/classification_model.py +601 -129
- orca_sdk/classification_model_test.py +415 -117
- orca_sdk/client.py +3787 -0
- orca_sdk/conftest.py +184 -38
- orca_sdk/credentials.py +162 -20
- orca_sdk/credentials_test.py +100 -16
- orca_sdk/datasource.py +268 -68
- orca_sdk/datasource_test.py +266 -18
- orca_sdk/embedding_model.py +434 -82
- orca_sdk/embedding_model_test.py +66 -33
- orca_sdk/job.py +343 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +1690 -324
- orca_sdk/memoryset_test.py +456 -119
- orca_sdk/regression_model.py +694 -0
- orca_sdk/regression_model_test.py +378 -0
- orca_sdk/telemetry.py +460 -143
- orca_sdk/telemetry_test.py +43 -24
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/METADATA +34 -16
- orca_sdk-0.1.3.dist-info/RECORD +41 -0
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/WHEEL +1 -1
- orca_sdk/_generated_api_client/__init__.py +0 -3
- orca_sdk/_generated_api_client/api/__init__.py +0 -193
- orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
- orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
- orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
- orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
- orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
- orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
- orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
- orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
- orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
- orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
- orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
- orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
- orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
- orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
- orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
- orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
- orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
- orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
- orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
- orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
- orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
- orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
- orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
- orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
- orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
- orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
- orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
- orca_sdk/_generated_api_client/client.py +0 -216
- orca_sdk/_generated_api_client/errors.py +0 -38
- orca_sdk/_generated_api_client/models/__init__.py +0 -159
- orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
- orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
- orca_sdk/_generated_api_client/models/base_model.py +0 -55
- orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
- orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
- orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
- orca_sdk/_generated_api_client/models/column_info.py +0 -114
- orca_sdk/_generated_api_client/models/column_type.py +0 -14
- orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
- orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
- orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
- orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
- orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
- orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/embed_request.py +0 -127
- orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
- orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
- orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
- orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
- orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
- orca_sdk/_generated_api_client/models/filter_item.py +0 -231
- orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
- orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
- orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
- orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
- orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
- orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
- orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
- orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
- orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
- orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
- orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
- orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
- orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
- orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
- orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
- orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
- orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
- orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
- orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
- orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
- orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
- orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
- orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
- orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
- orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
- orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
- orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
- orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
- orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/task.py +0 -198
- orca_sdk/_generated_api_client/models/task_status.py +0 -14
- orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
- orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
- orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
- orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
- orca_sdk/_generated_api_client/py.typed +0 -1
- orca_sdk/_generated_api_client/types.py +0 -56
- orca_sdk/_utils/task.py +0 -73
- orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/memoryset.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
from abc import ABC
|
|
4
5
|
from datetime import datetime, timedelta
|
|
5
6
|
from os import PathLike
|
|
6
|
-
from typing import Any, Iterable, Literal, cast, overload
|
|
7
|
+
from typing import Any, Generic, Iterable, Literal, Self, TypeVar, cast, overload
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
import pyarrow as pa
|
|
@@ -11,62 +12,62 @@ from datasets import Dataset
|
|
|
11
12
|
from torch.utils.data import DataLoader as TorchDataLoader
|
|
12
13
|
from torch.utils.data import Dataset as TorchDataset
|
|
13
14
|
|
|
14
|
-
from .
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
get_analysis,
|
|
21
|
-
get_memories,
|
|
22
|
-
get_memory,
|
|
23
|
-
get_memoryset,
|
|
24
|
-
insert_memories_gpu,
|
|
25
|
-
list_memorysets,
|
|
26
|
-
memoryset_lookup_gpu,
|
|
27
|
-
query_memoryset,
|
|
28
|
-
update_memories_gpu,
|
|
29
|
-
update_memory_gpu,
|
|
30
|
-
)
|
|
31
|
-
from ._generated_api_client.models import (
|
|
32
|
-
AnalyzeNeighborLabelsResult,
|
|
33
|
-
CloneLabeledMemorysetRequest,
|
|
34
|
-
CreateLabeledMemorysetRequest,
|
|
35
|
-
DeleteMemoriesRequest,
|
|
15
|
+
from ._utils.common import UNSET, CreateMode, DropMode
|
|
16
|
+
from .async_client import OrcaAsyncClient
|
|
17
|
+
from .client import (
|
|
18
|
+
CascadingEditSuggestion,
|
|
19
|
+
CloneMemorysetRequest,
|
|
20
|
+
CreateMemorysetRequest,
|
|
36
21
|
FilterItem,
|
|
37
|
-
FilterItemOp,
|
|
38
|
-
FindDuplicatesAnalysisResult,
|
|
39
|
-
GetMemoriesRequest,
|
|
40
22
|
)
|
|
41
|
-
from .
|
|
42
|
-
from .
|
|
23
|
+
from .client import LabeledMemory as LabeledMemoryResponse
|
|
24
|
+
from .client import (
|
|
43
25
|
LabeledMemoryInsert,
|
|
44
|
-
LabeledMemoryInsertMetadata,
|
|
45
26
|
)
|
|
46
|
-
from .
|
|
47
|
-
|
|
48
|
-
)
|
|
49
|
-
from ._generated_api_client.models import (
|
|
50
|
-
LabeledMemoryMetrics,
|
|
51
|
-
LabeledMemorysetMetadata,
|
|
27
|
+
from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
|
|
28
|
+
from .client import (
|
|
52
29
|
LabeledMemoryUpdate,
|
|
53
|
-
|
|
30
|
+
LabeledMemoryWithFeedbackMetrics,
|
|
54
31
|
LabelPredictionMemoryLookup,
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
32
|
+
MemoryMetrics,
|
|
33
|
+
MemorysetAnalysisConfigs,
|
|
34
|
+
MemorysetMetadata,
|
|
35
|
+
MemorysetMetrics,
|
|
36
|
+
MemorysetUpdate,
|
|
37
|
+
MemoryType,
|
|
38
|
+
OrcaClient,
|
|
39
|
+
)
|
|
40
|
+
from .client import ScoredMemory as ScoredMemoryResponse
|
|
41
|
+
from .client import (
|
|
42
|
+
ScoredMemoryInsert,
|
|
43
|
+
)
|
|
44
|
+
from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
|
|
45
|
+
from .client import (
|
|
46
|
+
ScoredMemoryUpdate,
|
|
47
|
+
ScoredMemoryWithFeedbackMetrics,
|
|
48
|
+
ScorePredictionMemoryLookup,
|
|
49
|
+
TelemetryFilterItem,
|
|
50
|
+
TelemetrySortOptions,
|
|
60
51
|
)
|
|
61
|
-
from ._generated_api_client.types import UNSET as CLIENT_UNSET
|
|
62
|
-
from ._utils.common import UNSET, CreateMode, DropMode
|
|
63
|
-
from ._utils.task import wait_for_task
|
|
64
52
|
from .datasource import Datasource
|
|
65
53
|
from .embedding_model import (
|
|
54
|
+
EmbeddingModelBase,
|
|
66
55
|
FinetunedEmbeddingModel,
|
|
67
56
|
PretrainedEmbeddingModel,
|
|
68
|
-
_EmbeddingModel,
|
|
69
57
|
)
|
|
58
|
+
from .job import Job, Status
|
|
59
|
+
|
|
60
|
+
TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
|
|
61
|
+
"""
|
|
62
|
+
Sort expression for telemetry data consisting of a field and a direction.
|
|
63
|
+
|
|
64
|
+
* **`field`**: The field to sort on.
|
|
65
|
+
* **`direction`**: The direction to sort in.
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
>>> ("feedback_metrics.accuracy.avg", "asc")
|
|
69
|
+
>>> ("lookup.count", "desc")
|
|
70
|
+
"""
|
|
70
71
|
|
|
71
72
|
FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
|
|
72
73
|
"""
|
|
@@ -90,62 +91,250 @@ Examples:
|
|
|
90
91
|
>>> ("label", "==", 0)
|
|
91
92
|
>>> ("metadata.author", "like", "John")
|
|
92
93
|
>>> ("source_id", "in", ["123", "456"])
|
|
94
|
+
>>> ("feedback_metrics.accuracy.avg", ">", 0.95)
|
|
93
95
|
"""
|
|
94
96
|
|
|
97
|
+
IndexType = Literal["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "DISKANN"]
|
|
95
98
|
|
|
96
|
-
DEFAULT_COLUMN_NAMES = {"value", "
|
|
97
|
-
|
|
99
|
+
DEFAULT_COLUMN_NAMES = {"value", "source_id"}
|
|
100
|
+
TYPE_SPECIFIC_COLUMN_NAMES = {"label", "score"}
|
|
101
|
+
FORBIDDEN_METADATA_COLUMN_NAMES = {
|
|
102
|
+
"memory_id",
|
|
103
|
+
"memory_version",
|
|
104
|
+
"embedding",
|
|
105
|
+
"created_at",
|
|
106
|
+
"updated_at",
|
|
107
|
+
"metrics",
|
|
108
|
+
"feedback_metrics",
|
|
109
|
+
"lookup",
|
|
110
|
+
}
|
|
98
111
|
|
|
99
112
|
|
|
100
|
-
def
|
|
113
|
+
def _is_metric_column(column: str):
|
|
114
|
+
return column in ["feedback_metrics", "lookup"]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem | TelemetryFilterItem:
|
|
101
118
|
field = input[0].split(".")
|
|
102
|
-
if
|
|
119
|
+
if (
|
|
120
|
+
len(field) == 1
|
|
121
|
+
and field[0] not in DEFAULT_COLUMN_NAMES | TYPE_SPECIFIC_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES
|
|
122
|
+
):
|
|
103
123
|
field = ["metadata", field[0]]
|
|
104
|
-
op =
|
|
124
|
+
op = input[1]
|
|
105
125
|
value = input[2]
|
|
126
|
+
if isinstance(value, datetime):
|
|
127
|
+
value = value.isoformat()
|
|
128
|
+
if _is_metric_column(field[0]):
|
|
129
|
+
if not (
|
|
130
|
+
(isinstance(value, list) and all(isinstance(v, float) or isinstance(v, int) for v in value))
|
|
131
|
+
or isinstance(value, float)
|
|
132
|
+
or isinstance(value, int)
|
|
133
|
+
):
|
|
134
|
+
raise ValueError(f"Invalid value for {field[0]} filter: {value}")
|
|
135
|
+
if field[0] == "feedback_metrics" and (len(field) != 3 or field[2] not in ["avg", "count"]):
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Feedback metrics filters must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
138
|
+
)
|
|
139
|
+
elif field[0] == "lookup" and (len(field) != 2 or field[1] != "count"):
|
|
140
|
+
raise ValueError("Lookup filters must follow the format `lookup.count`")
|
|
141
|
+
if op == "like":
|
|
142
|
+
raise ValueError("Like filters are not supported on metric columns")
|
|
143
|
+
op = cast(Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in"], op)
|
|
144
|
+
value = cast(float | int | list[float] | list[int], value)
|
|
145
|
+
return TelemetryFilterItem(field=field, op=op, value=value)
|
|
146
|
+
|
|
106
147
|
return FilterItem(field=field, op=op, value=value)
|
|
107
148
|
|
|
108
149
|
|
|
109
|
-
def
|
|
150
|
+
def _parse_sort_item_from_tuple(
|
|
151
|
+
input: TelemetrySortItem,
|
|
152
|
+
) -> TelemetrySortOptions:
|
|
153
|
+
field = input[0].split(".")
|
|
154
|
+
|
|
155
|
+
if len(field) == 1:
|
|
156
|
+
raise ValueError("Sort field must be a telemetry field with an aggregate function name value")
|
|
157
|
+
if field[0] not in ["feedback_metrics", "lookup"]:
|
|
158
|
+
raise ValueError("Sort field must be one of telemetry fields: feedback_metrics or lookup")
|
|
159
|
+
if field[0] == "feedback_metrics":
|
|
160
|
+
if len(field) != 3:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"Feedback metrics must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
163
|
+
)
|
|
164
|
+
if field[2] not in ["avg", "count"]:
|
|
165
|
+
raise ValueError("Feedback metrics can only be sorted on avg or count")
|
|
166
|
+
if field[0] == "lookup":
|
|
167
|
+
if len(field) != 2:
|
|
168
|
+
raise ValueError("Lookup must follow the format `lookup.count`")
|
|
169
|
+
if field[1] != "count":
|
|
170
|
+
raise ValueError("Lookup can only be sorted on count")
|
|
171
|
+
return TelemetrySortOptions(field=field, direction=input[1])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _parse_memory_insert(memory: dict[str, Any], type: MemoryType) -> LabeledMemoryInsert | ScoredMemoryInsert:
|
|
110
175
|
value = memory.get("value")
|
|
111
176
|
if not isinstance(value, str):
|
|
112
177
|
raise ValueError("Memory value must be a string")
|
|
113
|
-
label = memory.get("label")
|
|
114
|
-
if not isinstance(label, int):
|
|
115
|
-
raise ValueError("Memory label must be an integer")
|
|
116
178
|
source_id = memory.get("source_id")
|
|
117
179
|
if source_id and not isinstance(source_id, str):
|
|
118
180
|
raise ValueError("Memory source_id must be a string")
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
181
|
+
match type:
|
|
182
|
+
case "LABELED":
|
|
183
|
+
label = memory.get("label")
|
|
184
|
+
if label is not None and not isinstance(label, int):
|
|
185
|
+
raise ValueError("Memory label must be an integer")
|
|
186
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"label"}}
|
|
187
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
190
|
+
)
|
|
191
|
+
return {"value": value, "label": label, "source_id": source_id, "metadata": metadata}
|
|
192
|
+
case "SCORED":
|
|
193
|
+
score = memory.get("score")
|
|
194
|
+
if score is not None and not isinstance(score, (int, float)):
|
|
195
|
+
raise ValueError("Memory score must be a number")
|
|
196
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"score"}}
|
|
197
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
200
|
+
)
|
|
201
|
+
return {"value": value, "score": score, "source_id": source_id, "metadata": metadata}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _parse_memory_update(update: dict[str, Any], type: MemoryType) -> LabeledMemoryUpdate | ScoredMemoryUpdate:
|
|
126
205
|
if "memory_id" not in update:
|
|
127
206
|
raise ValueError("memory_id must be specified in the update dictionary")
|
|
128
207
|
memory_id = update["memory_id"]
|
|
129
208
|
if not isinstance(memory_id, str):
|
|
130
209
|
raise ValueError("memory_id must be a string")
|
|
131
|
-
|
|
132
|
-
if value
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
210
|
+
payload: LabeledMemoryUpdate | ScoredMemoryUpdate = {"memory_id": memory_id}
|
|
211
|
+
if "value" in update:
|
|
212
|
+
if not isinstance(update["value"], str):
|
|
213
|
+
raise ValueError("value must be a string or unset")
|
|
214
|
+
payload["value"] = update["value"]
|
|
215
|
+
if "source_id" in update:
|
|
216
|
+
if not isinstance(update["source_id"], str):
|
|
217
|
+
raise ValueError("source_id must be a string or unset")
|
|
218
|
+
payload["source_id"] = update["source_id"]
|
|
219
|
+
match type:
|
|
220
|
+
case "LABELED":
|
|
221
|
+
payload = cast(LabeledMemoryUpdate, payload)
|
|
222
|
+
if "label" in update:
|
|
223
|
+
if not isinstance(update["label"], int):
|
|
224
|
+
raise ValueError("label must be an integer or unset")
|
|
225
|
+
payload["label"] = update["label"]
|
|
226
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "label"}}
|
|
227
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
230
|
+
)
|
|
231
|
+
payload["metadata"] = metadata
|
|
232
|
+
return payload
|
|
233
|
+
case "SCORED":
|
|
234
|
+
payload = cast(ScoredMemoryUpdate, payload)
|
|
235
|
+
if "score" in update:
|
|
236
|
+
if not isinstance(update["score"], (int, float)):
|
|
237
|
+
raise ValueError("score must be a number or unset")
|
|
238
|
+
payload["score"] = update["score"]
|
|
239
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "score"}}
|
|
240
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
243
|
+
)
|
|
244
|
+
payload["metadata"] = metadata
|
|
245
|
+
return cast(ScoredMemoryUpdate, payload)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class MemoryBase(ABC):
|
|
249
|
+
value: str
|
|
250
|
+
embedding: list[float]
|
|
251
|
+
source_id: str | None
|
|
252
|
+
created_at: datetime
|
|
253
|
+
updated_at: datetime
|
|
254
|
+
metadata: dict[str, str | float | int | bool | None]
|
|
255
|
+
metrics: MemoryMetrics
|
|
256
|
+
memory_id: str
|
|
257
|
+
memory_version: int
|
|
258
|
+
feedback_metrics: dict[str, Any]
|
|
259
|
+
lookup_count: int
|
|
260
|
+
memory_type: MemoryType # defined by subclasses
|
|
261
|
+
|
|
262
|
+
def __init__(
|
|
263
|
+
self,
|
|
264
|
+
memoryset_id: str,
|
|
265
|
+
memory: (
|
|
266
|
+
LabeledMemoryResponse
|
|
267
|
+
| LabeledMemoryLookupResponse
|
|
268
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
269
|
+
| LabelPredictionMemoryLookup
|
|
270
|
+
| ScoredMemoryResponse
|
|
271
|
+
| ScoredMemoryLookupResponse
|
|
272
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
273
|
+
| ScorePredictionMemoryLookup
|
|
274
|
+
),
|
|
275
|
+
):
|
|
276
|
+
# for internal use only, do not document
|
|
277
|
+
self.memoryset_id = memoryset_id
|
|
278
|
+
self.memory_id = memory["memory_id"]
|
|
279
|
+
self.memory_version = memory["memory_version"]
|
|
280
|
+
self.value = cast(str, memory["value"])
|
|
281
|
+
self.embedding = memory["embedding"]
|
|
282
|
+
self.source_id = memory["source_id"]
|
|
283
|
+
self.created_at = datetime.fromisoformat(memory["created_at"])
|
|
284
|
+
self.updated_at = datetime.fromisoformat(memory["updated_at"])
|
|
285
|
+
self.metadata = memory["metadata"]
|
|
286
|
+
self.metrics = memory["metrics"] if "metrics" in memory else {}
|
|
287
|
+
self.feedback_metrics = memory.get("feedback_metrics", {}) or {}
|
|
288
|
+
self.lookup_count = memory.get("lookup_count", 0)
|
|
289
|
+
|
|
290
|
+
def __getattr__(self, key: str) -> Any:
|
|
291
|
+
if key.startswith("__") or key not in self.metadata:
|
|
292
|
+
raise AttributeError(f"{key} is not a valid attribute")
|
|
293
|
+
return self.metadata[key]
|
|
294
|
+
|
|
295
|
+
def _update(
|
|
296
|
+
self,
|
|
297
|
+
*,
|
|
298
|
+
value: str = UNSET,
|
|
299
|
+
source_id: str | None = UNSET,
|
|
300
|
+
**metadata: None | bool | float | int | str,
|
|
301
|
+
) -> Self:
|
|
302
|
+
client = OrcaClient._resolve_client()
|
|
303
|
+
response = client.PATCH(
|
|
304
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
305
|
+
params={"name_or_id": self.memoryset_id},
|
|
306
|
+
json=_parse_memory_update(
|
|
307
|
+
{"memory_id": self.memory_id}
|
|
308
|
+
| ({"value": value} if value is not UNSET else {})
|
|
309
|
+
| ({"source_id": source_id} if source_id is not UNSET else {})
|
|
310
|
+
| {k: v for k, v in metadata.items() if v is not UNSET},
|
|
311
|
+
type=self.memory_type,
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
self.__dict__.update(self.__class__(self.memoryset_id, response).__dict__)
|
|
315
|
+
return self
|
|
316
|
+
|
|
317
|
+
def to_dict(self) -> dict[str, Any]:
|
|
318
|
+
"""
|
|
319
|
+
Convert the memory to a dictionary
|
|
320
|
+
"""
|
|
321
|
+
return {
|
|
322
|
+
"value": self.value,
|
|
323
|
+
"embedding": self.embedding,
|
|
324
|
+
"source_id": self.source_id,
|
|
325
|
+
"created_at": self.created_at,
|
|
326
|
+
"updated_at": self.updated_at,
|
|
327
|
+
"metadata": self.metadata,
|
|
328
|
+
"metrics": self.metrics,
|
|
329
|
+
"memory_id": self.memory_id,
|
|
330
|
+
"memory_version": self.memory_version,
|
|
331
|
+
"feedback_metrics": self.feedback_metrics,
|
|
332
|
+
"lookup_count": self.lookup_count,
|
|
333
|
+
"memory_type": self.memory_type,
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class LabeledMemory(MemoryBase):
|
|
149
338
|
"""
|
|
150
339
|
A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
|
|
151
340
|
|
|
@@ -170,47 +359,30 @@ class LabeledMemory:
|
|
|
170
359
|
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
171
360
|
"""
|
|
172
361
|
|
|
173
|
-
|
|
174
|
-
embedding: list[float]
|
|
175
|
-
label: int
|
|
362
|
+
label: int | None
|
|
176
363
|
label_name: str | None
|
|
177
|
-
|
|
178
|
-
created_at: datetime
|
|
179
|
-
updated_at: datetime
|
|
180
|
-
metadata: dict[str, str | float | int | bool | None]
|
|
181
|
-
metrics: LabeledMemoryMetrics | None
|
|
182
|
-
memory_id: str
|
|
183
|
-
memory_version: int
|
|
364
|
+
memory_type = "LABELED"
|
|
184
365
|
|
|
185
366
|
def __init__(
|
|
186
367
|
self,
|
|
187
368
|
memoryset_id: str,
|
|
188
|
-
memory:
|
|
369
|
+
memory: (
|
|
370
|
+
LabeledMemoryResponse
|
|
371
|
+
| LabeledMemoryLookupResponse
|
|
372
|
+
| LabelPredictionMemoryLookup
|
|
373
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
374
|
+
),
|
|
189
375
|
):
|
|
190
376
|
# for internal use only, do not document
|
|
191
|
-
|
|
192
|
-
self.
|
|
193
|
-
self.
|
|
194
|
-
self.value = memory.value
|
|
195
|
-
self.embedding = memory.embedding
|
|
196
|
-
self.label = memory.label
|
|
197
|
-
self.label_name = memory.label_name
|
|
198
|
-
self.source_id = memory.source_id
|
|
199
|
-
self.created_at = memory.created_at
|
|
200
|
-
self.updated_at = memory.updated_at
|
|
201
|
-
self.metadata = memory.metadata.to_dict()
|
|
202
|
-
self.metrics = memory.metrics
|
|
203
|
-
|
|
204
|
-
def __getattr__(self, key: str) -> Any:
|
|
205
|
-
if key.startswith("__") or key not in self.metadata:
|
|
206
|
-
raise AttributeError(f"{key} is not a valid attribute")
|
|
207
|
-
return self.metadata[key]
|
|
377
|
+
super().__init__(memoryset_id, memory)
|
|
378
|
+
self.label = memory["label"]
|
|
379
|
+
self.label_name = memory["label_name"]
|
|
208
380
|
|
|
209
381
|
def __repr__(self) -> str:
|
|
210
382
|
return (
|
|
211
383
|
"LabeledMemory({ "
|
|
212
384
|
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
213
|
-
+ f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
|
|
385
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
214
386
|
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
215
387
|
+ " })"
|
|
216
388
|
)
|
|
@@ -222,7 +394,7 @@ class LabeledMemory:
|
|
|
222
394
|
self,
|
|
223
395
|
*,
|
|
224
396
|
value: str = UNSET,
|
|
225
|
-
label: int = UNSET,
|
|
397
|
+
label: int | None = UNSET,
|
|
226
398
|
source_id: str | None = UNSET,
|
|
227
399
|
**metadata: None | bool | float | int | str,
|
|
228
400
|
) -> LabeledMemory:
|
|
@@ -241,19 +413,18 @@ class LabeledMemory:
|
|
|
241
413
|
Returns:
|
|
242
414
|
The updated memory
|
|
243
415
|
"""
|
|
244
|
-
|
|
245
|
-
self.memoryset_id,
|
|
246
|
-
body=_parse_memory_update(
|
|
247
|
-
{"memory_id": self.memory_id}
|
|
248
|
-
| ({"value": value} if value is not UNSET else {})
|
|
249
|
-
| ({"label": label} if label is not UNSET else {})
|
|
250
|
-
| ({"source_id": source_id} if source_id is not UNSET else {})
|
|
251
|
-
| metadata
|
|
252
|
-
),
|
|
253
|
-
)
|
|
254
|
-
self.__dict__.update(LabeledMemory(self.memoryset_id, response).__dict__)
|
|
416
|
+
self._update(value=value, label=label, source_id=source_id, **metadata)
|
|
255
417
|
return self
|
|
256
418
|
|
|
419
|
+
def to_dict(self) -> dict[str, Any]:
|
|
420
|
+
"""
|
|
421
|
+
Convert the memory to a dictionary
|
|
422
|
+
"""
|
|
423
|
+
super_dict = super().to_dict()
|
|
424
|
+
super_dict["label"] = self.label
|
|
425
|
+
super_dict["label_name"] = self.label_name
|
|
426
|
+
return super_dict
|
|
427
|
+
|
|
257
428
|
|
|
258
429
|
class LabeledMemoryLookup(LabeledMemory):
|
|
259
430
|
"""
|
|
@@ -289,10 +460,8 @@ class LabeledMemoryLookup(LabeledMemory):
|
|
|
289
460
|
def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
|
|
290
461
|
# for internal use only, do not document
|
|
291
462
|
super().__init__(memoryset_id, memory_lookup)
|
|
292
|
-
self.lookup_score = memory_lookup
|
|
293
|
-
self.attention_weight =
|
|
294
|
-
memory_lookup.attention_weight if isinstance(memory_lookup, LabelPredictionMemoryLookup) else None
|
|
295
|
-
)
|
|
463
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
464
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
296
465
|
|
|
297
466
|
def __repr__(self) -> str:
|
|
298
467
|
return (
|
|
@@ -300,20 +469,155 @@ class LabeledMemoryLookup(LabeledMemory):
|
|
|
300
469
|
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
301
470
|
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
302
471
|
+ (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
|
|
303
|
-
+ f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
|
|
472
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
473
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
474
|
+
+ " })"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class ScoredMemory(MemoryBase):
|
|
479
|
+
"""
|
|
480
|
+
A row of the [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
481
|
+
|
|
482
|
+
Attributes:
|
|
483
|
+
value: Value represented by the row
|
|
484
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
485
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
486
|
+
score: Score of the memory
|
|
487
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
488
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
489
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
490
|
+
metadata: Metadata associated with the memory that is not used in the model. Metadata
|
|
491
|
+
properties are also accessible as individual attributes on the instance.
|
|
492
|
+
memory_id: Unique identifier for the memory, automatically generated on insert
|
|
493
|
+
memory_version: Version of the memory, automatically updated when the score or value changes
|
|
494
|
+
created_at: When the memory was created, automatically generated on insert
|
|
495
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
496
|
+
|
|
497
|
+
## Other Attributes:
|
|
498
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
score: float | None
|
|
502
|
+
memory_type = "SCORED"
|
|
503
|
+
|
|
504
|
+
def __init__(
|
|
505
|
+
self,
|
|
506
|
+
memoryset_id: str,
|
|
507
|
+
memory: (
|
|
508
|
+
ScoredMemoryResponse
|
|
509
|
+
| ScoredMemoryLookupResponse
|
|
510
|
+
| ScorePredictionMemoryLookup
|
|
511
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
512
|
+
),
|
|
513
|
+
):
|
|
514
|
+
# for internal use only, do not document
|
|
515
|
+
super().__init__(memoryset_id, memory)
|
|
516
|
+
self.score = memory["score"]
|
|
517
|
+
|
|
518
|
+
def __repr__(self) -> str:
|
|
519
|
+
return (
|
|
520
|
+
"ScoredMemory({ "
|
|
521
|
+
+ f"score: {self.score:.2f}"
|
|
522
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
523
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
524
|
+
+ " })"
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
def __eq__(self, other: object) -> bool:
|
|
528
|
+
return isinstance(other, ScoredMemory) and self.memory_id == other.memory_id
|
|
529
|
+
|
|
530
|
+
def update(
|
|
531
|
+
self,
|
|
532
|
+
*,
|
|
533
|
+
value: str = UNSET,
|
|
534
|
+
score: float | None = UNSET,
|
|
535
|
+
source_id: str | None = UNSET,
|
|
536
|
+
**metadata: None | bool | float | int | str,
|
|
537
|
+
) -> ScoredMemory:
|
|
538
|
+
"""
|
|
539
|
+
Update the memory with new values
|
|
540
|
+
|
|
541
|
+
Note:
|
|
542
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
543
|
+
|
|
544
|
+
Params:
|
|
545
|
+
value: New value of the memory
|
|
546
|
+
score: New score of the memory
|
|
547
|
+
source_id: New source ID of the memory
|
|
548
|
+
**metadata: New values for metadata properties
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
The updated memory
|
|
552
|
+
"""
|
|
553
|
+
self._update(value=value, score=score, source_id=source_id, **metadata)
|
|
554
|
+
return self
|
|
555
|
+
|
|
556
|
+
def to_dict(self) -> dict[str, Any]:
|
|
557
|
+
"""
|
|
558
|
+
Convert the memory to a dictionary
|
|
559
|
+
"""
|
|
560
|
+
super_dict = super().to_dict()
|
|
561
|
+
super_dict["score"] = self.score
|
|
562
|
+
return super_dict
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
class ScoredMemoryLookup(ScoredMemory):
|
|
566
|
+
"""
|
|
567
|
+
Lookup result for a memory in a memoryset
|
|
568
|
+
|
|
569
|
+
Attributes:
|
|
570
|
+
lookup_score: Similarity between the memory embedding and search query embedding
|
|
571
|
+
attention_weight: Weight the model assigned to the memory during prediction if this lookup
|
|
572
|
+
happened as part of a prediction
|
|
573
|
+
value: Value represented by the row
|
|
574
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
575
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
576
|
+
score: Score of the memory
|
|
577
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
578
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
579
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
580
|
+
memory_id: The unique identifier for the memory, automatically generated on insert
|
|
581
|
+
memory_version: The version of the memory, automatically updated when the score or value changes
|
|
582
|
+
created_at: When the memory was created, automatically generated on insert
|
|
583
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
584
|
+
|
|
585
|
+
## Other Attributes:
|
|
586
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
587
|
+
"""
|
|
588
|
+
|
|
589
|
+
lookup_score: float
|
|
590
|
+
attention_weight: float | None
|
|
591
|
+
|
|
592
|
+
def __init__(self, memoryset_id: str, memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup):
|
|
593
|
+
# for internal use only, do not document
|
|
594
|
+
super().__init__(memoryset_id, memory_lookup)
|
|
595
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
596
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
597
|
+
|
|
598
|
+
def __repr__(self) -> str:
|
|
599
|
+
return (
|
|
600
|
+
"ScoredMemoryLookup({ "
|
|
601
|
+
+ f"score: {self.score:.2f}"
|
|
602
|
+
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
603
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
304
604
|
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
305
605
|
+ " })"
|
|
306
606
|
)
|
|
307
607
|
|
|
308
608
|
|
|
309
|
-
|
|
609
|
+
MemoryT = TypeVar("MemoryT", bound=MemoryBase)
|
|
610
|
+
MemoryLookupT = TypeVar("MemoryLookupT", bound=MemoryBase)
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
310
614
|
"""
|
|
311
615
|
A Handle to a collection of memories with labels in the OrcaCloud
|
|
312
616
|
|
|
313
617
|
Attributes:
|
|
314
618
|
id: Unique identifier for the memoryset
|
|
315
619
|
name: Unique name of the memoryset
|
|
316
|
-
|
|
620
|
+
description: Description of the memoryset
|
|
317
621
|
length: Number of memories in the memoryset
|
|
318
622
|
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
319
623
|
created_at: When the memoryset was created, automatically generated on create
|
|
@@ -322,43 +626,96 @@ class LabeledMemoryset:
|
|
|
322
626
|
|
|
323
627
|
id: str
|
|
324
628
|
name: str
|
|
325
|
-
|
|
629
|
+
description: str | None
|
|
630
|
+
memory_type: MemoryType # defined by subclasses
|
|
631
|
+
|
|
326
632
|
length: int
|
|
327
633
|
created_at: datetime
|
|
328
634
|
updated_at: datetime
|
|
329
|
-
insertion_status:
|
|
330
|
-
embedding_model:
|
|
635
|
+
insertion_status: Status
|
|
636
|
+
embedding_model: EmbeddingModelBase
|
|
637
|
+
index_type: IndexType
|
|
638
|
+
index_params: dict[str, Any]
|
|
639
|
+
hidden: bool
|
|
331
640
|
|
|
332
|
-
|
|
641
|
+
_batch_size = 32 # max number of memories to insert/update/delete in a single API call
|
|
642
|
+
|
|
643
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
333
644
|
# for internal use only, do not document
|
|
334
|
-
if metadata
|
|
335
|
-
self.embedding_model = PretrainedEmbeddingModel._get(metadata
|
|
336
|
-
elif metadata
|
|
337
|
-
self.embedding_model = FinetunedEmbeddingModel.open(metadata
|
|
645
|
+
if metadata["pretrained_embedding_model_name"]:
|
|
646
|
+
self.embedding_model = PretrainedEmbeddingModel._get(metadata["pretrained_embedding_model_name"])
|
|
647
|
+
elif metadata["finetuned_embedding_model_id"]:
|
|
648
|
+
self.embedding_model = FinetunedEmbeddingModel.open(metadata["finetuned_embedding_model_id"])
|
|
338
649
|
else:
|
|
339
650
|
raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
|
|
340
|
-
self.id = metadata
|
|
341
|
-
self.name = metadata
|
|
342
|
-
self.
|
|
343
|
-
self.length = metadata
|
|
344
|
-
self.created_at = metadata
|
|
345
|
-
self.updated_at = metadata
|
|
346
|
-
self.insertion_status = metadata
|
|
651
|
+
self.id = metadata["id"]
|
|
652
|
+
self.name = metadata["name"]
|
|
653
|
+
self.description = metadata["description"]
|
|
654
|
+
self.length = metadata["length"]
|
|
655
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
656
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
657
|
+
self.insertion_status = Status(metadata["insertion_status"])
|
|
347
658
|
self._last_refresh = datetime.now()
|
|
659
|
+
self.index_type = metadata["index_type"]
|
|
660
|
+
self.index_params = metadata["index_params"]
|
|
661
|
+
self.memory_type = metadata["memory_type"]
|
|
662
|
+
self.hidden = metadata["hidden"]
|
|
348
663
|
|
|
349
664
|
def __eq__(self, other) -> bool:
|
|
350
|
-
return isinstance(other,
|
|
665
|
+
return isinstance(other, MemorysetBase) and self.id == other.id
|
|
351
666
|
|
|
352
667
|
def __repr__(self) -> str:
|
|
353
668
|
return (
|
|
354
|
-
"
|
|
669
|
+
"Memoryset({\n"
|
|
355
670
|
f" name: '{self.name}',\n"
|
|
356
671
|
f" length: {self.length},\n"
|
|
357
|
-
f" label_names: {self.label_names},\n"
|
|
358
672
|
f" embedding_model: {self.embedding_model},\n"
|
|
359
673
|
"})"
|
|
360
674
|
)
|
|
361
675
|
|
|
676
|
+
@classmethod
|
|
677
|
+
def _handle_if_exists(
|
|
678
|
+
cls,
|
|
679
|
+
name: str,
|
|
680
|
+
*,
|
|
681
|
+
if_exists: CreateMode,
|
|
682
|
+
label_names: list[str] | None,
|
|
683
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
|
|
684
|
+
) -> Self | None:
|
|
685
|
+
"""
|
|
686
|
+
Handle common `if_exists` logic shared by all creator-style helpers.
|
|
687
|
+
|
|
688
|
+
Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
|
|
689
|
+
and returns `None` when the memoryset does not yet exist.
|
|
690
|
+
"""
|
|
691
|
+
if not cls.exists(name):
|
|
692
|
+
return None
|
|
693
|
+
if if_exists == "error":
|
|
694
|
+
raise ValueError(f"Memoryset with name {name} already exists")
|
|
695
|
+
|
|
696
|
+
existing = cls.open(name)
|
|
697
|
+
|
|
698
|
+
if label_names is not None and hasattr(existing, "label_names"):
|
|
699
|
+
existing_label_names = getattr(existing, "label_names")
|
|
700
|
+
if label_names != existing_label_names:
|
|
701
|
+
requested = ", ".join(label_names)
|
|
702
|
+
existing_joined = ", ".join(existing_label_names)
|
|
703
|
+
raise ValueError(
|
|
704
|
+
f"Memoryset {name} already exists with label names [{existing_joined}] "
|
|
705
|
+
f"(requested: [{requested}])."
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
if embedding_model is not None and embedding_model != existing.embedding_model:
|
|
709
|
+
existing_model = existing.embedding_model
|
|
710
|
+
existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
|
|
711
|
+
requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
|
|
712
|
+
raise ValueError(
|
|
713
|
+
f"Memoryset {name} already exists with embedding_model {existing_model_name} "
|
|
714
|
+
f"(requested: {requested_name})."
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
return existing
|
|
718
|
+
|
|
362
719
|
@classmethod
|
|
363
720
|
def create(
|
|
364
721
|
cls,
|
|
@@ -367,12 +724,20 @@ class LabeledMemoryset:
|
|
|
367
724
|
*,
|
|
368
725
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
369
726
|
value_column: str = "value",
|
|
370
|
-
label_column: str =
|
|
727
|
+
label_column: str | None = None,
|
|
728
|
+
score_column: str | None = None,
|
|
371
729
|
source_id_column: str | None = None,
|
|
730
|
+
description: str | None = None,
|
|
372
731
|
label_names: list[str] | None = None,
|
|
373
732
|
max_seq_length_override: int | None = None,
|
|
733
|
+
prompt: str | None = None,
|
|
734
|
+
remove_duplicates: bool = True,
|
|
735
|
+
index_type: IndexType = "FLAT",
|
|
736
|
+
index_params: dict[str, Any] = {},
|
|
374
737
|
if_exists: CreateMode = "error",
|
|
375
|
-
|
|
738
|
+
background: bool = False,
|
|
739
|
+
hidden: bool = False,
|
|
740
|
+
) -> Self | Job[Self]:
|
|
376
741
|
"""
|
|
377
742
|
Create a new memoryset in the OrcaCloud
|
|
378
743
|
|
|
@@ -387,8 +752,12 @@ class LabeledMemoryset:
|
|
|
387
752
|
value_column: Name of the column in the datasource that contains the memory values
|
|
388
753
|
label_column: Name of the column in the datasource that contains the memory labels,
|
|
389
754
|
these must be contiguous integers starting from 0
|
|
755
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
390
756
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
391
757
|
the system of reference
|
|
758
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
759
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
760
|
+
datasource or the embedding model.
|
|
392
761
|
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
393
762
|
the number of labels in the `label_column`. Will be automatically inferred if a
|
|
394
763
|
[Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
|
|
@@ -396,8 +765,16 @@ class LabeledMemoryset:
|
|
|
396
765
|
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
397
766
|
value is longer than this it will be truncated, will default to the model's max
|
|
398
767
|
sequence length if not provided
|
|
768
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
769
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
770
|
+
into the memoryset
|
|
771
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
772
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
773
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
399
774
|
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
400
775
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
776
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
777
|
+
hidden: Whether the memoryset should be hidden
|
|
401
778
|
|
|
402
779
|
Returns:
|
|
403
780
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -407,42 +784,62 @@ class LabeledMemoryset:
|
|
|
407
784
|
`"open"` and the params do not match those of the existing memoryset.
|
|
408
785
|
"""
|
|
409
786
|
if embedding_model is None:
|
|
410
|
-
embedding_model = PretrainedEmbeddingModel.
|
|
787
|
+
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
411
788
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
if if_exists == "error":
|
|
415
|
-
raise ValueError(f"Memoryset with name {name} already exists")
|
|
416
|
-
elif if_exists == "open":
|
|
417
|
-
existing = cls.open(name)
|
|
418
|
-
for attribute in {"label_names", "embedding_model"}:
|
|
419
|
-
if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
|
|
420
|
-
raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
|
|
421
|
-
return existing
|
|
789
|
+
if label_column is None and score_column is None:
|
|
790
|
+
raise ValueError("label_column or score_column must be provided")
|
|
422
791
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
datasource_label_column=label_column,
|
|
429
|
-
datasource_value_column=value_column,
|
|
430
|
-
datasource_source_id_column=source_id_column,
|
|
431
|
-
pretrained_embedding_model_name=(
|
|
432
|
-
embedding_model._model_name if isinstance(embedding_model, PretrainedEmbeddingModel) else None
|
|
433
|
-
),
|
|
434
|
-
finetuned_embedding_model_id=(
|
|
435
|
-
embedding_model.id if isinstance(embedding_model, FinetunedEmbeddingModel) else None
|
|
436
|
-
),
|
|
437
|
-
label_names=label_names or [],
|
|
438
|
-
max_seq_length_override=max_seq_length_override,
|
|
439
|
-
),
|
|
792
|
+
existing = cls._handle_if_exists(
|
|
793
|
+
name,
|
|
794
|
+
if_exists=if_exists,
|
|
795
|
+
label_names=label_names,
|
|
796
|
+
embedding_model=embedding_model,
|
|
440
797
|
)
|
|
441
|
-
|
|
442
|
-
|
|
798
|
+
if existing is not None:
|
|
799
|
+
return existing
|
|
800
|
+
|
|
801
|
+
payload: CreateMemorysetRequest = {
|
|
802
|
+
"name": name,
|
|
803
|
+
"description": description,
|
|
804
|
+
"datasource_name_or_id": datasource.id,
|
|
805
|
+
"datasource_label_column": label_column,
|
|
806
|
+
"datasource_score_column": score_column,
|
|
807
|
+
"datasource_value_column": value_column,
|
|
808
|
+
"datasource_source_id_column": source_id_column,
|
|
809
|
+
"label_names": label_names,
|
|
810
|
+
"max_seq_length_override": max_seq_length_override,
|
|
811
|
+
"remove_duplicates": remove_duplicates,
|
|
812
|
+
"index_type": index_type,
|
|
813
|
+
"index_params": index_params,
|
|
814
|
+
"hidden": hidden,
|
|
815
|
+
}
|
|
816
|
+
if prompt is not None:
|
|
817
|
+
payload["prompt"] = prompt
|
|
818
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
819
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
820
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
821
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
822
|
+
else:
|
|
823
|
+
raise ValueError("Invalid embedding model")
|
|
824
|
+
client = OrcaClient._resolve_client()
|
|
825
|
+
response = client.POST("/memoryset", json=payload)
|
|
826
|
+
job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
|
|
827
|
+
return job if background else job.result()
|
|
828
|
+
|
|
829
|
+
@overload
|
|
830
|
+
@classmethod
|
|
831
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
|
|
832
|
+
pass
|
|
833
|
+
|
|
834
|
+
@overload
|
|
835
|
+
@classmethod
|
|
836
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
|
|
837
|
+
pass
|
|
443
838
|
|
|
444
839
|
@classmethod
|
|
445
|
-
def from_hf_dataset(
|
|
840
|
+
def from_hf_dataset(
|
|
841
|
+
cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
|
|
842
|
+
) -> Self | Job[Self]:
|
|
446
843
|
"""
|
|
447
844
|
Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
|
|
448
845
|
|
|
@@ -456,18 +853,53 @@ class LabeledMemoryset:
|
|
|
456
853
|
name: Name for the new memoryset (must be unique)
|
|
457
854
|
hf_dataset: Hugging Face dataset to create the memoryset from
|
|
458
855
|
kwargs: Additional parameters for creating the memoryset. See
|
|
459
|
-
[`create`][orca_sdk.
|
|
460
|
-
|
|
856
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
461
857
|
|
|
462
858
|
Returns:
|
|
463
859
|
Handle to the new memoryset in the OrcaCloud
|
|
464
860
|
"""
|
|
861
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
862
|
+
existing = cls._handle_if_exists(
|
|
863
|
+
name,
|
|
864
|
+
if_exists=if_exists,
|
|
865
|
+
label_names=kwargs.get("label_names"),
|
|
866
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
867
|
+
)
|
|
868
|
+
if existing is not None:
|
|
869
|
+
return existing
|
|
870
|
+
|
|
465
871
|
datasource = Datasource.from_hf_dataset(
|
|
466
872
|
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
467
873
|
)
|
|
468
|
-
|
|
874
|
+
kwargs["background"] = background
|
|
469
875
|
return cls.create(name, datasource, **kwargs)
|
|
470
876
|
|
|
877
|
+
@overload
|
|
878
|
+
@classmethod
|
|
879
|
+
def from_pytorch(
|
|
880
|
+
cls,
|
|
881
|
+
name: str,
|
|
882
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
883
|
+
*,
|
|
884
|
+
column_names: list[str] | None = None,
|
|
885
|
+
background: Literal[True],
|
|
886
|
+
**kwargs: Any,
|
|
887
|
+
) -> Job[Self]:
|
|
888
|
+
pass
|
|
889
|
+
|
|
890
|
+
@overload
|
|
891
|
+
@classmethod
|
|
892
|
+
def from_pytorch(
|
|
893
|
+
cls,
|
|
894
|
+
name: str,
|
|
895
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
896
|
+
*,
|
|
897
|
+
column_names: list[str] | None = None,
|
|
898
|
+
background: Literal[False] = False,
|
|
899
|
+
**kwargs: Any,
|
|
900
|
+
) -> Self:
|
|
901
|
+
pass
|
|
902
|
+
|
|
471
903
|
@classmethod
|
|
472
904
|
def from_pytorch(
|
|
473
905
|
cls,
|
|
@@ -475,8 +907,9 @@ class LabeledMemoryset:
|
|
|
475
907
|
torch_data: TorchDataLoader | TorchDataset,
|
|
476
908
|
*,
|
|
477
909
|
column_names: list[str] | None = None,
|
|
910
|
+
background: bool = False,
|
|
478
911
|
**kwargs: Any,
|
|
479
|
-
) ->
|
|
912
|
+
) -> Self | Job[Self]:
|
|
480
913
|
"""
|
|
481
914
|
Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
|
|
482
915
|
[`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
|
|
@@ -492,34 +925,77 @@ class LabeledMemoryset:
|
|
|
492
925
|
torch_data: PyTorch data loader or dataset to create the memoryset from
|
|
493
926
|
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
494
927
|
argument must be provided to specify the names of the columns.
|
|
928
|
+
background: Whether to run the operation in the background
|
|
495
929
|
kwargs: Additional parameters for creating the memoryset. See
|
|
496
|
-
[`create`][orca_sdk.
|
|
497
|
-
|
|
930
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
498
931
|
|
|
499
932
|
Returns:
|
|
500
933
|
Handle to the new memoryset in the OrcaCloud
|
|
501
934
|
"""
|
|
935
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
936
|
+
existing = cls._handle_if_exists(
|
|
937
|
+
name,
|
|
938
|
+
if_exists=if_exists,
|
|
939
|
+
label_names=kwargs.get("label_names"),
|
|
940
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
941
|
+
)
|
|
942
|
+
if existing is not None:
|
|
943
|
+
return existing
|
|
944
|
+
|
|
502
945
|
datasource = Datasource.from_pytorch(
|
|
503
946
|
f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
|
|
504
947
|
)
|
|
948
|
+
kwargs["background"] = background
|
|
505
949
|
return cls.create(name, datasource, **kwargs)
|
|
506
950
|
|
|
951
|
+
@overload
|
|
507
952
|
@classmethod
|
|
508
|
-
def from_list(
|
|
509
|
-
|
|
510
|
-
|
|
953
|
+
def from_list(
|
|
954
|
+
cls,
|
|
955
|
+
name: str,
|
|
956
|
+
data: list[dict],
|
|
957
|
+
*,
|
|
958
|
+
background: Literal[True],
|
|
959
|
+
**kwargs: Any,
|
|
960
|
+
) -> Job[Self]:
|
|
961
|
+
pass
|
|
511
962
|
|
|
512
|
-
|
|
513
|
-
|
|
963
|
+
@overload
|
|
964
|
+
@classmethod
|
|
965
|
+
def from_list(
|
|
966
|
+
cls,
|
|
967
|
+
name: str,
|
|
968
|
+
data: list[dict],
|
|
969
|
+
*,
|
|
970
|
+
background: Literal[False] = False,
|
|
971
|
+
**kwargs: Any,
|
|
972
|
+
) -> Self:
|
|
973
|
+
pass
|
|
514
974
|
|
|
515
|
-
|
|
516
|
-
|
|
975
|
+
@classmethod
|
|
976
|
+
def from_list(
|
|
977
|
+
cls,
|
|
978
|
+
name: str,
|
|
979
|
+
data: list[dict],
|
|
980
|
+
*,
|
|
981
|
+
background: bool = False,
|
|
982
|
+
**kwargs: Any,
|
|
983
|
+
) -> Self | Job[Self]:
|
|
984
|
+
"""
|
|
985
|
+
Create a new memoryset from a list of dictionaries in the OrcaCloud
|
|
986
|
+
|
|
987
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
988
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
989
|
+
|
|
990
|
+
All properties that are not specified to be used as `value_column`, `label_column`, or
|
|
991
|
+
`source_id_column` will be stored as metadata in the memoryset.
|
|
517
992
|
|
|
518
993
|
Params:
|
|
519
994
|
name: Name for the new memoryset (must be unique)
|
|
520
995
|
data: List of dictionaries to create the memoryset from
|
|
996
|
+
background: Whether to run the operation in the background
|
|
521
997
|
kwargs: Additional parameters for creating the memoryset. See
|
|
522
|
-
[`create`][orca_sdk.
|
|
998
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
523
999
|
|
|
524
1000
|
Returns:
|
|
525
1001
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -530,11 +1006,53 @@ class LabeledMemoryset:
|
|
|
530
1006
|
... {"value": "world", "label": 1, "tag": "tag2"},
|
|
531
1007
|
... ])
|
|
532
1008
|
"""
|
|
1009
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1010
|
+
existing = cls._handle_if_exists(
|
|
1011
|
+
name,
|
|
1012
|
+
if_exists=if_exists,
|
|
1013
|
+
label_names=kwargs.get("label_names"),
|
|
1014
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1015
|
+
)
|
|
1016
|
+
if existing is not None:
|
|
1017
|
+
return existing
|
|
1018
|
+
|
|
533
1019
|
datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1020
|
+
kwargs["background"] = background
|
|
534
1021
|
return cls.create(name, datasource, **kwargs)
|
|
535
1022
|
|
|
1023
|
+
@overload
|
|
536
1024
|
@classmethod
|
|
537
|
-
def from_dict(
|
|
1025
|
+
def from_dict(
|
|
1026
|
+
cls,
|
|
1027
|
+
name: str,
|
|
1028
|
+
data: dict,
|
|
1029
|
+
*,
|
|
1030
|
+
background: Literal[True],
|
|
1031
|
+
**kwargs: Any,
|
|
1032
|
+
) -> Job[Self]:
|
|
1033
|
+
pass
|
|
1034
|
+
|
|
1035
|
+
@overload
|
|
1036
|
+
@classmethod
|
|
1037
|
+
def from_dict(
|
|
1038
|
+
cls,
|
|
1039
|
+
name: str,
|
|
1040
|
+
data: dict,
|
|
1041
|
+
*,
|
|
1042
|
+
background: Literal[False] = False,
|
|
1043
|
+
**kwargs: Any,
|
|
1044
|
+
) -> Self:
|
|
1045
|
+
pass
|
|
1046
|
+
|
|
1047
|
+
@classmethod
|
|
1048
|
+
def from_dict(
|
|
1049
|
+
cls,
|
|
1050
|
+
name: str,
|
|
1051
|
+
data: dict,
|
|
1052
|
+
*,
|
|
1053
|
+
background: bool = False,
|
|
1054
|
+
**kwargs: Any,
|
|
1055
|
+
) -> Self | Job[Self]:
|
|
538
1056
|
"""
|
|
539
1057
|
Create a new memoryset from a dictionary of columns in the OrcaCloud
|
|
540
1058
|
|
|
@@ -547,8 +1065,9 @@ class LabeledMemoryset:
|
|
|
547
1065
|
Params:
|
|
548
1066
|
name: Name for the new memoryset (must be unique)
|
|
549
1067
|
data: Dictionary of columns to create the memoryset from
|
|
1068
|
+
background: Whether to run the operation in the background
|
|
550
1069
|
kwargs: Additional parameters for creating the memoryset. See
|
|
551
|
-
[`create`][orca_sdk.
|
|
1070
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
552
1071
|
|
|
553
1072
|
Returns:
|
|
554
1073
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -560,11 +1079,53 @@ class LabeledMemoryset:
|
|
|
560
1079
|
... "tag": ["tag1", "tag2"],
|
|
561
1080
|
... })
|
|
562
1081
|
"""
|
|
1082
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1083
|
+
existing = cls._handle_if_exists(
|
|
1084
|
+
name,
|
|
1085
|
+
if_exists=if_exists,
|
|
1086
|
+
label_names=kwargs.get("label_names"),
|
|
1087
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1088
|
+
)
|
|
1089
|
+
if existing is not None:
|
|
1090
|
+
return existing
|
|
1091
|
+
|
|
563
1092
|
datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1093
|
+
kwargs["background"] = background
|
|
564
1094
|
return cls.create(name, datasource, **kwargs)
|
|
565
1095
|
|
|
1096
|
+
@overload
|
|
566
1097
|
@classmethod
|
|
567
|
-
def from_pandas(
|
|
1098
|
+
def from_pandas(
|
|
1099
|
+
cls,
|
|
1100
|
+
name: str,
|
|
1101
|
+
dataframe: pd.DataFrame,
|
|
1102
|
+
*,
|
|
1103
|
+
background: Literal[True],
|
|
1104
|
+
**kwargs: Any,
|
|
1105
|
+
) -> Job[Self]:
|
|
1106
|
+
pass
|
|
1107
|
+
|
|
1108
|
+
@overload
|
|
1109
|
+
@classmethod
|
|
1110
|
+
def from_pandas(
|
|
1111
|
+
cls,
|
|
1112
|
+
name: str,
|
|
1113
|
+
dataframe: pd.DataFrame,
|
|
1114
|
+
*,
|
|
1115
|
+
background: Literal[False] = False,
|
|
1116
|
+
**kwargs: Any,
|
|
1117
|
+
) -> Self:
|
|
1118
|
+
pass
|
|
1119
|
+
|
|
1120
|
+
@classmethod
|
|
1121
|
+
def from_pandas(
|
|
1122
|
+
cls,
|
|
1123
|
+
name: str,
|
|
1124
|
+
dataframe: pd.DataFrame,
|
|
1125
|
+
*,
|
|
1126
|
+
background: bool = False,
|
|
1127
|
+
**kwargs: Any,
|
|
1128
|
+
) -> Self | Job[Self]:
|
|
568
1129
|
"""
|
|
569
1130
|
Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
|
|
570
1131
|
|
|
@@ -577,17 +1138,60 @@ class LabeledMemoryset:
|
|
|
577
1138
|
Params:
|
|
578
1139
|
name: Name for the new memoryset (must be unique)
|
|
579
1140
|
dataframe: Dataframe to create the memoryset from
|
|
1141
|
+
background: Whether to run the operation in the background
|
|
580
1142
|
kwargs: Additional parameters for creating the memoryset. See
|
|
581
|
-
[`create`][orca_sdk.
|
|
1143
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
582
1144
|
|
|
583
1145
|
Returns:
|
|
584
1146
|
Handle to the new memoryset in the OrcaCloud
|
|
585
1147
|
"""
|
|
1148
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1149
|
+
existing = cls._handle_if_exists(
|
|
1150
|
+
name,
|
|
1151
|
+
if_exists=if_exists,
|
|
1152
|
+
label_names=kwargs.get("label_names"),
|
|
1153
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1154
|
+
)
|
|
1155
|
+
if existing is not None:
|
|
1156
|
+
return existing
|
|
1157
|
+
|
|
586
1158
|
datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
|
|
1159
|
+
kwargs["background"] = background
|
|
587
1160
|
return cls.create(name, datasource, **kwargs)
|
|
588
1161
|
|
|
1162
|
+
@overload
|
|
1163
|
+
@classmethod
|
|
1164
|
+
def from_arrow(
|
|
1165
|
+
cls,
|
|
1166
|
+
name: str,
|
|
1167
|
+
pyarrow_table: pa.Table,
|
|
1168
|
+
*,
|
|
1169
|
+
background: Literal[True],
|
|
1170
|
+
**kwargs: Any,
|
|
1171
|
+
) -> Job[Self]:
|
|
1172
|
+
pass
|
|
1173
|
+
|
|
1174
|
+
@overload
|
|
1175
|
+
@classmethod
|
|
1176
|
+
def from_arrow(
|
|
1177
|
+
cls,
|
|
1178
|
+
name: str,
|
|
1179
|
+
pyarrow_table: pa.Table,
|
|
1180
|
+
*,
|
|
1181
|
+
background: Literal[False] = False,
|
|
1182
|
+
**kwargs: Any,
|
|
1183
|
+
) -> Self:
|
|
1184
|
+
pass
|
|
1185
|
+
|
|
589
1186
|
@classmethod
|
|
590
|
-
def from_arrow(
|
|
1187
|
+
def from_arrow(
|
|
1188
|
+
cls,
|
|
1189
|
+
name: str,
|
|
1190
|
+
pyarrow_table: pa.Table,
|
|
1191
|
+
*,
|
|
1192
|
+
background: bool = False,
|
|
1193
|
+
**kwargs: Any,
|
|
1194
|
+
) -> Self | Job[Self]:
|
|
591
1195
|
"""
|
|
592
1196
|
Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
|
|
593
1197
|
|
|
@@ -600,19 +1204,62 @@ class LabeledMemoryset:
|
|
|
600
1204
|
Params:
|
|
601
1205
|
name: Name for the new memoryset (must be unique)
|
|
602
1206
|
pyarrow_table: PyArrow table to create the memoryset from
|
|
1207
|
+
background: Whether to run the operation in the background
|
|
603
1208
|
kwargs: Additional parameters for creating the memoryset. See
|
|
604
|
-
[`create`][orca_sdk.
|
|
1209
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
605
1210
|
|
|
606
1211
|
Returns:
|
|
607
1212
|
Handle to the new memoryset in the OrcaCloud
|
|
608
1213
|
"""
|
|
1214
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1215
|
+
existing = cls._handle_if_exists(
|
|
1216
|
+
name,
|
|
1217
|
+
if_exists=if_exists,
|
|
1218
|
+
label_names=kwargs.get("label_names"),
|
|
1219
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1220
|
+
)
|
|
1221
|
+
if existing is not None:
|
|
1222
|
+
return existing
|
|
1223
|
+
|
|
609
1224
|
datasource = Datasource.from_arrow(
|
|
610
1225
|
f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
|
|
611
1226
|
)
|
|
1227
|
+
kwargs["background"] = background
|
|
612
1228
|
return cls.create(name, datasource, **kwargs)
|
|
613
1229
|
|
|
1230
|
+
@overload
|
|
614
1231
|
@classmethod
|
|
615
|
-
def from_disk(
|
|
1232
|
+
def from_disk(
|
|
1233
|
+
cls,
|
|
1234
|
+
name: str,
|
|
1235
|
+
file_path: str | PathLike,
|
|
1236
|
+
*,
|
|
1237
|
+
background: Literal[True],
|
|
1238
|
+
**kwargs: Any,
|
|
1239
|
+
) -> Job[Self]:
|
|
1240
|
+
pass
|
|
1241
|
+
|
|
1242
|
+
@overload
|
|
1243
|
+
@classmethod
|
|
1244
|
+
def from_disk(
|
|
1245
|
+
cls,
|
|
1246
|
+
name: str,
|
|
1247
|
+
file_path: str | PathLike,
|
|
1248
|
+
*,
|
|
1249
|
+
background: Literal[False] = False,
|
|
1250
|
+
**kwargs: Any,
|
|
1251
|
+
) -> Self:
|
|
1252
|
+
pass
|
|
1253
|
+
|
|
1254
|
+
@classmethod
|
|
1255
|
+
def from_disk(
|
|
1256
|
+
cls,
|
|
1257
|
+
name: str,
|
|
1258
|
+
file_path: str | PathLike,
|
|
1259
|
+
*,
|
|
1260
|
+
background: bool = False,
|
|
1261
|
+
**kwargs: Any,
|
|
1262
|
+
) -> Self | Job[Self]:
|
|
616
1263
|
"""
|
|
617
1264
|
Create a new memoryset from a file on disk in the OrcaCloud
|
|
618
1265
|
|
|
@@ -632,17 +1279,29 @@ class LabeledMemoryset:
|
|
|
632
1279
|
- .csv: [`CSV`][csv] files
|
|
633
1280
|
- .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
|
|
634
1281
|
- dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
|
|
1282
|
+
background: Whether to run the operation in the background
|
|
635
1283
|
kwargs: Additional parameters for creating the memoryset. See
|
|
636
|
-
[`create`][orca_sdk.
|
|
1284
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
637
1285
|
|
|
638
1286
|
Returns:
|
|
639
1287
|
Handle to the new memoryset in the OrcaCloud
|
|
640
1288
|
"""
|
|
1289
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1290
|
+
existing = cls._handle_if_exists(
|
|
1291
|
+
name,
|
|
1292
|
+
if_exists=if_exists,
|
|
1293
|
+
label_names=kwargs.get("label_names"),
|
|
1294
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1295
|
+
)
|
|
1296
|
+
if existing is not None:
|
|
1297
|
+
return existing
|
|
1298
|
+
|
|
641
1299
|
datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
|
|
1300
|
+
kwargs["background"] = background
|
|
642
1301
|
return cls.create(name, datasource, **kwargs)
|
|
643
1302
|
|
|
644
1303
|
@classmethod
|
|
645
|
-
def open(cls, name: str) ->
|
|
1304
|
+
def open(cls, name: str) -> Self:
|
|
646
1305
|
"""
|
|
647
1306
|
Get a handle to a memoryset in the OrcaCloud
|
|
648
1307
|
|
|
@@ -655,7 +1314,26 @@ class LabeledMemoryset:
|
|
|
655
1314
|
Raises:
|
|
656
1315
|
LookupError: If the memoryset does not exist
|
|
657
1316
|
"""
|
|
658
|
-
|
|
1317
|
+
client = OrcaClient._resolve_client()
|
|
1318
|
+
metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
1319
|
+
return cls(metadata)
|
|
1320
|
+
|
|
1321
|
+
@classmethod
|
|
1322
|
+
async def aopen(cls, name: str) -> Self:
|
|
1323
|
+
"""
|
|
1324
|
+
Asynchronously get a handle to a memoryset in the OrcaCloud
|
|
1325
|
+
|
|
1326
|
+
Params:
|
|
1327
|
+
name: Name or unique identifier of the memoryset
|
|
1328
|
+
|
|
1329
|
+
Returns:
|
|
1330
|
+
Handle to the existing memoryset in the OrcaCloud
|
|
1331
|
+
|
|
1332
|
+
Raises:
|
|
1333
|
+
LookupError: If the memoryset does not exist
|
|
1334
|
+
"""
|
|
1335
|
+
client = OrcaAsyncClient._resolve_client()
|
|
1336
|
+
metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
659
1337
|
return cls(metadata)
|
|
660
1338
|
|
|
661
1339
|
@classmethod
|
|
@@ -676,14 +1354,21 @@ class LabeledMemoryset:
|
|
|
676
1354
|
return False
|
|
677
1355
|
|
|
678
1356
|
@classmethod
|
|
679
|
-
def all(cls) -> list[
|
|
1357
|
+
def all(cls, show_hidden: bool = False) -> list[Self]:
|
|
680
1358
|
"""
|
|
681
1359
|
Get a list of handles to all memorysets in the OrcaCloud
|
|
682
1360
|
|
|
1361
|
+
Params:
|
|
1362
|
+
show_hidden: Whether to include hidden memorysets in results, defaults to `False`
|
|
1363
|
+
|
|
683
1364
|
Returns:
|
|
684
1365
|
List of handles to all memorysets in the OrcaCloud
|
|
685
1366
|
"""
|
|
686
|
-
|
|
1367
|
+
client = OrcaClient._resolve_client()
|
|
1368
|
+
return [
|
|
1369
|
+
cls(metadata)
|
|
1370
|
+
for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
|
|
1371
|
+
]
|
|
687
1372
|
|
|
688
1373
|
@classmethod
|
|
689
1374
|
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
|
|
@@ -699,29 +1384,94 @@ class LabeledMemoryset:
|
|
|
699
1384
|
LookupError: If the memoryset does not exist and if_not_exists is `"error"`
|
|
700
1385
|
"""
|
|
701
1386
|
try:
|
|
702
|
-
|
|
1387
|
+
client = OrcaClient._resolve_client()
|
|
1388
|
+
client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
|
|
703
1389
|
logging.info(f"Deleted memoryset {name_or_id}")
|
|
704
1390
|
except LookupError:
|
|
705
1391
|
if if_not_exists == "error":
|
|
706
1392
|
raise
|
|
707
1393
|
|
|
1394
|
+
def set(
|
|
1395
|
+
self,
|
|
1396
|
+
*,
|
|
1397
|
+
name: str = UNSET,
|
|
1398
|
+
description: str | None = UNSET,
|
|
1399
|
+
label_names: list[str] = UNSET,
|
|
1400
|
+
hidden: bool = UNSET,
|
|
1401
|
+
):
|
|
1402
|
+
"""
|
|
1403
|
+
Update editable attributes of the memoryset
|
|
1404
|
+
|
|
1405
|
+
Note:
|
|
1406
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
1407
|
+
|
|
1408
|
+
Params:
|
|
1409
|
+
description: Value to set for the description
|
|
1410
|
+
name: Value to set for the name
|
|
1411
|
+
label_names: Value to replace existing label names with
|
|
1412
|
+
"""
|
|
1413
|
+
payload: MemorysetUpdate = {}
|
|
1414
|
+
if name is not UNSET:
|
|
1415
|
+
payload["name"] = name
|
|
1416
|
+
if description is not UNSET:
|
|
1417
|
+
payload["description"] = description
|
|
1418
|
+
if label_names is not UNSET:
|
|
1419
|
+
payload["label_names"] = label_names
|
|
1420
|
+
if hidden is not UNSET:
|
|
1421
|
+
payload["hidden"] = hidden
|
|
1422
|
+
|
|
1423
|
+
client = OrcaClient._resolve_client()
|
|
1424
|
+
client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
|
|
1425
|
+
self.refresh()
|
|
1426
|
+
|
|
1427
|
+
@overload
|
|
708
1428
|
def clone(
|
|
709
1429
|
self,
|
|
710
1430
|
name: str,
|
|
711
1431
|
*,
|
|
712
1432
|
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
713
1433
|
max_seq_length_override: int | None = None,
|
|
1434
|
+
prompt: str | None = None,
|
|
714
1435
|
if_exists: CreateMode = "error",
|
|
715
|
-
|
|
1436
|
+
background: Literal[True],
|
|
1437
|
+
) -> Job[Self]:
|
|
1438
|
+
pass
|
|
1439
|
+
|
|
1440
|
+
@overload
|
|
1441
|
+
def clone(
|
|
1442
|
+
self,
|
|
1443
|
+
name: str,
|
|
1444
|
+
*,
|
|
1445
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
1446
|
+
max_seq_length_override: int | None = None,
|
|
1447
|
+
prompt: str | None = None,
|
|
1448
|
+
if_exists: CreateMode = "error",
|
|
1449
|
+
background: Literal[False] = False,
|
|
1450
|
+
) -> Self:
|
|
1451
|
+
pass
|
|
1452
|
+
|
|
1453
|
+
def clone(
|
|
1454
|
+
self,
|
|
1455
|
+
name: str,
|
|
1456
|
+
*,
|
|
1457
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
1458
|
+
max_seq_length_override: int | None = UNSET,
|
|
1459
|
+
prompt: str | None = None,
|
|
1460
|
+
if_exists: CreateMode = "error",
|
|
1461
|
+
background: bool = False,
|
|
1462
|
+
) -> Self | Job[Self]:
|
|
716
1463
|
"""
|
|
717
1464
|
Create a clone of the memoryset with a new name
|
|
718
1465
|
|
|
719
1466
|
Params:
|
|
720
1467
|
name: Name for the new memoryset (must be unique)
|
|
721
1468
|
embedding_model: Optional new embedding model to use for re-embedding the memory values
|
|
722
|
-
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
723
1469
|
value is longer than this it will be truncated, will default to the model's max
|
|
724
1470
|
sequence length if not provided
|
|
1471
|
+
max_seq_length_override: Optional custom max sequence length to use for the cloned memoryset.
|
|
1472
|
+
If not provided, will use the source memoryset's max sequence length.
|
|
1473
|
+
prompt: Optional custom prompt to use for the cloned memoryset.
|
|
1474
|
+
If not provided, will use the source memoryset's prompt.
|
|
725
1475
|
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
726
1476
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
727
1477
|
|
|
@@ -736,6 +1486,13 @@ class LabeledMemoryset:
|
|
|
736
1486
|
>>> new_memoryset = memoryset.clone(
|
|
737
1487
|
... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
|
|
738
1488
|
... )
|
|
1489
|
+
|
|
1490
|
+
>>> # Clone with custom prompts
|
|
1491
|
+
>>> new_memoryset = memoryset.clone(
|
|
1492
|
+
... "my_memoryset_with_prompts",
|
|
1493
|
+
... document_prompt_override="Represent this document for retrieval:",
|
|
1494
|
+
... query_prompt_override="Represent this query for retrieval:",
|
|
1495
|
+
... )
|
|
739
1496
|
"""
|
|
740
1497
|
if self.exists(name):
|
|
741
1498
|
if if_exists == "error":
|
|
@@ -746,22 +1503,23 @@ class LabeledMemoryset:
|
|
|
746
1503
|
if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
|
|
747
1504
|
raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
|
|
748
1505
|
return existing
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
1506
|
+
payload: CloneMemorysetRequest = {"name": name}
|
|
1507
|
+
if max_seq_length_override is not UNSET:
|
|
1508
|
+
payload["max_seq_length_override"] = max_seq_length_override
|
|
1509
|
+
if prompt is not None:
|
|
1510
|
+
payload["prompt"] = prompt
|
|
1511
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
1512
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
1513
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1514
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1515
|
+
|
|
1516
|
+
client = OrcaClient._resolve_client()
|
|
1517
|
+
metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
1518
|
+
job = Job(
|
|
1519
|
+
metadata["insertion_task_id"],
|
|
1520
|
+
lambda: self.open(metadata["id"]),
|
|
762
1521
|
)
|
|
763
|
-
|
|
764
|
-
return LabeledMemoryset.open(metadata.id)
|
|
1522
|
+
return job if background else job.result()
|
|
765
1523
|
|
|
766
1524
|
def refresh(self, throttle: float = 0):
|
|
767
1525
|
"""
|
|
@@ -775,7 +1533,7 @@ class LabeledMemoryset:
|
|
|
775
1533
|
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
776
1534
|
return
|
|
777
1535
|
|
|
778
|
-
self.__dict__.update(
|
|
1536
|
+
self.__dict__.update(self.open(self.id).__dict__)
|
|
779
1537
|
self._last_refresh = current_time
|
|
780
1538
|
|
|
781
1539
|
def __len__(self) -> int:
|
|
@@ -784,14 +1542,14 @@ class LabeledMemoryset:
|
|
|
784
1542
|
return self.length
|
|
785
1543
|
|
|
786
1544
|
@overload
|
|
787
|
-
def __getitem__(self, index: int | str) ->
|
|
1545
|
+
def __getitem__(self, index: int | str) -> MemoryT:
|
|
788
1546
|
pass
|
|
789
1547
|
|
|
790
1548
|
@overload
|
|
791
|
-
def __getitem__(self, index: slice) -> list[
|
|
1549
|
+
def __getitem__(self, index: slice) -> list[MemoryT]:
|
|
792
1550
|
pass
|
|
793
1551
|
|
|
794
|
-
def __getitem__(self, index: int | slice | str) ->
|
|
1552
|
+
def __getitem__(self, index: int | slice | str) -> MemoryT | list[MemoryT]:
|
|
795
1553
|
"""
|
|
796
1554
|
Get memories from the memoryset by index or memory id
|
|
797
1555
|
|
|
@@ -837,22 +1595,24 @@ class LabeledMemoryset:
|
|
|
837
1595
|
raise ValueError(f"Invalid index type: {type(index)}")
|
|
838
1596
|
|
|
839
1597
|
@overload
|
|
840
|
-
def search(self, query: str, *, count: int = 1) -> list[
|
|
1598
|
+
def search(self, query: str, *, count: int = 1, prompt: str | None = None) -> list[MemoryLookupT]:
|
|
841
1599
|
pass
|
|
842
1600
|
|
|
843
1601
|
@overload
|
|
844
|
-
def search(self, query: list[str], *, count: int = 1) -> list[list[
|
|
1602
|
+
def search(self, query: list[str], *, count: int = 1, prompt: str | None = None) -> list[list[MemoryLookupT]]:
|
|
845
1603
|
pass
|
|
846
1604
|
|
|
847
1605
|
def search(
|
|
848
|
-
self, query: str | list[str], *, count: int = 1
|
|
849
|
-
) -> list[
|
|
1606
|
+
self, query: str | list[str], *, count: int = 1, prompt: str | None = None
|
|
1607
|
+
) -> list[MemoryLookupT] | list[list[MemoryLookupT]]:
|
|
850
1608
|
"""
|
|
851
1609
|
Search for memories that are semantically similar to the query
|
|
852
1610
|
|
|
853
1611
|
Params:
|
|
854
1612
|
query: Query to lookup memories in the memoryset, can be a single query or a list
|
|
855
1613
|
count: Number of memories to return for each query
|
|
1614
|
+
prompt: Optional prompt for query embedding during search.
|
|
1615
|
+
If not provided, the memoryset's default query prompt will be used if available.
|
|
856
1616
|
|
|
857
1617
|
Returns:
|
|
858
1618
|
List of memories from the memoryset that match the query. If a single query is provided,
|
|
@@ -867,6 +1627,13 @@ class LabeledMemoryset:
|
|
|
867
1627
|
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
868
1628
|
]
|
|
869
1629
|
|
|
1630
|
+
Search with custom query prompt for instruction-following models:
|
|
1631
|
+
>>> memoryset.search("I am happy", count=2, query_prompt="Represent this query for sentiment retrieval:")
|
|
1632
|
+
[
|
|
1633
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
|
|
1634
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
1635
|
+
]
|
|
1636
|
+
|
|
870
1637
|
Search for similar memories for multiple queries:
|
|
871
1638
|
>>> memoryset.search(["I am happy", "I am sad"], count=1)
|
|
872
1639
|
[
|
|
@@ -878,14 +1645,30 @@ class LabeledMemoryset:
|
|
|
878
1645
|
],
|
|
879
1646
|
]
|
|
880
1647
|
"""
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
1648
|
+
client = OrcaClient._resolve_client()
|
|
1649
|
+
response = client.POST(
|
|
1650
|
+
"/gpu/memoryset/{name_or_id}/lookup",
|
|
1651
|
+
params={"name_or_id": self.id},
|
|
1652
|
+
json={
|
|
1653
|
+
"query": query if isinstance(query, list) else [query],
|
|
1654
|
+
"count": count,
|
|
1655
|
+
"prompt": prompt,
|
|
1656
|
+
},
|
|
887
1657
|
)
|
|
888
|
-
lookups = [
|
|
1658
|
+
lookups = [
|
|
1659
|
+
[
|
|
1660
|
+
cast(
|
|
1661
|
+
MemoryLookupT,
|
|
1662
|
+
(
|
|
1663
|
+
LabeledMemoryLookup(self.id, lookup_response)
|
|
1664
|
+
if "label" in lookup_response
|
|
1665
|
+
else ScoredMemoryLookup(self.id, lookup_response)
|
|
1666
|
+
),
|
|
1667
|
+
)
|
|
1668
|
+
for lookup_response in batch
|
|
1669
|
+
]
|
|
1670
|
+
for batch in response
|
|
1671
|
+
]
|
|
889
1672
|
return lookups if isinstance(query, list) else lookups[0]
|
|
890
1673
|
|
|
891
1674
|
def query(
|
|
@@ -893,7 +1676,9 @@ class LabeledMemoryset:
|
|
|
893
1676
|
offset: int = 0,
|
|
894
1677
|
limit: int = 100,
|
|
895
1678
|
filters: list[FilterItemTuple] = [],
|
|
896
|
-
|
|
1679
|
+
with_feedback_metrics: bool = False,
|
|
1680
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
1681
|
+
) -> list[MemoryT]:
|
|
897
1682
|
"""
|
|
898
1683
|
Query the memoryset for memories that match the filters
|
|
899
1684
|
|
|
@@ -901,6 +1686,7 @@ class LabeledMemoryset:
|
|
|
901
1686
|
offset: The offset of the first memory to return
|
|
902
1687
|
limit: The maximum number of memories to return
|
|
903
1688
|
filters: List of filters to apply to the query.
|
|
1689
|
+
with_feedback_metrics: Whether to include feedback metrics in the response
|
|
904
1690
|
|
|
905
1691
|
Returns:
|
|
906
1692
|
List of memories from the memoryset that match the filters
|
|
@@ -912,21 +1698,78 @@ class LabeledMemoryset:
|
|
|
912
1698
|
LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
|
|
913
1699
|
]
|
|
914
1700
|
"""
|
|
1701
|
+
parsed_filters = [
|
|
1702
|
+
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
|
|
1703
|
+
]
|
|
1704
|
+
|
|
1705
|
+
if with_feedback_metrics:
|
|
1706
|
+
client = OrcaClient._resolve_client()
|
|
1707
|
+
response = client.POST(
|
|
1708
|
+
"/telemetry/memories",
|
|
1709
|
+
json={
|
|
1710
|
+
"memoryset_id": self.id,
|
|
1711
|
+
"offset": offset,
|
|
1712
|
+
"limit": limit,
|
|
1713
|
+
"filters": parsed_filters,
|
|
1714
|
+
"sort": [_parse_sort_item_from_tuple(item) for item in sort] if sort else None,
|
|
1715
|
+
},
|
|
1716
|
+
)
|
|
1717
|
+
return [
|
|
1718
|
+
cast(
|
|
1719
|
+
MemoryT,
|
|
1720
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1721
|
+
)
|
|
1722
|
+
for memory in response["items"]
|
|
1723
|
+
]
|
|
1724
|
+
|
|
1725
|
+
if any(_is_metric_column(filter[0]) for filter in filters):
|
|
1726
|
+
raise ValueError("Feedback metrics are only supported when the with_feedback_metrics flag is set to True")
|
|
1727
|
+
|
|
1728
|
+
if sort:
|
|
1729
|
+
logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
|
|
1730
|
+
|
|
1731
|
+
client = OrcaClient._resolve_client()
|
|
1732
|
+
response = client.POST(
|
|
1733
|
+
"/memoryset/{name_or_id}/memories",
|
|
1734
|
+
params={"name_or_id": self.id},
|
|
1735
|
+
json={
|
|
1736
|
+
"offset": offset,
|
|
1737
|
+
"limit": limit,
|
|
1738
|
+
"filters": cast(list[FilterItem], parsed_filters),
|
|
1739
|
+
},
|
|
1740
|
+
)
|
|
915
1741
|
return [
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
self.id,
|
|
919
|
-
body=ListMemoriesRequest(
|
|
920
|
-
offset=offset,
|
|
921
|
-
limit=limit,
|
|
922
|
-
filters=[
|
|
923
|
-
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter
|
|
924
|
-
for filter in filters
|
|
925
|
-
],
|
|
926
|
-
),
|
|
1742
|
+
cast(
|
|
1743
|
+
MemoryT,
|
|
1744
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
927
1745
|
)
|
|
1746
|
+
for memory in response
|
|
928
1747
|
]
|
|
929
1748
|
|
|
1749
|
+
def to_pandas(
|
|
1750
|
+
self,
|
|
1751
|
+
offset: int = 0,
|
|
1752
|
+
limit: int = 100,
|
|
1753
|
+
filters: list[FilterItemTuple] = [],
|
|
1754
|
+
with_feedback_metrics: bool = False,
|
|
1755
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
1756
|
+
) -> pd.DataFrame:
|
|
1757
|
+
"""
|
|
1758
|
+
Convert the memoryset to a pandas DataFrame
|
|
1759
|
+
"""
|
|
1760
|
+
return pd.DataFrame(
|
|
1761
|
+
[
|
|
1762
|
+
memory.to_dict()
|
|
1763
|
+
for memory in self.query(
|
|
1764
|
+
offset=offset,
|
|
1765
|
+
limit=limit,
|
|
1766
|
+
filters=filters,
|
|
1767
|
+
with_feedback_metrics=with_feedback_metrics,
|
|
1768
|
+
sort=sort,
|
|
1769
|
+
)
|
|
1770
|
+
]
|
|
1771
|
+
)
|
|
1772
|
+
|
|
930
1773
|
def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
|
|
931
1774
|
"""
|
|
932
1775
|
Insert memories into the memoryset
|
|
@@ -937,6 +1780,7 @@ class LabeledMemoryset:
|
|
|
937
1780
|
|
|
938
1781
|
- `value`: Value of the memory
|
|
939
1782
|
- `label`: Label of the memory
|
|
1783
|
+
- `score`: Score of the memory
|
|
940
1784
|
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
941
1785
|
- `...`: Any other metadata to store for the memory
|
|
942
1786
|
|
|
@@ -946,26 +1790,83 @@ class LabeledMemoryset:
|
|
|
946
1790
|
... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
|
|
947
1791
|
... ])
|
|
948
1792
|
"""
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1793
|
+
client = OrcaClient._resolve_client()
|
|
1794
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
1795
|
+
# insert memories in batches to avoid API timeouts
|
|
1796
|
+
for i in range(0, len(items), self._batch_size):
|
|
1797
|
+
batch = items[i : i + self._batch_size]
|
|
1798
|
+
client.POST(
|
|
1799
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
1800
|
+
params={"name_or_id": self.id},
|
|
1801
|
+
json=cast(
|
|
1802
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
1803
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
1804
|
+
),
|
|
1805
|
+
)
|
|
1806
|
+
|
|
958
1807
|
self.refresh()
|
|
959
1808
|
|
|
1809
|
+
async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
|
|
1810
|
+
"""
|
|
1811
|
+
Asynchronously insert memories into the memoryset
|
|
1812
|
+
|
|
1813
|
+
Params:
|
|
1814
|
+
items: List of memories to insert into the memoryset. This should be a list of
|
|
1815
|
+
dictionaries with the following keys:
|
|
1816
|
+
|
|
1817
|
+
- `value`: Value of the memory
|
|
1818
|
+
- `label`: Label of the memory
|
|
1819
|
+
- `score`: Score of the memory
|
|
1820
|
+
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
1821
|
+
- `...`: Any other metadata to store for the memory
|
|
1822
|
+
|
|
1823
|
+
Examples:
|
|
1824
|
+
>>> await memoryset.ainsert([
|
|
1825
|
+
... {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
|
|
1826
|
+
... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
|
|
1827
|
+
... ])
|
|
1828
|
+
"""
|
|
1829
|
+
client = OrcaAsyncClient._resolve_client()
|
|
1830
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
1831
|
+
# insert memories in batches to avoid API timeouts
|
|
1832
|
+
for i in range(0, len(items), self._batch_size):
|
|
1833
|
+
batch = items[i : i + self._batch_size]
|
|
1834
|
+
await client.POST(
|
|
1835
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
1836
|
+
params={"name_or_id": self.id},
|
|
1837
|
+
json=cast(
|
|
1838
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
1839
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
1840
|
+
),
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
await self.arefresh()
|
|
1844
|
+
|
|
1845
|
+
async def arefresh(self, throttle: float = 0):
|
|
1846
|
+
"""
|
|
1847
|
+
Asynchronously refresh the information about the memoryset from the OrcaCloud
|
|
1848
|
+
|
|
1849
|
+
Params:
|
|
1850
|
+
throttle: Minimum time in seconds between refreshes
|
|
1851
|
+
"""
|
|
1852
|
+
current_time = datetime.now()
|
|
1853
|
+
# Skip refresh if last refresh was too recent
|
|
1854
|
+
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
1855
|
+
return
|
|
1856
|
+
|
|
1857
|
+
refreshed_memoryset = await type(self).aopen(self.id)
|
|
1858
|
+
self.__dict__.update(refreshed_memoryset.__dict__)
|
|
1859
|
+
self._last_refresh = current_time
|
|
1860
|
+
|
|
960
1861
|
@overload
|
|
961
|
-
def get(self, memory_id: str) ->
|
|
1862
|
+
def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
|
|
962
1863
|
pass
|
|
963
1864
|
|
|
964
1865
|
@overload
|
|
965
|
-
def get(self, memory_id: Iterable[str]) -> list[
|
|
1866
|
+
def get(self, memory_id: Iterable[str]) -> list[MemoryT]:
|
|
966
1867
|
pass
|
|
967
1868
|
|
|
968
|
-
def get(self, memory_id: str | Iterable[str]) ->
|
|
1869
|
+
def get(self, memory_id: str | Iterable[str]) -> MemoryT | list[MemoryT]:
|
|
969
1870
|
"""
|
|
970
1871
|
Fetch a memory or memories from the memoryset
|
|
971
1872
|
|
|
@@ -994,22 +1895,38 @@ class LabeledMemoryset:
|
|
|
994
1895
|
]
|
|
995
1896
|
"""
|
|
996
1897
|
if isinstance(memory_id, str):
|
|
997
|
-
|
|
1898
|
+
client = OrcaClient._resolve_client()
|
|
1899
|
+
response = client.GET(
|
|
1900
|
+
"/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
|
|
1901
|
+
)
|
|
1902
|
+
return cast(
|
|
1903
|
+
MemoryT,
|
|
1904
|
+
(LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
|
|
1905
|
+
)
|
|
998
1906
|
else:
|
|
1907
|
+
client = OrcaClient._resolve_client()
|
|
1908
|
+
response = client.POST(
|
|
1909
|
+
"/memoryset/{name_or_id}/memories/get",
|
|
1910
|
+
params={"name_or_id": self.id},
|
|
1911
|
+
json={"memory_ids": list(memory_id)},
|
|
1912
|
+
)
|
|
999
1913
|
return [
|
|
1000
|
-
|
|
1001
|
-
|
|
1914
|
+
cast(
|
|
1915
|
+
MemoryT,
|
|
1916
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1917
|
+
)
|
|
1918
|
+
for memory in response
|
|
1002
1919
|
]
|
|
1003
1920
|
|
|
1004
1921
|
@overload
|
|
1005
|
-
def update(self, updates: dict[str, Any]) ->
|
|
1922
|
+
def update(self, updates: dict[str, Any]) -> MemoryT:
|
|
1006
1923
|
pass
|
|
1007
1924
|
|
|
1008
1925
|
@overload
|
|
1009
|
-
def update(self, updates: Iterable[dict[str, Any]]) -> list[
|
|
1926
|
+
def update(self, updates: Iterable[dict[str, Any]]) -> list[MemoryT]:
|
|
1010
1927
|
pass
|
|
1011
1928
|
|
|
1012
|
-
def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) ->
|
|
1929
|
+
def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> MemoryT | list[MemoryT]:
|
|
1013
1930
|
"""
|
|
1014
1931
|
Update one or multiple memories in the memoryset
|
|
1015
1932
|
|
|
@@ -1041,16 +1958,87 @@ class LabeledMemoryset:
|
|
|
1041
1958
|
... for m in memoryset.query(filters=[("tag", "==", "happy")])
|
|
1042
1959
|
... )
|
|
1043
1960
|
"""
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
]
|
|
1050
|
-
|
|
1051
|
-
|
|
1961
|
+
client = OrcaClient._resolve_client()
|
|
1962
|
+
updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
|
|
1963
|
+
# update memories in batches to avoid API timeouts
|
|
1964
|
+
updated_memories: list[MemoryT] = []
|
|
1965
|
+
for i in range(0, len(updates_list), self._batch_size):
|
|
1966
|
+
batch = updates_list[i : i + self._batch_size]
|
|
1967
|
+
response = client.PATCH(
|
|
1968
|
+
"/gpu/memoryset/{name_or_id}/memories",
|
|
1969
|
+
params={"name_or_id": self.id},
|
|
1970
|
+
json=cast(
|
|
1971
|
+
list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
|
|
1972
|
+
[_parse_memory_update(update, type=self.memory_type) for update in batch],
|
|
1973
|
+
),
|
|
1974
|
+
)
|
|
1975
|
+
updated_memories.extend(
|
|
1976
|
+
cast(
|
|
1977
|
+
MemoryT,
|
|
1978
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1979
|
+
)
|
|
1980
|
+
for memory in response
|
|
1981
|
+
)
|
|
1982
|
+
|
|
1052
1983
|
return updated_memories[0] if isinstance(updates, dict) else updated_memories
|
|
1053
1984
|
|
|
1985
|
+
def get_cascading_edits_suggestions(
|
|
1986
|
+
self,
|
|
1987
|
+
memory: MemoryT,
|
|
1988
|
+
*,
|
|
1989
|
+
old_label: int,
|
|
1990
|
+
new_label: int,
|
|
1991
|
+
max_neighbors: int = 50,
|
|
1992
|
+
max_validation_neighbors: int = 10,
|
|
1993
|
+
similarity_threshold: float | None = None,
|
|
1994
|
+
only_if_has_old_label: bool = True,
|
|
1995
|
+
exclude_if_new_label: bool = True,
|
|
1996
|
+
suggestion_cooldown_time: float = 3600.0 * 24.0, # 1 day
|
|
1997
|
+
label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7, # 1 week
|
|
1998
|
+
) -> list[CascadingEditSuggestion]:
|
|
1999
|
+
"""
|
|
2000
|
+
Suggests cascading edits for a given memory based on nearby points with similar labels.
|
|
2001
|
+
|
|
2002
|
+
This function is triggered after a user changes a memory's label. It looks for nearby
|
|
2003
|
+
candidates in embedding space that may be subject to similar relabeling and returns them
|
|
2004
|
+
as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
|
|
2005
|
+
to reduce noise and improve usability.
|
|
2006
|
+
|
|
2007
|
+
Params:
|
|
2008
|
+
memory: The memory whose label was just changed.
|
|
2009
|
+
old_label: The label this memory used to have.
|
|
2010
|
+
new_label: The label it was changed to.
|
|
2011
|
+
max_neighbors: Maximum number of neighbors to consider.
|
|
2012
|
+
max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
|
|
2013
|
+
similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
|
|
2014
|
+
only_if_has_old_label: If True, only consider neighbors that have the old label.
|
|
2015
|
+
exclude_if_new_label: If True, exclude neighbors that already have the new label.
|
|
2016
|
+
suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
|
|
2017
|
+
to be considered again.
|
|
2018
|
+
label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
|
|
2019
|
+
to be considered for suggestions.
|
|
2020
|
+
|
|
2021
|
+
Returns:
|
|
2022
|
+
A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
|
|
2023
|
+
"""
|
|
2024
|
+
# TODO: properly integrate this with memory edits and return something that can be applied
|
|
2025
|
+
client = OrcaClient._resolve_client()
|
|
2026
|
+
return client.POST(
|
|
2027
|
+
"/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
|
|
2028
|
+
params={"name_or_id": self.id, "memory_id": memory.memory_id},
|
|
2029
|
+
json={
|
|
2030
|
+
"old_label": old_label,
|
|
2031
|
+
"new_label": new_label,
|
|
2032
|
+
"max_neighbors": max_neighbors,
|
|
2033
|
+
"max_validation_neighbors": max_validation_neighbors,
|
|
2034
|
+
"similarity_threshold": similarity_threshold,
|
|
2035
|
+
"only_if_has_old_label": only_if_has_old_label,
|
|
2036
|
+
"exclude_if_new_label": exclude_if_new_label,
|
|
2037
|
+
"suggestion_cooldown_time": suggestion_cooldown_time,
|
|
2038
|
+
"label_confirmation_cooldown_time": label_confirmation_cooldown_time,
|
|
2039
|
+
},
|
|
2040
|
+
)
|
|
2041
|
+
|
|
1054
2042
|
def delete(self, memory_id: str | Iterable[str]) -> None:
|
|
1055
2043
|
"""
|
|
1056
2044
|
Delete memories from the memoryset
|
|
@@ -1069,57 +2057,75 @@ class LabeledMemoryset:
|
|
|
1069
2057
|
... )
|
|
1070
2058
|
|
|
1071
2059
|
"""
|
|
2060
|
+
client = OrcaClient._resolve_client()
|
|
1072
2061
|
memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
|
|
1073
|
-
|
|
2062
|
+
# delete memories in batches to avoid API timeouts
|
|
2063
|
+
for i in range(0, len(memory_ids), self._batch_size):
|
|
2064
|
+
batch = memory_ids[i : i + self._batch_size]
|
|
2065
|
+
client.POST(
|
|
2066
|
+
"/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
|
|
2067
|
+
)
|
|
1074
2068
|
logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
|
|
1075
2069
|
self.refresh()
|
|
1076
2070
|
|
|
1077
|
-
|
|
2071
|
+
@overload
|
|
2072
|
+
def analyze(
|
|
2073
|
+
self,
|
|
2074
|
+
*analyses: dict[str, Any] | str,
|
|
2075
|
+
lookup_count: int = 15,
|
|
2076
|
+
clear_metrics: bool = False,
|
|
2077
|
+
background: Literal[True],
|
|
2078
|
+
) -> Job[MemorysetMetrics]:
|
|
2079
|
+
pass
|
|
2080
|
+
|
|
2081
|
+
@overload
|
|
2082
|
+
def analyze(
|
|
2083
|
+
self,
|
|
2084
|
+
*analyses: dict[str, Any] | str,
|
|
2085
|
+
lookup_count: int = 15,
|
|
2086
|
+
clear_metrics: bool = False,
|
|
2087
|
+
background: Literal[False] = False,
|
|
2088
|
+
) -> MemorysetMetrics:
|
|
2089
|
+
pass
|
|
2090
|
+
|
|
2091
|
+
def analyze(
|
|
2092
|
+
self,
|
|
2093
|
+
*analyses: dict[str, Any] | str,
|
|
2094
|
+
lookup_count: int = 15,
|
|
2095
|
+
clear_metrics: bool = False,
|
|
2096
|
+
background: bool = False,
|
|
2097
|
+
) -> Job[MemorysetMetrics] | MemorysetMetrics:
|
|
1078
2098
|
"""
|
|
1079
|
-
Run
|
|
2099
|
+
Run analyses on the memoryset to find duplicates, clusters, mislabelings, and more
|
|
1080
2100
|
|
|
1081
2101
|
The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
|
|
1082
|
-
attribute of each memory in the memoryset.
|
|
2102
|
+
attribute of each memory in the memoryset. Overall memoryset metrics will be returned as a dictionary.
|
|
1083
2103
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
>>> memoryset.find_duplicate_memories()
|
|
1089
|
-
{ "num_duplicates": 10 }
|
|
1090
|
-
>>> memoryset.delete(
|
|
1091
|
-
... m.memory_id
|
|
1092
|
-
... for m in memoryset.query(
|
|
1093
|
-
... filters=[("metrics.is_duplicate", "==", True)]
|
|
1094
|
-
... )
|
|
1095
|
-
... )
|
|
1096
|
-
"""
|
|
1097
|
-
analysis = create_analysis(
|
|
1098
|
-
self.id,
|
|
1099
|
-
body=MemorysetAnalysisRequest(
|
|
1100
|
-
type=MemorysetAnalysisRequestType.ANALYZE_DUPLICATE_MEMORIES,
|
|
1101
|
-
),
|
|
1102
|
-
)
|
|
1103
|
-
wait_for_task(analysis.task_id, description="Analyzing duplicates")
|
|
1104
|
-
analysis = get_analysis(self.id, analysis.task_id)
|
|
1105
|
-
assert isinstance(analysis.result, FindDuplicatesAnalysisResult)
|
|
1106
|
-
# TODO: return a custom duplicate analysis class instance with helper methods
|
|
1107
|
-
return analysis.result.to_dict()
|
|
2104
|
+
Params:
|
|
2105
|
+
analyses: List of analysis to run on the memoryset, can either be just the name of an
|
|
2106
|
+
analysis or a dictionary with a name property and additional config. The available
|
|
2107
|
+
analyses are:
|
|
1108
2108
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
2109
|
+
- **`"duplicate"`**: Find potentially duplicate memories in the memoryset
|
|
2110
|
+
- **`"cluster"`**: Cluster the memories in the memoryset
|
|
2111
|
+
- **`"label"`**: Analyze the labels to find potential mislabelings
|
|
2112
|
+
- **`"distribution"`**: Analyze the embedding distribution to populate
|
|
2113
|
+
- **`"projection"`**: Create a 2D projection of the embeddings for visualization
|
|
1113
2114
|
|
|
1114
|
-
|
|
1115
|
-
|
|
2115
|
+
lookup_count: Number of memories to lookup for each memory in the memoryset
|
|
2116
|
+
clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
|
|
1116
2117
|
|
|
1117
2118
|
Returns:
|
|
1118
|
-
|
|
2119
|
+
dictionary with aggregate metrics for each analysis that was run
|
|
2120
|
+
|
|
2121
|
+
Raises:
|
|
2122
|
+
ValueError: If an invalid analysis name is provided
|
|
1119
2123
|
|
|
1120
2124
|
Examples:
|
|
1121
|
-
|
|
1122
|
-
{
|
|
2125
|
+
Run label and duplicate analysis:
|
|
2126
|
+
>>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
|
|
2127
|
+
{ "duplicate": { "num_duplicates": 10 },
|
|
2128
|
+
"label": {
|
|
1123
2129
|
"label_metrics": [{
|
|
1124
2130
|
"label": 0,
|
|
1125
2131
|
"label_name": "negative",
|
|
@@ -1131,24 +2137,384 @@ class LabeledMemoryset:
|
|
|
1131
2137
|
"average_lookup_score": 0.90,
|
|
1132
2138
|
"memory_count": 100,
|
|
1133
2139
|
}]
|
|
2140
|
+
"neighbor_prediction_accuracy": 0.95,
|
|
2141
|
+
"mean_neighbor_label_confidence": 0.95,
|
|
2142
|
+
"mean_neighbor_label_entropy": 0.95,
|
|
2143
|
+
"mean_neighbor_predicted_label_ambiguity": 0.95,
|
|
2144
|
+
}
|
|
1134
2145
|
}
|
|
2146
|
+
|
|
2147
|
+
Remove all exact duplicates:
|
|
2148
|
+
>>> memoryset.delete(
|
|
2149
|
+
... m.memory_id
|
|
2150
|
+
... for m in memoryset.query(
|
|
2151
|
+
... filters=[("metrics.is_duplicate", "==", True)]
|
|
2152
|
+
... )
|
|
2153
|
+
... )
|
|
2154
|
+
|
|
2155
|
+
Display label analysis to review potential mislabelings:
|
|
1135
2156
|
>>> memoryset.display_label_analysis()
|
|
1136
2157
|
"""
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
2158
|
+
|
|
2159
|
+
# Get valid analysis names from MemorysetAnalysisConfigs
|
|
2160
|
+
valid_analysis_names = set(MemorysetAnalysisConfigs.__annotations__)
|
|
2161
|
+
|
|
2162
|
+
configs: MemorysetAnalysisConfigs = {}
|
|
2163
|
+
for analysis in analyses:
|
|
2164
|
+
if isinstance(analysis, str):
|
|
2165
|
+
error_msg = (
|
|
2166
|
+
f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2167
|
+
)
|
|
2168
|
+
if analysis not in valid_analysis_names:
|
|
2169
|
+
raise ValueError(error_msg)
|
|
2170
|
+
configs[analysis] = {}
|
|
2171
|
+
else:
|
|
2172
|
+
name = analysis.pop("name")
|
|
2173
|
+
error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2174
|
+
if name not in valid_analysis_names:
|
|
2175
|
+
raise ValueError(error_msg)
|
|
2176
|
+
configs[name] = analysis
|
|
2177
|
+
|
|
2178
|
+
client = OrcaClient._resolve_client()
|
|
2179
|
+
analysis = client.POST(
|
|
2180
|
+
"/memoryset/{name_or_id}/analysis",
|
|
2181
|
+
params={"name_or_id": self.id},
|
|
2182
|
+
json={
|
|
2183
|
+
"configs": configs,
|
|
2184
|
+
"lookup_count": lookup_count,
|
|
2185
|
+
"clear_metrics": clear_metrics,
|
|
2186
|
+
},
|
|
2187
|
+
)
|
|
2188
|
+
|
|
2189
|
+
def get_analysis_result():
|
|
2190
|
+
client = OrcaClient._resolve_client()
|
|
2191
|
+
return client.GET(
|
|
2192
|
+
"/memoryset/{name_or_id}/analysis/{analysis_task_id}",
|
|
2193
|
+
params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
|
|
2194
|
+
)["results"]
|
|
2195
|
+
|
|
2196
|
+
job = Job(analysis["task_id"], get_analysis_result)
|
|
2197
|
+
return job if background else job.result()
|
|
2198
|
+
|
|
2199
|
+
def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
|
|
2200
|
+
"""Group potential duplicates in the memoryset"""
|
|
2201
|
+
client = OrcaClient._resolve_client()
|
|
2202
|
+
response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
|
|
2203
|
+
return [
|
|
2204
|
+
[cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
|
|
2205
|
+
for ms in response
|
|
2206
|
+
]
|
|
2207
|
+
|
|
2208
|
+
|
|
2209
|
+
class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
2210
|
+
"""
|
|
2211
|
+
A Handle to a collection of memories with labels in the OrcaCloud
|
|
2212
|
+
|
|
2213
|
+
Attributes:
|
|
2214
|
+
id: Unique identifier for the memoryset
|
|
2215
|
+
name: Unique name of the memoryset
|
|
2216
|
+
description: Description of the memoryset
|
|
2217
|
+
label_names: Names for the class labels in the memoryset
|
|
2218
|
+
length: Number of memories in the memoryset
|
|
2219
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2220
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
2221
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2222
|
+
"""
|
|
2223
|
+
|
|
2224
|
+
label_names: list[str]
|
|
2225
|
+
memory_type: MemoryType = "LABELED"
|
|
2226
|
+
|
|
2227
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
2228
|
+
super().__init__(metadata)
|
|
2229
|
+
assert metadata["label_names"] is not None
|
|
2230
|
+
self.label_names = metadata["label_names"]
|
|
2231
|
+
|
|
2232
|
+
def __eq__(self, other) -> bool:
|
|
2233
|
+
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
2234
|
+
|
|
2235
|
+
@overload
|
|
2236
|
+
@classmethod
|
|
2237
|
+
def create(
|
|
2238
|
+
cls,
|
|
2239
|
+
name: str,
|
|
2240
|
+
datasource: Datasource,
|
|
2241
|
+
*,
|
|
2242
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2243
|
+
value_column: str = "value",
|
|
2244
|
+
label_column: str = "label",
|
|
2245
|
+
source_id_column: str | None = None,
|
|
2246
|
+
description: str | None = None,
|
|
2247
|
+
label_names: list[str] | None = None,
|
|
2248
|
+
max_seq_length_override: int | None = None,
|
|
2249
|
+
prompt: str | None = None,
|
|
2250
|
+
remove_duplicates: bool = True,
|
|
2251
|
+
index_type: IndexType = "FLAT",
|
|
2252
|
+
index_params: dict[str, Any] = {},
|
|
2253
|
+
if_exists: CreateMode = "error",
|
|
2254
|
+
background: Literal[True],
|
|
2255
|
+
hidden: bool = False,
|
|
2256
|
+
) -> Job[Self]:
|
|
2257
|
+
pass
|
|
2258
|
+
|
|
2259
|
+
@overload
|
|
2260
|
+
@classmethod
|
|
2261
|
+
def create(
|
|
2262
|
+
cls,
|
|
2263
|
+
name: str,
|
|
2264
|
+
datasource: Datasource,
|
|
2265
|
+
*,
|
|
2266
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2267
|
+
value_column: str = "value",
|
|
2268
|
+
label_column: str = "label",
|
|
2269
|
+
source_id_column: str | None = None,
|
|
2270
|
+
description: str | None = None,
|
|
2271
|
+
label_names: list[str] | None = None,
|
|
2272
|
+
max_seq_length_override: int | None = None,
|
|
2273
|
+
prompt: str | None = None,
|
|
2274
|
+
remove_duplicates: bool = True,
|
|
2275
|
+
index_type: IndexType = "FLAT",
|
|
2276
|
+
index_params: dict[str, Any] = {},
|
|
2277
|
+
if_exists: CreateMode = "error",
|
|
2278
|
+
background: Literal[False] = False,
|
|
2279
|
+
hidden: bool = False,
|
|
2280
|
+
) -> Self:
|
|
2281
|
+
pass
|
|
2282
|
+
|
|
2283
|
+
@classmethod
|
|
2284
|
+
def create( # type: ignore[override]
|
|
2285
|
+
cls,
|
|
2286
|
+
name: str,
|
|
2287
|
+
datasource: Datasource,
|
|
2288
|
+
*,
|
|
2289
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2290
|
+
value_column: str = "value",
|
|
2291
|
+
label_column: str = "label",
|
|
2292
|
+
source_id_column: str | None = None,
|
|
2293
|
+
description: str | None = None,
|
|
2294
|
+
label_names: list[str] | None = None,
|
|
2295
|
+
max_seq_length_override: int | None = None,
|
|
2296
|
+
prompt: str | None = None,
|
|
2297
|
+
remove_duplicates: bool = True,
|
|
2298
|
+
index_type: IndexType = "FLAT",
|
|
2299
|
+
index_params: dict[str, Any] = {},
|
|
2300
|
+
if_exists: CreateMode = "error",
|
|
2301
|
+
background: bool = False,
|
|
2302
|
+
hidden: bool = False,
|
|
2303
|
+
) -> Self | Job[Self]:
|
|
2304
|
+
"""
|
|
2305
|
+
Create a new labeled memoryset in the OrcaCloud
|
|
2306
|
+
|
|
2307
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
2308
|
+
`label_column`, or `source_id_column` will be stored as metadata in the memoryset.
|
|
2309
|
+
|
|
2310
|
+
Params:
|
|
2311
|
+
name: Name for the new memoryset (must be unique)
|
|
2312
|
+
datasource: Source data to populate the memories in the memoryset
|
|
2313
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2314
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
2315
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
2316
|
+
label_column: Name of the column in the datasource that contains the memory labels,
|
|
2317
|
+
these must be contiguous integers starting from 0
|
|
2318
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2319
|
+
the system of reference
|
|
2320
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
2321
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
2322
|
+
datasource or the embedding model.
|
|
2323
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
2324
|
+
the number of labels in the `label_column`. Will be automatically inferred if a
|
|
2325
|
+
[Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
|
|
2326
|
+
labels is used as the datasource
|
|
2327
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
2328
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
2329
|
+
sequence length if not provided
|
|
2330
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
2331
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
2332
|
+
into the memoryset
|
|
2333
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
2334
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
2335
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
2336
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2337
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2338
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
2339
|
+
hidden: Whether the memoryset should be hidden
|
|
2340
|
+
|
|
2341
|
+
Returns:
|
|
2342
|
+
Handle to the new memoryset in the OrcaCloud
|
|
2343
|
+
|
|
2344
|
+
Raises:
|
|
2345
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2346
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
2347
|
+
"""
|
|
2348
|
+
return super().create(
|
|
2349
|
+
name,
|
|
2350
|
+
datasource,
|
|
2351
|
+
label_column=label_column,
|
|
2352
|
+
score_column=None,
|
|
2353
|
+
embedding_model=embedding_model,
|
|
2354
|
+
value_column=value_column,
|
|
2355
|
+
source_id_column=source_id_column,
|
|
2356
|
+
description=description,
|
|
2357
|
+
label_names=label_names,
|
|
2358
|
+
max_seq_length_override=max_seq_length_override,
|
|
2359
|
+
prompt=prompt,
|
|
2360
|
+
remove_duplicates=remove_duplicates,
|
|
2361
|
+
index_type=index_type,
|
|
2362
|
+
index_params=index_params,
|
|
2363
|
+
if_exists=if_exists,
|
|
2364
|
+
background=background,
|
|
2365
|
+
hidden=hidden,
|
|
1143
2366
|
)
|
|
1144
|
-
wait_for_task(analysis.task_id, description="Analyzing labels")
|
|
1145
|
-
analysis = get_analysis(self.id, analysis.task_id)
|
|
1146
|
-
assert isinstance(analysis.result, AnalyzeNeighborLabelsResult)
|
|
1147
|
-
# TODO: return a custom label analysis class instance with helper methods
|
|
1148
|
-
return analysis.result.to_dict()
|
|
1149
2367
|
|
|
1150
2368
|
def display_label_analysis(self):
|
|
1151
|
-
"""
|
|
2369
|
+
"""
|
|
2370
|
+
Display an interactive UI to review and act upon the label analysis results
|
|
2371
|
+
|
|
2372
|
+
Note:
|
|
2373
|
+
This method is only available in Jupyter notebooks.
|
|
2374
|
+
"""
|
|
1152
2375
|
from ._utils.analysis_ui import display_suggested_memory_relabels
|
|
1153
2376
|
|
|
1154
2377
|
display_suggested_memory_relabels(self)
|
|
2378
|
+
|
|
2379
|
+
|
|
2380
|
+
class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
2381
|
+
"""
|
|
2382
|
+
A Handle to a collection of memories with scores in the OrcaCloud
|
|
2383
|
+
|
|
2384
|
+
Attributes:
|
|
2385
|
+
id: Unique identifier for the memoryset
|
|
2386
|
+
name: Unique name of the memoryset
|
|
2387
|
+
description: Description of the memoryset
|
|
2388
|
+
length: Number of memories in the memoryset
|
|
2389
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2390
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
2391
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2392
|
+
"""
|
|
2393
|
+
|
|
2394
|
+
memory_type: MemoryType = "SCORED"
|
|
2395
|
+
|
|
2396
|
+
def __eq__(self, other) -> bool:
|
|
2397
|
+
return isinstance(other, ScoredMemoryset) and self.id == other.id
|
|
2398
|
+
|
|
2399
|
+
@overload
|
|
2400
|
+
@classmethod
|
|
2401
|
+
def create(
|
|
2402
|
+
cls,
|
|
2403
|
+
name: str,
|
|
2404
|
+
datasource: Datasource,
|
|
2405
|
+
*,
|
|
2406
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2407
|
+
value_column: str = "value",
|
|
2408
|
+
score_column: str = "score",
|
|
2409
|
+
source_id_column: str | None = None,
|
|
2410
|
+
description: str | None = None,
|
|
2411
|
+
max_seq_length_override: int | None = None,
|
|
2412
|
+
prompt: str | None = None,
|
|
2413
|
+
remove_duplicates: bool = True,
|
|
2414
|
+
index_type: IndexType = "FLAT",
|
|
2415
|
+
index_params: dict[str, Any] = {},
|
|
2416
|
+
if_exists: CreateMode = "error",
|
|
2417
|
+
background: Literal[True],
|
|
2418
|
+
hidden: bool = False,
|
|
2419
|
+
) -> Job[Self]:
|
|
2420
|
+
pass
|
|
2421
|
+
|
|
2422
|
+
@overload
|
|
2423
|
+
@classmethod
|
|
2424
|
+
def create(
|
|
2425
|
+
cls,
|
|
2426
|
+
name: str,
|
|
2427
|
+
datasource: Datasource,
|
|
2428
|
+
*,
|
|
2429
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2430
|
+
score_column: str = "score",
|
|
2431
|
+
value_column: str = "value",
|
|
2432
|
+
source_id_column: str | None = None,
|
|
2433
|
+
description: str | None = None,
|
|
2434
|
+
max_seq_length_override: int | None = None,
|
|
2435
|
+
prompt: str | None = None,
|
|
2436
|
+
remove_duplicates: bool = True,
|
|
2437
|
+
index_type: IndexType = "FLAT",
|
|
2438
|
+
index_params: dict[str, Any] = {},
|
|
2439
|
+
if_exists: CreateMode = "error",
|
|
2440
|
+
background: Literal[False] = False,
|
|
2441
|
+
hidden: bool = False,
|
|
2442
|
+
) -> Self:
|
|
2443
|
+
pass
|
|
2444
|
+
|
|
2445
|
+
@classmethod
|
|
2446
|
+
def create( # type: ignore[override]
|
|
2447
|
+
cls,
|
|
2448
|
+
name: str,
|
|
2449
|
+
datasource: Datasource,
|
|
2450
|
+
*,
|
|
2451
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2452
|
+
value_column: str = "value",
|
|
2453
|
+
score_column: str = "score",
|
|
2454
|
+
source_id_column: str | None = None,
|
|
2455
|
+
description: str | None = None,
|
|
2456
|
+
max_seq_length_override: int | None = None,
|
|
2457
|
+
prompt: str | None = None,
|
|
2458
|
+
remove_duplicates: bool = True,
|
|
2459
|
+
index_type: IndexType = "FLAT",
|
|
2460
|
+
index_params: dict[str, Any] = {},
|
|
2461
|
+
if_exists: CreateMode = "error",
|
|
2462
|
+
background: bool = False,
|
|
2463
|
+
hidden: bool = False,
|
|
2464
|
+
) -> Self | Job[Self]:
|
|
2465
|
+
"""
|
|
2466
|
+
Create a new scored memoryset in the OrcaCloud
|
|
2467
|
+
|
|
2468
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
2469
|
+
`score_column`, or `source_id_column` will be stored as metadata in the memoryset.
|
|
2470
|
+
|
|
2471
|
+
Params:
|
|
2472
|
+
name: Name for the new memoryset (must be unique)
|
|
2473
|
+
datasource: Source data to populate the memories in the memoryset
|
|
2474
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2475
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
2476
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
2477
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
2478
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2479
|
+
the system of reference
|
|
2480
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
2481
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
2482
|
+
datasource or the embedding model.
|
|
2483
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
2484
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
2485
|
+
sequence length if not provided
|
|
2486
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
2487
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
2488
|
+
into the memoryset
|
|
2489
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
2490
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
2491
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
2492
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2493
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2494
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
2495
|
+
hidden: Whether the memoryset should be hidden
|
|
2496
|
+
|
|
2497
|
+
Returns:
|
|
2498
|
+
Handle to the new memoryset in the OrcaCloud
|
|
2499
|
+
|
|
2500
|
+
Raises:
|
|
2501
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2502
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
2503
|
+
"""
|
|
2504
|
+
return super().create(
|
|
2505
|
+
name,
|
|
2506
|
+
datasource,
|
|
2507
|
+
embedding_model=embedding_model,
|
|
2508
|
+
value_column=value_column,
|
|
2509
|
+
score_column=score_column,
|
|
2510
|
+
source_id_column=source_id_column,
|
|
2511
|
+
description=description,
|
|
2512
|
+
max_seq_length_override=max_seq_length_override,
|
|
2513
|
+
prompt=prompt,
|
|
2514
|
+
remove_duplicates=remove_duplicates,
|
|
2515
|
+
index_type=index_type,
|
|
2516
|
+
index_params=index_params,
|
|
2517
|
+
if_exists=if_exists,
|
|
2518
|
+
background=background,
|
|
2519
|
+
hidden=hidden,
|
|
2520
|
+
)
|