orca-sdk 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +10 -4
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +393 -0
- orca_sdk/_shared/metrics_test.py +273 -0
- orca_sdk/_utils/analysis_ui.py +12 -10
- orca_sdk/_utils/analysis_ui_style.css +0 -3
- orca_sdk/_utils/auth.py +27 -29
- orca_sdk/_utils/data_parsing.py +28 -2
- orca_sdk/_utils/data_parsing_test.py +15 -15
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.py +67 -21
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/classification_model.py +439 -129
- orca_sdk/classification_model_test.py +334 -104
- orca_sdk/client.py +3747 -0
- orca_sdk/conftest.py +164 -19
- orca_sdk/credentials.py +120 -18
- orca_sdk/credentials_test.py +20 -0
- orca_sdk/datasource.py +259 -68
- orca_sdk/datasource_test.py +242 -0
- orca_sdk/embedding_model.py +425 -82
- orca_sdk/embedding_model_test.py +39 -13
- orca_sdk/job.py +337 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +1341 -305
- orca_sdk/memoryset_test.py +350 -111
- orca_sdk/regression_model.py +684 -0
- orca_sdk/regression_model_test.py +369 -0
- orca_sdk/telemetry.py +449 -143
- orca_sdk/telemetry_test.py +43 -24
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/METADATA +34 -16
- orca_sdk-0.1.2.dist-info/RECORD +40 -0
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/WHEEL +1 -1
- orca_sdk/_generated_api_client/__init__.py +0 -3
- orca_sdk/_generated_api_client/api/__init__.py +0 -193
- orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
- orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
- orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
- orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
- orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
- orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
- orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
- orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
- orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
- orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
- orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
- orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
- orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
- orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
- orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
- orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
- orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
- orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
- orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
- orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
- orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
- orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
- orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
- orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
- orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
- orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
- orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
- orca_sdk/_generated_api_client/client.py +0 -216
- orca_sdk/_generated_api_client/errors.py +0 -38
- orca_sdk/_generated_api_client/models/__init__.py +0 -159
- orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
- orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
- orca_sdk/_generated_api_client/models/base_model.py +0 -55
- orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
- orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
- orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
- orca_sdk/_generated_api_client/models/column_info.py +0 -114
- orca_sdk/_generated_api_client/models/column_type.py +0 -14
- orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
- orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
- orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
- orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
- orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
- orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/embed_request.py +0 -127
- orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
- orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
- orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
- orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
- orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
- orca_sdk/_generated_api_client/models/filter_item.py +0 -231
- orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
- orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
- orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
- orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
- orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
- orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
- orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
- orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
- orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
- orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
- orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
- orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
- orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
- orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
- orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
- orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
- orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
- orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
- orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
- orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
- orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
- orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
- orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
- orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
- orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
- orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
- orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
- orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
- orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/task.py +0 -198
- orca_sdk/_generated_api_client/models/task_status.py +0 -14
- orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
- orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
- orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
- orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
- orca_sdk/_generated_api_client/py.typed +0 -1
- orca_sdk/_generated_api_client/types.py +0 -56
- orca_sdk/_utils/task.py +0 -73
- orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/memoryset.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
from abc import ABC
|
|
4
5
|
from datetime import datetime, timedelta
|
|
5
6
|
from os import PathLike
|
|
6
|
-
from typing import Any, Iterable, Literal, cast, overload
|
|
7
|
+
from typing import Any, Generic, Iterable, Literal, Self, TypeVar, cast, overload
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
import pyarrow as pa
|
|
@@ -11,62 +12,62 @@ from datasets import Dataset
|
|
|
11
12
|
from torch.utils.data import DataLoader as TorchDataLoader
|
|
12
13
|
from torch.utils.data import Dataset as TorchDataset
|
|
13
14
|
|
|
14
|
-
from .
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
get_analysis,
|
|
21
|
-
get_memories,
|
|
22
|
-
get_memory,
|
|
23
|
-
get_memoryset,
|
|
24
|
-
insert_memories_gpu,
|
|
25
|
-
list_memorysets,
|
|
26
|
-
memoryset_lookup_gpu,
|
|
27
|
-
query_memoryset,
|
|
28
|
-
update_memories_gpu,
|
|
29
|
-
update_memory_gpu,
|
|
30
|
-
)
|
|
31
|
-
from ._generated_api_client.models import (
|
|
32
|
-
AnalyzeNeighborLabelsResult,
|
|
33
|
-
CloneLabeledMemorysetRequest,
|
|
34
|
-
CreateLabeledMemorysetRequest,
|
|
35
|
-
DeleteMemoriesRequest,
|
|
15
|
+
from ._utils.common import UNSET, CreateMode, DropMode
|
|
16
|
+
from .client import (
|
|
17
|
+
CascadingEditSuggestion,
|
|
18
|
+
CloneMemorysetRequest,
|
|
19
|
+
CreateMemorysetRequest,
|
|
20
|
+
EmbeddingModelResult,
|
|
36
21
|
FilterItem,
|
|
37
|
-
FilterItemOp,
|
|
38
|
-
FindDuplicatesAnalysisResult,
|
|
39
|
-
GetMemoriesRequest,
|
|
40
22
|
)
|
|
41
|
-
from .
|
|
42
|
-
from .
|
|
23
|
+
from .client import LabeledMemory as LabeledMemoryResponse
|
|
24
|
+
from .client import (
|
|
43
25
|
LabeledMemoryInsert,
|
|
44
|
-
LabeledMemoryInsertMetadata,
|
|
45
|
-
)
|
|
46
|
-
from ._generated_api_client.models import (
|
|
47
|
-
LabeledMemoryLookup as LabeledMemoryLookupResponse,
|
|
48
26
|
)
|
|
49
|
-
from .
|
|
50
|
-
|
|
51
|
-
LabeledMemorysetMetadata,
|
|
27
|
+
from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
|
|
28
|
+
from .client import (
|
|
52
29
|
LabeledMemoryUpdate,
|
|
53
|
-
|
|
30
|
+
LabeledMemoryWithFeedbackMetrics,
|
|
54
31
|
LabelPredictionMemoryLookup,
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
32
|
+
MemoryMetrics,
|
|
33
|
+
MemorysetAnalysisConfigs,
|
|
34
|
+
MemorysetMetadata,
|
|
35
|
+
MemorysetMetrics,
|
|
36
|
+
MemorysetUpdate,
|
|
37
|
+
MemoryType,
|
|
38
|
+
)
|
|
39
|
+
from .client import ScoredMemory as ScoredMemoryResponse
|
|
40
|
+
from .client import (
|
|
41
|
+
ScoredMemoryInsert,
|
|
42
|
+
)
|
|
43
|
+
from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
|
|
44
|
+
from .client import (
|
|
45
|
+
ScoredMemoryUpdate,
|
|
46
|
+
ScoredMemoryWithFeedbackMetrics,
|
|
47
|
+
ScorePredictionMemoryLookup,
|
|
48
|
+
TelemetryFilterItem,
|
|
49
|
+
TelemetrySortOptions,
|
|
50
|
+
orca_api,
|
|
60
51
|
)
|
|
61
|
-
from ._generated_api_client.types import UNSET as CLIENT_UNSET
|
|
62
|
-
from ._utils.common import UNSET, CreateMode, DropMode
|
|
63
|
-
from ._utils.task import wait_for_task
|
|
64
52
|
from .datasource import Datasource
|
|
65
53
|
from .embedding_model import (
|
|
54
|
+
EmbeddingModelBase,
|
|
66
55
|
FinetunedEmbeddingModel,
|
|
67
56
|
PretrainedEmbeddingModel,
|
|
68
|
-
_EmbeddingModel,
|
|
69
57
|
)
|
|
58
|
+
from .job import Job, Status
|
|
59
|
+
|
|
60
|
+
TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
|
|
61
|
+
"""
|
|
62
|
+
Sort expression for telemetry data consisting of a field and a direction.
|
|
63
|
+
|
|
64
|
+
* **`field`**: The field to sort on.
|
|
65
|
+
* **`direction`**: The direction to sort in.
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
>>> ("feedback_metrics.accuracy.avg", "asc")
|
|
69
|
+
>>> ("lookup.count", "desc")
|
|
70
|
+
"""
|
|
70
71
|
|
|
71
72
|
FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"]
|
|
72
73
|
"""
|
|
@@ -90,62 +91,249 @@ Examples:
|
|
|
90
91
|
>>> ("label", "==", 0)
|
|
91
92
|
>>> ("metadata.author", "like", "John")
|
|
92
93
|
>>> ("source_id", "in", ["123", "456"])
|
|
94
|
+
>>> ("feedback_metrics.accuracy.avg", ">", 0.95)
|
|
93
95
|
"""
|
|
94
96
|
|
|
97
|
+
IndexType = Literal["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ", "HNSW", "DISKANN"]
|
|
98
|
+
|
|
99
|
+
DEFAULT_COLUMN_NAMES = {"value", "source_id"}
|
|
100
|
+
TYPE_SPECIFIC_COLUMN_NAMES = {"label", "score"}
|
|
101
|
+
FORBIDDEN_METADATA_COLUMN_NAMES = {
|
|
102
|
+
"memory_id",
|
|
103
|
+
"memory_version",
|
|
104
|
+
"embedding",
|
|
105
|
+
"created_at",
|
|
106
|
+
"updated_at",
|
|
107
|
+
"metrics",
|
|
108
|
+
"feedback_metrics",
|
|
109
|
+
"lookup",
|
|
110
|
+
}
|
|
95
111
|
|
|
96
|
-
DEFAULT_COLUMN_NAMES = {"value", "label", "source_id"}
|
|
97
|
-
FORBIDDEN_METADATA_COLUMN_NAMES = {"memory_id", "memory_version", "embedding", "created_at", "updated_at", "metrics"}
|
|
98
112
|
|
|
113
|
+
def _is_metric_column(column: str):
|
|
114
|
+
return column in ["feedback_metrics", "lookup"]
|
|
99
115
|
|
|
100
|
-
|
|
116
|
+
|
|
117
|
+
def _parse_filter_item_from_tuple(input: FilterItemTuple) -> FilterItem | TelemetryFilterItem:
|
|
101
118
|
field = input[0].split(".")
|
|
102
|
-
if
|
|
119
|
+
if (
|
|
120
|
+
len(field) == 1
|
|
121
|
+
and field[0] not in DEFAULT_COLUMN_NAMES | TYPE_SPECIFIC_COLUMN_NAMES | FORBIDDEN_METADATA_COLUMN_NAMES
|
|
122
|
+
):
|
|
103
123
|
field = ["metadata", field[0]]
|
|
104
|
-
op =
|
|
124
|
+
op = input[1]
|
|
105
125
|
value = input[2]
|
|
126
|
+
if isinstance(value, datetime):
|
|
127
|
+
value = value.isoformat()
|
|
128
|
+
if _is_metric_column(field[0]):
|
|
129
|
+
if not (
|
|
130
|
+
(isinstance(value, list) and all(isinstance(v, float) or isinstance(v, int) for v in value))
|
|
131
|
+
or isinstance(value, float)
|
|
132
|
+
or isinstance(value, int)
|
|
133
|
+
):
|
|
134
|
+
raise ValueError(f"Invalid value for {field[0]} filter: {value}")
|
|
135
|
+
if field[0] == "feedback_metrics" and (len(field) != 3 or field[2] not in ["avg", "count"]):
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Feedback metrics filters must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
138
|
+
)
|
|
139
|
+
elif field[0] == "lookup" and (len(field) != 2 or field[1] != "count"):
|
|
140
|
+
raise ValueError("Lookup filters must follow the format `lookup.count`")
|
|
141
|
+
if op == "like":
|
|
142
|
+
raise ValueError("Like filters are not supported on metric columns")
|
|
143
|
+
op = cast(Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in"], op)
|
|
144
|
+
value = cast(float | int | list[float] | list[int], value)
|
|
145
|
+
return TelemetryFilterItem(field=field, op=op, value=value)
|
|
146
|
+
|
|
106
147
|
return FilterItem(field=field, op=op, value=value)
|
|
107
148
|
|
|
108
149
|
|
|
109
|
-
def
|
|
150
|
+
def _parse_sort_item_from_tuple(
|
|
151
|
+
input: TelemetrySortItem,
|
|
152
|
+
) -> TelemetrySortOptions:
|
|
153
|
+
field = input[0].split(".")
|
|
154
|
+
|
|
155
|
+
if len(field) == 1:
|
|
156
|
+
raise ValueError("Sort field must be a telemetry field with an aggregate function name value")
|
|
157
|
+
if field[0] not in ["feedback_metrics", "lookup"]:
|
|
158
|
+
raise ValueError("Sort field must be one of telemetry fields: feedback_metrics or lookup")
|
|
159
|
+
if field[0] == "feedback_metrics":
|
|
160
|
+
if len(field) != 3:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"Feedback metrics must follow the format `feedback_metrics.<feedback_category_name>.<avg | count>`"
|
|
163
|
+
)
|
|
164
|
+
if field[2] not in ["avg", "count"]:
|
|
165
|
+
raise ValueError("Feedback metrics can only be sorted on avg or count")
|
|
166
|
+
if field[0] == "lookup":
|
|
167
|
+
if len(field) != 2:
|
|
168
|
+
raise ValueError("Lookup must follow the format `lookup.count`")
|
|
169
|
+
if field[1] != "count":
|
|
170
|
+
raise ValueError("Lookup can only be sorted on count")
|
|
171
|
+
return TelemetrySortOptions(field=field, direction=input[1])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _parse_memory_insert(memory: dict[str, Any], type: MemoryType) -> LabeledMemoryInsert | ScoredMemoryInsert:
|
|
110
175
|
value = memory.get("value")
|
|
111
176
|
if not isinstance(value, str):
|
|
112
177
|
raise ValueError("Memory value must be a string")
|
|
113
|
-
label = memory.get("label")
|
|
114
|
-
if not isinstance(label, int):
|
|
115
|
-
raise ValueError("Memory label must be an integer")
|
|
116
178
|
source_id = memory.get("source_id")
|
|
117
179
|
if source_id and not isinstance(source_id, str):
|
|
118
180
|
raise ValueError("Memory source_id must be a string")
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
181
|
+
match type:
|
|
182
|
+
case "LABELED":
|
|
183
|
+
label = memory.get("label")
|
|
184
|
+
if label is not None and not isinstance(label, int):
|
|
185
|
+
raise ValueError("Memory label must be an integer")
|
|
186
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"label"}}
|
|
187
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
190
|
+
)
|
|
191
|
+
return {"value": value, "label": label, "source_id": source_id, "metadata": metadata}
|
|
192
|
+
case "SCORED":
|
|
193
|
+
score = memory.get("score")
|
|
194
|
+
if score is not None and not isinstance(score, (int, float)):
|
|
195
|
+
raise ValueError("Memory score must be a number")
|
|
196
|
+
metadata = {k: v for k, v in memory.items() if k not in DEFAULT_COLUMN_NAMES | {"score"}}
|
|
197
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"The following column names are reserved: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
200
|
+
)
|
|
201
|
+
return {"value": value, "score": score, "source_id": source_id, "metadata": metadata}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _parse_memory_update(update: dict[str, Any], type: MemoryType) -> LabeledMemoryUpdate | ScoredMemoryUpdate:
|
|
126
205
|
if "memory_id" not in update:
|
|
127
206
|
raise ValueError("memory_id must be specified in the update dictionary")
|
|
128
207
|
memory_id = update["memory_id"]
|
|
129
208
|
if not isinstance(memory_id, str):
|
|
130
209
|
raise ValueError("memory_id must be a string")
|
|
131
|
-
|
|
132
|
-
if value
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
210
|
+
payload: LabeledMemoryUpdate | ScoredMemoryUpdate = {"memory_id": memory_id}
|
|
211
|
+
if "value" in update:
|
|
212
|
+
if not isinstance(update["value"], str):
|
|
213
|
+
raise ValueError("value must be a string or unset")
|
|
214
|
+
payload["value"] = update["value"]
|
|
215
|
+
if "source_id" in update:
|
|
216
|
+
if not isinstance(update["source_id"], str):
|
|
217
|
+
raise ValueError("source_id must be a string or unset")
|
|
218
|
+
payload["source_id"] = update["source_id"]
|
|
219
|
+
match type:
|
|
220
|
+
case "LABELED":
|
|
221
|
+
payload = cast(LabeledMemoryUpdate, payload)
|
|
222
|
+
if "label" in update:
|
|
223
|
+
if not isinstance(update["label"], int):
|
|
224
|
+
raise ValueError("label must be an integer or unset")
|
|
225
|
+
payload["label"] = update["label"]
|
|
226
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "label"}}
|
|
227
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
230
|
+
)
|
|
231
|
+
payload["metadata"] = metadata
|
|
232
|
+
return payload
|
|
233
|
+
case "SCORED":
|
|
234
|
+
payload = cast(ScoredMemoryUpdate, payload)
|
|
235
|
+
if "score" in update:
|
|
236
|
+
if not isinstance(update["score"], (int, float)):
|
|
237
|
+
raise ValueError("score must be a number or unset")
|
|
238
|
+
payload["score"] = update["score"]
|
|
239
|
+
metadata = {k: v for k, v in update.items() if k not in DEFAULT_COLUMN_NAMES | {"memory_id", "score"}}
|
|
240
|
+
if any(k in metadata for k in FORBIDDEN_METADATA_COLUMN_NAMES):
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f"Cannot update the following metadata keys: {', '.join(FORBIDDEN_METADATA_COLUMN_NAMES)}"
|
|
243
|
+
)
|
|
244
|
+
payload["metadata"] = metadata
|
|
245
|
+
return cast(ScoredMemoryUpdate, payload)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class MemoryBase(ABC):
|
|
249
|
+
value: str
|
|
250
|
+
embedding: list[float]
|
|
251
|
+
source_id: str | None
|
|
252
|
+
created_at: datetime
|
|
253
|
+
updated_at: datetime
|
|
254
|
+
metadata: dict[str, str | float | int | bool | None]
|
|
255
|
+
metrics: MemoryMetrics
|
|
256
|
+
memory_id: str
|
|
257
|
+
memory_version: int
|
|
258
|
+
feedback_metrics: dict[str, Any]
|
|
259
|
+
lookup_count: int
|
|
260
|
+
memory_type: MemoryType # defined by subclasses
|
|
261
|
+
|
|
262
|
+
def __init__(
|
|
263
|
+
self,
|
|
264
|
+
memoryset_id: str,
|
|
265
|
+
memory: (
|
|
266
|
+
LabeledMemoryResponse
|
|
267
|
+
| LabeledMemoryLookupResponse
|
|
268
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
269
|
+
| LabelPredictionMemoryLookup
|
|
270
|
+
| ScoredMemoryResponse
|
|
271
|
+
| ScoredMemoryLookupResponse
|
|
272
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
273
|
+
| ScorePredictionMemoryLookup
|
|
274
|
+
),
|
|
275
|
+
):
|
|
276
|
+
# for internal use only, do not document
|
|
277
|
+
self.memoryset_id = memoryset_id
|
|
278
|
+
self.memory_id = memory["memory_id"]
|
|
279
|
+
self.memory_version = memory["memory_version"]
|
|
280
|
+
self.value = cast(str, memory["value"])
|
|
281
|
+
self.embedding = memory["embedding"]
|
|
282
|
+
self.source_id = memory["source_id"]
|
|
283
|
+
self.created_at = datetime.fromisoformat(memory["created_at"])
|
|
284
|
+
self.updated_at = datetime.fromisoformat(memory["updated_at"])
|
|
285
|
+
self.metadata = memory["metadata"]
|
|
286
|
+
self.metrics = memory["metrics"] if "metrics" in memory else {}
|
|
287
|
+
self.feedback_metrics = memory.get("feedback_metrics", {}) or {}
|
|
288
|
+
self.lookup_count = memory.get("lookup_count", 0)
|
|
289
|
+
|
|
290
|
+
def __getattr__(self, key: str) -> Any:
|
|
291
|
+
if key.startswith("__") or key not in self.metadata:
|
|
292
|
+
raise AttributeError(f"{key} is not a valid attribute")
|
|
293
|
+
return self.metadata[key]
|
|
294
|
+
|
|
295
|
+
def _update(
|
|
296
|
+
self,
|
|
297
|
+
*,
|
|
298
|
+
value: str = UNSET,
|
|
299
|
+
source_id: str | None = UNSET,
|
|
300
|
+
**metadata: None | bool | float | int | str,
|
|
301
|
+
) -> Self:
|
|
302
|
+
response = orca_api.PATCH(
|
|
303
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
304
|
+
params={"name_or_id": self.memoryset_id},
|
|
305
|
+
json=_parse_memory_update(
|
|
306
|
+
{"memory_id": self.memory_id}
|
|
307
|
+
| ({"value": value} if value is not UNSET else {})
|
|
308
|
+
| ({"source_id": source_id} if source_id is not UNSET else {})
|
|
309
|
+
| {k: v for k, v in metadata.items() if v is not UNSET},
|
|
310
|
+
type=self.memory_type,
|
|
311
|
+
),
|
|
312
|
+
)
|
|
313
|
+
self.__dict__.update(self.__class__(self.memoryset_id, response).__dict__)
|
|
314
|
+
return self
|
|
315
|
+
|
|
316
|
+
def to_dict(self) -> dict[str, Any]:
|
|
317
|
+
"""
|
|
318
|
+
Convert the memory to a dictionary
|
|
319
|
+
"""
|
|
320
|
+
return {
|
|
321
|
+
"value": self.value,
|
|
322
|
+
"embedding": self.embedding,
|
|
323
|
+
"source_id": self.source_id,
|
|
324
|
+
"created_at": self.created_at,
|
|
325
|
+
"updated_at": self.updated_at,
|
|
326
|
+
"metadata": self.metadata,
|
|
327
|
+
"metrics": self.metrics,
|
|
328
|
+
"memory_id": self.memory_id,
|
|
329
|
+
"memory_version": self.memory_version,
|
|
330
|
+
"feedback_metrics": self.feedback_metrics,
|
|
331
|
+
"lookup_count": self.lookup_count,
|
|
332
|
+
"memory_type": self.memory_type,
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class LabeledMemory(MemoryBase):
|
|
149
337
|
"""
|
|
150
338
|
A row of the [`LabeledMemoryset`][orca_sdk.LabeledMemoryset]
|
|
151
339
|
|
|
@@ -170,47 +358,30 @@ class LabeledMemory:
|
|
|
170
358
|
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
171
359
|
"""
|
|
172
360
|
|
|
173
|
-
|
|
174
|
-
embedding: list[float]
|
|
175
|
-
label: int
|
|
361
|
+
label: int | None
|
|
176
362
|
label_name: str | None
|
|
177
|
-
|
|
178
|
-
created_at: datetime
|
|
179
|
-
updated_at: datetime
|
|
180
|
-
metadata: dict[str, str | float | int | bool | None]
|
|
181
|
-
metrics: LabeledMemoryMetrics | None
|
|
182
|
-
memory_id: str
|
|
183
|
-
memory_version: int
|
|
363
|
+
memory_type = "LABELED"
|
|
184
364
|
|
|
185
365
|
def __init__(
|
|
186
366
|
self,
|
|
187
367
|
memoryset_id: str,
|
|
188
|
-
memory:
|
|
368
|
+
memory: (
|
|
369
|
+
LabeledMemoryResponse
|
|
370
|
+
| LabeledMemoryLookupResponse
|
|
371
|
+
| LabelPredictionMemoryLookup
|
|
372
|
+
| LabeledMemoryWithFeedbackMetrics
|
|
373
|
+
),
|
|
189
374
|
):
|
|
190
375
|
# for internal use only, do not document
|
|
191
|
-
|
|
192
|
-
self.
|
|
193
|
-
self.
|
|
194
|
-
self.value = memory.value
|
|
195
|
-
self.embedding = memory.embedding
|
|
196
|
-
self.label = memory.label
|
|
197
|
-
self.label_name = memory.label_name
|
|
198
|
-
self.source_id = memory.source_id
|
|
199
|
-
self.created_at = memory.created_at
|
|
200
|
-
self.updated_at = memory.updated_at
|
|
201
|
-
self.metadata = memory.metadata.to_dict()
|
|
202
|
-
self.metrics = memory.metrics
|
|
203
|
-
|
|
204
|
-
def __getattr__(self, key: str) -> Any:
|
|
205
|
-
if key.startswith("__") or key not in self.metadata:
|
|
206
|
-
raise AttributeError(f"{key} is not a valid attribute")
|
|
207
|
-
return self.metadata[key]
|
|
376
|
+
super().__init__(memoryset_id, memory)
|
|
377
|
+
self.label = memory["label"]
|
|
378
|
+
self.label_name = memory["label_name"]
|
|
208
379
|
|
|
209
380
|
def __repr__(self) -> str:
|
|
210
381
|
return (
|
|
211
382
|
"LabeledMemory({ "
|
|
212
383
|
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
213
|
-
+ f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
|
|
384
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
214
385
|
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
215
386
|
+ " })"
|
|
216
387
|
)
|
|
@@ -222,7 +393,7 @@ class LabeledMemory:
|
|
|
222
393
|
self,
|
|
223
394
|
*,
|
|
224
395
|
value: str = UNSET,
|
|
225
|
-
label: int = UNSET,
|
|
396
|
+
label: int | None = UNSET,
|
|
226
397
|
source_id: str | None = UNSET,
|
|
227
398
|
**metadata: None | bool | float | int | str,
|
|
228
399
|
) -> LabeledMemory:
|
|
@@ -241,19 +412,18 @@ class LabeledMemory:
|
|
|
241
412
|
Returns:
|
|
242
413
|
The updated memory
|
|
243
414
|
"""
|
|
244
|
-
|
|
245
|
-
self.memoryset_id,
|
|
246
|
-
body=_parse_memory_update(
|
|
247
|
-
{"memory_id": self.memory_id}
|
|
248
|
-
| ({"value": value} if value is not UNSET else {})
|
|
249
|
-
| ({"label": label} if label is not UNSET else {})
|
|
250
|
-
| ({"source_id": source_id} if source_id is not UNSET else {})
|
|
251
|
-
| metadata
|
|
252
|
-
),
|
|
253
|
-
)
|
|
254
|
-
self.__dict__.update(LabeledMemory(self.memoryset_id, response).__dict__)
|
|
415
|
+
self._update(value=value, label=label, source_id=source_id, **metadata)
|
|
255
416
|
return self
|
|
256
417
|
|
|
418
|
+
def to_dict(self) -> dict[str, Any]:
|
|
419
|
+
"""
|
|
420
|
+
Convert the memory to a dictionary
|
|
421
|
+
"""
|
|
422
|
+
super_dict = super().to_dict()
|
|
423
|
+
super_dict["label"] = self.label
|
|
424
|
+
super_dict["label_name"] = self.label_name
|
|
425
|
+
return super_dict
|
|
426
|
+
|
|
257
427
|
|
|
258
428
|
class LabeledMemoryLookup(LabeledMemory):
|
|
259
429
|
"""
|
|
@@ -289,10 +459,8 @@ class LabeledMemoryLookup(LabeledMemory):
|
|
|
289
459
|
def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
|
|
290
460
|
# for internal use only, do not document
|
|
291
461
|
super().__init__(memoryset_id, memory_lookup)
|
|
292
|
-
self.lookup_score = memory_lookup
|
|
293
|
-
self.attention_weight =
|
|
294
|
-
memory_lookup.attention_weight if isinstance(memory_lookup, LabelPredictionMemoryLookup) else None
|
|
295
|
-
)
|
|
462
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
463
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
296
464
|
|
|
297
465
|
def __repr__(self) -> str:
|
|
298
466
|
return (
|
|
@@ -300,20 +468,155 @@ class LabeledMemoryLookup(LabeledMemory):
|
|
|
300
468
|
+ f"label: {('<' + self.label_name + ': ' + str(self.label) + '>') if self.label_name else str(self.label)}"
|
|
301
469
|
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
302
470
|
+ (f", attention_weight: {self.attention_weight:.2f}" if self.attention_weight is not None else "")
|
|
303
|
-
+ f", value: '{self.value[:100] + '...' if len(self.value) > 100 else self.value}'"
|
|
471
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
472
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
473
|
+
+ " })"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class ScoredMemory(MemoryBase):
|
|
478
|
+
"""
|
|
479
|
+
A row of the [`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
480
|
+
|
|
481
|
+
Attributes:
|
|
482
|
+
value: Value represented by the row
|
|
483
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
484
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
485
|
+
score: Score of the memory
|
|
486
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
487
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
488
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
489
|
+
metadata: Metadata associated with the memory that is not used in the model. Metadata
|
|
490
|
+
properties are also accessible as individual attributes on the instance.
|
|
491
|
+
memory_id: Unique identifier for the memory, automatically generated on insert
|
|
492
|
+
memory_version: Version of the memory, automatically updated when the score or value changes
|
|
493
|
+
created_at: When the memory was created, automatically generated on insert
|
|
494
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
495
|
+
|
|
496
|
+
## Other Attributes:
|
|
497
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
score: float | None
|
|
501
|
+
memory_type = "SCORED"
|
|
502
|
+
|
|
503
|
+
def __init__(
|
|
504
|
+
self,
|
|
505
|
+
memoryset_id: str,
|
|
506
|
+
memory: (
|
|
507
|
+
ScoredMemoryResponse
|
|
508
|
+
| ScoredMemoryLookupResponse
|
|
509
|
+
| ScorePredictionMemoryLookup
|
|
510
|
+
| ScoredMemoryWithFeedbackMetrics
|
|
511
|
+
),
|
|
512
|
+
):
|
|
513
|
+
# for internal use only, do not document
|
|
514
|
+
super().__init__(memoryset_id, memory)
|
|
515
|
+
self.score = memory["score"]
|
|
516
|
+
|
|
517
|
+
def __repr__(self) -> str:
|
|
518
|
+
return (
|
|
519
|
+
"ScoredMemory({ "
|
|
520
|
+
+ f"score: {self.score:.2f}"
|
|
521
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
304
522
|
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
305
523
|
+ " })"
|
|
306
524
|
)
|
|
307
525
|
|
|
526
|
+
def __eq__(self, other: object) -> bool:
|
|
527
|
+
return isinstance(other, ScoredMemory) and self.memory_id == other.memory_id
|
|
528
|
+
|
|
529
|
+
def update(
|
|
530
|
+
self,
|
|
531
|
+
*,
|
|
532
|
+
value: str = UNSET,
|
|
533
|
+
score: float | None = UNSET,
|
|
534
|
+
source_id: str | None = UNSET,
|
|
535
|
+
**metadata: None | bool | float | int | str,
|
|
536
|
+
) -> ScoredMemory:
|
|
537
|
+
"""
|
|
538
|
+
Update the memory with new values
|
|
539
|
+
|
|
540
|
+
Note:
|
|
541
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
542
|
+
|
|
543
|
+
Params:
|
|
544
|
+
value: New value of the memory
|
|
545
|
+
score: New score of the memory
|
|
546
|
+
source_id: New source ID of the memory
|
|
547
|
+
**metadata: New values for metadata properties
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
The updated memory
|
|
551
|
+
"""
|
|
552
|
+
self._update(value=value, score=score, source_id=source_id, **metadata)
|
|
553
|
+
return self
|
|
554
|
+
|
|
555
|
+
def to_dict(self) -> dict[str, Any]:
|
|
556
|
+
"""
|
|
557
|
+
Convert the memory to a dictionary
|
|
558
|
+
"""
|
|
559
|
+
super_dict = super().to_dict()
|
|
560
|
+
super_dict["score"] = self.score
|
|
561
|
+
return super_dict
|
|
308
562
|
|
|
309
|
-
|
|
563
|
+
|
|
564
|
+
class ScoredMemoryLookup(ScoredMemory):
|
|
565
|
+
"""
|
|
566
|
+
Lookup result for a memory in a memoryset
|
|
567
|
+
|
|
568
|
+
Attributes:
|
|
569
|
+
lookup_score: Similarity between the memory embedding and search query embedding
|
|
570
|
+
attention_weight: Weight the model assigned to the memory during prediction if this lookup
|
|
571
|
+
happened as part of a prediction
|
|
572
|
+
value: Value represented by the row
|
|
573
|
+
embedding: Embedding of the value of the memory for semantic search, automatically generated
|
|
574
|
+
with the [`ScoredMemoryset.embedding_model`][orca_sdk.ScoredMemoryset]
|
|
575
|
+
score: Score of the memory
|
|
576
|
+
source_id: Optional unique identifier of the memory in a system of reference
|
|
577
|
+
metrics: Metrics about the memory, generated when running an analysis on the
|
|
578
|
+
[`ScoredMemoryset`][orca_sdk.ScoredMemoryset]
|
|
579
|
+
memory_id: The unique identifier for the memory, automatically generated on insert
|
|
580
|
+
memory_version: The version of the memory, automatically updated when the score or value changes
|
|
581
|
+
created_at: When the memory was created, automatically generated on insert
|
|
582
|
+
updated_at: When the memory was last updated, automatically updated on update
|
|
583
|
+
|
|
584
|
+
## Other Attributes:
|
|
585
|
+
* **`...`** (<code>[str][str] | [float][float] | [int][int] | [bool][bool] | None</code>): All metadata properties can be accessed as attributes
|
|
586
|
+
"""
|
|
587
|
+
|
|
588
|
+
lookup_score: float
|
|
589
|
+
attention_weight: float | None
|
|
590
|
+
|
|
591
|
+
def __init__(self, memoryset_id: str, memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup):
|
|
592
|
+
# for internal use only, do not document
|
|
593
|
+
super().__init__(memoryset_id, memory_lookup)
|
|
594
|
+
self.lookup_score = memory_lookup["lookup_score"]
|
|
595
|
+
self.attention_weight = memory_lookup["attention_weight"] if "attention_weight" in memory_lookup else None
|
|
596
|
+
|
|
597
|
+
def __repr__(self) -> str:
|
|
598
|
+
return (
|
|
599
|
+
"ScoredMemoryLookup({ "
|
|
600
|
+
+ f"score: {self.score:.2f}"
|
|
601
|
+
+ f", lookup_score: {self.lookup_score:.2f}"
|
|
602
|
+
+ f", value: '{self.value[:100] + '...' if isinstance(self.value, str) and len(self.value) > 100 else self.value}'"
|
|
603
|
+
+ (f", source_id: '{self.source_id}'" if self.source_id is not None else "")
|
|
604
|
+
+ " })"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
MemoryT = TypeVar("MemoryT", bound=MemoryBase)
|
|
609
|
+
MemoryLookupT = TypeVar("MemoryLookupT", bound=MemoryBase)
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
310
613
|
"""
|
|
311
614
|
A Handle to a collection of memories with labels in the OrcaCloud
|
|
312
615
|
|
|
313
616
|
Attributes:
|
|
314
617
|
id: Unique identifier for the memoryset
|
|
315
618
|
name: Unique name of the memoryset
|
|
316
|
-
|
|
619
|
+
description: Description of the memoryset
|
|
317
620
|
length: Number of memories in the memoryset
|
|
318
621
|
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
319
622
|
created_at: When the memoryset was created, automatically generated on create
|
|
@@ -322,43 +625,52 @@ class LabeledMemoryset:
|
|
|
322
625
|
|
|
323
626
|
id: str
|
|
324
627
|
name: str
|
|
325
|
-
|
|
628
|
+
description: str | None
|
|
629
|
+
memory_type: MemoryType # defined by subclasses
|
|
630
|
+
|
|
326
631
|
length: int
|
|
327
632
|
created_at: datetime
|
|
328
633
|
updated_at: datetime
|
|
329
|
-
insertion_status:
|
|
330
|
-
embedding_model:
|
|
634
|
+
insertion_status: Status
|
|
635
|
+
embedding_model: EmbeddingModelBase
|
|
636
|
+
index_type: IndexType
|
|
637
|
+
index_params: dict[str, Any]
|
|
638
|
+
hidden: bool
|
|
331
639
|
|
|
332
|
-
def __init__(self, metadata:
|
|
640
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
333
641
|
# for internal use only, do not document
|
|
334
|
-
if metadata
|
|
335
|
-
self.embedding_model = PretrainedEmbeddingModel._get(metadata
|
|
336
|
-
elif metadata
|
|
337
|
-
self.embedding_model = FinetunedEmbeddingModel.open(metadata
|
|
642
|
+
if metadata["pretrained_embedding_model_name"]:
|
|
643
|
+
self.embedding_model = PretrainedEmbeddingModel._get(metadata["pretrained_embedding_model_name"])
|
|
644
|
+
elif metadata["finetuned_embedding_model_id"]:
|
|
645
|
+
self.embedding_model = FinetunedEmbeddingModel.open(metadata["finetuned_embedding_model_id"])
|
|
338
646
|
else:
|
|
339
647
|
raise ValueError("Either pretrained_embedding_model_name or finetuned_embedding_model_id must be provided")
|
|
340
|
-
self.id = metadata
|
|
341
|
-
self.name = metadata
|
|
342
|
-
self.
|
|
343
|
-
self.length = metadata
|
|
344
|
-
self.created_at = metadata
|
|
345
|
-
self.updated_at = metadata
|
|
346
|
-
self.insertion_status = metadata
|
|
648
|
+
self.id = metadata["id"]
|
|
649
|
+
self.name = metadata["name"]
|
|
650
|
+
self.description = metadata["description"]
|
|
651
|
+
self.length = metadata["length"]
|
|
652
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
653
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
654
|
+
self.insertion_status = Status(metadata["insertion_status"])
|
|
347
655
|
self._last_refresh = datetime.now()
|
|
656
|
+
self.index_type = metadata["index_type"]
|
|
657
|
+
self.index_params = metadata["index_params"]
|
|
658
|
+
self.memory_type = metadata["memory_type"]
|
|
659
|
+
self.hidden = metadata["hidden"]
|
|
348
660
|
|
|
349
661
|
def __eq__(self, other) -> bool:
|
|
350
|
-
return isinstance(other,
|
|
662
|
+
return isinstance(other, MemorysetBase) and self.id == other.id
|
|
351
663
|
|
|
352
664
|
def __repr__(self) -> str:
|
|
353
665
|
return (
|
|
354
|
-
"
|
|
666
|
+
"Memoryset({\n"
|
|
355
667
|
f" name: '{self.name}',\n"
|
|
356
668
|
f" length: {self.length},\n"
|
|
357
|
-
f" label_names: {self.label_names},\n"
|
|
358
669
|
f" embedding_model: {self.embedding_model},\n"
|
|
359
670
|
"})"
|
|
360
671
|
)
|
|
361
672
|
|
|
673
|
+
@overload
|
|
362
674
|
@classmethod
|
|
363
675
|
def create(
|
|
364
676
|
cls,
|
|
@@ -367,12 +679,69 @@ class LabeledMemoryset:
|
|
|
367
679
|
*,
|
|
368
680
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
369
681
|
value_column: str = "value",
|
|
370
|
-
label_column: str =
|
|
682
|
+
label_column: str | None = None,
|
|
683
|
+
score_column: str | None = None,
|
|
684
|
+
source_id_column: str | None = None,
|
|
685
|
+
description: str | None = None,
|
|
686
|
+
label_names: list[str] | None = None,
|
|
687
|
+
max_seq_length_override: int | None = None,
|
|
688
|
+
prompt: str | None = None,
|
|
689
|
+
remove_duplicates: bool = True,
|
|
690
|
+
index_type: IndexType = "FLAT",
|
|
691
|
+
index_params: dict[str, Any] = {},
|
|
692
|
+
if_exists: CreateMode = "error",
|
|
693
|
+
background: Literal[True],
|
|
694
|
+
hidden: bool = False,
|
|
695
|
+
) -> Job[Self]:
|
|
696
|
+
pass
|
|
697
|
+
|
|
698
|
+
@overload
|
|
699
|
+
@classmethod
|
|
700
|
+
def create(
|
|
701
|
+
cls,
|
|
702
|
+
name: str,
|
|
703
|
+
datasource: Datasource,
|
|
704
|
+
*,
|
|
705
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
706
|
+
value_column: str = "value",
|
|
707
|
+
label_column: str | None = None,
|
|
708
|
+
score_column: str | None = None,
|
|
709
|
+
source_id_column: str | None = None,
|
|
710
|
+
description: str | None = None,
|
|
711
|
+
label_names: list[str] | None = None,
|
|
712
|
+
max_seq_length_override: int | None = None,
|
|
713
|
+
prompt: str | None = None,
|
|
714
|
+
remove_duplicates: bool = True,
|
|
715
|
+
index_type: IndexType = "FLAT",
|
|
716
|
+
index_params: dict[str, Any] = {},
|
|
717
|
+
if_exists: CreateMode = "error",
|
|
718
|
+
background: Literal[False] = False,
|
|
719
|
+
hidden: bool = False,
|
|
720
|
+
) -> Self:
|
|
721
|
+
pass
|
|
722
|
+
|
|
723
|
+
@classmethod
|
|
724
|
+
def create(
|
|
725
|
+
cls,
|
|
726
|
+
name: str,
|
|
727
|
+
datasource: Datasource,
|
|
728
|
+
*,
|
|
729
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
730
|
+
value_column: str = "value",
|
|
731
|
+
label_column: str | None = None,
|
|
732
|
+
score_column: str | None = None,
|
|
371
733
|
source_id_column: str | None = None,
|
|
734
|
+
description: str | None = None,
|
|
372
735
|
label_names: list[str] | None = None,
|
|
373
736
|
max_seq_length_override: int | None = None,
|
|
737
|
+
prompt: str | None = None,
|
|
738
|
+
remove_duplicates: bool = True,
|
|
739
|
+
index_type: IndexType = "FLAT",
|
|
740
|
+
index_params: dict[str, Any] = {},
|
|
374
741
|
if_exists: CreateMode = "error",
|
|
375
|
-
|
|
742
|
+
background: bool = False,
|
|
743
|
+
hidden: bool = False,
|
|
744
|
+
) -> Self | Job[Self]:
|
|
376
745
|
"""
|
|
377
746
|
Create a new memoryset in the OrcaCloud
|
|
378
747
|
|
|
@@ -387,8 +756,12 @@ class LabeledMemoryset:
|
|
|
387
756
|
value_column: Name of the column in the datasource that contains the memory values
|
|
388
757
|
label_column: Name of the column in the datasource that contains the memory labels,
|
|
389
758
|
these must be contiguous integers starting from 0
|
|
759
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
390
760
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
391
761
|
the system of reference
|
|
762
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
763
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
764
|
+
datasource or the embedding model.
|
|
392
765
|
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
393
766
|
the number of labels in the `label_column`. Will be automatically inferred if a
|
|
394
767
|
[Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
|
|
@@ -396,8 +769,16 @@ class LabeledMemoryset:
|
|
|
396
769
|
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
397
770
|
value is longer than this it will be truncated, will default to the model's max
|
|
398
771
|
sequence length if not provided
|
|
772
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
773
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
774
|
+
into the memoryset
|
|
775
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
776
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
777
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
399
778
|
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
400
779
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
780
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
781
|
+
hidden: Whether the memoryset should be hidden
|
|
401
782
|
|
|
402
783
|
Returns:
|
|
403
784
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -407,9 +788,11 @@ class LabeledMemoryset:
|
|
|
407
788
|
`"open"` and the params do not match those of the existing memoryset.
|
|
408
789
|
"""
|
|
409
790
|
if embedding_model is None:
|
|
410
|
-
embedding_model = PretrainedEmbeddingModel.
|
|
791
|
+
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
792
|
+
|
|
793
|
+
if label_column is None and score_column is None:
|
|
794
|
+
raise ValueError("label_column or score_column must be provided")
|
|
411
795
|
|
|
412
|
-
logging.info(f"Checking if memoryset with name: {name} exists")
|
|
413
796
|
if cls.exists(name):
|
|
414
797
|
if if_exists == "error":
|
|
415
798
|
raise ValueError(f"Memoryset with name {name} already exists")
|
|
@@ -420,29 +803,47 @@ class LabeledMemoryset:
|
|
|
420
803
|
raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
|
|
421
804
|
return existing
|
|
422
805
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
|
|
806
|
+
payload: CreateMemorysetRequest = {
|
|
807
|
+
"name": name,
|
|
808
|
+
"description": description,
|
|
809
|
+
"datasource_name_or_id": datasource.id,
|
|
810
|
+
"datasource_label_column": label_column,
|
|
811
|
+
"datasource_score_column": score_column,
|
|
812
|
+
"datasource_value_column": value_column,
|
|
813
|
+
"datasource_source_id_column": source_id_column,
|
|
814
|
+
"label_names": label_names,
|
|
815
|
+
"max_seq_length_override": max_seq_length_override,
|
|
816
|
+
"remove_duplicates": remove_duplicates,
|
|
817
|
+
"index_type": index_type,
|
|
818
|
+
"index_params": index_params,
|
|
819
|
+
"hidden": hidden,
|
|
820
|
+
}
|
|
821
|
+
if prompt is not None:
|
|
822
|
+
payload["prompt"] = prompt
|
|
823
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
824
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
825
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
826
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
827
|
+
else:
|
|
828
|
+
raise ValueError("Invalid embedding model")
|
|
829
|
+
response = orca_api.POST("/memoryset", json=payload)
|
|
830
|
+
job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
|
|
831
|
+
return job if background else job.result()
|
|
832
|
+
|
|
833
|
+
@overload
|
|
834
|
+
@classmethod
|
|
835
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
|
|
836
|
+
pass
|
|
443
837
|
|
|
838
|
+
@overload
|
|
444
839
|
@classmethod
|
|
445
|
-
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, **kwargs: Any) ->
|
|
840
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
|
|
841
|
+
pass
|
|
842
|
+
|
|
843
|
+
@classmethod
|
|
844
|
+
def from_hf_dataset(
|
|
845
|
+
cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
|
|
846
|
+
) -> Self | Job[Self]:
|
|
446
847
|
"""
|
|
447
848
|
Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
|
|
448
849
|
|
|
@@ -456,8 +857,7 @@ class LabeledMemoryset:
|
|
|
456
857
|
name: Name for the new memoryset (must be unique)
|
|
457
858
|
hf_dataset: Hugging Face dataset to create the memoryset from
|
|
458
859
|
kwargs: Additional parameters for creating the memoryset. See
|
|
459
|
-
[`create`][orca_sdk.
|
|
460
|
-
|
|
860
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
461
861
|
|
|
462
862
|
Returns:
|
|
463
863
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -465,9 +865,23 @@ class LabeledMemoryset:
|
|
|
465
865
|
datasource = Datasource.from_hf_dataset(
|
|
466
866
|
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
467
867
|
)
|
|
468
|
-
|
|
868
|
+
kwargs["background"] = background
|
|
469
869
|
return cls.create(name, datasource, **kwargs)
|
|
470
870
|
|
|
871
|
+
@overload
|
|
872
|
+
@classmethod
|
|
873
|
+
def from_pytorch(
|
|
874
|
+
cls,
|
|
875
|
+
name: str,
|
|
876
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
877
|
+
*,
|
|
878
|
+
column_names: list[str] | None = None,
|
|
879
|
+
background: Literal[True],
|
|
880
|
+
**kwargs: Any,
|
|
881
|
+
) -> Job[Self]:
|
|
882
|
+
pass
|
|
883
|
+
|
|
884
|
+
@overload
|
|
471
885
|
@classmethod
|
|
472
886
|
def from_pytorch(
|
|
473
887
|
cls,
|
|
@@ -475,8 +889,21 @@ class LabeledMemoryset:
|
|
|
475
889
|
torch_data: TorchDataLoader | TorchDataset,
|
|
476
890
|
*,
|
|
477
891
|
column_names: list[str] | None = None,
|
|
892
|
+
background: Literal[False] = False,
|
|
478
893
|
**kwargs: Any,
|
|
479
|
-
) ->
|
|
894
|
+
) -> Self:
|
|
895
|
+
pass
|
|
896
|
+
|
|
897
|
+
@classmethod
|
|
898
|
+
def from_pytorch(
|
|
899
|
+
cls,
|
|
900
|
+
name: str,
|
|
901
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
902
|
+
*,
|
|
903
|
+
column_names: list[str] | None = None,
|
|
904
|
+
background: bool = False,
|
|
905
|
+
**kwargs: Any,
|
|
906
|
+
) -> Self | Job[Self]:
|
|
480
907
|
"""
|
|
481
908
|
Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
|
|
482
909
|
[`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
|
|
@@ -492,9 +919,9 @@ class LabeledMemoryset:
|
|
|
492
919
|
torch_data: PyTorch data loader or dataset to create the memoryset from
|
|
493
920
|
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
494
921
|
argument must be provided to specify the names of the columns.
|
|
922
|
+
background: Whether to run the operation in the background
|
|
495
923
|
kwargs: Additional parameters for creating the memoryset. See
|
|
496
|
-
[`create`][orca_sdk.
|
|
497
|
-
|
|
924
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
498
925
|
|
|
499
926
|
Returns:
|
|
500
927
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -502,10 +929,42 @@ class LabeledMemoryset:
|
|
|
502
929
|
datasource = Datasource.from_pytorch(
|
|
503
930
|
f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
|
|
504
931
|
)
|
|
932
|
+
kwargs["background"] = background
|
|
505
933
|
return cls.create(name, datasource, **kwargs)
|
|
506
934
|
|
|
935
|
+
@overload
|
|
936
|
+
@classmethod
|
|
937
|
+
def from_list(
|
|
938
|
+
cls,
|
|
939
|
+
name: str,
|
|
940
|
+
data: list[dict],
|
|
941
|
+
*,
|
|
942
|
+
background: Literal[True],
|
|
943
|
+
**kwargs: Any,
|
|
944
|
+
) -> Job[Self]:
|
|
945
|
+
pass
|
|
946
|
+
|
|
947
|
+
@overload
|
|
507
948
|
@classmethod
|
|
508
|
-
def from_list(
|
|
949
|
+
def from_list(
|
|
950
|
+
cls,
|
|
951
|
+
name: str,
|
|
952
|
+
data: list[dict],
|
|
953
|
+
*,
|
|
954
|
+
background: Literal[False] = False,
|
|
955
|
+
**kwargs: Any,
|
|
956
|
+
) -> Self:
|
|
957
|
+
pass
|
|
958
|
+
|
|
959
|
+
@classmethod
|
|
960
|
+
def from_list(
|
|
961
|
+
cls,
|
|
962
|
+
name: str,
|
|
963
|
+
data: list[dict],
|
|
964
|
+
*,
|
|
965
|
+
background: bool = False,
|
|
966
|
+
**kwargs: Any,
|
|
967
|
+
) -> Self | Job[Self]:
|
|
509
968
|
"""
|
|
510
969
|
Create a new memoryset from a list of dictionaries in the OrcaCloud
|
|
511
970
|
|
|
@@ -518,8 +977,9 @@ class LabeledMemoryset:
|
|
|
518
977
|
Params:
|
|
519
978
|
name: Name for the new memoryset (must be unique)
|
|
520
979
|
data: List of dictionaries to create the memoryset from
|
|
980
|
+
background: Whether to run the operation in the background
|
|
521
981
|
kwargs: Additional parameters for creating the memoryset. See
|
|
522
|
-
[`create`][orca_sdk.
|
|
982
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
523
983
|
|
|
524
984
|
Returns:
|
|
525
985
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -531,10 +991,42 @@ class LabeledMemoryset:
|
|
|
531
991
|
... ])
|
|
532
992
|
"""
|
|
533
993
|
datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
994
|
+
kwargs["background"] = background
|
|
534
995
|
return cls.create(name, datasource, **kwargs)
|
|
535
996
|
|
|
997
|
+
@overload
|
|
998
|
+
@classmethod
|
|
999
|
+
def from_dict(
|
|
1000
|
+
cls,
|
|
1001
|
+
name: str,
|
|
1002
|
+
data: dict,
|
|
1003
|
+
*,
|
|
1004
|
+
background: Literal[True],
|
|
1005
|
+
**kwargs: Any,
|
|
1006
|
+
) -> Job[Self]:
|
|
1007
|
+
pass
|
|
1008
|
+
|
|
1009
|
+
@overload
|
|
536
1010
|
@classmethod
|
|
537
|
-
def from_dict(
|
|
1011
|
+
def from_dict(
|
|
1012
|
+
cls,
|
|
1013
|
+
name: str,
|
|
1014
|
+
data: dict,
|
|
1015
|
+
*,
|
|
1016
|
+
background: Literal[False] = False,
|
|
1017
|
+
**kwargs: Any,
|
|
1018
|
+
) -> Self:
|
|
1019
|
+
pass
|
|
1020
|
+
|
|
1021
|
+
@classmethod
|
|
1022
|
+
def from_dict(
|
|
1023
|
+
cls,
|
|
1024
|
+
name: str,
|
|
1025
|
+
data: dict,
|
|
1026
|
+
*,
|
|
1027
|
+
background: bool = False,
|
|
1028
|
+
**kwargs: Any,
|
|
1029
|
+
) -> Self | Job[Self]:
|
|
538
1030
|
"""
|
|
539
1031
|
Create a new memoryset from a dictionary of columns in the OrcaCloud
|
|
540
1032
|
|
|
@@ -547,8 +1039,9 @@ class LabeledMemoryset:
|
|
|
547
1039
|
Params:
|
|
548
1040
|
name: Name for the new memoryset (must be unique)
|
|
549
1041
|
data: Dictionary of columns to create the memoryset from
|
|
1042
|
+
background: Whether to run the operation in the background
|
|
550
1043
|
kwargs: Additional parameters for creating the memoryset. See
|
|
551
|
-
[`create`][orca_sdk.
|
|
1044
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
552
1045
|
|
|
553
1046
|
Returns:
|
|
554
1047
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -561,10 +1054,42 @@ class LabeledMemoryset:
|
|
|
561
1054
|
... })
|
|
562
1055
|
"""
|
|
563
1056
|
datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1057
|
+
kwargs["background"] = background
|
|
564
1058
|
return cls.create(name, datasource, **kwargs)
|
|
565
1059
|
|
|
1060
|
+
@overload
|
|
1061
|
+
@classmethod
|
|
1062
|
+
def from_pandas(
|
|
1063
|
+
cls,
|
|
1064
|
+
name: str,
|
|
1065
|
+
dataframe: pd.DataFrame,
|
|
1066
|
+
*,
|
|
1067
|
+
background: Literal[True],
|
|
1068
|
+
**kwargs: Any,
|
|
1069
|
+
) -> Job[Self]:
|
|
1070
|
+
pass
|
|
1071
|
+
|
|
1072
|
+
@overload
|
|
566
1073
|
@classmethod
|
|
567
|
-
def from_pandas(
|
|
1074
|
+
def from_pandas(
|
|
1075
|
+
cls,
|
|
1076
|
+
name: str,
|
|
1077
|
+
dataframe: pd.DataFrame,
|
|
1078
|
+
*,
|
|
1079
|
+
background: Literal[False] = False,
|
|
1080
|
+
**kwargs: Any,
|
|
1081
|
+
) -> Self:
|
|
1082
|
+
pass
|
|
1083
|
+
|
|
1084
|
+
@classmethod
|
|
1085
|
+
def from_pandas(
|
|
1086
|
+
cls,
|
|
1087
|
+
name: str,
|
|
1088
|
+
dataframe: pd.DataFrame,
|
|
1089
|
+
*,
|
|
1090
|
+
background: bool = False,
|
|
1091
|
+
**kwargs: Any,
|
|
1092
|
+
) -> Self | Job[Self]:
|
|
568
1093
|
"""
|
|
569
1094
|
Create a new memoryset from a pandas [`DataFrame`][pandas.DataFrame] in the OrcaCloud
|
|
570
1095
|
|
|
@@ -577,17 +1102,50 @@ class LabeledMemoryset:
|
|
|
577
1102
|
Params:
|
|
578
1103
|
name: Name for the new memoryset (must be unique)
|
|
579
1104
|
dataframe: Dataframe to create the memoryset from
|
|
1105
|
+
background: Whether to run the operation in the background
|
|
580
1106
|
kwargs: Additional parameters for creating the memoryset. See
|
|
581
|
-
[`create`][orca_sdk.
|
|
1107
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
582
1108
|
|
|
583
1109
|
Returns:
|
|
584
1110
|
Handle to the new memoryset in the OrcaCloud
|
|
585
1111
|
"""
|
|
586
1112
|
datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
|
|
1113
|
+
kwargs["background"] = background
|
|
587
1114
|
return cls.create(name, datasource, **kwargs)
|
|
588
1115
|
|
|
1116
|
+
@overload
|
|
1117
|
+
@classmethod
|
|
1118
|
+
def from_arrow(
|
|
1119
|
+
cls,
|
|
1120
|
+
name: str,
|
|
1121
|
+
pyarrow_table: pa.Table,
|
|
1122
|
+
*,
|
|
1123
|
+
background: Literal[True],
|
|
1124
|
+
**kwargs: Any,
|
|
1125
|
+
) -> Job[Self]:
|
|
1126
|
+
pass
|
|
1127
|
+
|
|
1128
|
+
@overload
|
|
1129
|
+
@classmethod
|
|
1130
|
+
def from_arrow(
|
|
1131
|
+
cls,
|
|
1132
|
+
name: str,
|
|
1133
|
+
pyarrow_table: pa.Table,
|
|
1134
|
+
*,
|
|
1135
|
+
background: Literal[False] = False,
|
|
1136
|
+
**kwargs: Any,
|
|
1137
|
+
) -> Self:
|
|
1138
|
+
pass
|
|
1139
|
+
|
|
589
1140
|
@classmethod
|
|
590
|
-
def from_arrow(
|
|
1141
|
+
def from_arrow(
|
|
1142
|
+
cls,
|
|
1143
|
+
name: str,
|
|
1144
|
+
pyarrow_table: pa.Table,
|
|
1145
|
+
*,
|
|
1146
|
+
background: bool = False,
|
|
1147
|
+
**kwargs: Any,
|
|
1148
|
+
) -> Self | Job[Self]:
|
|
591
1149
|
"""
|
|
592
1150
|
Create a new memoryset from a PyArrow [`Table`][pyarrow.Table] in the OrcaCloud
|
|
593
1151
|
|
|
@@ -600,8 +1158,9 @@ class LabeledMemoryset:
|
|
|
600
1158
|
Params:
|
|
601
1159
|
name: Name for the new memoryset (must be unique)
|
|
602
1160
|
pyarrow_table: PyArrow table to create the memoryset from
|
|
1161
|
+
background: Whether to run the operation in the background
|
|
603
1162
|
kwargs: Additional parameters for creating the memoryset. See
|
|
604
|
-
[`create`][orca_sdk.
|
|
1163
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
605
1164
|
|
|
606
1165
|
Returns:
|
|
607
1166
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -609,10 +1168,42 @@ class LabeledMemoryset:
|
|
|
609
1168
|
datasource = Datasource.from_arrow(
|
|
610
1169
|
f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
|
|
611
1170
|
)
|
|
1171
|
+
kwargs["background"] = background
|
|
612
1172
|
return cls.create(name, datasource, **kwargs)
|
|
613
1173
|
|
|
1174
|
+
@overload
|
|
1175
|
+
@classmethod
|
|
1176
|
+
def from_disk(
|
|
1177
|
+
cls,
|
|
1178
|
+
name: str,
|
|
1179
|
+
file_path: str | PathLike,
|
|
1180
|
+
*,
|
|
1181
|
+
background: Literal[True],
|
|
1182
|
+
**kwargs: Any,
|
|
1183
|
+
) -> Job[Self]:
|
|
1184
|
+
pass
|
|
1185
|
+
|
|
1186
|
+
@overload
|
|
1187
|
+
@classmethod
|
|
1188
|
+
def from_disk(
|
|
1189
|
+
cls,
|
|
1190
|
+
name: str,
|
|
1191
|
+
file_path: str | PathLike,
|
|
1192
|
+
*,
|
|
1193
|
+
background: Literal[False] = False,
|
|
1194
|
+
**kwargs: Any,
|
|
1195
|
+
) -> Self:
|
|
1196
|
+
pass
|
|
1197
|
+
|
|
614
1198
|
@classmethod
|
|
615
|
-
def from_disk(
|
|
1199
|
+
def from_disk(
|
|
1200
|
+
cls,
|
|
1201
|
+
name: str,
|
|
1202
|
+
file_path: str | PathLike,
|
|
1203
|
+
*,
|
|
1204
|
+
background: bool = False,
|
|
1205
|
+
**kwargs: Any,
|
|
1206
|
+
) -> Self | Job[Self]:
|
|
616
1207
|
"""
|
|
617
1208
|
Create a new memoryset from a file on disk in the OrcaCloud
|
|
618
1209
|
|
|
@@ -632,17 +1223,19 @@ class LabeledMemoryset:
|
|
|
632
1223
|
- .csv: [`CSV`][csv] files
|
|
633
1224
|
- .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
|
|
634
1225
|
- dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
|
|
1226
|
+
background: Whether to run the operation in the background
|
|
635
1227
|
kwargs: Additional parameters for creating the memoryset. See
|
|
636
|
-
[`create`][orca_sdk.
|
|
1228
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
637
1229
|
|
|
638
1230
|
Returns:
|
|
639
1231
|
Handle to the new memoryset in the OrcaCloud
|
|
640
1232
|
"""
|
|
641
1233
|
datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
|
|
1234
|
+
kwargs["background"] = background
|
|
642
1235
|
return cls.create(name, datasource, **kwargs)
|
|
643
1236
|
|
|
644
1237
|
@classmethod
|
|
645
|
-
def open(cls, name: str) ->
|
|
1238
|
+
def open(cls, name: str) -> Self:
|
|
646
1239
|
"""
|
|
647
1240
|
Get a handle to a memoryset in the OrcaCloud
|
|
648
1241
|
|
|
@@ -655,7 +1248,7 @@ class LabeledMemoryset:
|
|
|
655
1248
|
Raises:
|
|
656
1249
|
LookupError: If the memoryset does not exist
|
|
657
1250
|
"""
|
|
658
|
-
metadata =
|
|
1251
|
+
metadata = orca_api.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
659
1252
|
return cls(metadata)
|
|
660
1253
|
|
|
661
1254
|
@classmethod
|
|
@@ -676,14 +1269,20 @@ class LabeledMemoryset:
|
|
|
676
1269
|
return False
|
|
677
1270
|
|
|
678
1271
|
@classmethod
|
|
679
|
-
def all(cls) -> list[
|
|
1272
|
+
def all(cls, show_hidden: bool = False) -> list[Self]:
|
|
680
1273
|
"""
|
|
681
1274
|
Get a list of handles to all memorysets in the OrcaCloud
|
|
682
1275
|
|
|
1276
|
+
Params:
|
|
1277
|
+
show_hidden: Whether to include hidden memorysets in results, defaults to `False`
|
|
1278
|
+
|
|
683
1279
|
Returns:
|
|
684
1280
|
List of handles to all memorysets in the OrcaCloud
|
|
685
1281
|
"""
|
|
686
|
-
return [
|
|
1282
|
+
return [
|
|
1283
|
+
cls(metadata)
|
|
1284
|
+
for metadata in orca_api.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
|
|
1285
|
+
]
|
|
687
1286
|
|
|
688
1287
|
@classmethod
|
|
689
1288
|
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
|
|
@@ -699,29 +1298,92 @@ class LabeledMemoryset:
|
|
|
699
1298
|
LookupError: If the memoryset does not exist and if_not_exists is `"error"`
|
|
700
1299
|
"""
|
|
701
1300
|
try:
|
|
702
|
-
|
|
1301
|
+
orca_api.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
|
|
703
1302
|
logging.info(f"Deleted memoryset {name_or_id}")
|
|
704
1303
|
except LookupError:
|
|
705
1304
|
if if_not_exists == "error":
|
|
706
1305
|
raise
|
|
707
1306
|
|
|
1307
|
+
def set(
|
|
1308
|
+
self,
|
|
1309
|
+
*,
|
|
1310
|
+
name: str = UNSET,
|
|
1311
|
+
description: str | None = UNSET,
|
|
1312
|
+
label_names: list[str] = UNSET,
|
|
1313
|
+
hidden: bool = UNSET,
|
|
1314
|
+
):
|
|
1315
|
+
"""
|
|
1316
|
+
Update editable attributes of the memoryset
|
|
1317
|
+
|
|
1318
|
+
Note:
|
|
1319
|
+
If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
|
|
1320
|
+
|
|
1321
|
+
Params:
|
|
1322
|
+
description: Value to set for the description
|
|
1323
|
+
name: Value to set for the name
|
|
1324
|
+
label_names: Value to replace existing label names with
|
|
1325
|
+
"""
|
|
1326
|
+
payload: MemorysetUpdate = {}
|
|
1327
|
+
if name is not UNSET:
|
|
1328
|
+
payload["name"] = name
|
|
1329
|
+
if description is not UNSET:
|
|
1330
|
+
payload["description"] = description
|
|
1331
|
+
if label_names is not UNSET:
|
|
1332
|
+
payload["label_names"] = label_names
|
|
1333
|
+
if hidden is not UNSET:
|
|
1334
|
+
payload["hidden"] = hidden
|
|
1335
|
+
|
|
1336
|
+
orca_api.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
|
|
1337
|
+
self.refresh()
|
|
1338
|
+
|
|
1339
|
+
@overload
|
|
1340
|
+
def clone(
|
|
1341
|
+
self,
|
|
1342
|
+
name: str,
|
|
1343
|
+
*,
|
|
1344
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
1345
|
+
max_seq_length_override: int | None = None,
|
|
1346
|
+
prompt: str | None = None,
|
|
1347
|
+
if_exists: CreateMode = "error",
|
|
1348
|
+
background: Literal[True],
|
|
1349
|
+
) -> Job[Self]:
|
|
1350
|
+
pass
|
|
1351
|
+
|
|
1352
|
+
@overload
|
|
708
1353
|
def clone(
|
|
709
1354
|
self,
|
|
710
1355
|
name: str,
|
|
711
1356
|
*,
|
|
712
1357
|
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
713
1358
|
max_seq_length_override: int | None = None,
|
|
1359
|
+
prompt: str | None = None,
|
|
1360
|
+
if_exists: CreateMode = "error",
|
|
1361
|
+
background: Literal[False] = False,
|
|
1362
|
+
) -> Self:
|
|
1363
|
+
pass
|
|
1364
|
+
|
|
1365
|
+
def clone(
|
|
1366
|
+
self,
|
|
1367
|
+
name: str,
|
|
1368
|
+
*,
|
|
1369
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None = None,
|
|
1370
|
+
max_seq_length_override: int | None = UNSET,
|
|
1371
|
+
prompt: str | None = None,
|
|
714
1372
|
if_exists: CreateMode = "error",
|
|
715
|
-
|
|
1373
|
+
background: bool = False,
|
|
1374
|
+
) -> Self | Job[Self]:
|
|
716
1375
|
"""
|
|
717
1376
|
Create a clone of the memoryset with a new name
|
|
718
1377
|
|
|
719
1378
|
Params:
|
|
720
1379
|
name: Name for the new memoryset (must be unique)
|
|
721
1380
|
embedding_model: Optional new embedding model to use for re-embedding the memory values
|
|
722
|
-
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
723
1381
|
value is longer than this it will be truncated, will default to the model's max
|
|
724
1382
|
sequence length if not provided
|
|
1383
|
+
max_seq_length_override: Optional custom max sequence length to use for the cloned memoryset.
|
|
1384
|
+
If not provided, will use the source memoryset's max sequence length.
|
|
1385
|
+
prompt: Optional custom prompt to use for the cloned memoryset.
|
|
1386
|
+
If not provided, will use the source memoryset's prompt.
|
|
725
1387
|
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
726
1388
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
727
1389
|
|
|
@@ -736,6 +1398,13 @@ class LabeledMemoryset:
|
|
|
736
1398
|
>>> new_memoryset = memoryset.clone(
|
|
737
1399
|
... "my_memoryset_finetuned", embedding_model=finetuned_embedding_model,
|
|
738
1400
|
... )
|
|
1401
|
+
|
|
1402
|
+
>>> # Clone with custom prompts
|
|
1403
|
+
>>> new_memoryset = memoryset.clone(
|
|
1404
|
+
... "my_memoryset_with_prompts",
|
|
1405
|
+
... document_prompt_override="Represent this document for retrieval:",
|
|
1406
|
+
... query_prompt_override="Represent this query for retrieval:",
|
|
1407
|
+
... )
|
|
739
1408
|
"""
|
|
740
1409
|
if self.exists(name):
|
|
741
1410
|
if if_exists == "error":
|
|
@@ -746,22 +1415,22 @@ class LabeledMemoryset:
|
|
|
746
1415
|
if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
|
|
747
1416
|
raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
|
|
748
1417
|
return existing
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
1418
|
+
payload: CloneMemorysetRequest = {"name": name}
|
|
1419
|
+
if max_seq_length_override is not UNSET:
|
|
1420
|
+
payload["max_seq_length_override"] = max_seq_length_override
|
|
1421
|
+
if prompt is not None:
|
|
1422
|
+
payload["prompt"] = prompt
|
|
1423
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
1424
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
1425
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1426
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1427
|
+
|
|
1428
|
+
metadata = orca_api.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
1429
|
+
job = Job(
|
|
1430
|
+
metadata["insertion_task_id"],
|
|
1431
|
+
lambda: self.open(metadata["id"]),
|
|
762
1432
|
)
|
|
763
|
-
|
|
764
|
-
return LabeledMemoryset.open(metadata.id)
|
|
1433
|
+
return job if background else job.result()
|
|
765
1434
|
|
|
766
1435
|
def refresh(self, throttle: float = 0):
|
|
767
1436
|
"""
|
|
@@ -775,7 +1444,7 @@ class LabeledMemoryset:
|
|
|
775
1444
|
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
776
1445
|
return
|
|
777
1446
|
|
|
778
|
-
self.__dict__.update(
|
|
1447
|
+
self.__dict__.update(self.open(self.id).__dict__)
|
|
779
1448
|
self._last_refresh = current_time
|
|
780
1449
|
|
|
781
1450
|
def __len__(self) -> int:
|
|
@@ -784,14 +1453,14 @@ class LabeledMemoryset:
|
|
|
784
1453
|
return self.length
|
|
785
1454
|
|
|
786
1455
|
@overload
|
|
787
|
-
def __getitem__(self, index: int | str) ->
|
|
1456
|
+
def __getitem__(self, index: int | str) -> MemoryT:
|
|
788
1457
|
pass
|
|
789
1458
|
|
|
790
1459
|
@overload
|
|
791
|
-
def __getitem__(self, index: slice) -> list[
|
|
1460
|
+
def __getitem__(self, index: slice) -> list[MemoryT]:
|
|
792
1461
|
pass
|
|
793
1462
|
|
|
794
|
-
def __getitem__(self, index: int | slice | str) ->
|
|
1463
|
+
def __getitem__(self, index: int | slice | str) -> MemoryT | list[MemoryT]:
|
|
795
1464
|
"""
|
|
796
1465
|
Get memories from the memoryset by index or memory id
|
|
797
1466
|
|
|
@@ -837,22 +1506,24 @@ class LabeledMemoryset:
|
|
|
837
1506
|
raise ValueError(f"Invalid index type: {type(index)}")
|
|
838
1507
|
|
|
839
1508
|
@overload
|
|
840
|
-
def search(self, query: str, *, count: int = 1) -> list[
|
|
1509
|
+
def search(self, query: str, *, count: int = 1, prompt: str | None = None) -> list[MemoryLookupT]:
|
|
841
1510
|
pass
|
|
842
1511
|
|
|
843
1512
|
@overload
|
|
844
|
-
def search(self, query: list[str], *, count: int = 1) -> list[list[
|
|
1513
|
+
def search(self, query: list[str], *, count: int = 1, prompt: str | None = None) -> list[list[MemoryLookupT]]:
|
|
845
1514
|
pass
|
|
846
1515
|
|
|
847
1516
|
def search(
|
|
848
|
-
self, query: str | list[str], *, count: int = 1
|
|
849
|
-
) -> list[
|
|
1517
|
+
self, query: str | list[str], *, count: int = 1, prompt: str | None = None
|
|
1518
|
+
) -> list[MemoryLookupT] | list[list[MemoryLookupT]]:
|
|
850
1519
|
"""
|
|
851
1520
|
Search for memories that are semantically similar to the query
|
|
852
1521
|
|
|
853
1522
|
Params:
|
|
854
1523
|
query: Query to lookup memories in the memoryset, can be a single query or a list
|
|
855
1524
|
count: Number of memories to return for each query
|
|
1525
|
+
prompt: Optional prompt for query embedding during search.
|
|
1526
|
+
If not provided, the memoryset's default query prompt will be used if available.
|
|
856
1527
|
|
|
857
1528
|
Returns:
|
|
858
1529
|
List of memories from the memoryset that match the query. If a single query is provided,
|
|
@@ -867,6 +1538,13 @@ class LabeledMemoryset:
|
|
|
867
1538
|
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
868
1539
|
]
|
|
869
1540
|
|
|
1541
|
+
Search with custom query prompt for instruction-following models:
|
|
1542
|
+
>>> memoryset.search("I am happy", count=2, query_prompt="Represent this query for sentiment retrieval:")
|
|
1543
|
+
[
|
|
1544
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am happy' }),
|
|
1545
|
+
LabeledMemoryLookup({ label: <positive: 1>, value: 'I am content' }),
|
|
1546
|
+
]
|
|
1547
|
+
|
|
870
1548
|
Search for similar memories for multiple queries:
|
|
871
1549
|
>>> memoryset.search(["I am happy", "I am sad"], count=1)
|
|
872
1550
|
[
|
|
@@ -878,14 +1556,29 @@ class LabeledMemoryset:
|
|
|
878
1556
|
],
|
|
879
1557
|
]
|
|
880
1558
|
"""
|
|
881
|
-
response =
|
|
882
|
-
name_or_id
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
1559
|
+
response = orca_api.POST(
|
|
1560
|
+
"/gpu/memoryset/{name_or_id}/lookup",
|
|
1561
|
+
params={"name_or_id": self.id},
|
|
1562
|
+
json={
|
|
1563
|
+
"query": query if isinstance(query, list) else [query],
|
|
1564
|
+
"count": count,
|
|
1565
|
+
"prompt": prompt,
|
|
1566
|
+
},
|
|
887
1567
|
)
|
|
888
|
-
lookups = [
|
|
1568
|
+
lookups = [
|
|
1569
|
+
[
|
|
1570
|
+
cast(
|
|
1571
|
+
MemoryLookupT,
|
|
1572
|
+
(
|
|
1573
|
+
LabeledMemoryLookup(self.id, lookup_response)
|
|
1574
|
+
if "label" in lookup_response
|
|
1575
|
+
else ScoredMemoryLookup(self.id, lookup_response)
|
|
1576
|
+
),
|
|
1577
|
+
)
|
|
1578
|
+
for lookup_response in batch
|
|
1579
|
+
]
|
|
1580
|
+
for batch in response
|
|
1581
|
+
]
|
|
889
1582
|
return lookups if isinstance(query, list) else lookups[0]
|
|
890
1583
|
|
|
891
1584
|
def query(
|
|
@@ -893,7 +1586,9 @@ class LabeledMemoryset:
|
|
|
893
1586
|
offset: int = 0,
|
|
894
1587
|
limit: int = 100,
|
|
895
1588
|
filters: list[FilterItemTuple] = [],
|
|
896
|
-
|
|
1589
|
+
with_feedback_metrics: bool = False,
|
|
1590
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
1591
|
+
) -> list[MemoryT]:
|
|
897
1592
|
"""
|
|
898
1593
|
Query the memoryset for memories that match the filters
|
|
899
1594
|
|
|
@@ -901,6 +1596,7 @@ class LabeledMemoryset:
|
|
|
901
1596
|
offset: The offset of the first memory to return
|
|
902
1597
|
limit: The maximum number of memories to return
|
|
903
1598
|
filters: List of filters to apply to the query.
|
|
1599
|
+
with_feedback_metrics: Whether to include feedback metrics in the response
|
|
904
1600
|
|
|
905
1601
|
Returns:
|
|
906
1602
|
List of memories from the memoryset that match the filters
|
|
@@ -912,21 +1608,76 @@ class LabeledMemoryset:
|
|
|
912
1608
|
LabeledMemory({ label: <negative: 0>, value: "I am sad" }),
|
|
913
1609
|
]
|
|
914
1610
|
"""
|
|
1611
|
+
parsed_filters = [
|
|
1612
|
+
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
|
|
1613
|
+
]
|
|
1614
|
+
|
|
1615
|
+
if with_feedback_metrics:
|
|
1616
|
+
response = orca_api.POST(
|
|
1617
|
+
"/telemetry/memories",
|
|
1618
|
+
json={
|
|
1619
|
+
"memoryset_id": self.id,
|
|
1620
|
+
"offset": offset,
|
|
1621
|
+
"limit": limit,
|
|
1622
|
+
"filters": parsed_filters,
|
|
1623
|
+
"sort": [_parse_sort_item_from_tuple(item) for item in sort] if sort else None,
|
|
1624
|
+
},
|
|
1625
|
+
)
|
|
1626
|
+
return [
|
|
1627
|
+
cast(
|
|
1628
|
+
MemoryT,
|
|
1629
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1630
|
+
)
|
|
1631
|
+
for memory in response["items"]
|
|
1632
|
+
]
|
|
1633
|
+
|
|
1634
|
+
if any(_is_metric_column(filter[0]) for filter in filters):
|
|
1635
|
+
raise ValueError("Feedback metrics are only supported when the with_feedback_metrics flag is set to True")
|
|
1636
|
+
|
|
1637
|
+
if sort:
|
|
1638
|
+
logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
|
|
1639
|
+
|
|
1640
|
+
response = orca_api.POST(
|
|
1641
|
+
"/memoryset/{name_or_id}/memories",
|
|
1642
|
+
params={"name_or_id": self.id},
|
|
1643
|
+
json={
|
|
1644
|
+
"offset": offset,
|
|
1645
|
+
"limit": limit,
|
|
1646
|
+
"filters": cast(list[FilterItem], parsed_filters),
|
|
1647
|
+
},
|
|
1648
|
+
)
|
|
915
1649
|
return [
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
self.id,
|
|
919
|
-
body=ListMemoriesRequest(
|
|
920
|
-
offset=offset,
|
|
921
|
-
limit=limit,
|
|
922
|
-
filters=[
|
|
923
|
-
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter
|
|
924
|
-
for filter in filters
|
|
925
|
-
],
|
|
926
|
-
),
|
|
1650
|
+
cast(
|
|
1651
|
+
MemoryT,
|
|
1652
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
927
1653
|
)
|
|
1654
|
+
for memory in response
|
|
928
1655
|
]
|
|
929
1656
|
|
|
1657
|
+
def to_pandas(
|
|
1658
|
+
self,
|
|
1659
|
+
offset: int = 0,
|
|
1660
|
+
limit: int = 100,
|
|
1661
|
+
filters: list[FilterItemTuple] = [],
|
|
1662
|
+
with_feedback_metrics: bool = False,
|
|
1663
|
+
sort: list[TelemetrySortItem] | None = None,
|
|
1664
|
+
) -> pd.DataFrame:
|
|
1665
|
+
"""
|
|
1666
|
+
Convert the memoryset to a pandas DataFrame
|
|
1667
|
+
"""
|
|
1668
|
+
return pd.DataFrame(
|
|
1669
|
+
[
|
|
1670
|
+
memory.to_dict()
|
|
1671
|
+
for memory in self.query(
|
|
1672
|
+
offset=offset,
|
|
1673
|
+
limit=limit,
|
|
1674
|
+
filters=filters,
|
|
1675
|
+
with_feedback_metrics=with_feedback_metrics,
|
|
1676
|
+
sort=sort,
|
|
1677
|
+
)
|
|
1678
|
+
]
|
|
1679
|
+
)
|
|
1680
|
+
|
|
930
1681
|
def insert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
|
|
931
1682
|
"""
|
|
932
1683
|
Insert memories into the memoryset
|
|
@@ -937,6 +1688,7 @@ class LabeledMemoryset:
|
|
|
937
1688
|
|
|
938
1689
|
- `value`: Value of the memory
|
|
939
1690
|
- `label`: Label of the memory
|
|
1691
|
+
- `score`: Score of the memory
|
|
940
1692
|
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
941
1693
|
- `...`: Any other metadata to store for the memory
|
|
942
1694
|
|
|
@@ -946,26 +1698,28 @@ class LabeledMemoryset:
|
|
|
946
1698
|
... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
|
|
947
1699
|
... ])
|
|
948
1700
|
"""
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1701
|
+
orca_api.POST(
|
|
1702
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
1703
|
+
params={"name_or_id": self.id},
|
|
1704
|
+
json=cast(
|
|
1705
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
952
1706
|
[
|
|
953
|
-
_parse_memory_insert(memory)
|
|
1707
|
+
_parse_memory_insert(memory, type=self.memory_type)
|
|
954
1708
|
for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
|
|
955
|
-
]
|
|
1709
|
+
],
|
|
956
1710
|
),
|
|
957
1711
|
)
|
|
958
1712
|
self.refresh()
|
|
959
1713
|
|
|
960
1714
|
@overload
|
|
961
|
-
def get(self, memory_id: str) ->
|
|
1715
|
+
def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
|
|
962
1716
|
pass
|
|
963
1717
|
|
|
964
1718
|
@overload
|
|
965
|
-
def get(self, memory_id: Iterable[str]) -> list[
|
|
1719
|
+
def get(self, memory_id: Iterable[str]) -> list[MemoryT]:
|
|
966
1720
|
pass
|
|
967
1721
|
|
|
968
|
-
def get(self, memory_id: str | Iterable[str]) ->
|
|
1722
|
+
def get(self, memory_id: str | Iterable[str]) -> MemoryT | list[MemoryT]:
|
|
969
1723
|
"""
|
|
970
1724
|
Fetch a memory or memories from the memoryset
|
|
971
1725
|
|
|
@@ -994,22 +1748,36 @@ class LabeledMemoryset:
|
|
|
994
1748
|
]
|
|
995
1749
|
"""
|
|
996
1750
|
if isinstance(memory_id, str):
|
|
997
|
-
|
|
1751
|
+
response = orca_api.GET(
|
|
1752
|
+
"/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
|
|
1753
|
+
)
|
|
1754
|
+
return cast(
|
|
1755
|
+
MemoryT,
|
|
1756
|
+
(LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
|
|
1757
|
+
)
|
|
998
1758
|
else:
|
|
1759
|
+
response = orca_api.POST(
|
|
1760
|
+
"/memoryset/{name_or_id}/memories/get",
|
|
1761
|
+
params={"name_or_id": self.id},
|
|
1762
|
+
json={"memory_ids": list(memory_id)},
|
|
1763
|
+
)
|
|
999
1764
|
return [
|
|
1000
|
-
|
|
1001
|
-
|
|
1765
|
+
cast(
|
|
1766
|
+
MemoryT,
|
|
1767
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1768
|
+
)
|
|
1769
|
+
for memory in response
|
|
1002
1770
|
]
|
|
1003
1771
|
|
|
1004
1772
|
@overload
|
|
1005
|
-
def update(self, updates: dict[str, Any]) ->
|
|
1773
|
+
def update(self, updates: dict[str, Any]) -> MemoryT:
|
|
1006
1774
|
pass
|
|
1007
1775
|
|
|
1008
1776
|
@overload
|
|
1009
|
-
def update(self, updates: Iterable[dict[str, Any]]) -> list[
|
|
1777
|
+
def update(self, updates: Iterable[dict[str, Any]]) -> list[MemoryT]:
|
|
1010
1778
|
pass
|
|
1011
1779
|
|
|
1012
|
-
def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) ->
|
|
1780
|
+
def update(self, updates: dict[str, Any] | Iterable[dict[str, Any]]) -> MemoryT | list[MemoryT]:
|
|
1013
1781
|
"""
|
|
1014
1782
|
Update one or multiple memories in the memoryset
|
|
1015
1783
|
|
|
@@ -1041,16 +1809,82 @@ class LabeledMemoryset:
|
|
|
1041
1809
|
... for m in memoryset.query(filters=[("tag", "==", "happy")])
|
|
1042
1810
|
... )
|
|
1043
1811
|
"""
|
|
1044
|
-
response =
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1812
|
+
response = orca_api.PATCH(
|
|
1813
|
+
"/gpu/memoryset/{name_or_id}/memories",
|
|
1814
|
+
params={"name_or_id": self.id},
|
|
1815
|
+
json=cast(
|
|
1816
|
+
list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
|
|
1817
|
+
[
|
|
1818
|
+
_parse_memory_update(update, type=self.memory_type)
|
|
1819
|
+
for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
|
|
1820
|
+
],
|
|
1821
|
+
),
|
|
1050
1822
|
)
|
|
1051
|
-
updated_memories = [
|
|
1823
|
+
updated_memories = [
|
|
1824
|
+
cast(
|
|
1825
|
+
MemoryT,
|
|
1826
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1827
|
+
)
|
|
1828
|
+
for memory in response
|
|
1829
|
+
]
|
|
1052
1830
|
return updated_memories[0] if isinstance(updates, dict) else updated_memories
|
|
1053
1831
|
|
|
1832
|
+
def get_cascading_edits_suggestions(
|
|
1833
|
+
self,
|
|
1834
|
+
memory: MemoryT,
|
|
1835
|
+
*,
|
|
1836
|
+
old_label: int,
|
|
1837
|
+
new_label: int,
|
|
1838
|
+
max_neighbors: int = 50,
|
|
1839
|
+
max_validation_neighbors: int = 10,
|
|
1840
|
+
similarity_threshold: float | None = None,
|
|
1841
|
+
only_if_has_old_label: bool = True,
|
|
1842
|
+
exclude_if_new_label: bool = True,
|
|
1843
|
+
suggestion_cooldown_time: float = 3600.0 * 24.0, # 1 day
|
|
1844
|
+
label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7, # 1 week
|
|
1845
|
+
) -> list[CascadingEditSuggestion]:
|
|
1846
|
+
"""
|
|
1847
|
+
Suggests cascading edits for a given memory based on nearby points with similar labels.
|
|
1848
|
+
|
|
1849
|
+
This function is triggered after a user changes a memory's label. It looks for nearby
|
|
1850
|
+
candidates in embedding space that may be subject to similar relabeling and returns them
|
|
1851
|
+
as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
|
|
1852
|
+
to reduce noise and improve usability.
|
|
1853
|
+
|
|
1854
|
+
Params:
|
|
1855
|
+
memory: The memory whose label was just changed.
|
|
1856
|
+
old_label: The label this memory used to have.
|
|
1857
|
+
new_label: The label it was changed to.
|
|
1858
|
+
max_neighbors: Maximum number of neighbors to consider.
|
|
1859
|
+
max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
|
|
1860
|
+
similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
|
|
1861
|
+
only_if_has_old_label: If True, only consider neighbors that have the old label.
|
|
1862
|
+
exclude_if_new_label: If True, exclude neighbors that already have the new label.
|
|
1863
|
+
suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
|
|
1864
|
+
to be considered again.
|
|
1865
|
+
label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
|
|
1866
|
+
to be considered for suggestions.
|
|
1867
|
+
|
|
1868
|
+
Returns:
|
|
1869
|
+
A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
|
|
1870
|
+
"""
|
|
1871
|
+
# TODO: properly integrate this with memory edits and return something that can be applied
|
|
1872
|
+
return orca_api.POST(
|
|
1873
|
+
"/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
|
|
1874
|
+
params={"name_or_id": self.id, "memory_id": memory.memory_id},
|
|
1875
|
+
json={
|
|
1876
|
+
"old_label": old_label,
|
|
1877
|
+
"new_label": new_label,
|
|
1878
|
+
"max_neighbors": max_neighbors,
|
|
1879
|
+
"max_validation_neighbors": max_validation_neighbors,
|
|
1880
|
+
"similarity_threshold": similarity_threshold,
|
|
1881
|
+
"only_if_has_old_label": only_if_has_old_label,
|
|
1882
|
+
"exclude_if_new_label": exclude_if_new_label,
|
|
1883
|
+
"suggestion_cooldown_time": suggestion_cooldown_time,
|
|
1884
|
+
"label_confirmation_cooldown_time": label_confirmation_cooldown_time,
|
|
1885
|
+
},
|
|
1886
|
+
)
|
|
1887
|
+
|
|
1054
1888
|
def delete(self, memory_id: str | Iterable[str]) -> None:
|
|
1055
1889
|
"""
|
|
1056
1890
|
Delete memories from the memoryset
|
|
@@ -1070,56 +1904,70 @@ class LabeledMemoryset:
|
|
|
1070
1904
|
|
|
1071
1905
|
"""
|
|
1072
1906
|
memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
|
|
1073
|
-
|
|
1907
|
+
orca_api.POST(
|
|
1908
|
+
"/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": memory_ids}
|
|
1909
|
+
)
|
|
1074
1910
|
logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
|
|
1075
1911
|
self.refresh()
|
|
1076
1912
|
|
|
1077
|
-
|
|
1913
|
+
@overload
|
|
1914
|
+
def analyze(
|
|
1915
|
+
self,
|
|
1916
|
+
*analyses: dict[str, Any] | str,
|
|
1917
|
+
lookup_count: int = 15,
|
|
1918
|
+
clear_metrics: bool = False,
|
|
1919
|
+
background: Literal[True],
|
|
1920
|
+
) -> Job[MemorysetMetrics]:
|
|
1921
|
+
pass
|
|
1922
|
+
|
|
1923
|
+
@overload
|
|
1924
|
+
def analyze(
|
|
1925
|
+
self,
|
|
1926
|
+
*analyses: dict[str, Any] | str,
|
|
1927
|
+
lookup_count: int = 15,
|
|
1928
|
+
clear_metrics: bool = False,
|
|
1929
|
+
background: Literal[False] = False,
|
|
1930
|
+
) -> MemorysetMetrics:
|
|
1931
|
+
pass
|
|
1932
|
+
|
|
1933
|
+
def analyze(
|
|
1934
|
+
self,
|
|
1935
|
+
*analyses: dict[str, Any] | str,
|
|
1936
|
+
lookup_count: int = 15,
|
|
1937
|
+
clear_metrics: bool = False,
|
|
1938
|
+
background: bool = False,
|
|
1939
|
+
) -> Job[MemorysetMetrics] | MemorysetMetrics:
|
|
1078
1940
|
"""
|
|
1079
|
-
Run
|
|
1941
|
+
Run analyses on the memoryset to find duplicates, clusters, mislabelings, and more
|
|
1080
1942
|
|
|
1081
1943
|
The results of the analysis will be stored in the [`LabeledMemory.metrics`][orca_sdk.LabeledMemory]
|
|
1082
|
-
attribute of each memory in the memoryset.
|
|
1944
|
+
attribute of each memory in the memoryset. Overall memoryset metrics will be returned as a dictionary.
|
|
1083
1945
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
>>> memoryset.find_duplicate_memories()
|
|
1089
|
-
{ "num_duplicates": 10 }
|
|
1090
|
-
>>> memoryset.delete(
|
|
1091
|
-
... m.memory_id
|
|
1092
|
-
... for m in memoryset.query(
|
|
1093
|
-
... filters=[("metrics.is_duplicate", "==", True)]
|
|
1094
|
-
... )
|
|
1095
|
-
... )
|
|
1096
|
-
"""
|
|
1097
|
-
analysis = create_analysis(
|
|
1098
|
-
self.id,
|
|
1099
|
-
body=MemorysetAnalysisRequest(
|
|
1100
|
-
type=MemorysetAnalysisRequestType.ANALYZE_DUPLICATE_MEMORIES,
|
|
1101
|
-
),
|
|
1102
|
-
)
|
|
1103
|
-
wait_for_task(analysis.task_id, description="Analyzing duplicates")
|
|
1104
|
-
analysis = get_analysis(self.id, analysis.task_id)
|
|
1105
|
-
assert isinstance(analysis.result, FindDuplicatesAnalysisResult)
|
|
1106
|
-
# TODO: return a custom duplicate analysis class instance with helper methods
|
|
1107
|
-
return analysis.result.to_dict()
|
|
1946
|
+
Params:
|
|
1947
|
+
analyses: List of analysis to run on the memoryset, can either be just the name of an
|
|
1948
|
+
analysis or a dictionary with a name property and additional config. The available
|
|
1949
|
+
analyses are:
|
|
1108
1950
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1951
|
+
- **`"duplicate"`**: Find potentially duplicate memories in the memoryset
|
|
1952
|
+
- **`"cluster"`**: Cluster the memories in the memoryset
|
|
1953
|
+
- **`"label"`**: Analyze the labels to find potential mislabelings
|
|
1954
|
+
- **`"neighbor"`**: Analyze the neighbors to populate anomaly scores
|
|
1955
|
+
- **`"projection"`**: Create a 2D projection of the embeddings for visualization
|
|
1113
1956
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1957
|
+
lookup_count: Number of memories to lookup for each memory in the memoryset
|
|
1958
|
+
clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
|
|
1116
1959
|
|
|
1117
1960
|
Returns:
|
|
1118
|
-
|
|
1961
|
+
dictionary with aggregate metrics for each analysis that was run
|
|
1962
|
+
|
|
1963
|
+
Raises:
|
|
1964
|
+
ValueError: If an invalid analysis name is provided
|
|
1119
1965
|
|
|
1120
1966
|
Examples:
|
|
1121
|
-
|
|
1122
|
-
{
|
|
1967
|
+
Run label and duplicate analysis:
|
|
1968
|
+
>>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
|
|
1969
|
+
{ "duplicate": { "num_duplicates": 10 },
|
|
1970
|
+
"label": {
|
|
1123
1971
|
"label_metrics": [{
|
|
1124
1972
|
"label": 0,
|
|
1125
1973
|
"label_name": "negative",
|
|
@@ -1131,24 +1979,212 @@ class LabeledMemoryset:
|
|
|
1131
1979
|
"average_lookup_score": 0.90,
|
|
1132
1980
|
"memory_count": 100,
|
|
1133
1981
|
}]
|
|
1982
|
+
"neighbor_prediction_accuracy": 0.95,
|
|
1983
|
+
"mean_neighbor_label_confidence": 0.95,
|
|
1984
|
+
"mean_neighbor_label_entropy": 0.95,
|
|
1985
|
+
"mean_neighbor_predicted_label_ambiguity": 0.95,
|
|
1986
|
+
}
|
|
1134
1987
|
}
|
|
1988
|
+
|
|
1989
|
+
Remove all exact duplicates:
|
|
1990
|
+
>>> memoryset.delete(
|
|
1991
|
+
... m.memory_id
|
|
1992
|
+
... for m in memoryset.query(
|
|
1993
|
+
... filters=[("metrics.is_duplicate", "==", True)]
|
|
1994
|
+
... )
|
|
1995
|
+
... )
|
|
1996
|
+
|
|
1997
|
+
Display label analysis to review potential mislabelings:
|
|
1135
1998
|
>>> memoryset.display_label_analysis()
|
|
1136
1999
|
"""
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
2000
|
+
|
|
2001
|
+
# Get valid analysis names from MemorysetAnalysisConfigs
|
|
2002
|
+
valid_analysis_names = set(MemorysetAnalysisConfigs.__annotations__)
|
|
2003
|
+
|
|
2004
|
+
configs: MemorysetAnalysisConfigs = {}
|
|
2005
|
+
for analysis in analyses:
|
|
2006
|
+
if isinstance(analysis, str):
|
|
2007
|
+
error_msg = (
|
|
2008
|
+
f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2009
|
+
)
|
|
2010
|
+
if analysis not in valid_analysis_names:
|
|
2011
|
+
raise ValueError(error_msg)
|
|
2012
|
+
configs[analysis] = {}
|
|
2013
|
+
else:
|
|
2014
|
+
name = analysis.pop("name")
|
|
2015
|
+
error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
|
|
2016
|
+
if name not in valid_analysis_names:
|
|
2017
|
+
raise ValueError(error_msg)
|
|
2018
|
+
configs[name] = analysis
|
|
2019
|
+
|
|
2020
|
+
analysis = orca_api.POST(
|
|
2021
|
+
"/memoryset/{name_or_id}/analysis",
|
|
2022
|
+
params={"name_or_id": self.id},
|
|
2023
|
+
json={
|
|
2024
|
+
"configs": configs,
|
|
2025
|
+
"lookup_count": lookup_count,
|
|
2026
|
+
"clear_metrics": clear_metrics,
|
|
2027
|
+
},
|
|
2028
|
+
)
|
|
2029
|
+
job = Job(
|
|
2030
|
+
analysis["task_id"],
|
|
2031
|
+
lambda: orca_api.GET(
|
|
2032
|
+
"/memoryset/{name_or_id}/analysis/{analysis_task_id}",
|
|
2033
|
+
params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
|
|
2034
|
+
)["results"],
|
|
2035
|
+
)
|
|
2036
|
+
return job if background else job.result()
|
|
2037
|
+
|
|
2038
|
+
def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
|
|
2039
|
+
"""Group potential duplicates in the memoryset"""
|
|
2040
|
+
response = orca_api.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
|
|
2041
|
+
return [
|
|
2042
|
+
[cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
|
|
2043
|
+
for ms in response
|
|
2044
|
+
]
|
|
2045
|
+
|
|
2046
|
+
@overload
|
|
2047
|
+
@staticmethod
|
|
2048
|
+
def run_embedding_evaluation(
|
|
2049
|
+
datasource: Datasource,
|
|
2050
|
+
*,
|
|
2051
|
+
value_column: str = "value",
|
|
2052
|
+
label_column: str = "label",
|
|
2053
|
+
source_id_column: str | None = None,
|
|
2054
|
+
neighbor_count: int = 5,
|
|
2055
|
+
embedding_models: list[str] | None = None,
|
|
2056
|
+
background: Literal[True],
|
|
2057
|
+
) -> Job[list[EmbeddingModelResult]]:
|
|
2058
|
+
pass
|
|
2059
|
+
|
|
2060
|
+
@overload
|
|
2061
|
+
@staticmethod
|
|
2062
|
+
def run_embedding_evaluation(
|
|
2063
|
+
datasource: Datasource,
|
|
2064
|
+
*,
|
|
2065
|
+
value_column: str = "value",
|
|
2066
|
+
label_column: str = "label",
|
|
2067
|
+
source_id_column: str | None = None,
|
|
2068
|
+
neighbor_count: int = 5,
|
|
2069
|
+
embedding_models: list[str] | None = None,
|
|
2070
|
+
background: Literal[False] = False,
|
|
2071
|
+
) -> list[EmbeddingModelResult]:
|
|
2072
|
+
pass
|
|
2073
|
+
|
|
2074
|
+
@staticmethod
|
|
2075
|
+
def run_embedding_evaluation(
|
|
2076
|
+
datasource: Datasource,
|
|
2077
|
+
*,
|
|
2078
|
+
value_column: str = "value",
|
|
2079
|
+
label_column: str = "label",
|
|
2080
|
+
source_id_column: str | None = None,
|
|
2081
|
+
neighbor_count: int = 5,
|
|
2082
|
+
embedding_models: list[str] | None = None,
|
|
2083
|
+
background: bool = False,
|
|
2084
|
+
) -> Job[list[EmbeddingModelResult]] | list[EmbeddingModelResult]:
|
|
2085
|
+
"""
|
|
2086
|
+
Test the quality of embeddings for the datasource by computing metrics such as prediction accuracy.
|
|
2087
|
+
|
|
2088
|
+
Params:
|
|
2089
|
+
datasource: The datasource to run the embedding evaluation on
|
|
2090
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
2091
|
+
label_column: Name of the column in the datasource that contains the memory labels,
|
|
2092
|
+
these must be contiguous integers starting from 0
|
|
2093
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2094
|
+
the system of reference
|
|
2095
|
+
neighbor_count: The number of neighbors to select for prediction
|
|
2096
|
+
embedding_models: Optional list of embedding model keys to evaluate, if not provided all
|
|
2097
|
+
available embedding models will be used
|
|
2098
|
+
|
|
2099
|
+
Returns:
|
|
2100
|
+
A dictionary containing the results of the embedding evaluation
|
|
2101
|
+
"""
|
|
2102
|
+
|
|
2103
|
+
response = orca_api.POST(
|
|
2104
|
+
"/datasource/{name_or_id}/embedding_evaluation",
|
|
2105
|
+
params={"name_or_id": datasource.id},
|
|
2106
|
+
json={
|
|
2107
|
+
"value_column": value_column,
|
|
2108
|
+
"label_column": label_column,
|
|
2109
|
+
"source_id_column": source_id_column,
|
|
2110
|
+
"neighbor_count": neighbor_count,
|
|
2111
|
+
"embedding_models": embedding_models,
|
|
2112
|
+
},
|
|
1143
2113
|
)
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
2114
|
+
|
|
2115
|
+
def get_value() -> list[EmbeddingModelResult]:
|
|
2116
|
+
res = orca_api.GET(
|
|
2117
|
+
"/datasource/{name_or_id}/embedding_evaluation/{task_id}",
|
|
2118
|
+
params={"name_or_id": datasource.id, "task_id": response["task_id"]},
|
|
2119
|
+
)
|
|
2120
|
+
assert res["result"] is not None
|
|
2121
|
+
return res["result"]["evaluation_results"]
|
|
2122
|
+
|
|
2123
|
+
job = Job(response["task_id"], get_value)
|
|
2124
|
+
return job if background else job.result()
|
|
2125
|
+
|
|
2126
|
+
|
|
2127
|
+
class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
2128
|
+
"""
|
|
2129
|
+
A Handle to a collection of memories with labels in the OrcaCloud
|
|
2130
|
+
|
|
2131
|
+
Attributes:
|
|
2132
|
+
id: Unique identifier for the memoryset
|
|
2133
|
+
name: Unique name of the memoryset
|
|
2134
|
+
description: Description of the memoryset
|
|
2135
|
+
label_names: Names for the class labels in the memoryset
|
|
2136
|
+
length: Number of memories in the memoryset
|
|
2137
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2138
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
2139
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2140
|
+
"""
|
|
2141
|
+
|
|
2142
|
+
label_names: list[str]
|
|
2143
|
+
memory_type: MemoryType = "LABELED"
|
|
2144
|
+
|
|
2145
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
2146
|
+
super().__init__(metadata)
|
|
2147
|
+
assert metadata["label_names"] is not None
|
|
2148
|
+
self.label_names = metadata["label_names"]
|
|
2149
|
+
|
|
2150
|
+
def __eq__(self, other) -> bool:
|
|
2151
|
+
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
2152
|
+
|
|
2153
|
+
@classmethod
|
|
2154
|
+
def create(cls, name: str, datasource: Datasource, *, label_column: str | None = "label", **kwargs):
|
|
2155
|
+
return super().create(name, datasource, label_column=label_column, score_column=None, **kwargs)
|
|
1149
2156
|
|
|
1150
2157
|
def display_label_analysis(self):
|
|
1151
|
-
"""
|
|
2158
|
+
"""
|
|
2159
|
+
Display an interactive UI to review and act upon the label analysis results
|
|
2160
|
+
|
|
2161
|
+
Note:
|
|
2162
|
+
This method is only available in Jupyter notebooks.
|
|
2163
|
+
"""
|
|
1152
2164
|
from ._utils.analysis_ui import display_suggested_memory_relabels
|
|
1153
2165
|
|
|
1154
2166
|
display_suggested_memory_relabels(self)
|
|
2167
|
+
|
|
2168
|
+
|
|
2169
|
+
class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
2170
|
+
"""
|
|
2171
|
+
A Handle to a collection of memories with scores in the OrcaCloud
|
|
2172
|
+
|
|
2173
|
+
Attributes:
|
|
2174
|
+
id: Unique identifier for the memoryset
|
|
2175
|
+
name: Unique name of the memoryset
|
|
2176
|
+
description: Description of the memoryset
|
|
2177
|
+
length: Number of memories in the memoryset
|
|
2178
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2179
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
2180
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2181
|
+
"""
|
|
2182
|
+
|
|
2183
|
+
memory_type: MemoryType = "SCORED"
|
|
2184
|
+
|
|
2185
|
+
def __eq__(self, other) -> bool:
|
|
2186
|
+
return isinstance(other, ScoredMemoryset) and self.id == other.id
|
|
2187
|
+
|
|
2188
|
+
@classmethod
|
|
2189
|
+
def create(cls, name: str, datasource: Datasource, *, score_column: str | None = "score", **kwargs):
|
|
2190
|
+
return super().create(name, datasource, score_column=score_column, label_column=None, **kwargs)
|