orca-sdk 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/_shared/metrics.py +120 -18
- orca_sdk/_shared/metrics_test.py +204 -0
- orca_sdk/async_client.py +105 -25
- orca_sdk/classification_model.py +4 -5
- orca_sdk/client.py +105 -25
- orca_sdk/embedding_model.py +19 -14
- orca_sdk/embedding_model_test.py +1 -1
- orca_sdk/memoryset.py +1093 -231
- orca_sdk/memoryset_test.py +109 -2
- orca_sdk/regression_model.py +2 -3
- {orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/METADATA +1 -1
- {orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/RECORD +13 -13
- {orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/WHEEL +0 -0
orca_sdk/memoryset.py
CHANGED
|
@@ -27,13 +27,12 @@ from .async_client import OrcaAsyncClient
|
|
|
27
27
|
from .client import (
|
|
28
28
|
CascadingEditSuggestion,
|
|
29
29
|
CloneMemorysetRequest,
|
|
30
|
+
CreateMemorysetFromDatasourceRequest,
|
|
30
31
|
CreateMemorysetRequest,
|
|
31
32
|
FilterItem,
|
|
32
33
|
)
|
|
33
34
|
from .client import LabeledMemory as LabeledMemoryResponse
|
|
34
|
-
from .client import
|
|
35
|
-
LabeledMemoryInsert,
|
|
36
|
-
)
|
|
35
|
+
from .client import LabeledMemoryInsert
|
|
37
36
|
from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
|
|
38
37
|
from .client import (
|
|
39
38
|
LabeledMemoryUpdate,
|
|
@@ -50,9 +49,7 @@ from .client import (
|
|
|
50
49
|
PredictionFeedback,
|
|
51
50
|
)
|
|
52
51
|
from .client import ScoredMemory as ScoredMemoryResponse
|
|
53
|
-
from .client import
|
|
54
|
-
ScoredMemoryInsert,
|
|
55
|
-
)
|
|
52
|
+
from .client import ScoredMemoryInsert
|
|
56
53
|
from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
|
|
57
54
|
from .client import (
|
|
58
55
|
ScoredMemoryUpdate,
|
|
@@ -937,7 +934,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
937
934
|
length: int
|
|
938
935
|
created_at: datetime
|
|
939
936
|
updated_at: datetime
|
|
940
|
-
insertion_status: Status
|
|
937
|
+
insertion_status: Status | None
|
|
941
938
|
embedding_model: EmbeddingModelBase
|
|
942
939
|
index_type: IndexType
|
|
943
940
|
index_params: dict[str, Any]
|
|
@@ -959,7 +956,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
959
956
|
self.length = metadata["length"]
|
|
960
957
|
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
961
958
|
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
962
|
-
self.insertion_status =
|
|
959
|
+
self.insertion_status = (
|
|
960
|
+
Status(metadata["insertion_status"]) if metadata["insertion_status"] is not None else None
|
|
961
|
+
)
|
|
963
962
|
self._last_refresh = datetime.now()
|
|
964
963
|
self.index_type = metadata["index_type"]
|
|
965
964
|
self.index_params = metadata["index_params"]
|
|
@@ -971,7 +970,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
971
970
|
|
|
972
971
|
def __repr__(self) -> str:
|
|
973
972
|
return (
|
|
974
|
-
"Memoryset({\n"
|
|
973
|
+
f"{self.memory_type.capitalize()}Memoryset(" + "{\n"
|
|
975
974
|
f" name: '{self.name}',\n"
|
|
976
975
|
f" length: {self.length},\n"
|
|
977
976
|
f" embedding_model: {self.embedding_model},\n"
|
|
@@ -1022,11 +1021,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1022
1021
|
return existing
|
|
1023
1022
|
|
|
1024
1023
|
@classmethod
|
|
1025
|
-
def
|
|
1024
|
+
def _create_from_datasource(
|
|
1026
1025
|
cls,
|
|
1027
1026
|
name: str,
|
|
1028
|
-
datasource: Datasource,
|
|
1029
1027
|
*,
|
|
1028
|
+
datasource: Datasource,
|
|
1030
1029
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1031
1030
|
value_column: str = "value",
|
|
1032
1031
|
label_column: str | None = None,
|
|
@@ -1047,54 +1046,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1047
1046
|
memory_type: MemoryType | None = None,
|
|
1048
1047
|
) -> Self | Job[Self]:
|
|
1049
1048
|
"""
|
|
1050
|
-
Create a
|
|
1051
|
-
|
|
1052
|
-
All columns from the datasource that are not specified in the `value_column`,
|
|
1053
|
-
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1054
|
-
|
|
1055
|
-
Params:
|
|
1056
|
-
name: Name for the new memoryset (must be unique)
|
|
1057
|
-
datasource: Source data to populate the memories in the memoryset
|
|
1058
|
-
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1059
|
-
If not provided, a default embedding model for the memoryset will be used.
|
|
1060
|
-
value_column: Name of the column in the datasource that contains the memory values
|
|
1061
|
-
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1062
|
-
Must contain categorical values as integers or strings. String labels will be
|
|
1063
|
-
converted to integers with the unique strings extracted as `label_names`
|
|
1064
|
-
score_column: Name of the column in the datasource that contains the memory scores
|
|
1065
|
-
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
1066
|
-
the system of reference
|
|
1067
|
-
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
1068
|
-
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1069
|
-
so make sure it is concise and describes the contents of your memoryset not the
|
|
1070
|
-
datasource or the embedding model.
|
|
1071
|
-
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
1072
|
-
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1073
|
-
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1074
|
-
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
1075
|
-
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1076
|
-
value is longer than this it will be truncated, will default to the model's max
|
|
1077
|
-
sequence length if not provided
|
|
1078
|
-
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1079
|
-
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
1080
|
-
into the memoryset
|
|
1081
|
-
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1082
|
-
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1083
|
-
index_params: Parameters for the vector index, defaults to `{}`
|
|
1084
|
-
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1085
|
-
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1086
|
-
background: Whether to run the operation none blocking and return a job handle
|
|
1087
|
-
hidden: Whether the memoryset should be hidden
|
|
1088
|
-
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1089
|
-
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1090
|
-
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1091
|
-
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
1092
|
-
Returns:
|
|
1093
|
-
Handle to the new memoryset in the OrcaCloud
|
|
1049
|
+
Create a memoryset from a datasource by calling the API.
|
|
1094
1050
|
|
|
1095
|
-
|
|
1096
|
-
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1097
|
-
`"open"` and the params do not match those of the existing memoryset.
|
|
1051
|
+
This is a private method that performs the actual API call to create a memoryset from a datasource.
|
|
1098
1052
|
"""
|
|
1099
1053
|
if embedding_model is None:
|
|
1100
1054
|
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
@@ -1108,7 +1062,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1108
1062
|
if existing is not None:
|
|
1109
1063
|
return existing
|
|
1110
1064
|
|
|
1111
|
-
payload:
|
|
1065
|
+
payload: CreateMemorysetFromDatasourceRequest = {
|
|
1112
1066
|
"name": name,
|
|
1113
1067
|
"description": description,
|
|
1114
1068
|
"datasource_name_or_id": datasource.id,
|
|
@@ -1138,141 +1092,582 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1138
1092
|
raise ValueError("Invalid embedding model")
|
|
1139
1093
|
client = OrcaClient._resolve_client()
|
|
1140
1094
|
response = client.POST("/memoryset", json=payload)
|
|
1095
|
+
|
|
1096
|
+
if response["insertion_job_id"] is None:
|
|
1097
|
+
raise ValueError("Create memoryset operation failed to produce an insertion job")
|
|
1098
|
+
|
|
1141
1099
|
job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
|
|
1142
1100
|
return job if background else job.result()
|
|
1143
1101
|
|
|
1144
1102
|
@overload
|
|
1145
1103
|
@classmethod
|
|
1146
|
-
def
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1104
|
+
def create(
|
|
1105
|
+
cls,
|
|
1106
|
+
name: str,
|
|
1107
|
+
*,
|
|
1108
|
+
datasource: None = None,
|
|
1109
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1110
|
+
description: str | None = None,
|
|
1111
|
+
label_names: list[str] | None = None,
|
|
1112
|
+
max_seq_length_override: int | None = None,
|
|
1113
|
+
prompt: str | None = None,
|
|
1114
|
+
index_type: IndexType = "FLAT",
|
|
1115
|
+
index_params: dict[str, Any] = {},
|
|
1116
|
+
if_exists: CreateMode = "error",
|
|
1117
|
+
hidden: bool = False,
|
|
1118
|
+
memory_type: MemoryType | None = None,
|
|
1119
|
+
) -> Self:
|
|
1152
1120
|
pass
|
|
1153
1121
|
|
|
1154
|
-
@classmethod
|
|
1155
|
-
def from_hf_dataset(
|
|
1156
|
-
cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
|
|
1157
|
-
) -> Self | Job[Self]:
|
|
1158
|
-
"""
|
|
1159
|
-
Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
|
|
1160
|
-
|
|
1161
|
-
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1162
|
-
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1163
|
-
|
|
1164
|
-
All features that are not specified to be used as `value_column`, `label_column`, or
|
|
1165
|
-
`source_id_column` will be stored as metadata in the memoryset.
|
|
1166
|
-
|
|
1167
|
-
Params:
|
|
1168
|
-
name: Name for the new memoryset (must be unique)
|
|
1169
|
-
hf_dataset: Hugging Face dataset to create the memoryset from
|
|
1170
|
-
kwargs: Additional parameters for creating the memoryset. See
|
|
1171
|
-
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1172
|
-
|
|
1173
|
-
Returns:
|
|
1174
|
-
Handle to the new memoryset in the OrcaCloud
|
|
1175
|
-
"""
|
|
1176
|
-
if_exists = kwargs.get("if_exists", "error")
|
|
1177
|
-
existing = cls._handle_if_exists(
|
|
1178
|
-
name,
|
|
1179
|
-
if_exists=if_exists,
|
|
1180
|
-
label_names=kwargs.get("label_names"),
|
|
1181
|
-
embedding_model=kwargs.get("embedding_model"),
|
|
1182
|
-
)
|
|
1183
|
-
if existing is not None:
|
|
1184
|
-
return existing
|
|
1185
|
-
|
|
1186
|
-
datasource = Datasource.from_hf_dataset(
|
|
1187
|
-
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
1188
|
-
)
|
|
1189
|
-
kwargs["background"] = background
|
|
1190
|
-
return cls.create(name, datasource, **kwargs)
|
|
1191
|
-
|
|
1192
1122
|
@overload
|
|
1193
1123
|
@classmethod
|
|
1194
|
-
def
|
|
1124
|
+
def create(
|
|
1195
1125
|
cls,
|
|
1196
1126
|
name: str,
|
|
1197
|
-
torch_data: TorchDataLoader | TorchDataset,
|
|
1198
1127
|
*,
|
|
1199
|
-
|
|
1128
|
+
datasource: Datasource,
|
|
1129
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1130
|
+
value_column: str = "value",
|
|
1131
|
+
label_column: str | None = None,
|
|
1132
|
+
score_column: str | None = None,
|
|
1133
|
+
source_id_column: str | None = None,
|
|
1134
|
+
partition_id_column: str | None = None,
|
|
1135
|
+
description: str | None = None,
|
|
1136
|
+
label_names: list[str] | None = None,
|
|
1137
|
+
max_seq_length_override: int | None = None,
|
|
1138
|
+
prompt: str | None = None,
|
|
1139
|
+
remove_duplicates: bool = True,
|
|
1140
|
+
index_type: IndexType = "FLAT",
|
|
1141
|
+
index_params: dict[str, Any] = {},
|
|
1142
|
+
if_exists: CreateMode = "error",
|
|
1200
1143
|
background: Literal[True],
|
|
1201
|
-
|
|
1144
|
+
hidden: bool = False,
|
|
1145
|
+
subsample: int | float | None = None,
|
|
1146
|
+
memory_type: MemoryType | None = None,
|
|
1202
1147
|
) -> Job[Self]:
|
|
1203
1148
|
pass
|
|
1204
1149
|
|
|
1205
1150
|
@overload
|
|
1206
1151
|
@classmethod
|
|
1207
|
-
def
|
|
1152
|
+
def create(
|
|
1208
1153
|
cls,
|
|
1209
1154
|
name: str,
|
|
1210
|
-
torch_data: TorchDataLoader | TorchDataset,
|
|
1211
1155
|
*,
|
|
1212
|
-
|
|
1156
|
+
datasource: Datasource,
|
|
1157
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1158
|
+
value_column: str = "value",
|
|
1159
|
+
label_column: str | None = None,
|
|
1160
|
+
score_column: str | None = None,
|
|
1161
|
+
source_id_column: str | None = None,
|
|
1162
|
+
partition_id_column: str | None = None,
|
|
1163
|
+
description: str | None = None,
|
|
1164
|
+
label_names: list[str] | None = None,
|
|
1165
|
+
max_seq_length_override: int | None = None,
|
|
1166
|
+
prompt: str | None = None,
|
|
1167
|
+
remove_duplicates: bool = True,
|
|
1168
|
+
index_type: IndexType = "FLAT",
|
|
1169
|
+
index_params: dict[str, Any] = {},
|
|
1170
|
+
if_exists: CreateMode = "error",
|
|
1213
1171
|
background: Literal[False] = False,
|
|
1214
|
-
|
|
1172
|
+
hidden: bool = False,
|
|
1173
|
+
subsample: int | float | None = None,
|
|
1174
|
+
memory_type: MemoryType | None = None,
|
|
1215
1175
|
) -> Self:
|
|
1216
1176
|
pass
|
|
1217
1177
|
|
|
1218
1178
|
@classmethod
|
|
1219
|
-
def
|
|
1179
|
+
def create(
|
|
1220
1180
|
cls,
|
|
1221
1181
|
name: str,
|
|
1222
|
-
torch_data: TorchDataLoader | TorchDataset,
|
|
1223
1182
|
*,
|
|
1224
|
-
|
|
1183
|
+
datasource: Datasource | None = None,
|
|
1184
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1185
|
+
value_column: str = "value",
|
|
1186
|
+
label_column: str | None = None,
|
|
1187
|
+
score_column: str | None = None,
|
|
1188
|
+
source_id_column: str | None = None,
|
|
1189
|
+
partition_id_column: str | None = None,
|
|
1190
|
+
description: str | None = None,
|
|
1191
|
+
label_names: list[str] | None = None,
|
|
1192
|
+
max_seq_length_override: int | None = None,
|
|
1193
|
+
prompt: str | None = None,
|
|
1194
|
+
remove_duplicates: bool = True,
|
|
1195
|
+
index_type: IndexType = "FLAT",
|
|
1196
|
+
index_params: dict[str, Any] = {},
|
|
1197
|
+
if_exists: CreateMode = "error",
|
|
1225
1198
|
background: bool = False,
|
|
1226
|
-
|
|
1199
|
+
hidden: bool = False,
|
|
1200
|
+
subsample: int | float | None = None,
|
|
1201
|
+
memory_type: MemoryType | None = None,
|
|
1227
1202
|
) -> Self | Job[Self]:
|
|
1228
1203
|
"""
|
|
1229
|
-
Create a new memoryset
|
|
1230
|
-
[`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
|
|
1204
|
+
Create a new memoryset in the OrcaCloud
|
|
1231
1205
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1206
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
1207
|
+
`value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
1208
|
+
as metadata in the memoryset.
|
|
1234
1209
|
|
|
1235
|
-
|
|
1236
|
-
|
|
1210
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
1211
|
+
You can add memories later using the `insert` method.
|
|
1237
1212
|
|
|
1238
1213
|
Params:
|
|
1239
1214
|
name: Name for the new memoryset (must be unique)
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1215
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
1216
|
+
an empty memoryset will be created.
|
|
1217
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1218
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1219
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
1220
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1221
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
1222
|
+
converted to integers with the unique strings extracted as `label_names`
|
|
1223
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
1224
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
1225
|
+
the system of reference
|
|
1226
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
1227
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1228
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1229
|
+
datasource or the embedding model.
|
|
1230
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
1231
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1232
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1233
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
1234
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1235
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1236
|
+
sequence length if not provided
|
|
1237
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1238
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
1239
|
+
into the memoryset
|
|
1240
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1241
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1242
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1243
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1244
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1245
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
1246
|
+
Note: This parameter is ignored when creating an empty memoryset (when datasource is None).
|
|
1247
|
+
hidden: Whether the memoryset should be hidden
|
|
1248
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1249
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1250
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1251
|
+
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
1247
1252
|
Returns:
|
|
1248
1253
|
Handle to the new memoryset in the OrcaCloud
|
|
1249
|
-
"""
|
|
1250
|
-
if_exists = kwargs.get("if_exists", "error")
|
|
1251
|
-
existing = cls._handle_if_exists(
|
|
1252
|
-
name,
|
|
1253
|
-
if_exists=if_exists,
|
|
1254
|
-
label_names=kwargs.get("label_names"),
|
|
1255
|
-
embedding_model=kwargs.get("embedding_model"),
|
|
1256
|
-
)
|
|
1257
|
-
if existing is not None:
|
|
1258
|
-
return existing
|
|
1259
1254
|
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1255
|
+
Raises:
|
|
1256
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1257
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1258
|
+
"""
|
|
1259
|
+
if datasource is None:
|
|
1260
|
+
return cls._create_empty(
|
|
1261
|
+
name,
|
|
1262
|
+
embedding_model=embedding_model,
|
|
1263
|
+
description=description,
|
|
1264
|
+
label_names=label_names,
|
|
1265
|
+
max_seq_length_override=max_seq_length_override,
|
|
1266
|
+
prompt=prompt,
|
|
1267
|
+
index_type=index_type,
|
|
1268
|
+
index_params=index_params,
|
|
1269
|
+
if_exists=if_exists,
|
|
1270
|
+
hidden=hidden,
|
|
1271
|
+
memory_type=memory_type,
|
|
1272
|
+
)
|
|
1273
|
+
else:
|
|
1274
|
+
return cls._create_from_datasource(
|
|
1275
|
+
name,
|
|
1276
|
+
datasource=datasource,
|
|
1277
|
+
embedding_model=embedding_model,
|
|
1278
|
+
value_column=value_column,
|
|
1279
|
+
label_column=label_column,
|
|
1280
|
+
score_column=score_column,
|
|
1281
|
+
source_id_column=source_id_column,
|
|
1282
|
+
partition_id_column=partition_id_column,
|
|
1283
|
+
description=description,
|
|
1284
|
+
label_names=label_names,
|
|
1285
|
+
max_seq_length_override=max_seq_length_override,
|
|
1286
|
+
prompt=prompt,
|
|
1287
|
+
remove_duplicates=remove_duplicates,
|
|
1288
|
+
index_type=index_type,
|
|
1289
|
+
index_params=index_params,
|
|
1290
|
+
if_exists=if_exists,
|
|
1291
|
+
background=background,
|
|
1292
|
+
hidden=hidden,
|
|
1293
|
+
subsample=subsample,
|
|
1294
|
+
memory_type=memory_type,
|
|
1295
|
+
)
|
|
1265
1296
|
|
|
1266
1297
|
@overload
|
|
1267
1298
|
@classmethod
|
|
1268
|
-
def
|
|
1299
|
+
def from_datasource(
|
|
1269
1300
|
cls,
|
|
1270
1301
|
name: str,
|
|
1271
|
-
data: list[dict],
|
|
1272
1302
|
*,
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1303
|
+
datasource: Datasource,
|
|
1304
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1305
|
+
value_column: str = "value",
|
|
1306
|
+
label_column: str | None = None,
|
|
1307
|
+
score_column: str | None = None,
|
|
1308
|
+
source_id_column: str | None = None,
|
|
1309
|
+
partition_id_column: str | None = None,
|
|
1310
|
+
description: str | None = None,
|
|
1311
|
+
label_names: list[str] | None = None,
|
|
1312
|
+
max_seq_length_override: int | None = None,
|
|
1313
|
+
prompt: str | None = None,
|
|
1314
|
+
remove_duplicates: bool = True,
|
|
1315
|
+
index_type: IndexType = "FLAT",
|
|
1316
|
+
index_params: dict[str, Any] = {},
|
|
1317
|
+
if_exists: CreateMode = "error",
|
|
1318
|
+
background: Literal[True],
|
|
1319
|
+
hidden: bool = False,
|
|
1320
|
+
subsample: int | float | None = None,
|
|
1321
|
+
memory_type: MemoryType | None = None,
|
|
1322
|
+
) -> Job[Self]:
|
|
1323
|
+
pass
|
|
1324
|
+
|
|
1325
|
+
@overload
|
|
1326
|
+
@classmethod
|
|
1327
|
+
def from_datasource(
|
|
1328
|
+
cls,
|
|
1329
|
+
name: str,
|
|
1330
|
+
*,
|
|
1331
|
+
datasource: Datasource,
|
|
1332
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1333
|
+
value_column: str = "value",
|
|
1334
|
+
label_column: str | None = None,
|
|
1335
|
+
score_column: str | None = None,
|
|
1336
|
+
source_id_column: str | None = None,
|
|
1337
|
+
partition_id_column: str | None = None,
|
|
1338
|
+
description: str | None = None,
|
|
1339
|
+
label_names: list[str] | None = None,
|
|
1340
|
+
max_seq_length_override: int | None = None,
|
|
1341
|
+
prompt: str | None = None,
|
|
1342
|
+
remove_duplicates: bool = True,
|
|
1343
|
+
index_type: IndexType = "FLAT",
|
|
1344
|
+
index_params: dict[str, Any] = {},
|
|
1345
|
+
if_exists: CreateMode = "error",
|
|
1346
|
+
background: Literal[False] = False,
|
|
1347
|
+
hidden: bool = False,
|
|
1348
|
+
subsample: int | float | None = None,
|
|
1349
|
+
memory_type: MemoryType | None = None,
|
|
1350
|
+
) -> Self:
|
|
1351
|
+
pass
|
|
1352
|
+
|
|
1353
|
+
@classmethod
|
|
1354
|
+
def from_datasource(
|
|
1355
|
+
cls,
|
|
1356
|
+
name: str,
|
|
1357
|
+
*,
|
|
1358
|
+
datasource: Datasource,
|
|
1359
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1360
|
+
value_column: str = "value",
|
|
1361
|
+
label_column: str | None = None,
|
|
1362
|
+
score_column: str | None = None,
|
|
1363
|
+
source_id_column: str | None = None,
|
|
1364
|
+
partition_id_column: str | None = None,
|
|
1365
|
+
description: str | None = None,
|
|
1366
|
+
label_names: list[str] | None = None,
|
|
1367
|
+
max_seq_length_override: int | None = None,
|
|
1368
|
+
prompt: str | None = None,
|
|
1369
|
+
remove_duplicates: bool = True,
|
|
1370
|
+
index_type: IndexType = "FLAT",
|
|
1371
|
+
index_params: dict[str, Any] = {},
|
|
1372
|
+
if_exists: CreateMode = "error",
|
|
1373
|
+
background: bool = False,
|
|
1374
|
+
hidden: bool = False,
|
|
1375
|
+
subsample: int | float | None = None,
|
|
1376
|
+
memory_type: MemoryType | None = None,
|
|
1377
|
+
) -> Self | Job[Self]:
|
|
1378
|
+
"""
|
|
1379
|
+
Create a new memoryset in the OrcaCloud from a datasource.
|
|
1380
|
+
|
|
1381
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
1382
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
1383
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
1384
|
+
in the memoryset.
|
|
1385
|
+
|
|
1386
|
+
Params:
|
|
1387
|
+
name: Name for the new memoryset (must be unique)
|
|
1388
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
1389
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1390
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1391
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
1392
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1393
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
1394
|
+
converted to integers with the unique strings extracted as `label_names`
|
|
1395
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
1396
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
1397
|
+
the system of reference
|
|
1398
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
1399
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1400
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1401
|
+
datasource or the embedding model.
|
|
1402
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
1403
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1404
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1405
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
1406
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1407
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1408
|
+
sequence length if not provided
|
|
1409
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1410
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
1411
|
+
into the memoryset
|
|
1412
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1413
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1414
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1415
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1416
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1417
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
1418
|
+
hidden: Whether the memoryset should be hidden
|
|
1419
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1420
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1421
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1422
|
+
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
1423
|
+
Returns:
|
|
1424
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1425
|
+
|
|
1426
|
+
Raises:
|
|
1427
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1428
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1429
|
+
"""
|
|
1430
|
+
return cls._create_from_datasource(
|
|
1431
|
+
name,
|
|
1432
|
+
datasource=datasource,
|
|
1433
|
+
embedding_model=embedding_model,
|
|
1434
|
+
value_column=value_column,
|
|
1435
|
+
label_column=label_column,
|
|
1436
|
+
score_column=score_column,
|
|
1437
|
+
source_id_column=source_id_column,
|
|
1438
|
+
partition_id_column=partition_id_column,
|
|
1439
|
+
description=description,
|
|
1440
|
+
label_names=label_names,
|
|
1441
|
+
max_seq_length_override=max_seq_length_override,
|
|
1442
|
+
prompt=prompt,
|
|
1443
|
+
remove_duplicates=remove_duplicates,
|
|
1444
|
+
index_type=index_type,
|
|
1445
|
+
index_params=index_params,
|
|
1446
|
+
if_exists=if_exists,
|
|
1447
|
+
background=background,
|
|
1448
|
+
hidden=hidden,
|
|
1449
|
+
subsample=subsample,
|
|
1450
|
+
memory_type=memory_type,
|
|
1451
|
+
)
|
|
1452
|
+
|
|
1453
|
+
@classmethod
|
|
1454
|
+
def _create_empty(
|
|
1455
|
+
cls,
|
|
1456
|
+
name: str,
|
|
1457
|
+
*,
|
|
1458
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
1459
|
+
description: str | None = None,
|
|
1460
|
+
label_names: list[str] | None = None,
|
|
1461
|
+
max_seq_length_override: int | None = None,
|
|
1462
|
+
prompt: str | None = None,
|
|
1463
|
+
index_type: IndexType = "FLAT",
|
|
1464
|
+
index_params: dict[str, Any] = {},
|
|
1465
|
+
if_exists: CreateMode = "error",
|
|
1466
|
+
hidden: bool = False,
|
|
1467
|
+
memory_type: MemoryType | None = None,
|
|
1468
|
+
) -> Self:
|
|
1469
|
+
"""
|
|
1470
|
+
Create an empty memoryset in the OrcaCloud
|
|
1471
|
+
|
|
1472
|
+
This creates a memoryset with no initial memories. You can add memories later using
|
|
1473
|
+
the `insert` method.
|
|
1474
|
+
|
|
1475
|
+
Params:
|
|
1476
|
+
name: Name for the new memoryset (must be unique)
|
|
1477
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
1478
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
1479
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
1480
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
1481
|
+
datasource or the embedding model.
|
|
1482
|
+
label_names: List of human-readable names for the labels in the memoryset
|
|
1483
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
1484
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
1485
|
+
sequence length if not provided
|
|
1486
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
1487
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
1488
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
1489
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
1490
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
1491
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
1492
|
+
hidden: Whether the memoryset should be hidden
|
|
1493
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if called from
|
|
1494
|
+
`LabeledMemoryset` and `"SCORED"` if called from `ScoredMemoryset`.
|
|
1495
|
+
|
|
1496
|
+
Returns:
|
|
1497
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1498
|
+
|
|
1499
|
+
Raises:
|
|
1500
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
1501
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
1502
|
+
"""
|
|
1503
|
+
if embedding_model is None:
|
|
1504
|
+
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
1505
|
+
|
|
1506
|
+
existing = cls._handle_if_exists(
|
|
1507
|
+
name,
|
|
1508
|
+
if_exists=if_exists,
|
|
1509
|
+
label_names=label_names,
|
|
1510
|
+
embedding_model=embedding_model,
|
|
1511
|
+
)
|
|
1512
|
+
if existing is not None:
|
|
1513
|
+
return existing
|
|
1514
|
+
|
|
1515
|
+
payload: CreateMemorysetRequest = {
|
|
1516
|
+
"name": name,
|
|
1517
|
+
"description": description,
|
|
1518
|
+
"label_names": label_names,
|
|
1519
|
+
"max_seq_length_override": max_seq_length_override,
|
|
1520
|
+
"index_type": index_type,
|
|
1521
|
+
"index_params": index_params,
|
|
1522
|
+
"hidden": hidden,
|
|
1523
|
+
}
|
|
1524
|
+
if memory_type is not None:
|
|
1525
|
+
payload["memory_type"] = memory_type
|
|
1526
|
+
if prompt is not None:
|
|
1527
|
+
payload["prompt"] = prompt
|
|
1528
|
+
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
1529
|
+
payload["pretrained_embedding_model_name"] = embedding_model.name
|
|
1530
|
+
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1531
|
+
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1532
|
+
else:
|
|
1533
|
+
raise ValueError("Invalid embedding model")
|
|
1534
|
+
|
|
1535
|
+
client = OrcaClient._resolve_client()
|
|
1536
|
+
response = client.POST("/memoryset/empty", json=payload)
|
|
1537
|
+
return cls.open(response["id"])
|
|
1538
|
+
|
|
1539
|
+
@overload
|
|
1540
|
+
@classmethod
|
|
1541
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
|
|
1542
|
+
pass
|
|
1543
|
+
|
|
1544
|
+
@overload
|
|
1545
|
+
@classmethod
|
|
1546
|
+
def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
|
|
1547
|
+
pass
|
|
1548
|
+
|
|
1549
|
+
@classmethod
|
|
1550
|
+
def from_hf_dataset(
|
|
1551
|
+
cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
|
|
1552
|
+
) -> Self | Job[Self]:
|
|
1553
|
+
"""
|
|
1554
|
+
Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
|
|
1555
|
+
|
|
1556
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1557
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1558
|
+
|
|
1559
|
+
All features that are not specified to be used as `value_column`, `label_column`, or
|
|
1560
|
+
`source_id_column` will be stored as metadata in the memoryset.
|
|
1561
|
+
|
|
1562
|
+
Params:
|
|
1563
|
+
name: Name for the new memoryset (must be unique)
|
|
1564
|
+
hf_dataset: Hugging Face dataset to create the memoryset from
|
|
1565
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1566
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1567
|
+
|
|
1568
|
+
Returns:
|
|
1569
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1570
|
+
"""
|
|
1571
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1572
|
+
existing = cls._handle_if_exists(
|
|
1573
|
+
name,
|
|
1574
|
+
if_exists=if_exists,
|
|
1575
|
+
label_names=kwargs.get("label_names"),
|
|
1576
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1577
|
+
)
|
|
1578
|
+
if existing is not None:
|
|
1579
|
+
return existing
|
|
1580
|
+
|
|
1581
|
+
datasource = Datasource.from_hf_dataset(
|
|
1582
|
+
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
1583
|
+
)
|
|
1584
|
+
kwargs["background"] = background
|
|
1585
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1586
|
+
|
|
1587
|
+
@overload
|
|
1588
|
+
@classmethod
|
|
1589
|
+
def from_pytorch(
|
|
1590
|
+
cls,
|
|
1591
|
+
name: str,
|
|
1592
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1593
|
+
*,
|
|
1594
|
+
column_names: list[str] | None = None,
|
|
1595
|
+
background: Literal[True],
|
|
1596
|
+
**kwargs: Any,
|
|
1597
|
+
) -> Job[Self]:
|
|
1598
|
+
pass
|
|
1599
|
+
|
|
1600
|
+
@overload
|
|
1601
|
+
@classmethod
|
|
1602
|
+
def from_pytorch(
|
|
1603
|
+
cls,
|
|
1604
|
+
name: str,
|
|
1605
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1606
|
+
*,
|
|
1607
|
+
column_names: list[str] | None = None,
|
|
1608
|
+
background: Literal[False] = False,
|
|
1609
|
+
**kwargs: Any,
|
|
1610
|
+
) -> Self:
|
|
1611
|
+
pass
|
|
1612
|
+
|
|
1613
|
+
@classmethod
|
|
1614
|
+
def from_pytorch(
|
|
1615
|
+
cls,
|
|
1616
|
+
name: str,
|
|
1617
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
1618
|
+
*,
|
|
1619
|
+
column_names: list[str] | None = None,
|
|
1620
|
+
background: bool = False,
|
|
1621
|
+
**kwargs: Any,
|
|
1622
|
+
) -> Self | Job[Self]:
|
|
1623
|
+
"""
|
|
1624
|
+
Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
|
|
1625
|
+
[`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
|
|
1626
|
+
|
|
1627
|
+
This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
|
|
1628
|
+
appended with `_datasource` and use that as the datasource for the memoryset.
|
|
1629
|
+
|
|
1630
|
+
All properties that are not specified to be used as `value_column`, `label_column`, or
|
|
1631
|
+
`source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
|
|
1632
|
+
|
|
1633
|
+
Params:
|
|
1634
|
+
name: Name for the new memoryset (must be unique)
|
|
1635
|
+
torch_data: PyTorch data loader or dataset to create the memoryset from
|
|
1636
|
+
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
1637
|
+
argument must be provided to specify the names of the columns.
|
|
1638
|
+
background: Whether to run the operation in the background
|
|
1639
|
+
kwargs: Additional parameters for creating the memoryset. See
|
|
1640
|
+
[`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
|
|
1641
|
+
|
|
1642
|
+
Returns:
|
|
1643
|
+
Handle to the new memoryset in the OrcaCloud
|
|
1644
|
+
"""
|
|
1645
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1646
|
+
existing = cls._handle_if_exists(
|
|
1647
|
+
name,
|
|
1648
|
+
if_exists=if_exists,
|
|
1649
|
+
label_names=kwargs.get("label_names"),
|
|
1650
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1651
|
+
)
|
|
1652
|
+
if existing is not None:
|
|
1653
|
+
return existing
|
|
1654
|
+
|
|
1655
|
+
datasource = Datasource.from_pytorch(
|
|
1656
|
+
f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
|
|
1657
|
+
)
|
|
1658
|
+
kwargs["background"] = background
|
|
1659
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1660
|
+
|
|
1661
|
+
@overload
|
|
1662
|
+
@classmethod
|
|
1663
|
+
def from_list(
|
|
1664
|
+
cls,
|
|
1665
|
+
name: str,
|
|
1666
|
+
data: list[dict],
|
|
1667
|
+
*,
|
|
1668
|
+
background: Literal[True],
|
|
1669
|
+
**kwargs: Any,
|
|
1670
|
+
) -> Job[Self]:
|
|
1276
1671
|
pass
|
|
1277
1672
|
|
|
1278
1673
|
@overload
|
|
@@ -1333,7 +1728,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1333
1728
|
|
|
1334
1729
|
datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1335
1730
|
kwargs["background"] = background
|
|
1336
|
-
return cls.create(name, datasource, **kwargs)
|
|
1731
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1337
1732
|
|
|
1338
1733
|
@overload
|
|
1339
1734
|
@classmethod
|
|
@@ -1406,7 +1801,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1406
1801
|
|
|
1407
1802
|
datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1408
1803
|
kwargs["background"] = background
|
|
1409
|
-
return cls.create(name, datasource, **kwargs)
|
|
1804
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1410
1805
|
|
|
1411
1806
|
@overload
|
|
1412
1807
|
@classmethod
|
|
@@ -1472,7 +1867,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1472
1867
|
|
|
1473
1868
|
datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
|
|
1474
1869
|
kwargs["background"] = background
|
|
1475
|
-
return cls.create(name, datasource, **kwargs)
|
|
1870
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1476
1871
|
|
|
1477
1872
|
@overload
|
|
1478
1873
|
@classmethod
|
|
@@ -1540,7 +1935,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1540
1935
|
f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
|
|
1541
1936
|
)
|
|
1542
1937
|
kwargs["background"] = background
|
|
1543
|
-
return cls.create(name, datasource, **kwargs)
|
|
1938
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1544
1939
|
|
|
1545
1940
|
@overload
|
|
1546
1941
|
@classmethod
|
|
@@ -1613,7 +2008,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1613
2008
|
|
|
1614
2009
|
datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
|
|
1615
2010
|
kwargs["background"] = background
|
|
1616
|
-
return cls.create(name, datasource, **kwargs)
|
|
2011
|
+
return cls.create(name, datasource=datasource, **kwargs)
|
|
1617
2012
|
|
|
1618
2013
|
@classmethod
|
|
1619
2014
|
def open(cls, name: str) -> Self:
|
|
@@ -1830,6 +2225,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1830
2225
|
|
|
1831
2226
|
client = OrcaClient._resolve_client()
|
|
1832
2227
|
metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
2228
|
+
|
|
2229
|
+
if metadata["insertion_job_id"] is None:
|
|
2230
|
+
raise ValueError("Create memoryset operation failed to produce an insertion job")
|
|
2231
|
+
|
|
1833
2232
|
job = Job(
|
|
1834
2233
|
metadata["insertion_job_id"],
|
|
1835
2234
|
lambda: self.open(metadata["id"]),
|
|
@@ -2482,9 +2881,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
2482
2881
|
|
|
2483
2882
|
- **`"duplicate"`**: Find potentially duplicate memories in the memoryset
|
|
2484
2883
|
- **`"cluster"`**: Cluster the memories in the memoryset
|
|
2485
|
-
- **`"
|
|
2486
|
-
- **`"distribution"`**: Analyze the embedding distribution to populate
|
|
2884
|
+
- **`"distribution"`**: Analyze the embedding distribution
|
|
2487
2885
|
- **`"projection"`**: Create a 2D projection of the embeddings for visualization
|
|
2886
|
+
- **`"label"`**: Analyze the labels to find potential mislabelings (labeled memorysets only)
|
|
2887
|
+
- **`"class_patterns"`**: Analyze class patterns and find representative memories (labeled memorysets only)
|
|
2888
|
+
- **`"concepts"`**: Discover and name conceptual clusters in the memoryset (labeled memorysets only)
|
|
2488
2889
|
|
|
2489
2890
|
lookup_count: Number of memories to lookup for each memory in the memoryset
|
|
2490
2891
|
clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
|
|
@@ -2590,35 +2991,246 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2590
2991
|
"""
|
|
2591
2992
|
A Handle to a collection of memories with labels in the OrcaCloud
|
|
2592
2993
|
|
|
2593
|
-
Attributes:
|
|
2594
|
-
id: Unique identifier for the memoryset
|
|
2595
|
-
name: Unique name of the memoryset
|
|
2596
|
-
description: Description of the memoryset
|
|
2597
|
-
label_names: Names for the class labels in the memoryset
|
|
2598
|
-
length: Number of memories in the memoryset
|
|
2599
|
-
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2600
|
-
created_at: When the memoryset was created, automatically generated on create
|
|
2601
|
-
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2602
|
-
"""
|
|
2994
|
+
Attributes:
|
|
2995
|
+
id: Unique identifier for the memoryset
|
|
2996
|
+
name: Unique name of the memoryset
|
|
2997
|
+
description: Description of the memoryset
|
|
2998
|
+
label_names: Names for the class labels in the memoryset
|
|
2999
|
+
length: Number of memories in the memoryset
|
|
3000
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
3001
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
3002
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
3003
|
+
"""
|
|
3004
|
+
|
|
3005
|
+
label_names: list[str]
|
|
3006
|
+
memory_type: MemoryType = "LABELED"
|
|
3007
|
+
|
|
3008
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
3009
|
+
super().__init__(metadata)
|
|
3010
|
+
assert metadata["label_names"] is not None
|
|
3011
|
+
self.label_names = metadata["label_names"]
|
|
3012
|
+
|
|
3013
|
+
def __eq__(self, other) -> bool:
|
|
3014
|
+
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
3015
|
+
|
|
3016
|
+
@overload
|
|
3017
|
+
@classmethod
|
|
3018
|
+
def create(
|
|
3019
|
+
cls,
|
|
3020
|
+
name: str,
|
|
3021
|
+
*,
|
|
3022
|
+
datasource: None = None,
|
|
3023
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3024
|
+
description: str | None = None,
|
|
3025
|
+
label_names: list[str],
|
|
3026
|
+
max_seq_length_override: int | None = None,
|
|
3027
|
+
prompt: str | None = None,
|
|
3028
|
+
index_type: IndexType = "FLAT",
|
|
3029
|
+
index_params: dict[str, Any] = {},
|
|
3030
|
+
if_exists: CreateMode = "error",
|
|
3031
|
+
hidden: bool = False,
|
|
3032
|
+
) -> Self:
|
|
3033
|
+
pass
|
|
3034
|
+
|
|
3035
|
+
@overload
|
|
3036
|
+
@classmethod
|
|
3037
|
+
def create(
|
|
3038
|
+
cls,
|
|
3039
|
+
name: str,
|
|
3040
|
+
*,
|
|
3041
|
+
datasource: Datasource,
|
|
3042
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3043
|
+
value_column: str = "value",
|
|
3044
|
+
label_column: str | None = "label",
|
|
3045
|
+
source_id_column: str | None = None,
|
|
3046
|
+
partition_id_column: str | None = None,
|
|
3047
|
+
description: str | None = None,
|
|
3048
|
+
label_names: list[str] | None = None,
|
|
3049
|
+
max_seq_length_override: int | None = None,
|
|
3050
|
+
prompt: str | None = None,
|
|
3051
|
+
remove_duplicates: bool = True,
|
|
3052
|
+
index_type: IndexType = "FLAT",
|
|
3053
|
+
index_params: dict[str, Any] = {},
|
|
3054
|
+
if_exists: CreateMode = "error",
|
|
3055
|
+
background: Literal[True],
|
|
3056
|
+
hidden: bool = False,
|
|
3057
|
+
subsample: int | float | None = None,
|
|
3058
|
+
) -> Job[Self]:
|
|
3059
|
+
pass
|
|
3060
|
+
|
|
3061
|
+
@overload
|
|
3062
|
+
@classmethod
|
|
3063
|
+
def create(
|
|
3064
|
+
cls,
|
|
3065
|
+
name: str,
|
|
3066
|
+
*,
|
|
3067
|
+
datasource: Datasource,
|
|
3068
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3069
|
+
value_column: str = "value",
|
|
3070
|
+
label_column: str | None = "label",
|
|
3071
|
+
source_id_column: str | None = None,
|
|
3072
|
+
partition_id_column: str | None = None,
|
|
3073
|
+
description: str | None = None,
|
|
3074
|
+
label_names: list[str] | None = None,
|
|
3075
|
+
max_seq_length_override: int | None = None,
|
|
3076
|
+
prompt: str | None = None,
|
|
3077
|
+
remove_duplicates: bool = True,
|
|
3078
|
+
index_type: IndexType = "FLAT",
|
|
3079
|
+
index_params: dict[str, Any] = {},
|
|
3080
|
+
if_exists: CreateMode = "error",
|
|
3081
|
+
background: Literal[False] = False,
|
|
3082
|
+
hidden: bool = False,
|
|
3083
|
+
subsample: int | float | None = None,
|
|
3084
|
+
) -> Self:
|
|
3085
|
+
pass
|
|
3086
|
+
|
|
3087
|
+
@classmethod
|
|
3088
|
+
def create( # type: ignore[override]
|
|
3089
|
+
cls,
|
|
3090
|
+
name: str,
|
|
3091
|
+
*,
|
|
3092
|
+
datasource: Datasource | None = None,
|
|
3093
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3094
|
+
value_column: str = "value",
|
|
3095
|
+
label_column: str | None = "label",
|
|
3096
|
+
source_id_column: str | None = None,
|
|
3097
|
+
partition_id_column: str | None = None,
|
|
3098
|
+
description: str | None = None,
|
|
3099
|
+
label_names: list[str] | None = None,
|
|
3100
|
+
max_seq_length_override: int | None = None,
|
|
3101
|
+
prompt: str | None = None,
|
|
3102
|
+
remove_duplicates: bool = True,
|
|
3103
|
+
index_type: IndexType = "FLAT",
|
|
3104
|
+
index_params: dict[str, Any] = {},
|
|
3105
|
+
if_exists: CreateMode = "error",
|
|
3106
|
+
background: bool = False,
|
|
3107
|
+
hidden: bool = False,
|
|
3108
|
+
subsample: int | float | None = None,
|
|
3109
|
+
) -> Self | Job[Self]:
|
|
3110
|
+
"""
|
|
3111
|
+
Create a new labeled memoryset in the OrcaCloud
|
|
3112
|
+
|
|
3113
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
3114
|
+
`value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
3115
|
+
as metadata in the memoryset.
|
|
2603
3116
|
|
|
2604
|
-
|
|
2605
|
-
|
|
3117
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
3118
|
+
You can add memories later using the `insert` method.
|
|
2606
3119
|
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
|
|
3120
|
+
Params:
|
|
3121
|
+
name: Name for the new memoryset (must be unique)
|
|
3122
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
3123
|
+
an empty memoryset will be created.
|
|
3124
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3125
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3126
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3127
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
3128
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
3129
|
+
converted to integers with the unique strings extracted as `label_names`. To create
|
|
3130
|
+
a memoryset with all none labels, set to `None`.
|
|
3131
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3132
|
+
the system of reference
|
|
3133
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3134
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3135
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3136
|
+
datasource or the embedding model.
|
|
3137
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
3138
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
3139
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
3140
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
3141
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3142
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3143
|
+
sequence length if not provided
|
|
3144
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3145
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3146
|
+
into the memoryset
|
|
3147
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3148
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3149
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3150
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3151
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3152
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
3153
|
+
hidden: Whether the memoryset should be hidden
|
|
2611
3154
|
|
|
2612
|
-
|
|
2613
|
-
|
|
3155
|
+
Returns:
|
|
3156
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3157
|
+
|
|
3158
|
+
Raises:
|
|
3159
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3160
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3161
|
+
"""
|
|
3162
|
+
if datasource is None:
|
|
3163
|
+
return super().create(
|
|
3164
|
+
name,
|
|
3165
|
+
datasource=None,
|
|
3166
|
+
embedding_model=embedding_model,
|
|
3167
|
+
description=description,
|
|
3168
|
+
label_names=label_names,
|
|
3169
|
+
max_seq_length_override=max_seq_length_override,
|
|
3170
|
+
prompt=prompt,
|
|
3171
|
+
index_type=index_type,
|
|
3172
|
+
index_params=index_params,
|
|
3173
|
+
if_exists=if_exists,
|
|
3174
|
+
hidden=hidden,
|
|
3175
|
+
memory_type="LABELED",
|
|
3176
|
+
)
|
|
3177
|
+
else:
|
|
3178
|
+
# Type narrowing: datasource is definitely Datasource here
|
|
3179
|
+
assert datasource is not None
|
|
3180
|
+
if background:
|
|
3181
|
+
return super().create(
|
|
3182
|
+
name,
|
|
3183
|
+
datasource=datasource,
|
|
3184
|
+
label_column=label_column,
|
|
3185
|
+
score_column=None,
|
|
3186
|
+
embedding_model=embedding_model,
|
|
3187
|
+
value_column=value_column,
|
|
3188
|
+
source_id_column=source_id_column,
|
|
3189
|
+
partition_id_column=partition_id_column,
|
|
3190
|
+
description=description,
|
|
3191
|
+
label_names=label_names,
|
|
3192
|
+
max_seq_length_override=max_seq_length_override,
|
|
3193
|
+
prompt=prompt,
|
|
3194
|
+
remove_duplicates=remove_duplicates,
|
|
3195
|
+
index_type=index_type,
|
|
3196
|
+
index_params=index_params,
|
|
3197
|
+
if_exists=if_exists,
|
|
3198
|
+
background=True,
|
|
3199
|
+
hidden=hidden,
|
|
3200
|
+
subsample=subsample,
|
|
3201
|
+
memory_type="LABELED",
|
|
3202
|
+
)
|
|
3203
|
+
else:
|
|
3204
|
+
return super().create(
|
|
3205
|
+
name,
|
|
3206
|
+
datasource=datasource,
|
|
3207
|
+
label_column=label_column,
|
|
3208
|
+
score_column=None,
|
|
3209
|
+
embedding_model=embedding_model,
|
|
3210
|
+
value_column=value_column,
|
|
3211
|
+
source_id_column=source_id_column,
|
|
3212
|
+
partition_id_column=partition_id_column,
|
|
3213
|
+
description=description,
|
|
3214
|
+
label_names=label_names,
|
|
3215
|
+
max_seq_length_override=max_seq_length_override,
|
|
3216
|
+
prompt=prompt,
|
|
3217
|
+
remove_duplicates=remove_duplicates,
|
|
3218
|
+
index_type=index_type,
|
|
3219
|
+
index_params=index_params,
|
|
3220
|
+
if_exists=if_exists,
|
|
3221
|
+
background=False,
|
|
3222
|
+
hidden=hidden,
|
|
3223
|
+
subsample=subsample,
|
|
3224
|
+
memory_type="LABELED",
|
|
3225
|
+
)
|
|
2614
3226
|
|
|
2615
3227
|
@overload
|
|
2616
3228
|
@classmethod
|
|
2617
|
-
def
|
|
3229
|
+
def from_datasource(
|
|
2618
3230
|
cls,
|
|
2619
3231
|
name: str,
|
|
2620
|
-
datasource: Datasource,
|
|
2621
3232
|
*,
|
|
3233
|
+
datasource: Datasource,
|
|
2622
3234
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2623
3235
|
value_column: str = "value",
|
|
2624
3236
|
label_column: str | None = "label",
|
|
@@ -2640,11 +3252,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2640
3252
|
|
|
2641
3253
|
@overload
|
|
2642
3254
|
@classmethod
|
|
2643
|
-
def
|
|
3255
|
+
def from_datasource(
|
|
2644
3256
|
cls,
|
|
2645
3257
|
name: str,
|
|
2646
|
-
datasource: Datasource,
|
|
2647
3258
|
*,
|
|
3259
|
+
datasource: Datasource,
|
|
2648
3260
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2649
3261
|
value_column: str = "value",
|
|
2650
3262
|
label_column: str | None = "label",
|
|
@@ -2665,11 +3277,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2665
3277
|
pass
|
|
2666
3278
|
|
|
2667
3279
|
@classmethod
|
|
2668
|
-
def
|
|
3280
|
+
def from_datasource( # type: ignore[override]
|
|
2669
3281
|
cls,
|
|
2670
3282
|
name: str,
|
|
2671
|
-
datasource: Datasource,
|
|
2672
3283
|
*,
|
|
3284
|
+
datasource: Datasource,
|
|
2673
3285
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2674
3286
|
value_column: str = "value",
|
|
2675
3287
|
label_column: str | None = "label",
|
|
@@ -2688,14 +3300,16 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2688
3300
|
subsample: int | float | None = None,
|
|
2689
3301
|
) -> Self | Job[Self]:
|
|
2690
3302
|
"""
|
|
2691
|
-
Create a new labeled memoryset in the OrcaCloud
|
|
3303
|
+
Create a new labeled memoryset in the OrcaCloud from a datasource.
|
|
2692
3304
|
|
|
3305
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
2693
3306
|
All columns from the datasource that are not specified in the `value_column`,
|
|
2694
|
-
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
3307
|
+
`label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
3308
|
+
in the memoryset.
|
|
2695
3309
|
|
|
2696
3310
|
Params:
|
|
2697
3311
|
name: Name for the new memoryset (must be unique)
|
|
2698
|
-
datasource: Source data to populate the memories in the memoryset
|
|
3312
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
2699
3313
|
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2700
3314
|
If not provided, a default embedding model for the memoryset will be used.
|
|
2701
3315
|
value_column: Name of the column in the datasource that contains the memory values
|
|
@@ -2724,8 +3338,10 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2724
3338
|
index_params: Parameters for the vector index, defaults to `{}`
|
|
2725
3339
|
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2726
3340
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2727
|
-
background: Whether to run the operation none blocking and return a job handle
|
|
3341
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
2728
3342
|
hidden: Whether the memoryset should be hidden
|
|
3343
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
3344
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
2729
3345
|
|
|
2730
3346
|
Returns:
|
|
2731
3347
|
Handle to the new memoryset in the OrcaCloud
|
|
@@ -2734,28 +3350,52 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2734
3350
|
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2735
3351
|
`"open"` and the params do not match those of the existing memoryset.
|
|
2736
3352
|
"""
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
3353
|
+
if background:
|
|
3354
|
+
return super().create(
|
|
3355
|
+
name,
|
|
3356
|
+
datasource=datasource,
|
|
3357
|
+
label_column=label_column,
|
|
3358
|
+
score_column=None,
|
|
3359
|
+
embedding_model=embedding_model,
|
|
3360
|
+
value_column=value_column,
|
|
3361
|
+
source_id_column=source_id_column,
|
|
3362
|
+
partition_id_column=partition_id_column,
|
|
3363
|
+
description=description,
|
|
3364
|
+
label_names=label_names,
|
|
3365
|
+
max_seq_length_override=max_seq_length_override,
|
|
3366
|
+
prompt=prompt,
|
|
3367
|
+
remove_duplicates=remove_duplicates,
|
|
3368
|
+
index_type=index_type,
|
|
3369
|
+
index_params=index_params,
|
|
3370
|
+
if_exists=if_exists,
|
|
3371
|
+
background=True,
|
|
3372
|
+
hidden=hidden,
|
|
3373
|
+
subsample=subsample,
|
|
3374
|
+
memory_type="LABELED",
|
|
3375
|
+
)
|
|
3376
|
+
else:
|
|
3377
|
+
return super().create(
|
|
3378
|
+
name,
|
|
3379
|
+
datasource=datasource,
|
|
3380
|
+
label_column=label_column,
|
|
3381
|
+
score_column=None,
|
|
3382
|
+
embedding_model=embedding_model,
|
|
3383
|
+
value_column=value_column,
|
|
3384
|
+
source_id_column=source_id_column,
|
|
3385
|
+
partition_id_column=partition_id_column,
|
|
3386
|
+
description=description,
|
|
3387
|
+
label_names=label_names,
|
|
3388
|
+
max_seq_length_override=max_seq_length_override,
|
|
3389
|
+
prompt=prompt,
|
|
3390
|
+
remove_duplicates=remove_duplicates,
|
|
3391
|
+
index_type=index_type,
|
|
3392
|
+
index_params=index_params,
|
|
3393
|
+
if_exists=if_exists,
|
|
3394
|
+
background=False,
|
|
3395
|
+
hidden=hidden,
|
|
3396
|
+
subsample=subsample,
|
|
3397
|
+
memory_type="LABELED",
|
|
3398
|
+
)
|
|
2759
3399
|
|
|
2760
3400
|
def display_label_analysis(self):
|
|
2761
3401
|
"""
|
|
@@ -2793,8 +3433,26 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2793
3433
|
def create(
|
|
2794
3434
|
cls,
|
|
2795
3435
|
name: str,
|
|
2796
|
-
datasource: Datasource,
|
|
2797
3436
|
*,
|
|
3437
|
+
datasource: None = None,
|
|
3438
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3439
|
+
description: str | None = None,
|
|
3440
|
+
max_seq_length_override: int | None = None,
|
|
3441
|
+
prompt: str | None = None,
|
|
3442
|
+
index_type: IndexType = "FLAT",
|
|
3443
|
+
index_params: dict[str, Any] = {},
|
|
3444
|
+
if_exists: CreateMode = "error",
|
|
3445
|
+
hidden: bool = False,
|
|
3446
|
+
) -> Self:
|
|
3447
|
+
pass
|
|
3448
|
+
|
|
3449
|
+
@overload
|
|
3450
|
+
@classmethod
|
|
3451
|
+
def create(
|
|
3452
|
+
cls,
|
|
3453
|
+
name: str,
|
|
3454
|
+
*,
|
|
3455
|
+
datasource: Datasource,
|
|
2798
3456
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2799
3457
|
value_column: str = "value",
|
|
2800
3458
|
score_column: str | None = "score",
|
|
@@ -2818,8 +3476,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2818
3476
|
def create(
|
|
2819
3477
|
cls,
|
|
2820
3478
|
name: str,
|
|
2821
|
-
datasource: Datasource,
|
|
2822
3479
|
*,
|
|
3480
|
+
datasource: Datasource,
|
|
2823
3481
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2824
3482
|
score_column: str | None = "score",
|
|
2825
3483
|
value_column: str = "value",
|
|
@@ -2842,8 +3500,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2842
3500
|
def create( # type: ignore[override]
|
|
2843
3501
|
cls,
|
|
2844
3502
|
name: str,
|
|
2845
|
-
datasource: Datasource,
|
|
2846
3503
|
*,
|
|
3504
|
+
datasource: Datasource | None = None,
|
|
2847
3505
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2848
3506
|
value_column: str = "value",
|
|
2849
3507
|
score_column: str | None = "score",
|
|
@@ -2863,12 +3521,17 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2863
3521
|
"""
|
|
2864
3522
|
Create a new scored memoryset in the OrcaCloud
|
|
2865
3523
|
|
|
2866
|
-
|
|
2867
|
-
`score_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
3524
|
+
If `datasource` is provided, all columns from the datasource that are not specified in the
|
|
3525
|
+
`value_column`, `score_column`, `source_id_column`, or `partition_id_column` will be stored
|
|
3526
|
+
as metadata in the memoryset.
|
|
3527
|
+
|
|
3528
|
+
If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
|
|
3529
|
+
You can add memories later using the `insert` method.
|
|
2868
3530
|
|
|
2869
3531
|
Params:
|
|
2870
3532
|
name: Name for the new memoryset (must be unique)
|
|
2871
|
-
datasource:
|
|
3533
|
+
datasource: Optional source data to populate the memories in the memoryset. If omitted,
|
|
3534
|
+
an empty memoryset will be created.
|
|
2872
3535
|
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2873
3536
|
If not provided, a default embedding model for the memoryset will be used.
|
|
2874
3537
|
value_column: Name of the column in the datasource that contains the memory values
|
|
@@ -2901,23 +3564,222 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2901
3564
|
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2902
3565
|
`"open"` and the params do not match those of the existing memoryset.
|
|
2903
3566
|
"""
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
2914
|
-
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
3567
|
+
if datasource is None:
|
|
3568
|
+
return super().create(
|
|
3569
|
+
name,
|
|
3570
|
+
datasource=None,
|
|
3571
|
+
embedding_model=embedding_model,
|
|
3572
|
+
description=description,
|
|
3573
|
+
max_seq_length_override=max_seq_length_override,
|
|
3574
|
+
prompt=prompt,
|
|
3575
|
+
index_type=index_type,
|
|
3576
|
+
index_params=index_params,
|
|
3577
|
+
if_exists=if_exists,
|
|
3578
|
+
hidden=hidden,
|
|
3579
|
+
memory_type="SCORED",
|
|
3580
|
+
)
|
|
3581
|
+
else:
|
|
3582
|
+
# Type narrowing: datasource is definitely Datasource here
|
|
3583
|
+
assert datasource is not None
|
|
3584
|
+
if background:
|
|
3585
|
+
return super().create(
|
|
3586
|
+
name,
|
|
3587
|
+
datasource=datasource,
|
|
3588
|
+
embedding_model=embedding_model,
|
|
3589
|
+
value_column=value_column,
|
|
3590
|
+
score_column=score_column,
|
|
3591
|
+
source_id_column=source_id_column,
|
|
3592
|
+
partition_id_column=partition_id_column,
|
|
3593
|
+
description=description,
|
|
3594
|
+
max_seq_length_override=max_seq_length_override,
|
|
3595
|
+
prompt=prompt,
|
|
3596
|
+
remove_duplicates=remove_duplicates,
|
|
3597
|
+
index_type=index_type,
|
|
3598
|
+
index_params=index_params,
|
|
3599
|
+
if_exists=if_exists,
|
|
3600
|
+
background=True,
|
|
3601
|
+
hidden=hidden,
|
|
3602
|
+
subsample=subsample,
|
|
3603
|
+
memory_type="SCORED",
|
|
3604
|
+
)
|
|
3605
|
+
else:
|
|
3606
|
+
return super().create(
|
|
3607
|
+
name,
|
|
3608
|
+
datasource=datasource,
|
|
3609
|
+
embedding_model=embedding_model,
|
|
3610
|
+
value_column=value_column,
|
|
3611
|
+
score_column=score_column,
|
|
3612
|
+
source_id_column=source_id_column,
|
|
3613
|
+
partition_id_column=partition_id_column,
|
|
3614
|
+
description=description,
|
|
3615
|
+
max_seq_length_override=max_seq_length_override,
|
|
3616
|
+
prompt=prompt,
|
|
3617
|
+
remove_duplicates=remove_duplicates,
|
|
3618
|
+
index_type=index_type,
|
|
3619
|
+
index_params=index_params,
|
|
3620
|
+
if_exists=if_exists,
|
|
3621
|
+
background=False,
|
|
3622
|
+
hidden=hidden,
|
|
3623
|
+
subsample=subsample,
|
|
3624
|
+
memory_type="SCORED",
|
|
3625
|
+
)
|
|
3626
|
+
|
|
3627
|
+
@overload
|
|
3628
|
+
@classmethod
|
|
3629
|
+
def from_datasource(
|
|
3630
|
+
cls,
|
|
3631
|
+
name: str,
|
|
3632
|
+
*,
|
|
3633
|
+
datasource: Datasource,
|
|
3634
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3635
|
+
value_column: str = "value",
|
|
3636
|
+
score_column: str | None = "score",
|
|
3637
|
+
source_id_column: str | None = None,
|
|
3638
|
+
partition_id_column: str | None = None,
|
|
3639
|
+
description: str | None = None,
|
|
3640
|
+
max_seq_length_override: int | None = None,
|
|
3641
|
+
prompt: str | None = None,
|
|
3642
|
+
remove_duplicates: bool = True,
|
|
3643
|
+
index_type: IndexType = "FLAT",
|
|
3644
|
+
index_params: dict[str, Any] = {},
|
|
3645
|
+
if_exists: CreateMode = "error",
|
|
3646
|
+
background: Literal[True],
|
|
3647
|
+
hidden: bool = False,
|
|
3648
|
+
subsample: int | float | None = None,
|
|
3649
|
+
) -> Job[Self]:
|
|
3650
|
+
pass
|
|
3651
|
+
|
|
3652
|
+
@overload
|
|
3653
|
+
@classmethod
|
|
3654
|
+
def from_datasource(
|
|
3655
|
+
cls,
|
|
3656
|
+
name: str,
|
|
3657
|
+
*,
|
|
3658
|
+
datasource: Datasource,
|
|
3659
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3660
|
+
score_column: str | None = "score",
|
|
3661
|
+
value_column: str = "value",
|
|
3662
|
+
source_id_column: str | None = None,
|
|
3663
|
+
partition_id_column: str | None = None,
|
|
3664
|
+
description: str | None = None,
|
|
3665
|
+
max_seq_length_override: int | None = None,
|
|
3666
|
+
prompt: str | None = None,
|
|
3667
|
+
remove_duplicates: bool = True,
|
|
3668
|
+
index_type: IndexType = "FLAT",
|
|
3669
|
+
index_params: dict[str, Any] = {},
|
|
3670
|
+
if_exists: CreateMode = "error",
|
|
3671
|
+
background: Literal[False] = False,
|
|
3672
|
+
hidden: bool = False,
|
|
3673
|
+
subsample: int | float | None = None,
|
|
3674
|
+
) -> Self:
|
|
3675
|
+
pass
|
|
3676
|
+
|
|
3677
|
+
@classmethod
|
|
3678
|
+
def from_datasource( # type: ignore[override]
|
|
3679
|
+
cls,
|
|
3680
|
+
name: str,
|
|
3681
|
+
*,
|
|
3682
|
+
datasource: Datasource,
|
|
3683
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
3684
|
+
value_column: str = "value",
|
|
3685
|
+
score_column: str | None = "score",
|
|
3686
|
+
source_id_column: str | None = None,
|
|
3687
|
+
partition_id_column: str | None = None,
|
|
3688
|
+
description: str | None = None,
|
|
3689
|
+
max_seq_length_override: int | None = None,
|
|
3690
|
+
prompt: str | None = None,
|
|
3691
|
+
remove_duplicates: bool = True,
|
|
3692
|
+
index_type: IndexType = "FLAT",
|
|
3693
|
+
index_params: dict[str, Any] = {},
|
|
3694
|
+
if_exists: CreateMode = "error",
|
|
3695
|
+
background: bool = False,
|
|
3696
|
+
hidden: bool = False,
|
|
3697
|
+
subsample: int | float | None = None,
|
|
3698
|
+
) -> Self | Job[Self]:
|
|
3699
|
+
"""
|
|
3700
|
+
Create a new scored memoryset in the OrcaCloud from a datasource.
|
|
3701
|
+
|
|
3702
|
+
This is a convenience method that is equivalent to calling `create` with a datasource.
|
|
3703
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
3704
|
+
`score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
|
|
3705
|
+
in the memoryset.
|
|
3706
|
+
|
|
3707
|
+
Params:
|
|
3708
|
+
name: Name for the new memoryset (must be unique)
|
|
3709
|
+
datasource: Source data to populate the memories in the memoryset.
|
|
3710
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
3711
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
3712
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
3713
|
+
score_column: Name of the column in the datasource that contains the memory scores. Must
|
|
3714
|
+
contain numerical values. To create a memoryset with all none scores, set to `None`.
|
|
3715
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
3716
|
+
the system of reference
|
|
3717
|
+
partition_id_column: Optional name of the column in the datasource that contains the partition ids
|
|
3718
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
3719
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
3720
|
+
datasource or the embedding model.
|
|
3721
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
3722
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
3723
|
+
sequence length if not provided
|
|
3724
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
3725
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
3726
|
+
into the memoryset
|
|
3727
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
3728
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
3729
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
3730
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
3731
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
3732
|
+
background: Whether to run the operation none blocking and return a job handle.
|
|
3733
|
+
hidden: Whether the memoryset should be hidden
|
|
3734
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
3735
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
3736
|
+
|
|
3737
|
+
Returns:
|
|
3738
|
+
Handle to the new memoryset in the OrcaCloud
|
|
3739
|
+
|
|
3740
|
+
Raises:
|
|
3741
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
3742
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
3743
|
+
"""
|
|
3744
|
+
if background:
|
|
3745
|
+
return super().create(
|
|
3746
|
+
name,
|
|
3747
|
+
datasource=datasource,
|
|
3748
|
+
embedding_model=embedding_model,
|
|
3749
|
+
value_column=value_column,
|
|
3750
|
+
score_column=score_column,
|
|
3751
|
+
source_id_column=source_id_column,
|
|
3752
|
+
partition_id_column=partition_id_column,
|
|
3753
|
+
description=description,
|
|
3754
|
+
max_seq_length_override=max_seq_length_override,
|
|
3755
|
+
prompt=prompt,
|
|
3756
|
+
remove_duplicates=remove_duplicates,
|
|
3757
|
+
index_type=index_type,
|
|
3758
|
+
index_params=index_params,
|
|
3759
|
+
if_exists=if_exists,
|
|
3760
|
+
background=True,
|
|
3761
|
+
hidden=hidden,
|
|
3762
|
+
subsample=subsample,
|
|
3763
|
+
memory_type="SCORED",
|
|
3764
|
+
)
|
|
3765
|
+
else:
|
|
3766
|
+
return super().create(
|
|
3767
|
+
name,
|
|
3768
|
+
datasource=datasource,
|
|
3769
|
+
embedding_model=embedding_model,
|
|
3770
|
+
value_column=value_column,
|
|
3771
|
+
score_column=score_column,
|
|
3772
|
+
source_id_column=source_id_column,
|
|
3773
|
+
partition_id_column=partition_id_column,
|
|
3774
|
+
description=description,
|
|
3775
|
+
max_seq_length_override=max_seq_length_override,
|
|
3776
|
+
prompt=prompt,
|
|
3777
|
+
remove_duplicates=remove_duplicates,
|
|
3778
|
+
index_type=index_type,
|
|
3779
|
+
index_params=index_params,
|
|
3780
|
+
if_exists=if_exists,
|
|
3781
|
+
background=False,
|
|
3782
|
+
hidden=hidden,
|
|
3783
|
+
subsample=subsample,
|
|
3784
|
+
memory_type="SCORED",
|
|
3785
|
+
)
|