orca-sdk 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orca_sdk/memoryset.py CHANGED
@@ -27,13 +27,12 @@ from .async_client import OrcaAsyncClient
27
27
  from .client import (
28
28
  CascadingEditSuggestion,
29
29
  CloneMemorysetRequest,
30
+ CreateMemorysetFromDatasourceRequest,
30
31
  CreateMemorysetRequest,
31
32
  FilterItem,
32
33
  )
33
34
  from .client import LabeledMemory as LabeledMemoryResponse
34
- from .client import (
35
- LabeledMemoryInsert,
36
- )
35
+ from .client import LabeledMemoryInsert
37
36
  from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
38
37
  from .client import (
39
38
  LabeledMemoryUpdate,
@@ -50,9 +49,7 @@ from .client import (
50
49
  PredictionFeedback,
51
50
  )
52
51
  from .client import ScoredMemory as ScoredMemoryResponse
53
- from .client import (
54
- ScoredMemoryInsert,
55
- )
52
+ from .client import ScoredMemoryInsert
56
53
  from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
57
54
  from .client import (
58
55
  ScoredMemoryUpdate,
@@ -937,7 +934,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
937
934
  length: int
938
935
  created_at: datetime
939
936
  updated_at: datetime
940
- insertion_status: Status
937
+ insertion_status: Status | None
941
938
  embedding_model: EmbeddingModelBase
942
939
  index_type: IndexType
943
940
  index_params: dict[str, Any]
@@ -959,7 +956,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
959
956
  self.length = metadata["length"]
960
957
  self.created_at = datetime.fromisoformat(metadata["created_at"])
961
958
  self.updated_at = datetime.fromisoformat(metadata["updated_at"])
962
- self.insertion_status = Status(metadata["insertion_status"])
959
+ self.insertion_status = (
960
+ Status(metadata["insertion_status"]) if metadata["insertion_status"] is not None else None
961
+ )
963
962
  self._last_refresh = datetime.now()
964
963
  self.index_type = metadata["index_type"]
965
964
  self.index_params = metadata["index_params"]
@@ -971,7 +970,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
971
970
 
972
971
  def __repr__(self) -> str:
973
972
  return (
974
- "Memoryset({\n"
973
+ f"{self.memory_type.capitalize()}Memoryset(" + "{\n"
975
974
  f" name: '{self.name}',\n"
976
975
  f" length: {self.length},\n"
977
976
  f" embedding_model: {self.embedding_model},\n"
@@ -1022,11 +1021,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1022
1021
  return existing
1023
1022
 
1024
1023
  @classmethod
1025
- def create(
1024
+ def _create_from_datasource(
1026
1025
  cls,
1027
1026
  name: str,
1028
- datasource: Datasource,
1029
1027
  *,
1028
+ datasource: Datasource,
1030
1029
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1031
1030
  value_column: str = "value",
1032
1031
  label_column: str | None = None,
@@ -1047,54 +1046,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1047
1046
  memory_type: MemoryType | None = None,
1048
1047
  ) -> Self | Job[Self]:
1049
1048
  """
1050
- Create a new memoryset in the OrcaCloud
1051
-
1052
- All columns from the datasource that are not specified in the `value_column`,
1053
- `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1054
-
1055
- Params:
1056
- name: Name for the new memoryset (must be unique)
1057
- datasource: Source data to populate the memories in the memoryset
1058
- embedding_model: Embedding model to use for embedding memory values for semantic search.
1059
- If not provided, a default embedding model for the memoryset will be used.
1060
- value_column: Name of the column in the datasource that contains the memory values
1061
- label_column: Name of the column in the datasource that contains the memory labels.
1062
- Must contain categorical values as integers or strings. String labels will be
1063
- converted to integers with the unique strings extracted as `label_names`
1064
- score_column: Name of the column in the datasource that contains the memory scores
1065
- source_id_column: Optional name of the column in the datasource that contains the ids in
1066
- the system of reference
1067
- partition_id_column: Optional name of the column in the datasource that contains the partition ids
1068
- description: Optional description for the memoryset, this will be used in agentic flows,
1069
- so make sure it is concise and describes the contents of your memoryset not the
1070
- datasource or the embedding model.
1071
- label_names: List of human-readable names for the labels in the memoryset, must match
1072
- the number of labels in the `label_column`. Will be automatically inferred if string
1073
- labels are provided or if a [Dataset][datasets.Dataset] with a
1074
- [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
1075
- max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1076
- value is longer than this it will be truncated, will default to the model's max
1077
- sequence length if not provided
1078
- prompt: Optional prompt to use when embedding documents/memories for storage
1079
- remove_duplicates: Whether to remove duplicates from the datasource before inserting
1080
- into the memoryset
1081
- index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1082
- values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1083
- index_params: Parameters for the vector index, defaults to `{}`
1084
- if_exists: What to do if a memoryset with the same name already exists, defaults to
1085
- `"error"`. Other option is `"open"` to open the existing memoryset.
1086
- background: Whether to run the operation none blocking and return a job handle
1087
- hidden: Whether the memoryset should be hidden
1088
- subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1089
- datasource to insert. Use to limit the size of the initial memoryset.
1090
- memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1091
- and `"SCORED"` if `score_column` is provided, must be specified for other cases.
1092
- Returns:
1093
- Handle to the new memoryset in the OrcaCloud
1049
+ Create a memoryset from a datasource by calling the API.
1094
1050
 
1095
- Raises:
1096
- ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1097
- `"open"` and the params do not match those of the existing memoryset.
1051
+ This is a private method that performs the actual API call to create a memoryset from a datasource.
1098
1052
  """
1099
1053
  if embedding_model is None:
1100
1054
  embedding_model = PretrainedEmbeddingModel.GTE_BASE
@@ -1108,7 +1062,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1108
1062
  if existing is not None:
1109
1063
  return existing
1110
1064
 
1111
- payload: CreateMemorysetRequest = {
1065
+ payload: CreateMemorysetFromDatasourceRequest = {
1112
1066
  "name": name,
1113
1067
  "description": description,
1114
1068
  "datasource_name_or_id": datasource.id,
@@ -1138,141 +1092,582 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1138
1092
  raise ValueError("Invalid embedding model")
1139
1093
  client = OrcaClient._resolve_client()
1140
1094
  response = client.POST("/memoryset", json=payload)
1095
+
1096
+ if response["insertion_job_id"] is None:
1097
+ raise ValueError("Create memoryset operation failed to produce an insertion job")
1098
+
1141
1099
  job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
1142
1100
  return job if background else job.result()
1143
1101
 
1144
1102
  @overload
1145
1103
  @classmethod
1146
- def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
1147
- pass
1148
-
1149
- @overload
1150
- @classmethod
1151
- def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
1104
+ def create(
1105
+ cls,
1106
+ name: str,
1107
+ *,
1108
+ datasource: None = None,
1109
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1110
+ description: str | None = None,
1111
+ label_names: list[str] | None = None,
1112
+ max_seq_length_override: int | None = None,
1113
+ prompt: str | None = None,
1114
+ index_type: IndexType = "FLAT",
1115
+ index_params: dict[str, Any] = {},
1116
+ if_exists: CreateMode = "error",
1117
+ hidden: bool = False,
1118
+ memory_type: MemoryType | None = None,
1119
+ ) -> Self:
1152
1120
  pass
1153
1121
 
1154
- @classmethod
1155
- def from_hf_dataset(
1156
- cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
1157
- ) -> Self | Job[Self]:
1158
- """
1159
- Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
1160
-
1161
- This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1162
- appended with `_datasource` and use that as the datasource for the memoryset.
1163
-
1164
- All features that are not specified to be used as `value_column`, `label_column`, or
1165
- `source_id_column` will be stored as metadata in the memoryset.
1166
-
1167
- Params:
1168
- name: Name for the new memoryset (must be unique)
1169
- hf_dataset: Hugging Face dataset to create the memoryset from
1170
- kwargs: Additional parameters for creating the memoryset. See
1171
- [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1172
-
1173
- Returns:
1174
- Handle to the new memoryset in the OrcaCloud
1175
- """
1176
- if_exists = kwargs.get("if_exists", "error")
1177
- existing = cls._handle_if_exists(
1178
- name,
1179
- if_exists=if_exists,
1180
- label_names=kwargs.get("label_names"),
1181
- embedding_model=kwargs.get("embedding_model"),
1182
- )
1183
- if existing is not None:
1184
- return existing
1185
-
1186
- datasource = Datasource.from_hf_dataset(
1187
- f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
1188
- )
1189
- kwargs["background"] = background
1190
- return cls.create(name, datasource, **kwargs)
1191
-
1192
1122
  @overload
1193
1123
  @classmethod
1194
- def from_pytorch(
1124
+ def create(
1195
1125
  cls,
1196
1126
  name: str,
1197
- torch_data: TorchDataLoader | TorchDataset,
1198
1127
  *,
1199
- column_names: list[str] | None = None,
1128
+ datasource: Datasource,
1129
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1130
+ value_column: str = "value",
1131
+ label_column: str | None = None,
1132
+ score_column: str | None = None,
1133
+ source_id_column: str | None = None,
1134
+ partition_id_column: str | None = None,
1135
+ description: str | None = None,
1136
+ label_names: list[str] | None = None,
1137
+ max_seq_length_override: int | None = None,
1138
+ prompt: str | None = None,
1139
+ remove_duplicates: bool = True,
1140
+ index_type: IndexType = "FLAT",
1141
+ index_params: dict[str, Any] = {},
1142
+ if_exists: CreateMode = "error",
1200
1143
  background: Literal[True],
1201
- **kwargs: Any,
1144
+ hidden: bool = False,
1145
+ subsample: int | float | None = None,
1146
+ memory_type: MemoryType | None = None,
1202
1147
  ) -> Job[Self]:
1203
1148
  pass
1204
1149
 
1205
1150
  @overload
1206
1151
  @classmethod
1207
- def from_pytorch(
1152
+ def create(
1208
1153
  cls,
1209
1154
  name: str,
1210
- torch_data: TorchDataLoader | TorchDataset,
1211
1155
  *,
1212
- column_names: list[str] | None = None,
1156
+ datasource: Datasource,
1157
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1158
+ value_column: str = "value",
1159
+ label_column: str | None = None,
1160
+ score_column: str | None = None,
1161
+ source_id_column: str | None = None,
1162
+ partition_id_column: str | None = None,
1163
+ description: str | None = None,
1164
+ label_names: list[str] | None = None,
1165
+ max_seq_length_override: int | None = None,
1166
+ prompt: str | None = None,
1167
+ remove_duplicates: bool = True,
1168
+ index_type: IndexType = "FLAT",
1169
+ index_params: dict[str, Any] = {},
1170
+ if_exists: CreateMode = "error",
1213
1171
  background: Literal[False] = False,
1214
- **kwargs: Any,
1172
+ hidden: bool = False,
1173
+ subsample: int | float | None = None,
1174
+ memory_type: MemoryType | None = None,
1215
1175
  ) -> Self:
1216
1176
  pass
1217
1177
 
1218
1178
  @classmethod
1219
- def from_pytorch(
1179
+ def create(
1220
1180
  cls,
1221
1181
  name: str,
1222
- torch_data: TorchDataLoader | TorchDataset,
1223
1182
  *,
1224
- column_names: list[str] | None = None,
1183
+ datasource: Datasource | None = None,
1184
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1185
+ value_column: str = "value",
1186
+ label_column: str | None = None,
1187
+ score_column: str | None = None,
1188
+ source_id_column: str | None = None,
1189
+ partition_id_column: str | None = None,
1190
+ description: str | None = None,
1191
+ label_names: list[str] | None = None,
1192
+ max_seq_length_override: int | None = None,
1193
+ prompt: str | None = None,
1194
+ remove_duplicates: bool = True,
1195
+ index_type: IndexType = "FLAT",
1196
+ index_params: dict[str, Any] = {},
1197
+ if_exists: CreateMode = "error",
1225
1198
  background: bool = False,
1226
- **kwargs: Any,
1199
+ hidden: bool = False,
1200
+ subsample: int | float | None = None,
1201
+ memory_type: MemoryType | None = None,
1227
1202
  ) -> Self | Job[Self]:
1228
1203
  """
1229
- Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
1230
- [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
1204
+ Create a new memoryset in the OrcaCloud
1231
1205
 
1232
- This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1233
- appended with `_datasource` and use that as the datasource for the memoryset.
1206
+ If `datasource` is provided, all columns from the datasource that are not specified in the
1207
+ `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
1208
+ as metadata in the memoryset.
1234
1209
 
1235
- All properties that are not specified to be used as `value_column`, `label_column`, or
1236
- `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1210
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
1211
+ You can add memories later using the `insert` method.
1237
1212
 
1238
1213
  Params:
1239
1214
  name: Name for the new memoryset (must be unique)
1240
- torch_data: PyTorch data loader or dataset to create the memoryset from
1241
- column_names: If the provided dataset or data loader returns unnamed tuples, this
1242
- argument must be provided to specify the names of the columns.
1243
- background: Whether to run the operation in the background
1244
- kwargs: Additional parameters for creating the memoryset. See
1245
- [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1246
-
1215
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
1216
+ an empty memoryset will be created.
1217
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1218
+ If not provided, a default embedding model for the memoryset will be used.
1219
+ value_column: Name of the column in the datasource that contains the memory values
1220
+ label_column: Name of the column in the datasource that contains the memory labels.
1221
+ Must contain categorical values as integers or strings. String labels will be
1222
+ converted to integers with the unique strings extracted as `label_names`
1223
+ score_column: Name of the column in the datasource that contains the memory scores
1224
+ source_id_column: Optional name of the column in the datasource that contains the ids in
1225
+ the system of reference
1226
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
1227
+ description: Optional description for the memoryset, this will be used in agentic flows,
1228
+ so make sure it is concise and describes the contents of your memoryset not the
1229
+ datasource or the embedding model.
1230
+ label_names: List of human-readable names for the labels in the memoryset, must match
1231
+ the number of labels in the `label_column`. Will be automatically inferred if string
1232
+ labels are provided or if a [Dataset][datasets.Dataset] with a
1233
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
1234
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1235
+ value is longer than this it will be truncated, will default to the model's max
1236
+ sequence length if not provided
1237
+ prompt: Optional prompt to use when embedding documents/memories for storage
1238
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
1239
+ into the memoryset
1240
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1241
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1242
+ index_params: Parameters for the vector index, defaults to `{}`
1243
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1244
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1245
+ background: Whether to run the operation none blocking and return a job handle.
1246
+ Note: This parameter is ignored when creating an empty memoryset (when datasource is None).
1247
+ hidden: Whether the memoryset should be hidden
1248
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1249
+ datasource to insert. Use to limit the size of the initial memoryset.
1250
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1251
+ and `"SCORED"` if `score_column` is provided, must be specified for other cases.
1247
1252
  Returns:
1248
1253
  Handle to the new memoryset in the OrcaCloud
1249
- """
1250
- if_exists = kwargs.get("if_exists", "error")
1251
- existing = cls._handle_if_exists(
1252
- name,
1253
- if_exists=if_exists,
1254
- label_names=kwargs.get("label_names"),
1255
- embedding_model=kwargs.get("embedding_model"),
1256
- )
1257
- if existing is not None:
1258
- return existing
1259
1254
 
1260
- datasource = Datasource.from_pytorch(
1261
- f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
1262
- )
1263
- kwargs["background"] = background
1264
- return cls.create(name, datasource, **kwargs)
1255
+ Raises:
1256
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1257
+ `"open"` and the params do not match those of the existing memoryset.
1258
+ """
1259
+ if datasource is None:
1260
+ return cls._create_empty(
1261
+ name,
1262
+ embedding_model=embedding_model,
1263
+ description=description,
1264
+ label_names=label_names,
1265
+ max_seq_length_override=max_seq_length_override,
1266
+ prompt=prompt,
1267
+ index_type=index_type,
1268
+ index_params=index_params,
1269
+ if_exists=if_exists,
1270
+ hidden=hidden,
1271
+ memory_type=memory_type,
1272
+ )
1273
+ else:
1274
+ return cls._create_from_datasource(
1275
+ name,
1276
+ datasource=datasource,
1277
+ embedding_model=embedding_model,
1278
+ value_column=value_column,
1279
+ label_column=label_column,
1280
+ score_column=score_column,
1281
+ source_id_column=source_id_column,
1282
+ partition_id_column=partition_id_column,
1283
+ description=description,
1284
+ label_names=label_names,
1285
+ max_seq_length_override=max_seq_length_override,
1286
+ prompt=prompt,
1287
+ remove_duplicates=remove_duplicates,
1288
+ index_type=index_type,
1289
+ index_params=index_params,
1290
+ if_exists=if_exists,
1291
+ background=background,
1292
+ hidden=hidden,
1293
+ subsample=subsample,
1294
+ memory_type=memory_type,
1295
+ )
1265
1296
 
1266
1297
  @overload
1267
1298
  @classmethod
1268
- def from_list(
1299
+ def from_datasource(
1269
1300
  cls,
1270
1301
  name: str,
1271
- data: list[dict],
1272
1302
  *,
1273
- background: Literal[True],
1274
- **kwargs: Any,
1275
- ) -> Job[Self]:
1303
+ datasource: Datasource,
1304
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1305
+ value_column: str = "value",
1306
+ label_column: str | None = None,
1307
+ score_column: str | None = None,
1308
+ source_id_column: str | None = None,
1309
+ partition_id_column: str | None = None,
1310
+ description: str | None = None,
1311
+ label_names: list[str] | None = None,
1312
+ max_seq_length_override: int | None = None,
1313
+ prompt: str | None = None,
1314
+ remove_duplicates: bool = True,
1315
+ index_type: IndexType = "FLAT",
1316
+ index_params: dict[str, Any] = {},
1317
+ if_exists: CreateMode = "error",
1318
+ background: Literal[True],
1319
+ hidden: bool = False,
1320
+ subsample: int | float | None = None,
1321
+ memory_type: MemoryType | None = None,
1322
+ ) -> Job[Self]:
1323
+ pass
1324
+
1325
+ @overload
1326
+ @classmethod
1327
+ def from_datasource(
1328
+ cls,
1329
+ name: str,
1330
+ *,
1331
+ datasource: Datasource,
1332
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1333
+ value_column: str = "value",
1334
+ label_column: str | None = None,
1335
+ score_column: str | None = None,
1336
+ source_id_column: str | None = None,
1337
+ partition_id_column: str | None = None,
1338
+ description: str | None = None,
1339
+ label_names: list[str] | None = None,
1340
+ max_seq_length_override: int | None = None,
1341
+ prompt: str | None = None,
1342
+ remove_duplicates: bool = True,
1343
+ index_type: IndexType = "FLAT",
1344
+ index_params: dict[str, Any] = {},
1345
+ if_exists: CreateMode = "error",
1346
+ background: Literal[False] = False,
1347
+ hidden: bool = False,
1348
+ subsample: int | float | None = None,
1349
+ memory_type: MemoryType | None = None,
1350
+ ) -> Self:
1351
+ pass
1352
+
1353
+ @classmethod
1354
+ def from_datasource(
1355
+ cls,
1356
+ name: str,
1357
+ *,
1358
+ datasource: Datasource,
1359
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1360
+ value_column: str = "value",
1361
+ label_column: str | None = None,
1362
+ score_column: str | None = None,
1363
+ source_id_column: str | None = None,
1364
+ partition_id_column: str | None = None,
1365
+ description: str | None = None,
1366
+ label_names: list[str] | None = None,
1367
+ max_seq_length_override: int | None = None,
1368
+ prompt: str | None = None,
1369
+ remove_duplicates: bool = True,
1370
+ index_type: IndexType = "FLAT",
1371
+ index_params: dict[str, Any] = {},
1372
+ if_exists: CreateMode = "error",
1373
+ background: bool = False,
1374
+ hidden: bool = False,
1375
+ subsample: int | float | None = None,
1376
+ memory_type: MemoryType | None = None,
1377
+ ) -> Self | Job[Self]:
1378
+ """
1379
+ Create a new memoryset in the OrcaCloud from a datasource.
1380
+
1381
+ This is a convenience method that is equivalent to calling `create` with a datasource.
1382
+ All columns from the datasource that are not specified in the `value_column`,
1383
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
1384
+ in the memoryset.
1385
+
1386
+ Params:
1387
+ name: Name for the new memoryset (must be unique)
1388
+ datasource: Source data to populate the memories in the memoryset.
1389
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1390
+ If not provided, a default embedding model for the memoryset will be used.
1391
+ value_column: Name of the column in the datasource that contains the memory values
1392
+ label_column: Name of the column in the datasource that contains the memory labels.
1393
+ Must contain categorical values as integers or strings. String labels will be
1394
+ converted to integers with the unique strings extracted as `label_names`
1395
+ score_column: Name of the column in the datasource that contains the memory scores
1396
+ source_id_column: Optional name of the column in the datasource that contains the ids in
1397
+ the system of reference
1398
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
1399
+ description: Optional description for the memoryset, this will be used in agentic flows,
1400
+ so make sure it is concise and describes the contents of your memoryset not the
1401
+ datasource or the embedding model.
1402
+ label_names: List of human-readable names for the labels in the memoryset, must match
1403
+ the number of labels in the `label_column`. Will be automatically inferred if string
1404
+ labels are provided or if a [Dataset][datasets.Dataset] with a
1405
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
1406
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1407
+ value is longer than this it will be truncated, will default to the model's max
1408
+ sequence length if not provided
1409
+ prompt: Optional prompt to use when embedding documents/memories for storage
1410
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
1411
+ into the memoryset
1412
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1413
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1414
+ index_params: Parameters for the vector index, defaults to `{}`
1415
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1416
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1417
+ background: Whether to run the operation none blocking and return a job handle.
1418
+ hidden: Whether the memoryset should be hidden
1419
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1420
+ datasource to insert. Use to limit the size of the initial memoryset.
1421
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1422
+ and `"SCORED"` if `score_column` is provided, must be specified for other cases.
1423
+ Returns:
1424
+ Handle to the new memoryset in the OrcaCloud
1425
+
1426
+ Raises:
1427
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1428
+ `"open"` and the params do not match those of the existing memoryset.
1429
+ """
1430
+ return cls._create_from_datasource(
1431
+ name,
1432
+ datasource=datasource,
1433
+ embedding_model=embedding_model,
1434
+ value_column=value_column,
1435
+ label_column=label_column,
1436
+ score_column=score_column,
1437
+ source_id_column=source_id_column,
1438
+ partition_id_column=partition_id_column,
1439
+ description=description,
1440
+ label_names=label_names,
1441
+ max_seq_length_override=max_seq_length_override,
1442
+ prompt=prompt,
1443
+ remove_duplicates=remove_duplicates,
1444
+ index_type=index_type,
1445
+ index_params=index_params,
1446
+ if_exists=if_exists,
1447
+ background=background,
1448
+ hidden=hidden,
1449
+ subsample=subsample,
1450
+ memory_type=memory_type,
1451
+ )
1452
+
1453
+ @classmethod
1454
+ def _create_empty(
1455
+ cls,
1456
+ name: str,
1457
+ *,
1458
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
1459
+ description: str | None = None,
1460
+ label_names: list[str] | None = None,
1461
+ max_seq_length_override: int | None = None,
1462
+ prompt: str | None = None,
1463
+ index_type: IndexType = "FLAT",
1464
+ index_params: dict[str, Any] = {},
1465
+ if_exists: CreateMode = "error",
1466
+ hidden: bool = False,
1467
+ memory_type: MemoryType | None = None,
1468
+ ) -> Self:
1469
+ """
1470
+ Create an empty memoryset in the OrcaCloud
1471
+
1472
+ This creates a memoryset with no initial memories. You can add memories later using
1473
+ the `insert` method.
1474
+
1475
+ Params:
1476
+ name: Name for the new memoryset (must be unique)
1477
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
1478
+ If not provided, a default embedding model for the memoryset will be used.
1479
+ description: Optional description for the memoryset, this will be used in agentic flows,
1480
+ so make sure it is concise and describes the contents of your memoryset not the
1481
+ datasource or the embedding model.
1482
+ label_names: List of human-readable names for the labels in the memoryset
1483
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
1484
+ value is longer than this it will be truncated, will default to the model's max
1485
+ sequence length if not provided
1486
+ prompt: Optional prompt to use when embedding documents/memories for storage
1487
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
1488
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
1489
+ index_params: Parameters for the vector index, defaults to `{}`
1490
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
1491
+ `"error"`. Other option is `"open"` to open the existing memoryset.
1492
+ hidden: Whether the memoryset should be hidden
1493
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if called from
1494
+ `LabeledMemoryset` and `"SCORED"` if called from `ScoredMemoryset`.
1495
+
1496
+ Returns:
1497
+ Handle to the new memoryset in the OrcaCloud
1498
+
1499
+ Raises:
1500
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
1501
+ `"open"` and the params do not match those of the existing memoryset.
1502
+ """
1503
+ if embedding_model is None:
1504
+ embedding_model = PretrainedEmbeddingModel.GTE_BASE
1505
+
1506
+ existing = cls._handle_if_exists(
1507
+ name,
1508
+ if_exists=if_exists,
1509
+ label_names=label_names,
1510
+ embedding_model=embedding_model,
1511
+ )
1512
+ if existing is not None:
1513
+ return existing
1514
+
1515
+ payload: CreateMemorysetRequest = {
1516
+ "name": name,
1517
+ "description": description,
1518
+ "label_names": label_names,
1519
+ "max_seq_length_override": max_seq_length_override,
1520
+ "index_type": index_type,
1521
+ "index_params": index_params,
1522
+ "hidden": hidden,
1523
+ }
1524
+ if memory_type is not None:
1525
+ payload["memory_type"] = memory_type
1526
+ if prompt is not None:
1527
+ payload["prompt"] = prompt
1528
+ if isinstance(embedding_model, PretrainedEmbeddingModel):
1529
+ payload["pretrained_embedding_model_name"] = embedding_model.name
1530
+ elif isinstance(embedding_model, FinetunedEmbeddingModel):
1531
+ payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
1532
+ else:
1533
+ raise ValueError("Invalid embedding model")
1534
+
1535
+ client = OrcaClient._resolve_client()
1536
+ response = client.POST("/memoryset/empty", json=payload)
1537
+ return cls.open(response["id"])
1538
+
1539
+ @overload
1540
+ @classmethod
1541
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
1542
+ pass
1543
+
1544
+ @overload
1545
+ @classmethod
1546
+ def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
1547
+ pass
1548
+
1549
+ @classmethod
1550
+ def from_hf_dataset(
1551
+ cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
1552
+ ) -> Self | Job[Self]:
1553
+ """
1554
+ Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
1555
+
1556
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1557
+ appended with `_datasource` and use that as the datasource for the memoryset.
1558
+
1559
+ All features that are not specified to be used as `value_column`, `label_column`, or
1560
+ `source_id_column` will be stored as metadata in the memoryset.
1561
+
1562
+ Params:
1563
+ name: Name for the new memoryset (must be unique)
1564
+ hf_dataset: Hugging Face dataset to create the memoryset from
1565
+ kwargs: Additional parameters for creating the memoryset. See
1566
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1567
+
1568
+ Returns:
1569
+ Handle to the new memoryset in the OrcaCloud
1570
+ """
1571
+ if_exists = kwargs.get("if_exists", "error")
1572
+ existing = cls._handle_if_exists(
1573
+ name,
1574
+ if_exists=if_exists,
1575
+ label_names=kwargs.get("label_names"),
1576
+ embedding_model=kwargs.get("embedding_model"),
1577
+ )
1578
+ if existing is not None:
1579
+ return existing
1580
+
1581
+ datasource = Datasource.from_hf_dataset(
1582
+ f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
1583
+ )
1584
+ kwargs["background"] = background
1585
+ return cls.create(name, datasource=datasource, **kwargs)
1586
+
1587
+ @overload
1588
+ @classmethod
1589
+ def from_pytorch(
1590
+ cls,
1591
+ name: str,
1592
+ torch_data: TorchDataLoader | TorchDataset,
1593
+ *,
1594
+ column_names: list[str] | None = None,
1595
+ background: Literal[True],
1596
+ **kwargs: Any,
1597
+ ) -> Job[Self]:
1598
+ pass
1599
+
1600
+ @overload
1601
+ @classmethod
1602
+ def from_pytorch(
1603
+ cls,
1604
+ name: str,
1605
+ torch_data: TorchDataLoader | TorchDataset,
1606
+ *,
1607
+ column_names: list[str] | None = None,
1608
+ background: Literal[False] = False,
1609
+ **kwargs: Any,
1610
+ ) -> Self:
1611
+ pass
1612
+
1613
+ @classmethod
1614
+ def from_pytorch(
1615
+ cls,
1616
+ name: str,
1617
+ torch_data: TorchDataLoader | TorchDataset,
1618
+ *,
1619
+ column_names: list[str] | None = None,
1620
+ background: bool = False,
1621
+ **kwargs: Any,
1622
+ ) -> Self | Job[Self]:
1623
+ """
1624
+ Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
1625
+ [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
1626
+
1627
+ This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
1628
+ appended with `_datasource` and use that as the datasource for the memoryset.
1629
+
1630
+ All properties that are not specified to be used as `value_column`, `label_column`, or
1631
+ `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
1632
+
1633
+ Params:
1634
+ name: Name for the new memoryset (must be unique)
1635
+ torch_data: PyTorch data loader or dataset to create the memoryset from
1636
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
1637
+ argument must be provided to specify the names of the columns.
1638
+ background: Whether to run the operation in the background
1639
+ kwargs: Additional parameters for creating the memoryset. See
1640
+ [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
1641
+
1642
+ Returns:
1643
+ Handle to the new memoryset in the OrcaCloud
1644
+ """
1645
+ if_exists = kwargs.get("if_exists", "error")
1646
+ existing = cls._handle_if_exists(
1647
+ name,
1648
+ if_exists=if_exists,
1649
+ label_names=kwargs.get("label_names"),
1650
+ embedding_model=kwargs.get("embedding_model"),
1651
+ )
1652
+ if existing is not None:
1653
+ return existing
1654
+
1655
+ datasource = Datasource.from_pytorch(
1656
+ f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
1657
+ )
1658
+ kwargs["background"] = background
1659
+ return cls.create(name, datasource=datasource, **kwargs)
1660
+
1661
+ @overload
1662
+ @classmethod
1663
+ def from_list(
1664
+ cls,
1665
+ name: str,
1666
+ data: list[dict],
1667
+ *,
1668
+ background: Literal[True],
1669
+ **kwargs: Any,
1670
+ ) -> Job[Self]:
1276
1671
  pass
1277
1672
 
1278
1673
  @overload
@@ -1333,7 +1728,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1333
1728
 
1334
1729
  datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
1335
1730
  kwargs["background"] = background
1336
- return cls.create(name, datasource, **kwargs)
1731
+ return cls.create(name, datasource=datasource, **kwargs)
1337
1732
 
1338
1733
  @overload
1339
1734
  @classmethod
@@ -1406,7 +1801,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1406
1801
 
1407
1802
  datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
1408
1803
  kwargs["background"] = background
1409
- return cls.create(name, datasource, **kwargs)
1804
+ return cls.create(name, datasource=datasource, **kwargs)
1410
1805
 
1411
1806
  @overload
1412
1807
  @classmethod
@@ -1472,7 +1867,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1472
1867
 
1473
1868
  datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
1474
1869
  kwargs["background"] = background
1475
- return cls.create(name, datasource, **kwargs)
1870
+ return cls.create(name, datasource=datasource, **kwargs)
1476
1871
 
1477
1872
  @overload
1478
1873
  @classmethod
@@ -1540,7 +1935,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1540
1935
  f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
1541
1936
  )
1542
1937
  kwargs["background"] = background
1543
- return cls.create(name, datasource, **kwargs)
1938
+ return cls.create(name, datasource=datasource, **kwargs)
1544
1939
 
1545
1940
  @overload
1546
1941
  @classmethod
@@ -1613,7 +2008,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1613
2008
 
1614
2009
  datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
1615
2010
  kwargs["background"] = background
1616
- return cls.create(name, datasource, **kwargs)
2011
+ return cls.create(name, datasource=datasource, **kwargs)
1617
2012
 
1618
2013
  @classmethod
1619
2014
  def open(cls, name: str) -> Self:
@@ -1830,6 +2225,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1830
2225
 
1831
2226
  client = OrcaClient._resolve_client()
1832
2227
  metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
2228
+
2229
+ if metadata["insertion_job_id"] is None:
2230
+ raise ValueError("Create memoryset operation failed to produce an insertion job")
2231
+
1833
2232
  job = Job(
1834
2233
  metadata["insertion_job_id"],
1835
2234
  lambda: self.open(metadata["id"]),
@@ -2482,9 +2881,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
2482
2881
 
2483
2882
  - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
2484
2883
  - **`"cluster"`**: Cluster the memories in the memoryset
2485
- - **`"label"`**: Analyze the labels to find potential mislabelings
2486
- - **`"distribution"`**: Analyze the embedding distribution to populate
2884
+ - **`"distribution"`**: Analyze the embedding distribution
2487
2885
  - **`"projection"`**: Create a 2D projection of the embeddings for visualization
2886
+ - **`"label"`**: Analyze the labels to find potential mislabelings (labeled memorysets only)
2887
+ - **`"class_patterns"`**: Analyze class patterns and find representative memories (labeled memorysets only)
2888
+ - **`"concepts"`**: Discover and name conceptual clusters in the memoryset (labeled memorysets only)
2488
2889
 
2489
2890
  lookup_count: Number of memories to lookup for each memory in the memoryset
2490
2891
  clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
@@ -2590,35 +2991,246 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2590
2991
  """
2591
2992
  A Handle to a collection of memories with labels in the OrcaCloud
2592
2993
 
2593
- Attributes:
2594
- id: Unique identifier for the memoryset
2595
- name: Unique name of the memoryset
2596
- description: Description of the memoryset
2597
- label_names: Names for the class labels in the memoryset
2598
- length: Number of memories in the memoryset
2599
- embedding_model: Embedding model used to embed the memory values for semantic search
2600
- created_at: When the memoryset was created, automatically generated on create
2601
- updated_at: When the memoryset was last updated, automatically updated on updates
2602
- """
2994
+ Attributes:
2995
+ id: Unique identifier for the memoryset
2996
+ name: Unique name of the memoryset
2997
+ description: Description of the memoryset
2998
+ label_names: Names for the class labels in the memoryset
2999
+ length: Number of memories in the memoryset
3000
+ embedding_model: Embedding model used to embed the memory values for semantic search
3001
+ created_at: When the memoryset was created, automatically generated on create
3002
+ updated_at: When the memoryset was last updated, automatically updated on updates
3003
+ """
3004
+
3005
+ label_names: list[str]
3006
+ memory_type: MemoryType = "LABELED"
3007
+
3008
+ def __init__(self, metadata: MemorysetMetadata):
3009
+ super().__init__(metadata)
3010
+ assert metadata["label_names"] is not None
3011
+ self.label_names = metadata["label_names"]
3012
+
3013
+ def __eq__(self, other) -> bool:
3014
+ return isinstance(other, LabeledMemoryset) and self.id == other.id
3015
+
3016
+ @overload
3017
+ @classmethod
3018
+ def create(
3019
+ cls,
3020
+ name: str,
3021
+ *,
3022
+ datasource: None = None,
3023
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3024
+ description: str | None = None,
3025
+ label_names: list[str],
3026
+ max_seq_length_override: int | None = None,
3027
+ prompt: str | None = None,
3028
+ index_type: IndexType = "FLAT",
3029
+ index_params: dict[str, Any] = {},
3030
+ if_exists: CreateMode = "error",
3031
+ hidden: bool = False,
3032
+ ) -> Self:
3033
+ pass
3034
+
3035
+ @overload
3036
+ @classmethod
3037
+ def create(
3038
+ cls,
3039
+ name: str,
3040
+ *,
3041
+ datasource: Datasource,
3042
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3043
+ value_column: str = "value",
3044
+ label_column: str | None = "label",
3045
+ source_id_column: str | None = None,
3046
+ partition_id_column: str | None = None,
3047
+ description: str | None = None,
3048
+ label_names: list[str] | None = None,
3049
+ max_seq_length_override: int | None = None,
3050
+ prompt: str | None = None,
3051
+ remove_duplicates: bool = True,
3052
+ index_type: IndexType = "FLAT",
3053
+ index_params: dict[str, Any] = {},
3054
+ if_exists: CreateMode = "error",
3055
+ background: Literal[True],
3056
+ hidden: bool = False,
3057
+ subsample: int | float | None = None,
3058
+ ) -> Job[Self]:
3059
+ pass
3060
+
3061
+ @overload
3062
+ @classmethod
3063
+ def create(
3064
+ cls,
3065
+ name: str,
3066
+ *,
3067
+ datasource: Datasource,
3068
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3069
+ value_column: str = "value",
3070
+ label_column: str | None = "label",
3071
+ source_id_column: str | None = None,
3072
+ partition_id_column: str | None = None,
3073
+ description: str | None = None,
3074
+ label_names: list[str] | None = None,
3075
+ max_seq_length_override: int | None = None,
3076
+ prompt: str | None = None,
3077
+ remove_duplicates: bool = True,
3078
+ index_type: IndexType = "FLAT",
3079
+ index_params: dict[str, Any] = {},
3080
+ if_exists: CreateMode = "error",
3081
+ background: Literal[False] = False,
3082
+ hidden: bool = False,
3083
+ subsample: int | float | None = None,
3084
+ ) -> Self:
3085
+ pass
3086
+
3087
+ @classmethod
3088
+ def create( # type: ignore[override]
3089
+ cls,
3090
+ name: str,
3091
+ *,
3092
+ datasource: Datasource | None = None,
3093
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3094
+ value_column: str = "value",
3095
+ label_column: str | None = "label",
3096
+ source_id_column: str | None = None,
3097
+ partition_id_column: str | None = None,
3098
+ description: str | None = None,
3099
+ label_names: list[str] | None = None,
3100
+ max_seq_length_override: int | None = None,
3101
+ prompt: str | None = None,
3102
+ remove_duplicates: bool = True,
3103
+ index_type: IndexType = "FLAT",
3104
+ index_params: dict[str, Any] = {},
3105
+ if_exists: CreateMode = "error",
3106
+ background: bool = False,
3107
+ hidden: bool = False,
3108
+ subsample: int | float | None = None,
3109
+ ) -> Self | Job[Self]:
3110
+ """
3111
+ Create a new labeled memoryset in the OrcaCloud
3112
+
3113
+ If `datasource` is provided, all columns from the datasource that are not specified in the
3114
+ `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
3115
+ as metadata in the memoryset.
2603
3116
 
2604
- label_names: list[str]
2605
- memory_type: MemoryType = "LABELED"
3117
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
3118
+ You can add memories later using the `insert` method.
2606
3119
 
2607
- def __init__(self, metadata: MemorysetMetadata):
2608
- super().__init__(metadata)
2609
- assert metadata["label_names"] is not None
2610
- self.label_names = metadata["label_names"]
3120
+ Params:
3121
+ name: Name for the new memoryset (must be unique)
3122
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
3123
+ an empty memoryset will be created.
3124
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3125
+ If not provided, a default embedding model for the memoryset will be used.
3126
+ value_column: Name of the column in the datasource that contains the memory values
3127
+ label_column: Name of the column in the datasource that contains the memory labels.
3128
+ Must contain categorical values as integers or strings. String labels will be
3129
+ converted to integers with the unique strings extracted as `label_names`. To create
3130
+ a memoryset with all none labels, set to `None`.
3131
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3132
+ the system of reference
3133
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3134
+ description: Optional description for the memoryset, this will be used in agentic flows,
3135
+ so make sure it is concise and describes the contents of your memoryset not the
3136
+ datasource or the embedding model.
3137
+ label_names: List of human-readable names for the labels in the memoryset, must match
3138
+ the number of labels in the `label_column`. Will be automatically inferred if string
3139
+ labels are provided or if a [Dataset][datasets.Dataset] with a
3140
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
3141
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3142
+ value is longer than this it will be truncated, will default to the model's max
3143
+ sequence length if not provided
3144
+ prompt: Optional prompt to use when embedding documents/memories for storage
3145
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3146
+ into the memoryset
3147
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3148
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3149
+ index_params: Parameters for the vector index, defaults to `{}`
3150
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3151
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3152
+ background: Whether to run the operation none blocking and return a job handle
3153
+ hidden: Whether the memoryset should be hidden
2611
3154
 
2612
- def __eq__(self, other) -> bool:
2613
- return isinstance(other, LabeledMemoryset) and self.id == other.id
3155
+ Returns:
3156
+ Handle to the new memoryset in the OrcaCloud
3157
+
3158
+ Raises:
3159
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3160
+ `"open"` and the params do not match those of the existing memoryset.
3161
+ """
3162
+ if datasource is None:
3163
+ return super().create(
3164
+ name,
3165
+ datasource=None,
3166
+ embedding_model=embedding_model,
3167
+ description=description,
3168
+ label_names=label_names,
3169
+ max_seq_length_override=max_seq_length_override,
3170
+ prompt=prompt,
3171
+ index_type=index_type,
3172
+ index_params=index_params,
3173
+ if_exists=if_exists,
3174
+ hidden=hidden,
3175
+ memory_type="LABELED",
3176
+ )
3177
+ else:
3178
+ # Type narrowing: datasource is definitely Datasource here
3179
+ assert datasource is not None
3180
+ if background:
3181
+ return super().create(
3182
+ name,
3183
+ datasource=datasource,
3184
+ label_column=label_column,
3185
+ score_column=None,
3186
+ embedding_model=embedding_model,
3187
+ value_column=value_column,
3188
+ source_id_column=source_id_column,
3189
+ partition_id_column=partition_id_column,
3190
+ description=description,
3191
+ label_names=label_names,
3192
+ max_seq_length_override=max_seq_length_override,
3193
+ prompt=prompt,
3194
+ remove_duplicates=remove_duplicates,
3195
+ index_type=index_type,
3196
+ index_params=index_params,
3197
+ if_exists=if_exists,
3198
+ background=True,
3199
+ hidden=hidden,
3200
+ subsample=subsample,
3201
+ memory_type="LABELED",
3202
+ )
3203
+ else:
3204
+ return super().create(
3205
+ name,
3206
+ datasource=datasource,
3207
+ label_column=label_column,
3208
+ score_column=None,
3209
+ embedding_model=embedding_model,
3210
+ value_column=value_column,
3211
+ source_id_column=source_id_column,
3212
+ partition_id_column=partition_id_column,
3213
+ description=description,
3214
+ label_names=label_names,
3215
+ max_seq_length_override=max_seq_length_override,
3216
+ prompt=prompt,
3217
+ remove_duplicates=remove_duplicates,
3218
+ index_type=index_type,
3219
+ index_params=index_params,
3220
+ if_exists=if_exists,
3221
+ background=False,
3222
+ hidden=hidden,
3223
+ subsample=subsample,
3224
+ memory_type="LABELED",
3225
+ )
2614
3226
 
2615
3227
  @overload
2616
3228
  @classmethod
2617
- def create(
3229
+ def from_datasource(
2618
3230
  cls,
2619
3231
  name: str,
2620
- datasource: Datasource,
2621
3232
  *,
3233
+ datasource: Datasource,
2622
3234
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2623
3235
  value_column: str = "value",
2624
3236
  label_column: str | None = "label",
@@ -2640,11 +3252,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2640
3252
 
2641
3253
  @overload
2642
3254
  @classmethod
2643
- def create(
3255
+ def from_datasource(
2644
3256
  cls,
2645
3257
  name: str,
2646
- datasource: Datasource,
2647
3258
  *,
3259
+ datasource: Datasource,
2648
3260
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2649
3261
  value_column: str = "value",
2650
3262
  label_column: str | None = "label",
@@ -2665,11 +3277,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2665
3277
  pass
2666
3278
 
2667
3279
  @classmethod
2668
- def create( # type: ignore[override]
3280
+ def from_datasource( # type: ignore[override]
2669
3281
  cls,
2670
3282
  name: str,
2671
- datasource: Datasource,
2672
3283
  *,
3284
+ datasource: Datasource,
2673
3285
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2674
3286
  value_column: str = "value",
2675
3287
  label_column: str | None = "label",
@@ -2688,14 +3300,16 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2688
3300
  subsample: int | float | None = None,
2689
3301
  ) -> Self | Job[Self]:
2690
3302
  """
2691
- Create a new labeled memoryset in the OrcaCloud
3303
+ Create a new labeled memoryset in the OrcaCloud from a datasource.
2692
3304
 
3305
+ This is a convenience method that is equivalent to calling `create` with a datasource.
2693
3306
  All columns from the datasource that are not specified in the `value_column`,
2694
- `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
3307
+ `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
3308
+ in the memoryset.
2695
3309
 
2696
3310
  Params:
2697
3311
  name: Name for the new memoryset (must be unique)
2698
- datasource: Source data to populate the memories in the memoryset
3312
+ datasource: Source data to populate the memories in the memoryset.
2699
3313
  embedding_model: Embedding model to use for embedding memory values for semantic search.
2700
3314
  If not provided, a default embedding model for the memoryset will be used.
2701
3315
  value_column: Name of the column in the datasource that contains the memory values
@@ -2724,8 +3338,10 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2724
3338
  index_params: Parameters for the vector index, defaults to `{}`
2725
3339
  if_exists: What to do if a memoryset with the same name already exists, defaults to
2726
3340
  `"error"`. Other option is `"open"` to open the existing memoryset.
2727
- background: Whether to run the operation none blocking and return a job handle
3341
+ background: Whether to run the operation none blocking and return a job handle.
2728
3342
  hidden: Whether the memoryset should be hidden
3343
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
3344
+ datasource to insert. Use to limit the size of the initial memoryset.
2729
3345
 
2730
3346
  Returns:
2731
3347
  Handle to the new memoryset in the OrcaCloud
@@ -2734,28 +3350,52 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2734
3350
  ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
2735
3351
  `"open"` and the params do not match those of the existing memoryset.
2736
3352
  """
2737
- return super().create(
2738
- name,
2739
- datasource,
2740
- label_column=label_column,
2741
- score_column=None,
2742
- embedding_model=embedding_model,
2743
- value_column=value_column,
2744
- source_id_column=source_id_column,
2745
- partition_id_column=partition_id_column,
2746
- description=description,
2747
- label_names=label_names,
2748
- max_seq_length_override=max_seq_length_override,
2749
- prompt=prompt,
2750
- remove_duplicates=remove_duplicates,
2751
- index_type=index_type,
2752
- index_params=index_params,
2753
- if_exists=if_exists,
2754
- background=background,
2755
- hidden=hidden,
2756
- subsample=subsample,
2757
- memory_type="LABELED",
2758
- )
3353
+ if background:
3354
+ return super().create(
3355
+ name,
3356
+ datasource=datasource,
3357
+ label_column=label_column,
3358
+ score_column=None,
3359
+ embedding_model=embedding_model,
3360
+ value_column=value_column,
3361
+ source_id_column=source_id_column,
3362
+ partition_id_column=partition_id_column,
3363
+ description=description,
3364
+ label_names=label_names,
3365
+ max_seq_length_override=max_seq_length_override,
3366
+ prompt=prompt,
3367
+ remove_duplicates=remove_duplicates,
3368
+ index_type=index_type,
3369
+ index_params=index_params,
3370
+ if_exists=if_exists,
3371
+ background=True,
3372
+ hidden=hidden,
3373
+ subsample=subsample,
3374
+ memory_type="LABELED",
3375
+ )
3376
+ else:
3377
+ return super().create(
3378
+ name,
3379
+ datasource=datasource,
3380
+ label_column=label_column,
3381
+ score_column=None,
3382
+ embedding_model=embedding_model,
3383
+ value_column=value_column,
3384
+ source_id_column=source_id_column,
3385
+ partition_id_column=partition_id_column,
3386
+ description=description,
3387
+ label_names=label_names,
3388
+ max_seq_length_override=max_seq_length_override,
3389
+ prompt=prompt,
3390
+ remove_duplicates=remove_duplicates,
3391
+ index_type=index_type,
3392
+ index_params=index_params,
3393
+ if_exists=if_exists,
3394
+ background=False,
3395
+ hidden=hidden,
3396
+ subsample=subsample,
3397
+ memory_type="LABELED",
3398
+ )
2759
3399
 
2760
3400
  def display_label_analysis(self):
2761
3401
  """
@@ -2793,8 +3433,26 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2793
3433
  def create(
2794
3434
  cls,
2795
3435
  name: str,
2796
- datasource: Datasource,
2797
3436
  *,
3437
+ datasource: None = None,
3438
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3439
+ description: str | None = None,
3440
+ max_seq_length_override: int | None = None,
3441
+ prompt: str | None = None,
3442
+ index_type: IndexType = "FLAT",
3443
+ index_params: dict[str, Any] = {},
3444
+ if_exists: CreateMode = "error",
3445
+ hidden: bool = False,
3446
+ ) -> Self:
3447
+ pass
3448
+
3449
+ @overload
3450
+ @classmethod
3451
+ def create(
3452
+ cls,
3453
+ name: str,
3454
+ *,
3455
+ datasource: Datasource,
2798
3456
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2799
3457
  value_column: str = "value",
2800
3458
  score_column: str | None = "score",
@@ -2818,8 +3476,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2818
3476
  def create(
2819
3477
  cls,
2820
3478
  name: str,
2821
- datasource: Datasource,
2822
3479
  *,
3480
+ datasource: Datasource,
2823
3481
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2824
3482
  score_column: str | None = "score",
2825
3483
  value_column: str = "value",
@@ -2842,8 +3500,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2842
3500
  def create( # type: ignore[override]
2843
3501
  cls,
2844
3502
  name: str,
2845
- datasource: Datasource,
2846
3503
  *,
3504
+ datasource: Datasource | None = None,
2847
3505
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2848
3506
  value_column: str = "value",
2849
3507
  score_column: str | None = "score",
@@ -2863,12 +3521,17 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2863
3521
  """
2864
3522
  Create a new scored memoryset in the OrcaCloud
2865
3523
 
2866
- All columns from the datasource that are not specified in the `value_column`,
2867
- `score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
3524
+ If `datasource` is provided, all columns from the datasource that are not specified in the
3525
+ `value_column`, `score_column`, `source_id_column`, or `partition_id_column` will be stored
3526
+ as metadata in the memoryset.
3527
+
3528
+ If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
3529
+ You can add memories later using the `insert` method.
2868
3530
 
2869
3531
  Params:
2870
3532
  name: Name for the new memoryset (must be unique)
2871
- datasource: Source data to populate the memories in the memoryset
3533
+ datasource: Optional source data to populate the memories in the memoryset. If omitted,
3534
+ an empty memoryset will be created.
2872
3535
  embedding_model: Embedding model to use for embedding memory values for semantic search.
2873
3536
  If not provided, a default embedding model for the memoryset will be used.
2874
3537
  value_column: Name of the column in the datasource that contains the memory values
@@ -2901,23 +3564,222 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2901
3564
  ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
2902
3565
  `"open"` and the params do not match those of the existing memoryset.
2903
3566
  """
2904
- return super().create(
2905
- name,
2906
- datasource,
2907
- embedding_model=embedding_model,
2908
- value_column=value_column,
2909
- score_column=score_column,
2910
- source_id_column=source_id_column,
2911
- partition_id_column=partition_id_column,
2912
- description=description,
2913
- max_seq_length_override=max_seq_length_override,
2914
- prompt=prompt,
2915
- remove_duplicates=remove_duplicates,
2916
- index_type=index_type,
2917
- index_params=index_params,
2918
- if_exists=if_exists,
2919
- background=background,
2920
- hidden=hidden,
2921
- subsample=subsample,
2922
- memory_type="SCORED",
2923
- )
3567
+ if datasource is None:
3568
+ return super().create(
3569
+ name,
3570
+ datasource=None,
3571
+ embedding_model=embedding_model,
3572
+ description=description,
3573
+ max_seq_length_override=max_seq_length_override,
3574
+ prompt=prompt,
3575
+ index_type=index_type,
3576
+ index_params=index_params,
3577
+ if_exists=if_exists,
3578
+ hidden=hidden,
3579
+ memory_type="SCORED",
3580
+ )
3581
+ else:
3582
+ # Type narrowing: datasource is definitely Datasource here
3583
+ assert datasource is not None
3584
+ if background:
3585
+ return super().create(
3586
+ name,
3587
+ datasource=datasource,
3588
+ embedding_model=embedding_model,
3589
+ value_column=value_column,
3590
+ score_column=score_column,
3591
+ source_id_column=source_id_column,
3592
+ partition_id_column=partition_id_column,
3593
+ description=description,
3594
+ max_seq_length_override=max_seq_length_override,
3595
+ prompt=prompt,
3596
+ remove_duplicates=remove_duplicates,
3597
+ index_type=index_type,
3598
+ index_params=index_params,
3599
+ if_exists=if_exists,
3600
+ background=True,
3601
+ hidden=hidden,
3602
+ subsample=subsample,
3603
+ memory_type="SCORED",
3604
+ )
3605
+ else:
3606
+ return super().create(
3607
+ name,
3608
+ datasource=datasource,
3609
+ embedding_model=embedding_model,
3610
+ value_column=value_column,
3611
+ score_column=score_column,
3612
+ source_id_column=source_id_column,
3613
+ partition_id_column=partition_id_column,
3614
+ description=description,
3615
+ max_seq_length_override=max_seq_length_override,
3616
+ prompt=prompt,
3617
+ remove_duplicates=remove_duplicates,
3618
+ index_type=index_type,
3619
+ index_params=index_params,
3620
+ if_exists=if_exists,
3621
+ background=False,
3622
+ hidden=hidden,
3623
+ subsample=subsample,
3624
+ memory_type="SCORED",
3625
+ )
3626
+
3627
+ @overload
3628
+ @classmethod
3629
+ def from_datasource(
3630
+ cls,
3631
+ name: str,
3632
+ *,
3633
+ datasource: Datasource,
3634
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3635
+ value_column: str = "value",
3636
+ score_column: str | None = "score",
3637
+ source_id_column: str | None = None,
3638
+ partition_id_column: str | None = None,
3639
+ description: str | None = None,
3640
+ max_seq_length_override: int | None = None,
3641
+ prompt: str | None = None,
3642
+ remove_duplicates: bool = True,
3643
+ index_type: IndexType = "FLAT",
3644
+ index_params: dict[str, Any] = {},
3645
+ if_exists: CreateMode = "error",
3646
+ background: Literal[True],
3647
+ hidden: bool = False,
3648
+ subsample: int | float | None = None,
3649
+ ) -> Job[Self]:
3650
+ pass
3651
+
3652
+ @overload
3653
+ @classmethod
3654
+ def from_datasource(
3655
+ cls,
3656
+ name: str,
3657
+ *,
3658
+ datasource: Datasource,
3659
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3660
+ score_column: str | None = "score",
3661
+ value_column: str = "value",
3662
+ source_id_column: str | None = None,
3663
+ partition_id_column: str | None = None,
3664
+ description: str | None = None,
3665
+ max_seq_length_override: int | None = None,
3666
+ prompt: str | None = None,
3667
+ remove_duplicates: bool = True,
3668
+ index_type: IndexType = "FLAT",
3669
+ index_params: dict[str, Any] = {},
3670
+ if_exists: CreateMode = "error",
3671
+ background: Literal[False] = False,
3672
+ hidden: bool = False,
3673
+ subsample: int | float | None = None,
3674
+ ) -> Self:
3675
+ pass
3676
+
3677
+ @classmethod
3678
+ def from_datasource( # type: ignore[override]
3679
+ cls,
3680
+ name: str,
3681
+ *,
3682
+ datasource: Datasource,
3683
+ embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
3684
+ value_column: str = "value",
3685
+ score_column: str | None = "score",
3686
+ source_id_column: str | None = None,
3687
+ partition_id_column: str | None = None,
3688
+ description: str | None = None,
3689
+ max_seq_length_override: int | None = None,
3690
+ prompt: str | None = None,
3691
+ remove_duplicates: bool = True,
3692
+ index_type: IndexType = "FLAT",
3693
+ index_params: dict[str, Any] = {},
3694
+ if_exists: CreateMode = "error",
3695
+ background: bool = False,
3696
+ hidden: bool = False,
3697
+ subsample: int | float | None = None,
3698
+ ) -> Self | Job[Self]:
3699
+ """
3700
+ Create a new scored memoryset in the OrcaCloud from a datasource.
3701
+
3702
+ This is a convenience method that is equivalent to calling `create` with a datasource.
3703
+ All columns from the datasource that are not specified in the `value_column`,
3704
+ `score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
3705
+ in the memoryset.
3706
+
3707
+ Params:
3708
+ name: Name for the new memoryset (must be unique)
3709
+ datasource: Source data to populate the memories in the memoryset.
3710
+ embedding_model: Embedding model to use for embedding memory values for semantic search.
3711
+ If not provided, a default embedding model for the memoryset will be used.
3712
+ value_column: Name of the column in the datasource that contains the memory values
3713
+ score_column: Name of the column in the datasource that contains the memory scores. Must
3714
+ contain numerical values. To create a memoryset with all none scores, set to `None`.
3715
+ source_id_column: Optional name of the column in the datasource that contains the ids in
3716
+ the system of reference
3717
+ partition_id_column: Optional name of the column in the datasource that contains the partition ids
3718
+ description: Optional description for the memoryset, this will be used in agentic flows,
3719
+ so make sure it is concise and describes the contents of your memoryset not the
3720
+ datasource or the embedding model.
3721
+ max_seq_length_override: Maximum sequence length of values in the memoryset, if the
3722
+ value is longer than this it will be truncated, will default to the model's max
3723
+ sequence length if not provided
3724
+ prompt: Optional prompt to use when embedding documents/memories for storage
3725
+ remove_duplicates: Whether to remove duplicates from the datasource before inserting
3726
+ into the memoryset
3727
+ index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
3728
+ values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
3729
+ index_params: Parameters for the vector index, defaults to `{}`
3730
+ if_exists: What to do if a memoryset with the same name already exists, defaults to
3731
+ `"error"`. Other option is `"open"` to open the existing memoryset.
3732
+ background: Whether to run the operation none blocking and return a job handle.
3733
+ hidden: Whether the memoryset should be hidden
3734
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
3735
+ datasource to insert. Use to limit the size of the initial memoryset.
3736
+
3737
+ Returns:
3738
+ Handle to the new memoryset in the OrcaCloud
3739
+
3740
+ Raises:
3741
+ ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
3742
+ `"open"` and the params do not match those of the existing memoryset.
3743
+ """
3744
+ if background:
3745
+ return super().create(
3746
+ name,
3747
+ datasource=datasource,
3748
+ embedding_model=embedding_model,
3749
+ value_column=value_column,
3750
+ score_column=score_column,
3751
+ source_id_column=source_id_column,
3752
+ partition_id_column=partition_id_column,
3753
+ description=description,
3754
+ max_seq_length_override=max_seq_length_override,
3755
+ prompt=prompt,
3756
+ remove_duplicates=remove_duplicates,
3757
+ index_type=index_type,
3758
+ index_params=index_params,
3759
+ if_exists=if_exists,
3760
+ background=True,
3761
+ hidden=hidden,
3762
+ subsample=subsample,
3763
+ memory_type="SCORED",
3764
+ )
3765
+ else:
3766
+ return super().create(
3767
+ name,
3768
+ datasource=datasource,
3769
+ embedding_model=embedding_model,
3770
+ value_column=value_column,
3771
+ score_column=score_column,
3772
+ source_id_column=source_id_column,
3773
+ partition_id_column=partition_id_column,
3774
+ description=description,
3775
+ max_seq_length_override=max_seq_length_override,
3776
+ prompt=prompt,
3777
+ remove_duplicates=remove_duplicates,
3778
+ index_type=index_type,
3779
+ index_params=index_params,
3780
+ if_exists=if_exists,
3781
+ background=False,
3782
+ hidden=hidden,
3783
+ subsample=subsample,
3784
+ memory_type="SCORED",
3785
+ )