orca-sdk 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +1 -1
- orca_sdk/_utils/auth.py +12 -8
- orca_sdk/async_client.py +3795 -0
- orca_sdk/classification_model.py +176 -14
- orca_sdk/classification_model_test.py +96 -28
- orca_sdk/client.py +515 -475
- orca_sdk/conftest.py +37 -36
- orca_sdk/credentials.py +54 -14
- orca_sdk/credentials_test.py +92 -28
- orca_sdk/datasource.py +19 -10
- orca_sdk/datasource_test.py +24 -18
- orca_sdk/embedding_model.py +22 -13
- orca_sdk/embedding_model_test.py +27 -20
- orca_sdk/job.py +14 -8
- orca_sdk/memoryset.py +513 -183
- orca_sdk/memoryset_test.py +130 -32
- orca_sdk/regression_model.py +21 -11
- orca_sdk/regression_model_test.py +35 -26
- orca_sdk/telemetry.py +24 -13
- {orca_sdk-0.1.2.dist-info → orca_sdk-0.1.3.dist-info}/METADATA +1 -1
- orca_sdk-0.1.3.dist-info/RECORD +41 -0
- orca_sdk-0.1.2.dist-info/RECORD +0 -40
- {orca_sdk-0.1.2.dist-info → orca_sdk-0.1.3.dist-info}/WHEEL +0 -0
orca_sdk/memoryset.py
CHANGED
|
@@ -13,11 +13,11 @@ from torch.utils.data import DataLoader as TorchDataLoader
|
|
|
13
13
|
from torch.utils.data import Dataset as TorchDataset
|
|
14
14
|
|
|
15
15
|
from ._utils.common import UNSET, CreateMode, DropMode
|
|
16
|
+
from .async_client import OrcaAsyncClient
|
|
16
17
|
from .client import (
|
|
17
18
|
CascadingEditSuggestion,
|
|
18
19
|
CloneMemorysetRequest,
|
|
19
20
|
CreateMemorysetRequest,
|
|
20
|
-
EmbeddingModelResult,
|
|
21
21
|
FilterItem,
|
|
22
22
|
)
|
|
23
23
|
from .client import LabeledMemory as LabeledMemoryResponse
|
|
@@ -35,6 +35,7 @@ from .client import (
|
|
|
35
35
|
MemorysetMetrics,
|
|
36
36
|
MemorysetUpdate,
|
|
37
37
|
MemoryType,
|
|
38
|
+
OrcaClient,
|
|
38
39
|
)
|
|
39
40
|
from .client import ScoredMemory as ScoredMemoryResponse
|
|
40
41
|
from .client import (
|
|
@@ -47,7 +48,6 @@ from .client import (
|
|
|
47
48
|
ScorePredictionMemoryLookup,
|
|
48
49
|
TelemetryFilterItem,
|
|
49
50
|
TelemetrySortOptions,
|
|
50
|
-
orca_api,
|
|
51
51
|
)
|
|
52
52
|
from .datasource import Datasource
|
|
53
53
|
from .embedding_model import (
|
|
@@ -299,7 +299,8 @@ class MemoryBase(ABC):
|
|
|
299
299
|
source_id: str | None = UNSET,
|
|
300
300
|
**metadata: None | bool | float | int | str,
|
|
301
301
|
) -> Self:
|
|
302
|
-
|
|
302
|
+
client = OrcaClient._resolve_client()
|
|
303
|
+
response = client.PATCH(
|
|
303
304
|
"/gpu/memoryset/{name_or_id}/memory",
|
|
304
305
|
params={"name_or_id": self.memoryset_id},
|
|
305
306
|
json=_parse_memory_update(
|
|
@@ -637,6 +638,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
637
638
|
index_params: dict[str, Any]
|
|
638
639
|
hidden: bool
|
|
639
640
|
|
|
641
|
+
_batch_size = 32 # max number of memories to insert/update/delete in a single API call
|
|
642
|
+
|
|
640
643
|
def __init__(self, metadata: MemorysetMetadata):
|
|
641
644
|
# for internal use only, do not document
|
|
642
645
|
if metadata["pretrained_embedding_model_name"]:
|
|
@@ -670,55 +673,48 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
670
673
|
"})"
|
|
671
674
|
)
|
|
672
675
|
|
|
673
|
-
@overload
|
|
674
676
|
@classmethod
|
|
675
|
-
def
|
|
677
|
+
def _handle_if_exists(
|
|
676
678
|
cls,
|
|
677
679
|
name: str,
|
|
678
|
-
datasource: Datasource,
|
|
679
680
|
*,
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
label_names: list[str] | None = None,
|
|
687
|
-
max_seq_length_override: int | None = None,
|
|
688
|
-
prompt: str | None = None,
|
|
689
|
-
remove_duplicates: bool = True,
|
|
690
|
-
index_type: IndexType = "FLAT",
|
|
691
|
-
index_params: dict[str, Any] = {},
|
|
692
|
-
if_exists: CreateMode = "error",
|
|
693
|
-
background: Literal[True],
|
|
694
|
-
hidden: bool = False,
|
|
695
|
-
) -> Job[Self]:
|
|
696
|
-
pass
|
|
681
|
+
if_exists: CreateMode,
|
|
682
|
+
label_names: list[str] | None,
|
|
683
|
+
embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
|
|
684
|
+
) -> Self | None:
|
|
685
|
+
"""
|
|
686
|
+
Handle common `if_exists` logic shared by all creator-style helpers.
|
|
697
687
|
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
cls
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
688
|
+
Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
|
|
689
|
+
and returns `None` when the memoryset does not yet exist.
|
|
690
|
+
"""
|
|
691
|
+
if not cls.exists(name):
|
|
692
|
+
return None
|
|
693
|
+
if if_exists == "error":
|
|
694
|
+
raise ValueError(f"Memoryset with name {name} already exists")
|
|
695
|
+
|
|
696
|
+
existing = cls.open(name)
|
|
697
|
+
|
|
698
|
+
if label_names is not None and hasattr(existing, "label_names"):
|
|
699
|
+
existing_label_names = getattr(existing, "label_names")
|
|
700
|
+
if label_names != existing_label_names:
|
|
701
|
+
requested = ", ".join(label_names)
|
|
702
|
+
existing_joined = ", ".join(existing_label_names)
|
|
703
|
+
raise ValueError(
|
|
704
|
+
f"Memoryset {name} already exists with label names [{existing_joined}] "
|
|
705
|
+
f"(requested: [{requested}])."
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
if embedding_model is not None and embedding_model != existing.embedding_model:
|
|
709
|
+
existing_model = existing.embedding_model
|
|
710
|
+
existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
|
|
711
|
+
requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
|
|
712
|
+
raise ValueError(
|
|
713
|
+
f"Memoryset {name} already exists with embedding_model {existing_model_name} "
|
|
714
|
+
f"(requested: {requested_name})."
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
return existing
|
|
722
718
|
|
|
723
719
|
@classmethod
|
|
724
720
|
def create(
|
|
@@ -793,15 +789,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
793
789
|
if label_column is None and score_column is None:
|
|
794
790
|
raise ValueError("label_column or score_column must be provided")
|
|
795
791
|
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
return existing
|
|
792
|
+
existing = cls._handle_if_exists(
|
|
793
|
+
name,
|
|
794
|
+
if_exists=if_exists,
|
|
795
|
+
label_names=label_names,
|
|
796
|
+
embedding_model=embedding_model,
|
|
797
|
+
)
|
|
798
|
+
if existing is not None:
|
|
799
|
+
return existing
|
|
805
800
|
|
|
806
801
|
payload: CreateMemorysetRequest = {
|
|
807
802
|
"name": name,
|
|
@@ -826,7 +821,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
826
821
|
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
827
822
|
else:
|
|
828
823
|
raise ValueError("Invalid embedding model")
|
|
829
|
-
|
|
824
|
+
client = OrcaClient._resolve_client()
|
|
825
|
+
response = client.POST("/memoryset", json=payload)
|
|
830
826
|
job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
|
|
831
827
|
return job if background else job.result()
|
|
832
828
|
|
|
@@ -862,6 +858,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
862
858
|
Returns:
|
|
863
859
|
Handle to the new memoryset in the OrcaCloud
|
|
864
860
|
"""
|
|
861
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
862
|
+
existing = cls._handle_if_exists(
|
|
863
|
+
name,
|
|
864
|
+
if_exists=if_exists,
|
|
865
|
+
label_names=kwargs.get("label_names"),
|
|
866
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
867
|
+
)
|
|
868
|
+
if existing is not None:
|
|
869
|
+
return existing
|
|
870
|
+
|
|
865
871
|
datasource = Datasource.from_hf_dataset(
|
|
866
872
|
f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
|
|
867
873
|
)
|
|
@@ -926,6 +932,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
926
932
|
Returns:
|
|
927
933
|
Handle to the new memoryset in the OrcaCloud
|
|
928
934
|
"""
|
|
935
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
936
|
+
existing = cls._handle_if_exists(
|
|
937
|
+
name,
|
|
938
|
+
if_exists=if_exists,
|
|
939
|
+
label_names=kwargs.get("label_names"),
|
|
940
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
941
|
+
)
|
|
942
|
+
if existing is not None:
|
|
943
|
+
return existing
|
|
944
|
+
|
|
929
945
|
datasource = Datasource.from_pytorch(
|
|
930
946
|
f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
|
|
931
947
|
)
|
|
@@ -990,6 +1006,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
990
1006
|
... {"value": "world", "label": 1, "tag": "tag2"},
|
|
991
1007
|
... ])
|
|
992
1008
|
"""
|
|
1009
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1010
|
+
existing = cls._handle_if_exists(
|
|
1011
|
+
name,
|
|
1012
|
+
if_exists=if_exists,
|
|
1013
|
+
label_names=kwargs.get("label_names"),
|
|
1014
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1015
|
+
)
|
|
1016
|
+
if existing is not None:
|
|
1017
|
+
return existing
|
|
1018
|
+
|
|
993
1019
|
datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
994
1020
|
kwargs["background"] = background
|
|
995
1021
|
return cls.create(name, datasource, **kwargs)
|
|
@@ -1053,6 +1079,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1053
1079
|
... "tag": ["tag1", "tag2"],
|
|
1054
1080
|
... })
|
|
1055
1081
|
"""
|
|
1082
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1083
|
+
existing = cls._handle_if_exists(
|
|
1084
|
+
name,
|
|
1085
|
+
if_exists=if_exists,
|
|
1086
|
+
label_names=kwargs.get("label_names"),
|
|
1087
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1088
|
+
)
|
|
1089
|
+
if existing is not None:
|
|
1090
|
+
return existing
|
|
1091
|
+
|
|
1056
1092
|
datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
|
|
1057
1093
|
kwargs["background"] = background
|
|
1058
1094
|
return cls.create(name, datasource, **kwargs)
|
|
@@ -1109,6 +1145,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1109
1145
|
Returns:
|
|
1110
1146
|
Handle to the new memoryset in the OrcaCloud
|
|
1111
1147
|
"""
|
|
1148
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1149
|
+
existing = cls._handle_if_exists(
|
|
1150
|
+
name,
|
|
1151
|
+
if_exists=if_exists,
|
|
1152
|
+
label_names=kwargs.get("label_names"),
|
|
1153
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1154
|
+
)
|
|
1155
|
+
if existing is not None:
|
|
1156
|
+
return existing
|
|
1157
|
+
|
|
1112
1158
|
datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
|
|
1113
1159
|
kwargs["background"] = background
|
|
1114
1160
|
return cls.create(name, datasource, **kwargs)
|
|
@@ -1165,6 +1211,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1165
1211
|
Returns:
|
|
1166
1212
|
Handle to the new memoryset in the OrcaCloud
|
|
1167
1213
|
"""
|
|
1214
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1215
|
+
existing = cls._handle_if_exists(
|
|
1216
|
+
name,
|
|
1217
|
+
if_exists=if_exists,
|
|
1218
|
+
label_names=kwargs.get("label_names"),
|
|
1219
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1220
|
+
)
|
|
1221
|
+
if existing is not None:
|
|
1222
|
+
return existing
|
|
1223
|
+
|
|
1168
1224
|
datasource = Datasource.from_arrow(
|
|
1169
1225
|
f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
|
|
1170
1226
|
)
|
|
@@ -1230,6 +1286,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1230
1286
|
Returns:
|
|
1231
1287
|
Handle to the new memoryset in the OrcaCloud
|
|
1232
1288
|
"""
|
|
1289
|
+
if_exists = kwargs.get("if_exists", "error")
|
|
1290
|
+
existing = cls._handle_if_exists(
|
|
1291
|
+
name,
|
|
1292
|
+
if_exists=if_exists,
|
|
1293
|
+
label_names=kwargs.get("label_names"),
|
|
1294
|
+
embedding_model=kwargs.get("embedding_model"),
|
|
1295
|
+
)
|
|
1296
|
+
if existing is not None:
|
|
1297
|
+
return existing
|
|
1298
|
+
|
|
1233
1299
|
datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
|
|
1234
1300
|
kwargs["background"] = background
|
|
1235
1301
|
return cls.create(name, datasource, **kwargs)
|
|
@@ -1248,7 +1314,26 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1248
1314
|
Raises:
|
|
1249
1315
|
LookupError: If the memoryset does not exist
|
|
1250
1316
|
"""
|
|
1251
|
-
|
|
1317
|
+
client = OrcaClient._resolve_client()
|
|
1318
|
+
metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
1319
|
+
return cls(metadata)
|
|
1320
|
+
|
|
1321
|
+
@classmethod
|
|
1322
|
+
async def aopen(cls, name: str) -> Self:
|
|
1323
|
+
"""
|
|
1324
|
+
Asynchronously get a handle to a memoryset in the OrcaCloud
|
|
1325
|
+
|
|
1326
|
+
Params:
|
|
1327
|
+
name: Name or unique identifier of the memoryset
|
|
1328
|
+
|
|
1329
|
+
Returns:
|
|
1330
|
+
Handle to the existing memoryset in the OrcaCloud
|
|
1331
|
+
|
|
1332
|
+
Raises:
|
|
1333
|
+
LookupError: If the memoryset does not exist
|
|
1334
|
+
"""
|
|
1335
|
+
client = OrcaAsyncClient._resolve_client()
|
|
1336
|
+
metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
|
|
1252
1337
|
return cls(metadata)
|
|
1253
1338
|
|
|
1254
1339
|
@classmethod
|
|
@@ -1279,9 +1364,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1279
1364
|
Returns:
|
|
1280
1365
|
List of handles to all memorysets in the OrcaCloud
|
|
1281
1366
|
"""
|
|
1367
|
+
client = OrcaClient._resolve_client()
|
|
1282
1368
|
return [
|
|
1283
1369
|
cls(metadata)
|
|
1284
|
-
for metadata in
|
|
1370
|
+
for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
|
|
1285
1371
|
]
|
|
1286
1372
|
|
|
1287
1373
|
@classmethod
|
|
@@ -1298,7 +1384,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1298
1384
|
LookupError: If the memoryset does not exist and if_not_exists is `"error"`
|
|
1299
1385
|
"""
|
|
1300
1386
|
try:
|
|
1301
|
-
|
|
1387
|
+
client = OrcaClient._resolve_client()
|
|
1388
|
+
client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
|
|
1302
1389
|
logging.info(f"Deleted memoryset {name_or_id}")
|
|
1303
1390
|
except LookupError:
|
|
1304
1391
|
if if_not_exists == "error":
|
|
@@ -1333,7 +1420,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1333
1420
|
if hidden is not UNSET:
|
|
1334
1421
|
payload["hidden"] = hidden
|
|
1335
1422
|
|
|
1336
|
-
|
|
1423
|
+
client = OrcaClient._resolve_client()
|
|
1424
|
+
client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
|
|
1337
1425
|
self.refresh()
|
|
1338
1426
|
|
|
1339
1427
|
@overload
|
|
@@ -1425,7 +1513,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1425
1513
|
elif isinstance(embedding_model, FinetunedEmbeddingModel):
|
|
1426
1514
|
payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
|
|
1427
1515
|
|
|
1428
|
-
|
|
1516
|
+
client = OrcaClient._resolve_client()
|
|
1517
|
+
metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
1429
1518
|
job = Job(
|
|
1430
1519
|
metadata["insertion_task_id"],
|
|
1431
1520
|
lambda: self.open(metadata["id"]),
|
|
@@ -1556,7 +1645,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1556
1645
|
],
|
|
1557
1646
|
]
|
|
1558
1647
|
"""
|
|
1559
|
-
|
|
1648
|
+
client = OrcaClient._resolve_client()
|
|
1649
|
+
response = client.POST(
|
|
1560
1650
|
"/gpu/memoryset/{name_or_id}/lookup",
|
|
1561
1651
|
params={"name_or_id": self.id},
|
|
1562
1652
|
json={
|
|
@@ -1613,7 +1703,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1613
1703
|
]
|
|
1614
1704
|
|
|
1615
1705
|
if with_feedback_metrics:
|
|
1616
|
-
|
|
1706
|
+
client = OrcaClient._resolve_client()
|
|
1707
|
+
response = client.POST(
|
|
1617
1708
|
"/telemetry/memories",
|
|
1618
1709
|
json={
|
|
1619
1710
|
"memoryset_id": self.id,
|
|
@@ -1637,7 +1728,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1637
1728
|
if sort:
|
|
1638
1729
|
logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
|
|
1639
1730
|
|
|
1640
|
-
|
|
1731
|
+
client = OrcaClient._resolve_client()
|
|
1732
|
+
response = client.POST(
|
|
1641
1733
|
"/memoryset/{name_or_id}/memories",
|
|
1642
1734
|
params={"name_or_id": self.id},
|
|
1643
1735
|
json={
|
|
@@ -1698,19 +1790,74 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1698
1790
|
... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
|
|
1699
1791
|
... ])
|
|
1700
1792
|
"""
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1793
|
+
client = OrcaClient._resolve_client()
|
|
1794
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
1795
|
+
# insert memories in batches to avoid API timeouts
|
|
1796
|
+
for i in range(0, len(items), self._batch_size):
|
|
1797
|
+
batch = items[i : i + self._batch_size]
|
|
1798
|
+
client.POST(
|
|
1799
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
1800
|
+
params={"name_or_id": self.id},
|
|
1801
|
+
json=cast(
|
|
1802
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
1803
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
1804
|
+
),
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1712
1807
|
self.refresh()
|
|
1713
1808
|
|
|
1809
|
+
async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
|
|
1810
|
+
"""
|
|
1811
|
+
Asynchronously insert memories into the memoryset
|
|
1812
|
+
|
|
1813
|
+
Params:
|
|
1814
|
+
items: List of memories to insert into the memoryset. This should be a list of
|
|
1815
|
+
dictionaries with the following keys:
|
|
1816
|
+
|
|
1817
|
+
- `value`: Value of the memory
|
|
1818
|
+
- `label`: Label of the memory
|
|
1819
|
+
- `score`: Score of the memory
|
|
1820
|
+
- `source_id`: Optional unique ID of the memory in a system of reference
|
|
1821
|
+
- `...`: Any other metadata to store for the memory
|
|
1822
|
+
|
|
1823
|
+
Examples:
|
|
1824
|
+
>>> await memoryset.ainsert([
|
|
1825
|
+
... {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
|
|
1826
|
+
... {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
|
|
1827
|
+
... ])
|
|
1828
|
+
"""
|
|
1829
|
+
client = OrcaAsyncClient._resolve_client()
|
|
1830
|
+
items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
|
|
1831
|
+
# insert memories in batches to avoid API timeouts
|
|
1832
|
+
for i in range(0, len(items), self._batch_size):
|
|
1833
|
+
batch = items[i : i + self._batch_size]
|
|
1834
|
+
await client.POST(
|
|
1835
|
+
"/gpu/memoryset/{name_or_id}/memory",
|
|
1836
|
+
params={"name_or_id": self.id},
|
|
1837
|
+
json=cast(
|
|
1838
|
+
list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
|
|
1839
|
+
[_parse_memory_insert(item, type=self.memory_type) for item in batch],
|
|
1840
|
+
),
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
await self.arefresh()
|
|
1844
|
+
|
|
1845
|
+
async def arefresh(self, throttle: float = 0):
|
|
1846
|
+
"""
|
|
1847
|
+
Asynchronously refresh the information about the memoryset from the OrcaCloud
|
|
1848
|
+
|
|
1849
|
+
Params:
|
|
1850
|
+
throttle: Minimum time in seconds between refreshes
|
|
1851
|
+
"""
|
|
1852
|
+
current_time = datetime.now()
|
|
1853
|
+
# Skip refresh if last refresh was too recent
|
|
1854
|
+
if (current_time - self._last_refresh) < timedelta(seconds=throttle):
|
|
1855
|
+
return
|
|
1856
|
+
|
|
1857
|
+
refreshed_memoryset = await type(self).aopen(self.id)
|
|
1858
|
+
self.__dict__.update(refreshed_memoryset.__dict__)
|
|
1859
|
+
self._last_refresh = current_time
|
|
1860
|
+
|
|
1714
1861
|
@overload
|
|
1715
1862
|
def get(self, memory_id: str) -> MemoryT: # type: ignore -- this takes precedence
|
|
1716
1863
|
pass
|
|
@@ -1748,7 +1895,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1748
1895
|
]
|
|
1749
1896
|
"""
|
|
1750
1897
|
if isinstance(memory_id, str):
|
|
1751
|
-
|
|
1898
|
+
client = OrcaClient._resolve_client()
|
|
1899
|
+
response = client.GET(
|
|
1752
1900
|
"/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
|
|
1753
1901
|
)
|
|
1754
1902
|
return cast(
|
|
@@ -1756,7 +1904,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1756
1904
|
(LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
|
|
1757
1905
|
)
|
|
1758
1906
|
else:
|
|
1759
|
-
|
|
1907
|
+
client = OrcaClient._resolve_client()
|
|
1908
|
+
response = client.POST(
|
|
1760
1909
|
"/memoryset/{name_or_id}/memories/get",
|
|
1761
1910
|
params={"name_or_id": self.id},
|
|
1762
1911
|
json={"memory_ids": list(memory_id)},
|
|
@@ -1809,24 +1958,28 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1809
1958
|
... for m in memoryset.query(filters=[("tag", "==", "happy")])
|
|
1810
1959
|
... )
|
|
1811
1960
|
"""
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
MemoryT,
|
|
1826
|
-
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1961
|
+
client = OrcaClient._resolve_client()
|
|
1962
|
+
updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
|
|
1963
|
+
# update memories in batches to avoid API timeouts
|
|
1964
|
+
updated_memories: list[MemoryT] = []
|
|
1965
|
+
for i in range(0, len(updates_list), self._batch_size):
|
|
1966
|
+
batch = updates_list[i : i + self._batch_size]
|
|
1967
|
+
response = client.PATCH(
|
|
1968
|
+
"/gpu/memoryset/{name_or_id}/memories",
|
|
1969
|
+
params={"name_or_id": self.id},
|
|
1970
|
+
json=cast(
|
|
1971
|
+
list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
|
|
1972
|
+
[_parse_memory_update(update, type=self.memory_type) for update in batch],
|
|
1973
|
+
),
|
|
1827
1974
|
)
|
|
1828
|
-
|
|
1829
|
-
|
|
1975
|
+
updated_memories.extend(
|
|
1976
|
+
cast(
|
|
1977
|
+
MemoryT,
|
|
1978
|
+
(LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
|
|
1979
|
+
)
|
|
1980
|
+
for memory in response
|
|
1981
|
+
)
|
|
1982
|
+
|
|
1830
1983
|
return updated_memories[0] if isinstance(updates, dict) else updated_memories
|
|
1831
1984
|
|
|
1832
1985
|
def get_cascading_edits_suggestions(
|
|
@@ -1869,7 +2022,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1869
2022
|
A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
|
|
1870
2023
|
"""
|
|
1871
2024
|
# TODO: properly integrate this with memory edits and return something that can be applied
|
|
1872
|
-
|
|
2025
|
+
client = OrcaClient._resolve_client()
|
|
2026
|
+
return client.POST(
|
|
1873
2027
|
"/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
|
|
1874
2028
|
params={"name_or_id": self.id, "memory_id": memory.memory_id},
|
|
1875
2029
|
json={
|
|
@@ -1903,10 +2057,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1903
2057
|
... )
|
|
1904
2058
|
|
|
1905
2059
|
"""
|
|
2060
|
+
client = OrcaClient._resolve_client()
|
|
1906
2061
|
memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
2062
|
+
# delete memories in batches to avoid API timeouts
|
|
2063
|
+
for i in range(0, len(memory_ids), self._batch_size):
|
|
2064
|
+
batch = memory_ids[i : i + self._batch_size]
|
|
2065
|
+
client.POST(
|
|
2066
|
+
"/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
|
|
2067
|
+
)
|
|
1910
2068
|
logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
|
|
1911
2069
|
self.refresh()
|
|
1912
2070
|
|
|
@@ -1951,7 +2109,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1951
2109
|
- **`"duplicate"`**: Find potentially duplicate memories in the memoryset
|
|
1952
2110
|
- **`"cluster"`**: Cluster the memories in the memoryset
|
|
1953
2111
|
- **`"label"`**: Analyze the labels to find potential mislabelings
|
|
1954
|
-
- **`"
|
|
2112
|
+
- **`"distribution"`**: Analyze the embedding distribution to populate
|
|
1955
2113
|
- **`"projection"`**: Create a 2D projection of the embeddings for visualization
|
|
1956
2114
|
|
|
1957
2115
|
lookup_count: Number of memories to lookup for each memory in the memoryset
|
|
@@ -2017,7 +2175,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
2017
2175
|
raise ValueError(error_msg)
|
|
2018
2176
|
configs[name] = analysis
|
|
2019
2177
|
|
|
2020
|
-
|
|
2178
|
+
client = OrcaClient._resolve_client()
|
|
2179
|
+
analysis = client.POST(
|
|
2021
2180
|
"/memoryset/{name_or_id}/analysis",
|
|
2022
2181
|
params={"name_or_id": self.id},
|
|
2023
2182
|
json={
|
|
@@ -2026,134 +2185,186 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
2026
2185
|
"clear_metrics": clear_metrics,
|
|
2027
2186
|
},
|
|
2028
2187
|
)
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2188
|
+
|
|
2189
|
+
def get_analysis_result():
|
|
2190
|
+
client = OrcaClient._resolve_client()
|
|
2191
|
+
return client.GET(
|
|
2032
2192
|
"/memoryset/{name_or_id}/analysis/{analysis_task_id}",
|
|
2033
2193
|
params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
|
|
2034
|
-
)["results"]
|
|
2035
|
-
|
|
2194
|
+
)["results"]
|
|
2195
|
+
|
|
2196
|
+
job = Job(analysis["task_id"], get_analysis_result)
|
|
2036
2197
|
return job if background else job.result()
|
|
2037
2198
|
|
|
2038
2199
|
def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
|
|
2039
2200
|
"""Group potential duplicates in the memoryset"""
|
|
2040
|
-
|
|
2201
|
+
client = OrcaClient._resolve_client()
|
|
2202
|
+
response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
|
|
2041
2203
|
return [
|
|
2042
2204
|
[cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
|
|
2043
2205
|
for ms in response
|
|
2044
2206
|
]
|
|
2045
2207
|
|
|
2208
|
+
|
|
2209
|
+
class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
2210
|
+
"""
|
|
2211
|
+
A Handle to a collection of memories with labels in the OrcaCloud
|
|
2212
|
+
|
|
2213
|
+
Attributes:
|
|
2214
|
+
id: Unique identifier for the memoryset
|
|
2215
|
+
name: Unique name of the memoryset
|
|
2216
|
+
description: Description of the memoryset
|
|
2217
|
+
label_names: Names for the class labels in the memoryset
|
|
2218
|
+
length: Number of memories in the memoryset
|
|
2219
|
+
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2220
|
+
created_at: When the memoryset was created, automatically generated on create
|
|
2221
|
+
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2222
|
+
"""
|
|
2223
|
+
|
|
2224
|
+
label_names: list[str]
|
|
2225
|
+
memory_type: MemoryType = "LABELED"
|
|
2226
|
+
|
|
2227
|
+
def __init__(self, metadata: MemorysetMetadata):
|
|
2228
|
+
super().__init__(metadata)
|
|
2229
|
+
assert metadata["label_names"] is not None
|
|
2230
|
+
self.label_names = metadata["label_names"]
|
|
2231
|
+
|
|
2232
|
+
def __eq__(self, other) -> bool:
|
|
2233
|
+
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
2234
|
+
|
|
2046
2235
|
@overload
|
|
2047
|
-
@
|
|
2048
|
-
def
|
|
2236
|
+
@classmethod
|
|
2237
|
+
def create(
|
|
2238
|
+
cls,
|
|
2239
|
+
name: str,
|
|
2049
2240
|
datasource: Datasource,
|
|
2050
2241
|
*,
|
|
2242
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2051
2243
|
value_column: str = "value",
|
|
2052
2244
|
label_column: str = "label",
|
|
2053
2245
|
source_id_column: str | None = None,
|
|
2054
|
-
|
|
2055
|
-
|
|
2246
|
+
description: str | None = None,
|
|
2247
|
+
label_names: list[str] | None = None,
|
|
2248
|
+
max_seq_length_override: int | None = None,
|
|
2249
|
+
prompt: str | None = None,
|
|
2250
|
+
remove_duplicates: bool = True,
|
|
2251
|
+
index_type: IndexType = "FLAT",
|
|
2252
|
+
index_params: dict[str, Any] = {},
|
|
2253
|
+
if_exists: CreateMode = "error",
|
|
2056
2254
|
background: Literal[True],
|
|
2057
|
-
|
|
2255
|
+
hidden: bool = False,
|
|
2256
|
+
) -> Job[Self]:
|
|
2058
2257
|
pass
|
|
2059
2258
|
|
|
2060
2259
|
@overload
|
|
2061
|
-
@
|
|
2062
|
-
def
|
|
2260
|
+
@classmethod
|
|
2261
|
+
def create(
|
|
2262
|
+
cls,
|
|
2263
|
+
name: str,
|
|
2063
2264
|
datasource: Datasource,
|
|
2064
2265
|
*,
|
|
2266
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2065
2267
|
value_column: str = "value",
|
|
2066
2268
|
label_column: str = "label",
|
|
2067
2269
|
source_id_column: str | None = None,
|
|
2068
|
-
|
|
2069
|
-
|
|
2270
|
+
description: str | None = None,
|
|
2271
|
+
label_names: list[str] | None = None,
|
|
2272
|
+
max_seq_length_override: int | None = None,
|
|
2273
|
+
prompt: str | None = None,
|
|
2274
|
+
remove_duplicates: bool = True,
|
|
2275
|
+
index_type: IndexType = "FLAT",
|
|
2276
|
+
index_params: dict[str, Any] = {},
|
|
2277
|
+
if_exists: CreateMode = "error",
|
|
2070
2278
|
background: Literal[False] = False,
|
|
2071
|
-
|
|
2279
|
+
hidden: bool = False,
|
|
2280
|
+
) -> Self:
|
|
2072
2281
|
pass
|
|
2073
2282
|
|
|
2074
|
-
@
|
|
2075
|
-
def
|
|
2283
|
+
@classmethod
|
|
2284
|
+
def create( # type: ignore[override]
|
|
2285
|
+
cls,
|
|
2286
|
+
name: str,
|
|
2076
2287
|
datasource: Datasource,
|
|
2077
2288
|
*,
|
|
2289
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2078
2290
|
value_column: str = "value",
|
|
2079
2291
|
label_column: str = "label",
|
|
2080
2292
|
source_id_column: str | None = None,
|
|
2081
|
-
|
|
2082
|
-
|
|
2293
|
+
description: str | None = None,
|
|
2294
|
+
label_names: list[str] | None = None,
|
|
2295
|
+
max_seq_length_override: int | None = None,
|
|
2296
|
+
prompt: str | None = None,
|
|
2297
|
+
remove_duplicates: bool = True,
|
|
2298
|
+
index_type: IndexType = "FLAT",
|
|
2299
|
+
index_params: dict[str, Any] = {},
|
|
2300
|
+
if_exists: CreateMode = "error",
|
|
2083
2301
|
background: bool = False,
|
|
2084
|
-
|
|
2302
|
+
hidden: bool = False,
|
|
2303
|
+
) -> Self | Job[Self]:
|
|
2085
2304
|
"""
|
|
2086
|
-
|
|
2305
|
+
Create a new labeled memoryset in the OrcaCloud
|
|
2306
|
+
|
|
2307
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
2308
|
+
`label_column`, or `source_id_column` will be stored as metadata in the memoryset.
|
|
2087
2309
|
|
|
2088
2310
|
Params:
|
|
2089
|
-
|
|
2311
|
+
name: Name for the new memoryset (must be unique)
|
|
2312
|
+
datasource: Source data to populate the memories in the memoryset
|
|
2313
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2314
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
2090
2315
|
value_column: Name of the column in the datasource that contains the memory values
|
|
2091
2316
|
label_column: Name of the column in the datasource that contains the memory labels,
|
|
2092
2317
|
these must be contiguous integers starting from 0
|
|
2093
2318
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2094
2319
|
the system of reference
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2320
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
2321
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
2322
|
+
datasource or the embedding model.
|
|
2323
|
+
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
2324
|
+
the number of labels in the `label_column`. Will be automatically inferred if a
|
|
2325
|
+
[Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
|
|
2326
|
+
labels is used as the datasource
|
|
2327
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
2328
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
2329
|
+
sequence length if not provided
|
|
2330
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
2331
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
2332
|
+
into the memoryset
|
|
2333
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
2334
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
2335
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
2336
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2337
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2338
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
2339
|
+
hidden: Whether the memoryset should be hidden
|
|
2098
2340
|
|
|
2099
2341
|
Returns:
|
|
2100
|
-
|
|
2101
|
-
"""
|
|
2342
|
+
Handle to the new memoryset in the OrcaCloud
|
|
2102
2343
|
|
|
2103
|
-
|
|
2104
|
-
"
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2344
|
+
Raises:
|
|
2345
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2346
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
2347
|
+
"""
|
|
2348
|
+
return super().create(
|
|
2349
|
+
name,
|
|
2350
|
+
datasource,
|
|
2351
|
+
label_column=label_column,
|
|
2352
|
+
score_column=None,
|
|
2353
|
+
embedding_model=embedding_model,
|
|
2354
|
+
value_column=value_column,
|
|
2355
|
+
source_id_column=source_id_column,
|
|
2356
|
+
description=description,
|
|
2357
|
+
label_names=label_names,
|
|
2358
|
+
max_seq_length_override=max_seq_length_override,
|
|
2359
|
+
prompt=prompt,
|
|
2360
|
+
remove_duplicates=remove_duplicates,
|
|
2361
|
+
index_type=index_type,
|
|
2362
|
+
index_params=index_params,
|
|
2363
|
+
if_exists=if_exists,
|
|
2364
|
+
background=background,
|
|
2365
|
+
hidden=hidden,
|
|
2113
2366
|
)
|
|
2114
2367
|
|
|
2115
|
-
def get_value() -> list[EmbeddingModelResult]:
|
|
2116
|
-
res = orca_api.GET(
|
|
2117
|
-
"/datasource/{name_or_id}/embedding_evaluation/{task_id}",
|
|
2118
|
-
params={"name_or_id": datasource.id, "task_id": response["task_id"]},
|
|
2119
|
-
)
|
|
2120
|
-
assert res["result"] is not None
|
|
2121
|
-
return res["result"]["evaluation_results"]
|
|
2122
|
-
|
|
2123
|
-
job = Job(response["task_id"], get_value)
|
|
2124
|
-
return job if background else job.result()
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
2128
|
-
"""
|
|
2129
|
-
A Handle to a collection of memories with labels in the OrcaCloud
|
|
2130
|
-
|
|
2131
|
-
Attributes:
|
|
2132
|
-
id: Unique identifier for the memoryset
|
|
2133
|
-
name: Unique name of the memoryset
|
|
2134
|
-
description: Description of the memoryset
|
|
2135
|
-
label_names: Names for the class labels in the memoryset
|
|
2136
|
-
length: Number of memories in the memoryset
|
|
2137
|
-
embedding_model: Embedding model used to embed the memory values for semantic search
|
|
2138
|
-
created_at: When the memoryset was created, automatically generated on create
|
|
2139
|
-
updated_at: When the memoryset was last updated, automatically updated on updates
|
|
2140
|
-
"""
|
|
2141
|
-
|
|
2142
|
-
label_names: list[str]
|
|
2143
|
-
memory_type: MemoryType = "LABELED"
|
|
2144
|
-
|
|
2145
|
-
def __init__(self, metadata: MemorysetMetadata):
|
|
2146
|
-
super().__init__(metadata)
|
|
2147
|
-
assert metadata["label_names"] is not None
|
|
2148
|
-
self.label_names = metadata["label_names"]
|
|
2149
|
-
|
|
2150
|
-
def __eq__(self, other) -> bool:
|
|
2151
|
-
return isinstance(other, LabeledMemoryset) and self.id == other.id
|
|
2152
|
-
|
|
2153
|
-
@classmethod
|
|
2154
|
-
def create(cls, name: str, datasource: Datasource, *, label_column: str | None = "label", **kwargs):
|
|
2155
|
-
return super().create(name, datasource, label_column=label_column, score_column=None, **kwargs)
|
|
2156
|
-
|
|
2157
2368
|
def display_label_analysis(self):
|
|
2158
2369
|
"""
|
|
2159
2370
|
Display an interactive UI to review and act upon the label analysis results
|
|
@@ -2185,6 +2396,125 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2185
2396
|
def __eq__(self, other) -> bool:
|
|
2186
2397
|
return isinstance(other, ScoredMemoryset) and self.id == other.id
|
|
2187
2398
|
|
|
2399
|
+
@overload
|
|
2188
2400
|
@classmethod
|
|
2189
|
-
def create(
|
|
2190
|
-
|
|
2401
|
+
def create(
|
|
2402
|
+
cls,
|
|
2403
|
+
name: str,
|
|
2404
|
+
datasource: Datasource,
|
|
2405
|
+
*,
|
|
2406
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2407
|
+
value_column: str = "value",
|
|
2408
|
+
score_column: str = "score",
|
|
2409
|
+
source_id_column: str | None = None,
|
|
2410
|
+
description: str | None = None,
|
|
2411
|
+
max_seq_length_override: int | None = None,
|
|
2412
|
+
prompt: str | None = None,
|
|
2413
|
+
remove_duplicates: bool = True,
|
|
2414
|
+
index_type: IndexType = "FLAT",
|
|
2415
|
+
index_params: dict[str, Any] = {},
|
|
2416
|
+
if_exists: CreateMode = "error",
|
|
2417
|
+
background: Literal[True],
|
|
2418
|
+
hidden: bool = False,
|
|
2419
|
+
) -> Job[Self]:
|
|
2420
|
+
pass
|
|
2421
|
+
|
|
2422
|
+
@overload
|
|
2423
|
+
@classmethod
|
|
2424
|
+
def create(
|
|
2425
|
+
cls,
|
|
2426
|
+
name: str,
|
|
2427
|
+
datasource: Datasource,
|
|
2428
|
+
*,
|
|
2429
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2430
|
+
score_column: str = "score",
|
|
2431
|
+
value_column: str = "value",
|
|
2432
|
+
source_id_column: str | None = None,
|
|
2433
|
+
description: str | None = None,
|
|
2434
|
+
max_seq_length_override: int | None = None,
|
|
2435
|
+
prompt: str | None = None,
|
|
2436
|
+
remove_duplicates: bool = True,
|
|
2437
|
+
index_type: IndexType = "FLAT",
|
|
2438
|
+
index_params: dict[str, Any] = {},
|
|
2439
|
+
if_exists: CreateMode = "error",
|
|
2440
|
+
background: Literal[False] = False,
|
|
2441
|
+
hidden: bool = False,
|
|
2442
|
+
) -> Self:
|
|
2443
|
+
pass
|
|
2444
|
+
|
|
2445
|
+
@classmethod
|
|
2446
|
+
def create( # type: ignore[override]
|
|
2447
|
+
cls,
|
|
2448
|
+
name: str,
|
|
2449
|
+
datasource: Datasource,
|
|
2450
|
+
*,
|
|
2451
|
+
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2452
|
+
value_column: str = "value",
|
|
2453
|
+
score_column: str = "score",
|
|
2454
|
+
source_id_column: str | None = None,
|
|
2455
|
+
description: str | None = None,
|
|
2456
|
+
max_seq_length_override: int | None = None,
|
|
2457
|
+
prompt: str | None = None,
|
|
2458
|
+
remove_duplicates: bool = True,
|
|
2459
|
+
index_type: IndexType = "FLAT",
|
|
2460
|
+
index_params: dict[str, Any] = {},
|
|
2461
|
+
if_exists: CreateMode = "error",
|
|
2462
|
+
background: bool = False,
|
|
2463
|
+
hidden: bool = False,
|
|
2464
|
+
) -> Self | Job[Self]:
|
|
2465
|
+
"""
|
|
2466
|
+
Create a new scored memoryset in the OrcaCloud
|
|
2467
|
+
|
|
2468
|
+
All columns from the datasource that are not specified in the `value_column`,
|
|
2469
|
+
`score_column`, or `source_id_column` will be stored as metadata in the memoryset.
|
|
2470
|
+
|
|
2471
|
+
Params:
|
|
2472
|
+
name: Name for the new memoryset (must be unique)
|
|
2473
|
+
datasource: Source data to populate the memories in the memoryset
|
|
2474
|
+
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2475
|
+
If not provided, a default embedding model for the memoryset will be used.
|
|
2476
|
+
value_column: Name of the column in the datasource that contains the memory values
|
|
2477
|
+
score_column: Name of the column in the datasource that contains the memory scores
|
|
2478
|
+
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2479
|
+
the system of reference
|
|
2480
|
+
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
2481
|
+
so make sure it is concise and describes the contents of your memoryset not the
|
|
2482
|
+
datasource or the embedding model.
|
|
2483
|
+
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
2484
|
+
value is longer than this it will be truncated, will default to the model's max
|
|
2485
|
+
sequence length if not provided
|
|
2486
|
+
prompt: Optional prompt to use when embedding documents/memories for storage
|
|
2487
|
+
remove_duplicates: Whether to remove duplicates from the datasource before inserting
|
|
2488
|
+
into the memoryset
|
|
2489
|
+
index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
|
|
2490
|
+
values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
|
|
2491
|
+
index_params: Parameters for the vector index, defaults to `{}`
|
|
2492
|
+
if_exists: What to do if a memoryset with the same name already exists, defaults to
|
|
2493
|
+
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
2494
|
+
background: Whether to run the operation none blocking and return a job handle
|
|
2495
|
+
hidden: Whether the memoryset should be hidden
|
|
2496
|
+
|
|
2497
|
+
Returns:
|
|
2498
|
+
Handle to the new memoryset in the OrcaCloud
|
|
2499
|
+
|
|
2500
|
+
Raises:
|
|
2501
|
+
ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
|
|
2502
|
+
`"open"` and the params do not match those of the existing memoryset.
|
|
2503
|
+
"""
|
|
2504
|
+
return super().create(
|
|
2505
|
+
name,
|
|
2506
|
+
datasource,
|
|
2507
|
+
embedding_model=embedding_model,
|
|
2508
|
+
value_column=value_column,
|
|
2509
|
+
score_column=score_column,
|
|
2510
|
+
source_id_column=source_id_column,
|
|
2511
|
+
description=description,
|
|
2512
|
+
max_seq_length_override=max_seq_length_override,
|
|
2513
|
+
prompt=prompt,
|
|
2514
|
+
remove_duplicates=remove_duplicates,
|
|
2515
|
+
index_type=index_type,
|
|
2516
|
+
index_params=index_params,
|
|
2517
|
+
if_exists=if_exists,
|
|
2518
|
+
background=background,
|
|
2519
|
+
hidden=hidden,
|
|
2520
|
+
)
|