PyPI - orca-sdk - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

orca-sdk 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

orca_sdk/__init__.py +1 -1
orca_sdk/_utils/auth.py +12 -8
orca_sdk/async_client.py +3795 -0
orca_sdk/classification_model.py +176 -14
orca_sdk/classification_model_test.py +96 -28
orca_sdk/client.py +515 -475
orca_sdk/conftest.py +37 -36
orca_sdk/credentials.py +54 -14
orca_sdk/credentials_test.py +92 -28
orca_sdk/datasource.py +19 -10
orca_sdk/datasource_test.py +24 -18
orca_sdk/embedding_model.py +22 -13
orca_sdk/embedding_model_test.py +27 -20
orca_sdk/job.py +14 -8
orca_sdk/memoryset.py +513 -183
orca_sdk/memoryset_test.py +130 -32
orca_sdk/regression_model.py +21 -11
orca_sdk/regression_model_test.py +35 -26
orca_sdk/telemetry.py +24 -13
{orca_sdk-0.1.2.dist-info → orca_sdk-0.1.3.dist-info}/METADATA +1 -1
orca_sdk-0.1.3.dist-info/RECORD +41 -0
orca_sdk-0.1.2.dist-info/RECORD +0 -40
{orca_sdk-0.1.2.dist-info → orca_sdk-0.1.3.dist-info}/WHEEL +0 -0

orca_sdk/memoryset.py CHANGED Viewed

@@ -13,11 +13,11 @@ from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
 from ._utils.common import UNSET, CreateMode, DropMode
+from .async_client import OrcaAsyncClient
 from .client import (
     CascadingEditSuggestion,
     CloneMemorysetRequest,
     CreateMemorysetRequest,
-    EmbeddingModelResult,
     FilterItem,
 )
 from .client import LabeledMemory as LabeledMemoryResponse
@@ -35,6 +35,7 @@ from .client import (
     MemorysetMetrics,
     MemorysetUpdate,
     MemoryType,
+    OrcaClient,
 )
 from .client import ScoredMemory as ScoredMemoryResponse
 from .client import (
@@ -47,7 +48,6 @@ from .client import (
     ScorePredictionMemoryLookup,
     TelemetryFilterItem,
     TelemetrySortOptions,
-    orca_api,
 )
 from .datasource import Datasource
 from .embedding_model import (
@@ -299,7 +299,8 @@ class MemoryBase(ABC):
         source_id: str | None = UNSET,
         **metadata: None | bool | float | int | str,
     ) -> Self:
-        response = orca_api.PATCH(
+        client = OrcaClient._resolve_client()
+        response = client.PATCH(
             "/gpu/memoryset/{name_or_id}/memory",
             params={"name_or_id": self.memoryset_id},
             json=_parse_memory_update(
@@ -637,6 +638,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
     index_params: dict[str, Any]
     hidden: bool
+    _batch_size = 32  # max number of memories to insert/update/delete in a single API call
     def __init__(self, metadata: MemorysetMetadata):
         # for internal use only, do not document
         if metadata["pretrained_embedding_model_name"]:
@@ -670,55 +673,48 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             "})"
         )
-    @overload
     @classmethod
-    def create(
+    def _handle_if_exists(
         cls,
         name: str,
-        datasource: Datasource,
         *,
-        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
-        value_column: str = "value",
-        label_column: str | None = None,
-        score_column: str | None = None,
-        source_id_column: str | None = None,
-        description: str | None = None,
-        label_names: list[str] | None = None,
-        max_seq_length_override: int | None = None,
-        prompt: str | None = None,
-        remove_duplicates: bool = True,
-        index_type: IndexType = "FLAT",
-        index_params: dict[str, Any] = {},
-        if_exists: CreateMode = "error",
-        background: Literal[True],
-        hidden: bool = False,
-    ) -> Job[Self]:
-        pass
+        if_exists: CreateMode,
+        label_names: list[str] | None,
+        embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
+    ) -> Self | None:
+        """
+        Handle common `if_exists` logic shared by all creator-style helpers.
-    @overload
-    @classmethod
-    def create(
-        cls,
-        name: str,
-        datasource: Datasource,
-        *,
-        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
-        value_column: str = "value",
-        label_column: str | None = None,
-        score_column: str | None = None,
-        source_id_column: str | None = None,
-        description: str | None = None,
-        label_names: list[str] | None = None,
-        max_seq_length_override: int | None = None,
-        prompt: str | None = None,
-        remove_duplicates: bool = True,
-        index_type: IndexType = "FLAT",
-        index_params: dict[str, Any] = {},
-        if_exists: CreateMode = "error",
-        background: Literal[False] = False,
-        hidden: bool = False,
-    ) -> Self:
-        pass
+        Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
+        and returns `None` when the memoryset does not yet exist.
+        """
+        if not cls.exists(name):
+            return None
+        if if_exists == "error":
+            raise ValueError(f"Memoryset with name {name} already exists")
+        existing = cls.open(name)
+        if label_names is not None and hasattr(existing, "label_names"):
+            existing_label_names = getattr(existing, "label_names")
+            if label_names != existing_label_names:
+                requested = ", ".join(label_names)
+                existing_joined = ", ".join(existing_label_names)
+                raise ValueError(
+                    f"Memoryset {name} already exists with label names [{existing_joined}] "
+                    f"(requested: [{requested}])."
+                )
+        if embedding_model is not None and embedding_model != existing.embedding_model:
+            existing_model = existing.embedding_model
+            existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
+            requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
+            raise ValueError(
+                f"Memoryset {name} already exists with embedding_model {existing_model_name} "
+                f"(requested: {requested_name})."
+            )
+        return existing
     @classmethod
     def create(
@@ -793,15 +789,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if label_column is None and score_column is None:
             raise ValueError("label_column or score_column must be provided")
-        if cls.exists(name):
-            if if_exists == "error":
-                raise ValueError(f"Memoryset with name {name} already exists")
-            elif if_exists == "open":
-                existing = cls.open(name)
-                for attribute in {"label_names", "embedding_model"}:
-                    if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
-                        raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
-                return existing
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=label_names,
+            embedding_model=embedding_model,
+        )
+        if existing is not None:
+            return existing
         payload: CreateMemorysetRequest = {
             "name": name,
@@ -826,7 +821,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
         else:
             raise ValueError("Invalid embedding model")
-        response = orca_api.POST("/memoryset", json=payload)
+        client = OrcaClient._resolve_client()
+        response = client.POST("/memoryset", json=payload)
         job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
         return job if background else job.result()
@@ -862,6 +858,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_hf_dataset(
             f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
         )
@@ -926,6 +932,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_pytorch(
             f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
         )
@@ -990,6 +1006,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     {"value": "world", "label": 1, "tag": "tag2"},
             ... ])
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1053,6 +1079,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     "tag": ["tag1", "tag2"],
             ... })
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1109,6 +1145,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1165,6 +1211,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_arrow(
             f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
         )
@@ -1230,6 +1286,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1248,7 +1314,26 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Raises:
             LookupError: If the memoryset does not exist
         """
-        metadata = orca_api.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
+        client = OrcaClient._resolve_client()
+        metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
+        return cls(metadata)
+    @classmethod
+    async def aopen(cls, name: str) -> Self:
+        """
+        Asynchronously get a handle to a memoryset in the OrcaCloud
+        Params:
+            name: Name or unique identifier of the memoryset
+        Returns:
+            Handle to the existing memoryset in the OrcaCloud
+        Raises:
+            LookupError: If the memoryset does not exist
+        """
+        client = OrcaAsyncClient._resolve_client()
+        metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
         return cls(metadata)
     @classmethod
@@ -1279,9 +1364,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             List of handles to all memorysets in the OrcaCloud
         """
+        client = OrcaClient._resolve_client()
         return [
             cls(metadata)
-            for metadata in orca_api.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
+            for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
         ]
     @classmethod
@@ -1298,7 +1384,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             LookupError: If the memoryset does not exist and if_not_exists is `"error"`
         """
         try:
-            orca_api.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
+            client = OrcaClient._resolve_client()
+            client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
             logging.info(f"Deleted memoryset {name_or_id}")
         except LookupError:
             if if_not_exists == "error":
@@ -1333,7 +1420,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if hidden is not UNSET:
             payload["hidden"] = hidden
-        orca_api.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
+        client = OrcaClient._resolve_client()
+        client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
         self.refresh()
     @overload
@@ -1425,7 +1513,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         elif isinstance(embedding_model, FinetunedEmbeddingModel):
             payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
-        metadata = orca_api.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
+        client = OrcaClient._resolve_client()
+        metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
         job = Job(
             metadata["insertion_task_id"],
             lambda: self.open(metadata["id"]),
@@ -1556,7 +1645,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 ],
             ]
         """
-        response = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        response = client.POST(
             "/gpu/memoryset/{name_or_id}/lookup",
             params={"name_or_id": self.id},
             json={
@@ -1613,7 +1703,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         ]
         if with_feedback_metrics:
-            response = orca_api.POST(
+            client = OrcaClient._resolve_client()
+            response = client.POST(
                 "/telemetry/memories",
                 json={
                     "memoryset_id": self.id,
@@ -1637,7 +1728,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if sort:
             logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
-        response = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        response = client.POST(
             "/memoryset/{name_or_id}/memories",
             params={"name_or_id": self.id},
             json={
@@ -1698,19 +1790,74 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
             ... ])
         """
-        orca_api.POST(
-            "/gpu/memoryset/{name_or_id}/memory",
-            params={"name_or_id": self.id},
-            json=cast(
-                list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
-                [
-                    _parse_memory_insert(memory, type=self.memory_type)
-                    for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
-                ],
-            ),
-        )
+        client = OrcaClient._resolve_client()
+        items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
+        # insert memories in batches to avoid API timeouts
+        for i in range(0, len(items), self._batch_size):
+            batch = items[i : i + self._batch_size]
+            client.POST(
+                "/gpu/memoryset/{name_or_id}/memory",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
+                    [_parse_memory_insert(item, type=self.memory_type) for item in batch],
+                ),
+            )
         self.refresh()
+    async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
+        """
+        Asynchronously insert memories into the memoryset
+        Params:
+            items: List of memories to insert into the memoryset. This should be a list of
+                dictionaries with the following keys:
+                - `value`: Value of the memory
+                - `label`: Label of the memory
+                - `score`: Score of the memory
+                - `source_id`: Optional unique ID of the memory in a system of reference
+                - `...`: Any other metadata to store for the memory
+        Examples:
+            >>> await memoryset.ainsert([
+            ...     {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
+            ...     {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
+            ... ])
+        """
+        client = OrcaAsyncClient._resolve_client()
+        items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
+        # insert memories in batches to avoid API timeouts
+        for i in range(0, len(items), self._batch_size):
+            batch = items[i : i + self._batch_size]
+            await client.POST(
+                "/gpu/memoryset/{name_or_id}/memory",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
+                    [_parse_memory_insert(item, type=self.memory_type) for item in batch],
+                ),
+            )
+        await self.arefresh()
+    async def arefresh(self, throttle: float = 0):
+        """
+        Asynchronously refresh the information about the memoryset from the OrcaCloud
+        Params:
+            throttle: Minimum time in seconds between refreshes
+        """
+        current_time = datetime.now()
+        # Skip refresh if last refresh was too recent
+        if (current_time - self._last_refresh) < timedelta(seconds=throttle):
+            return
+        refreshed_memoryset = await type(self).aopen(self.id)
+        self.__dict__.update(refreshed_memoryset.__dict__)
+        self._last_refresh = current_time
     @overload
     def get(self, memory_id: str) -> MemoryT:  # type: ignore -- this takes precedence
         pass
@@ -1748,7 +1895,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ]
         """
         if isinstance(memory_id, str):
-            response = orca_api.GET(
+            client = OrcaClient._resolve_client()
+            response = client.GET(
                 "/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
             )
             return cast(
@@ -1756,7 +1904,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 (LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
             )
         else:
-            response = orca_api.POST(
+            client = OrcaClient._resolve_client()
+            response = client.POST(
                 "/memoryset/{name_or_id}/memories/get",
                 params={"name_or_id": self.id},
                 json={"memory_ids": list(memory_id)},
@@ -1809,24 +1958,28 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     for m in memoryset.query(filters=[("tag", "==", "happy")])
             ... )
         """
-        response = orca_api.PATCH(
-            "/gpu/memoryset/{name_or_id}/memories",
-            params={"name_or_id": self.id},
-            json=cast(
-                list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
-                [
-                    _parse_memory_update(update, type=self.memory_type)
-                    for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
-                ],
-            ),
-        )
-        updated_memories = [
-            cast(
-                MemoryT,
-                (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
+        client = OrcaClient._resolve_client()
+        updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
+        # update memories in batches to avoid API timeouts
+        updated_memories: list[MemoryT] = []
+        for i in range(0, len(updates_list), self._batch_size):
+            batch = updates_list[i : i + self._batch_size]
+            response = client.PATCH(
+                "/gpu/memoryset/{name_or_id}/memories",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
+                    [_parse_memory_update(update, type=self.memory_type) for update in batch],
+                ),
             )
-            for memory in response
-        ]
+            updated_memories.extend(
+                cast(
+                    MemoryT,
+                    (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
+                )
+                for memory in response
+            )
         return updated_memories[0] if isinstance(updates, dict) else updated_memories
     def get_cascading_edits_suggestions(
@@ -1869,7 +2022,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
         """
         # TODO: properly integrate this with memory edits and return something that can be applied
-        return orca_api.POST(
+        client = OrcaClient._resolve_client()
+        return client.POST(
             "/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
             params={"name_or_id": self.id, "memory_id": memory.memory_id},
             json={
@@ -1903,10 +2057,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ... )
         """
+        client = OrcaClient._resolve_client()
         memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
-        orca_api.POST(
-            "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": memory_ids}
-        )
+        # delete memories in batches to avoid API timeouts
+        for i in range(0, len(memory_ids), self._batch_size):
+            batch = memory_ids[i : i + self._batch_size]
+            client.POST(
+                "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
+            )
         logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
         self.refresh()
@@ -1951,7 +2109,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
                 - **`"cluster"`**: Cluster the memories in the memoryset
                 - **`"label"`**: Analyze the labels to find potential mislabelings
-                - **`"neighbor"`**: Analyze the neighbors to populate anomaly scores
+                - **`"distribution"`**: Analyze the embedding distribution to populate
                 - **`"projection"`**: Create a 2D projection of the embeddings for visualization
             lookup_count: Number of memories to lookup for each memory in the memoryset
@@ -2017,7 +2175,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                     raise ValueError(error_msg)
                 configs[name] = analysis
-        analysis = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        analysis = client.POST(
             "/memoryset/{name_or_id}/analysis",
             params={"name_or_id": self.id},
             json={
@@ -2026,134 +2185,186 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 "clear_metrics": clear_metrics,
             },
         )
-        job = Job(
-            analysis["task_id"],
-            lambda: orca_api.GET(
+        def get_analysis_result():
+            client = OrcaClient._resolve_client()
+            return client.GET(
                 "/memoryset/{name_or_id}/analysis/{analysis_task_id}",
                 params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
-            )["results"],
-        )
+            )["results"]
+        job = Job(analysis["task_id"], get_analysis_result)
         return job if background else job.result()
     def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
         """Group potential duplicates in the memoryset"""
-        response = orca_api.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
+        client = OrcaClient._resolve_client()
+        response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
         return [
             [cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
             for ms in response
         ]
+class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
+    """
+    A Handle to a collection of memories with labels in the OrcaCloud
+    Attributes:
+        id: Unique identifier for the memoryset
+        name: Unique name of the memoryset
+        description: Description of the memoryset
+        label_names: Names for the class labels in the memoryset
+        length: Number of memories in the memoryset
+        embedding_model: Embedding model used to embed the memory values for semantic search
+        created_at: When the memoryset was created, automatically generated on create
+        updated_at: When the memoryset was last updated, automatically updated on updates
+    """
+    label_names: list[str]
+    memory_type: MemoryType = "LABELED"
+    def __init__(self, metadata: MemorysetMetadata):
+        super().__init__(metadata)
+        assert metadata["label_names"] is not None
+        self.label_names = metadata["label_names"]
+    def __eq__(self, other) -> bool:
+        return isinstance(other, LabeledMemoryset) and self.id == other.id
     @overload
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[True],
-    ) -> Job[list[EmbeddingModelResult]]:
+        hidden: bool = False,
+    ) -> Job[Self]:
         pass
     @overload
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[False] = False,
-    ) -> list[EmbeddingModelResult]:
+        hidden: bool = False,
+    ) -> Self:
         pass
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: bool = False,
-    ) -> Job[list[EmbeddingModelResult]] | list[EmbeddingModelResult]:
+        hidden: bool = False,
+    ) -> Self | Job[Self]:
         """
-        Test the quality of embeddings for the datasource by computing metrics such as prediction accuracy.
+        Create a new labeled memoryset in the OrcaCloud
+        All columns from the datasource that are not specified in the `value_column`,
+        `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
         Params:
-            datasource: The datasource to run the embedding evaluation on
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
             value_column: Name of the column in the datasource that contains the memory values
             label_column: Name of the column in the datasource that contains the memory labels,
                 these must be contiguous integers starting from 0
             source_id_column: Optional name of the column in the datasource that contains the ids in
                 the system of reference
-            neighbor_count: The number of neighbors to select for prediction
-            embedding_models: Optional list of embedding model keys to evaluate, if not provided all
-                available embedding models will be used
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset, must match
+                the number of labels in the `label_column`. Will be automatically inferred if a
+                [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
+                labels is used as the datasource
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle
+            hidden: Whether the memoryset should be hidden
         Returns:
-            A dictionary containing the results of the embedding evaluation
-        """
+            Handle to the new memoryset in the OrcaCloud
-        response = orca_api.POST(
-            "/datasource/{name_or_id}/embedding_evaluation",
-            params={"name_or_id": datasource.id},
-            json={
-                "value_column": value_column,
-                "label_column": label_column,
-                "source_id_column": source_id_column,
-                "neighbor_count": neighbor_count,
-                "embedding_models": embedding_models,
-            },
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        return super().create(
+            name,
+            datasource,
+            label_column=label_column,
+            score_column=None,
+            embedding_model=embedding_model,
+            value_column=value_column,
+            source_id_column=source_id_column,
+            description=description,
+            label_names=label_names,
+            max_seq_length_override=max_seq_length_override,
+            prompt=prompt,
+            remove_duplicates=remove_duplicates,
+            index_type=index_type,
+            index_params=index_params,
+            if_exists=if_exists,
+            background=background,
+            hidden=hidden,
         )
-        def get_value() -> list[EmbeddingModelResult]:
-            res = orca_api.GET(
-                "/datasource/{name_or_id}/embedding_evaluation/{task_id}",
-                params={"name_or_id": datasource.id, "task_id": response["task_id"]},
-            )
-            assert res["result"] is not None
-            return res["result"]["evaluation_results"]
-        job = Job(response["task_id"], get_value)
-        return job if background else job.result()
-class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
-    """
-    A Handle to a collection of memories with labels in the OrcaCloud
-    Attributes:
-        id: Unique identifier for the memoryset
-        name: Unique name of the memoryset
-        description: Description of the memoryset
-        label_names: Names for the class labels in the memoryset
-        length: Number of memories in the memoryset
-        embedding_model: Embedding model used to embed the memory values for semantic search
-        created_at: When the memoryset was created, automatically generated on create
-        updated_at: When the memoryset was last updated, automatically updated on updates
-    """
-    label_names: list[str]
-    memory_type: MemoryType = "LABELED"
-    def __init__(self, metadata: MemorysetMetadata):
-        super().__init__(metadata)
-        assert metadata["label_names"] is not None
-        self.label_names = metadata["label_names"]
-    def __eq__(self, other) -> bool:
-        return isinstance(other, LabeledMemoryset) and self.id == other.id
-    @classmethod
-    def create(cls, name: str, datasource: Datasource, *, label_column: str | None = "label", **kwargs):
-        return super().create(name, datasource, label_column=label_column, score_column=None, **kwargs)
     def display_label_analysis(self):
         """
         Display an interactive UI to review and act upon the label analysis results
@@ -2185,6 +2396,125 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
     def __eq__(self, other) -> bool:
         return isinstance(other, ScoredMemoryset) and self.id == other.id
+    @overload
     @classmethod
-    def create(cls, name: str, datasource: Datasource, *, score_column: str | None = "score", **kwargs):
-        return super().create(name, datasource, score_column=score_column, label_column=None, **kwargs)
+    def create(
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str = "score",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[True],
+        hidden: bool = False,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        score_column: str = "score",
+        value_column: str = "value",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[False] = False,
+        hidden: bool = False,
+    ) -> Self:
+        pass
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str = "score",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: bool = False,
+        hidden: bool = False,
+    ) -> Self | Job[Self]:
+        """
+        Create a new scored memoryset in the OrcaCloud
+        All columns from the datasource that are not specified in the `value_column`,
+        `score_column`, or `source_id_column` will be stored as metadata in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            score_column: Name of the column in the datasource that contains the memory scores
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle
+            hidden: Whether the memoryset should be hidden
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        return super().create(
+            name,
+            datasource,
+            embedding_model=embedding_model,
+            value_column=value_column,
+            score_column=score_column,
+            source_id_column=source_id_column,
+            description=description,
+            max_seq_length_override=max_seq_length_override,
+            prompt=prompt,
+            remove_duplicates=remove_duplicates,
+            index_type=index_type,
+            index_params=index_params,
+            if_exists=if_exists,
+            background=background,
+            hidden=hidden,
+        )

orca-sdk 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

orca-sdk 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl