PyPI - orca-sdk - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

orca-sdk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

orca_sdk/__init__.py +1 -1
orca_sdk/_utils/auth.py +12 -8
orca_sdk/async_client.py +3942 -0
orca_sdk/classification_model.py +218 -20
orca_sdk/classification_model_test.py +96 -28
orca_sdk/client.py +899 -712
orca_sdk/conftest.py +37 -36
orca_sdk/credentials.py +54 -14
orca_sdk/credentials_test.py +92 -28
orca_sdk/datasource.py +64 -12
orca_sdk/datasource_test.py +144 -18
orca_sdk/embedding_model.py +54 -37
orca_sdk/embedding_model_test.py +27 -20
orca_sdk/job.py +27 -21
orca_sdk/memoryset.py +823 -205
orca_sdk/memoryset_test.py +315 -33
orca_sdk/regression_model.py +59 -15
orca_sdk/regression_model_test.py +35 -26
orca_sdk/telemetry.py +76 -26
{orca_sdk-0.1.2.dist-info → orca_sdk-0.1.4.dist-info}/METADATA +1 -1
orca_sdk-0.1.4.dist-info/RECORD +41 -0
orca_sdk-0.1.2.dist-info/RECORD +0 -40
{orca_sdk-0.1.2.dist-info → orca_sdk-0.1.4.dist-info}/WHEEL +0 -0

orca_sdk/memoryset.py CHANGED Viewed

@@ -4,7 +4,17 @@ import logging
 from abc import ABC
 from datetime import datetime, timedelta
 from os import PathLike
-from typing import Any, Generic, Iterable, Literal, Self, TypeVar, cast, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    Iterable,
+    Literal,
+    Self,
+    TypeVar,
+    cast,
+    overload,
+)
 import pandas as pd
 import pyarrow as pa
@@ -13,11 +23,11 @@ from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
 from ._utils.common import UNSET, CreateMode, DropMode
+from .async_client import OrcaAsyncClient
 from .client import (
     CascadingEditSuggestion,
     CloneMemorysetRequest,
     CreateMemorysetRequest,
-    EmbeddingModelResult,
     FilterItem,
 )
 from .client import LabeledMemory as LabeledMemoryResponse
@@ -29,12 +39,15 @@ from .client import (
     LabeledMemoryUpdate,
     LabeledMemoryWithFeedbackMetrics,
     LabelPredictionMemoryLookup,
+    LabelPredictionWithMemoriesAndFeedback,
     MemoryMetrics,
     MemorysetAnalysisConfigs,
     MemorysetMetadata,
     MemorysetMetrics,
     MemorysetUpdate,
     MemoryType,
+    OrcaClient,
+    PredictionFeedback,
 )
 from .client import ScoredMemory as ScoredMemoryResponse
 from .client import (
@@ -45,9 +58,9 @@ from .client import (
     ScoredMemoryUpdate,
     ScoredMemoryWithFeedbackMetrics,
     ScorePredictionMemoryLookup,
+    ScorePredictionWithMemoriesAndFeedback,
     TelemetryFilterItem,
     TelemetrySortOptions,
-    orca_api,
 )
 from .datasource import Datasource
 from .embedding_model import (
@@ -56,6 +69,11 @@ from .embedding_model import (
     PretrainedEmbeddingModel,
 )
 from .job import Job, Status
+from .telemetry import ClassificationPrediction, RegressionPrediction
+if TYPE_CHECKING:
+    from .classification_model import ClassificationModel
+    from .regression_model import RegressionModel
 TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
 """
@@ -74,7 +92,7 @@ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "lik
 Operations that can be used in a filter expression.
 """
-FilterValue = str | int | float | bool | datetime | None | list[str] | list[int] | list[float] | list[bool]
+FilterValue = str | int | float | bool | datetime | None | list[str | None] | list[int] | list[float] | list[bool]
 """
 Values that can be used in a filter expression.
 """
@@ -292,6 +310,110 @@ class MemoryBase(ABC):
             raise AttributeError(f"{key} is not a valid attribute")
         return self.metadata[key]
+    def _convert_to_classification_prediction(
+        self,
+        prediction: LabelPredictionWithMemoriesAndFeedback,
+        *,
+        memoryset: LabeledMemoryset,
+        model: ClassificationModel,
+    ) -> ClassificationPrediction:
+        """
+        Convert internal prediction TypedDict to ClassificationPrediction object.
+        """
+        input_value = prediction.get("input_value")
+        input_value_str: str | None = None
+        if input_value is not None:
+            input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
+        return ClassificationPrediction(
+            prediction_id=prediction["prediction_id"],
+            label=prediction.get("label"),
+            label_name=prediction.get("label_name"),
+            score=None,
+            confidence=prediction["confidence"],
+            anomaly_score=prediction["anomaly_score"],
+            memoryset=memoryset,
+            model=model,
+            telemetry=prediction,
+            logits=prediction.get("logits"),
+            input_value=input_value_str,
+        )
+    def _convert_to_regression_prediction(
+        self,
+        prediction: ScorePredictionWithMemoriesAndFeedback,
+        *,
+        memoryset: ScoredMemoryset,
+        model: RegressionModel,
+    ) -> RegressionPrediction:
+        """
+        Convert internal prediction TypedDict to RegressionPrediction object.
+        """
+        input_value = prediction.get("input_value")
+        input_value_str: str | None = None
+        if input_value is not None:
+            input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
+        return RegressionPrediction(
+            prediction_id=prediction["prediction_id"],
+            label=None,
+            label_name=None,
+            score=prediction.get("score"),
+            confidence=prediction["confidence"],
+            anomaly_score=prediction["anomaly_score"],
+            memoryset=memoryset,
+            model=model,
+            telemetry=prediction,
+            logits=None,
+            input_value=input_value_str,
+        )
+    def feedback(self) -> dict[str, list[bool] | list[float]]:
+        """
+        Get feedback metrics computed from predictions that used this memory.
+        Returns a dictionary where:
+        - Keys are feedback category names
+        - Values are lists of feedback values (you may want to look at mean on the raw data)
+        """
+        # Collect all feedbacks by category, paginating through all predictions
+        feedback_by_category: dict[str, list[bool] | list[float]] = {}
+        batch_size = 500
+        offset = 0
+        while True:
+            predictions_batch = self.predictions(limit=batch_size, offset=offset)
+            if not predictions_batch:
+                break
+            for prediction in predictions_batch:
+                telemetry = prediction._telemetry
+                if "feedbacks" not in telemetry:
+                    continue
+                for fb in telemetry["feedbacks"]:
+                    category_name = fb["category_name"]
+                    value = fb["value"]
+                    # Convert BINARY (1/0) to boolean, CONTINUOUS to float
+                    if fb["category_type"] == "BINARY":
+                        value = bool(value)
+                        if category_name not in feedback_by_category:
+                            feedback_by_category[category_name] = []
+                        cast(list[bool], feedback_by_category[category_name]).append(value)
+                    else:
+                        value = float(value)
+                        if category_name not in feedback_by_category:
+                            feedback_by_category[category_name] = []
+                        cast(list[float], feedback_by_category[category_name]).append(value)
+            if len(predictions_batch) < batch_size:
+                break
+            offset += batch_size
+        return feedback_by_category
     def _update(
         self,
         *,
@@ -299,7 +421,8 @@ class MemoryBase(ABC):
         source_id: str | None = UNSET,
         **metadata: None | bool | float | int | str,
     ) -> Self:
-        response = orca_api.PATCH(
+        client = OrcaClient._resolve_client()
+        response = client.PATCH(
             "/gpu/memoryset/{name_or_id}/memory",
             params={"name_or_id": self.memoryset_id},
             json=_parse_memory_update(
@@ -415,6 +538,75 @@ class LabeledMemory(MemoryBase):
         self._update(value=value, label=label, source_id=source_id, **metadata)
         return self
+    def predictions(
+        self,
+        limit: int = 100,
+        offset: int = 0,
+        tag: str | None = None,
+        sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
+        expected_label_match: bool | None = None,
+    ) -> list[ClassificationPrediction]:
+        """
+        Get classification predictions that used this memory.
+        Args:
+            limit: Maximum number of predictions to return (default: 100)
+            offset: Number of predictions to skip for pagination (default: 0)
+            tag: Optional tag filter to only include predictions with this tag
+            sort: List of (field, direction) tuples for sorting results.
+                Valid fields: "anomaly_score", "confidence", "timestamp".
+                Valid directions: "asc", "desc"
+            expected_label_match: Filter by prediction correctness:
+                - True: only return correct predictions (label == expected_label)
+                - False: only return incorrect predictions (label != expected_label)
+                - None: return all predictions (default)
+        Returns:
+            List of ClassificationPrediction objects that used this memory
+        """
+        client = OrcaClient._resolve_client()
+        predictions_data = client.POST(
+            "/telemetry/prediction",
+            json={
+                "memory_id": self.memory_id,
+                "limit": limit,
+                "offset": offset,
+                "sort": [list(sort_item) for sort_item in sort],
+                "tag": tag,
+                "expected_label_match": expected_label_match,
+            },
+        )
+        # Filter to only classification predictions and convert to ClassificationPrediction objects
+        classification_predictions = [
+            cast(LabelPredictionWithMemoriesAndFeedback, p) for p in predictions_data if "label" in p
+        ]
+        from .classification_model import ClassificationModel
+        memorysets: dict[str, LabeledMemoryset] = {}
+        models: dict[str, ClassificationModel] = {}
+        def resolve_memoryset(memoryset_id: str) -> LabeledMemoryset:
+            if memoryset_id not in memorysets:
+                memorysets[memoryset_id] = LabeledMemoryset.open(memoryset_id)
+            return memorysets[memoryset_id]
+        def resolve_model(model_id: str) -> ClassificationModel:
+            if model_id not in models:
+                models[model_id] = ClassificationModel.open(model_id)
+            return models[model_id]
+        return [
+            self._convert_to_classification_prediction(
+                p,
+                memoryset=resolve_memoryset(p["memoryset_id"]),
+                model=resolve_model(p["model_id"]),
+            )
+            for p in classification_predictions
+        ]
     def to_dict(self) -> dict[str, Any]:
         """
         Convert the memory to a dictionary
@@ -456,7 +648,11 @@ class LabeledMemoryLookup(LabeledMemory):
     lookup_score: float
     attention_weight: float | None
-    def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
+    def __init__(
+        self,
+        memoryset_id: str,
+        memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
+    ):
         # for internal use only, do not document
         super().__init__(memoryset_id, memory_lookup)
         self.lookup_score = memory_lookup["lookup_score"]
@@ -552,6 +748,75 @@ class ScoredMemory(MemoryBase):
         self._update(value=value, score=score, source_id=source_id, **metadata)
         return self
+    def predictions(
+        self,
+        limit: int = 100,
+        offset: int = 0,
+        tag: str | None = None,
+        sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
+        expected_label_match: bool | None = None,
+    ) -> list[RegressionPrediction]:
+        """
+        Get regression predictions that used this memory.
+        Args:
+            limit: Maximum number of predictions to return (default: 100)
+            offset: Number of predictions to skip for pagination (default: 0)
+            tag: Optional tag filter to only include predictions with this tag
+            sort: List of (field, direction) tuples for sorting results.
+                Valid fields: "anomaly_score", "confidence", "timestamp".
+                Valid directions: "asc", "desc"
+            expected_label_match: Filter by prediction correctness:
+                - True: only return correct predictions (score close to expected_score)
+                - False: only return incorrect predictions (score differs from expected_score)
+                - None: return all predictions (default)
+                Note: For regression, "correctness" is based on score proximity to expected_score.
+        Returns:
+            List of RegressionPrediction objects that used this memory
+        """
+        client = OrcaClient._resolve_client()
+        predictions_data = client.POST(
+            "/telemetry/prediction",
+            json={
+                "memory_id": self.memory_id,
+                "limit": limit,
+                "offset": offset,
+                "sort": [list(sort_item) for sort_item in sort],
+                "tag": tag,
+                "expected_label_match": expected_label_match,
+            },
+        )
+        # Filter to only regression predictions and convert to RegressionPrediction objects
+        regression_predictions = [
+            cast(ScorePredictionWithMemoriesAndFeedback, p) for p in predictions_data if "score" in p
+        ]
+        from .regression_model import RegressionModel
+        memorysets: dict[str, ScoredMemoryset] = {}
+        models: dict[str, RegressionModel] = {}
+        def resolve_memoryset(memoryset_id: str) -> ScoredMemoryset:
+            if memoryset_id not in memorysets:
+                memorysets[memoryset_id] = ScoredMemoryset.open(memoryset_id)
+            return memorysets[memoryset_id]
+        def resolve_model(model_id: str) -> RegressionModel:
+            if model_id not in models:
+                models[model_id] = RegressionModel.open(model_id)
+            return models[model_id]
+        return [
+            self._convert_to_regression_prediction(
+                p,
+                memoryset=resolve_memoryset(p["memoryset_id"]),
+                model=resolve_model(p["model_id"]),
+            )
+            for p in regression_predictions
+        ]
     def to_dict(self) -> dict[str, Any]:
         """
         Convert the memory to a dictionary
@@ -588,7 +853,11 @@ class ScoredMemoryLookup(ScoredMemory):
     lookup_score: float
     attention_weight: float | None
-    def __init__(self, memoryset_id: str, memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup):
+    def __init__(
+        self,
+        memoryset_id: str,
+        memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup,
+    ):
         # for internal use only, do not document
         super().__init__(memoryset_id, memory_lookup)
         self.lookup_score = memory_lookup["lookup_score"]
@@ -637,6 +906,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
     index_params: dict[str, Any]
     hidden: bool
+    _batch_size = 32  # max number of memories to insert/update/delete in a single API call
     def __init__(self, metadata: MemorysetMetadata):
         # for internal use only, do not document
         if metadata["pretrained_embedding_model_name"]:
@@ -670,55 +941,48 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             "})"
         )
-    @overload
     @classmethod
-    def create(
+    def _handle_if_exists(
         cls,
         name: str,
-        datasource: Datasource,
         *,
-        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
-        value_column: str = "value",
-        label_column: str | None = None,
-        score_column: str | None = None,
-        source_id_column: str | None = None,
-        description: str | None = None,
-        label_names: list[str] | None = None,
-        max_seq_length_override: int | None = None,
-        prompt: str | None = None,
-        remove_duplicates: bool = True,
-        index_type: IndexType = "FLAT",
-        index_params: dict[str, Any] = {},
-        if_exists: CreateMode = "error",
-        background: Literal[True],
-        hidden: bool = False,
-    ) -> Job[Self]:
-        pass
+        if_exists: CreateMode,
+        label_names: list[str] | None,
+        embedding_model: PretrainedEmbeddingModel | FinetunedEmbeddingModel | None,
+    ) -> Self | None:
+        """
+        Handle common `if_exists` logic shared by all creator-style helpers.
-    @overload
-    @classmethod
-    def create(
-        cls,
-        name: str,
-        datasource: Datasource,
-        *,
-        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
-        value_column: str = "value",
-        label_column: str | None = None,
-        score_column: str | None = None,
-        source_id_column: str | None = None,
-        description: str | None = None,
-        label_names: list[str] | None = None,
-        max_seq_length_override: int | None = None,
-        prompt: str | None = None,
-        remove_duplicates: bool = True,
-        index_type: IndexType = "FLAT",
-        index_params: dict[str, Any] = {},
-        if_exists: CreateMode = "error",
-        background: Literal[False] = False,
-        hidden: bool = False,
-    ) -> Self:
-        pass
+        Returns the already-existing memoryset when `if_exists == "open"`, raises for `"error"`,
+        and returns `None` when the memoryset does not yet exist.
+        """
+        if not cls.exists(name):
+            return None
+        if if_exists == "error":
+            raise ValueError(f"Memoryset with name {name} already exists")
+        existing = cls.open(name)
+        if label_names is not None and hasattr(existing, "label_names"):
+            existing_label_names = getattr(existing, "label_names")
+            if label_names != existing_label_names:
+                requested = ", ".join(label_names)
+                existing_joined = ", ".join(existing_label_names)
+                raise ValueError(
+                    f"Memoryset {name} already exists with label names [{existing_joined}] "
+                    f"(requested: [{requested}])."
+                )
+        if embedding_model is not None and embedding_model != existing.embedding_model:
+            existing_model = existing.embedding_model
+            existing_model_name = getattr(existing_model, "name", getattr(existing_model, "path", str(existing_model)))
+            requested_name = getattr(embedding_model, "name", getattr(embedding_model, "path", str(embedding_model)))
+            raise ValueError(
+                f"Memoryset {name} already exists with embedding_model {existing_model_name} "
+                f"(requested: {requested_name})."
+            )
+        return existing
     @classmethod
     def create(
@@ -741,6 +1005,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if_exists: CreateMode = "error",
         background: bool = False,
         hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
     ) -> Self | Job[Self]:
         """
         Create a new memoryset in the OrcaCloud
@@ -754,8 +1020,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             embedding_model: Embedding model to use for embedding memory values for semantic search.
                 If not provided, a default embedding model for the memoryset will be used.
             value_column: Name of the column in the datasource that contains the memory values
-            label_column: Name of the column in the datasource that contains the memory labels,
-                these must be contiguous integers starting from 0
+            label_column: Name of the column in the datasource that contains the memory labels.
+                Must contain categorical values as integers or strings. String labels will be
+                converted to integers with the unique strings extracted as `label_names`
             score_column: Name of the column in the datasource that contains the memory scores
             source_id_column: Optional name of the column in the datasource that contains the ids in
                 the system of reference
@@ -763,9 +1030,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 so make sure it is concise and describes the contents of your memoryset not the
                 datasource or the embedding model.
             label_names: List of human-readable names for the labels in the memoryset, must match
-                the number of labels in the `label_column`. Will be automatically inferred if a
-                [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
-                labels is used as the datasource
+                the number of labels in the `label_column`. Will be automatically inferred if string
+                labels are provided or if a [Dataset][datasets.Dataset] with a
+                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
             max_seq_length_override: Maximum sequence length of values in the memoryset, if the
                 value is longer than this it will be truncated, will default to the model's max
                 sequence length if not provided
@@ -779,7 +1046,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 `"error"`. Other option is `"open"` to open the existing memoryset.
             background: Whether to run the operation none blocking and return a job handle
             hidden: Whether the memoryset should be hidden
+            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
+                datasource to insert. Use to limit the size of the initial memoryset.
+            memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
+                and `"SCORED"` if `score_column` is provided, must be specified for other cases.
         Returns:
             Handle to the new memoryset in the OrcaCloud
@@ -790,18 +1060,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if embedding_model is None:
             embedding_model = PretrainedEmbeddingModel.GTE_BASE
-        if label_column is None and score_column is None:
-            raise ValueError("label_column or score_column must be provided")
-        if cls.exists(name):
-            if if_exists == "error":
-                raise ValueError(f"Memoryset with name {name} already exists")
-            elif if_exists == "open":
-                existing = cls.open(name)
-                for attribute in {"label_names", "embedding_model"}:
-                    if locals()[attribute] is not None and locals()[attribute] != getattr(existing, attribute):
-                        raise ValueError(f"Memoryset with name {name} already exists with a different {attribute}.")
-                return existing
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=label_names,
+            embedding_model=embedding_model,
+        )
+        if existing is not None:
+            return existing
         payload: CreateMemorysetRequest = {
             "name": name,
@@ -818,6 +1084,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             "index_params": index_params,
             "hidden": hidden,
         }
+        if memory_type is not None:
+            payload["memory_type"] = memory_type
+        if subsample is not None:
+            payload["subsample"] = subsample
         if prompt is not None:
             payload["prompt"] = prompt
         if isinstance(embedding_model, PretrainedEmbeddingModel):
@@ -826,8 +1096,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
         else:
             raise ValueError("Invalid embedding model")
-        response = orca_api.POST("/memoryset", json=payload)
-        job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
+        client = OrcaClient._resolve_client()
+        response = client.POST("/memoryset", json=payload)
+        job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
         return job if background else job.result()
     @overload
@@ -862,6 +1133,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_hf_dataset(
             f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
         )
@@ -926,6 +1207,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_pytorch(
             f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
         )
@@ -990,6 +1281,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     {"value": "world", "label": 1, "tag": "tag2"},
             ... ])
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1053,6 +1354,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     "tag": ["tag1", "tag2"],
             ... })
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1109,6 +1420,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1165,6 +1486,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_arrow(
             f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
         )
@@ -1230,6 +1561,16 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             Handle to the new memoryset in the OrcaCloud
         """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
         datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
         return cls.create(name, datasource, **kwargs)
@@ -1248,7 +1589,26 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Raises:
             LookupError: If the memoryset does not exist
         """
-        metadata = orca_api.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
+        client = OrcaClient._resolve_client()
+        metadata = client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
+        return cls(metadata)
+    @classmethod
+    async def aopen(cls, name: str) -> Self:
+        """
+        Asynchronously get a handle to a memoryset in the OrcaCloud
+        Params:
+            name: Name or unique identifier of the memoryset
+        Returns:
+            Handle to the existing memoryset in the OrcaCloud
+        Raises:
+            LookupError: If the memoryset does not exist
+        """
+        client = OrcaAsyncClient._resolve_client()
+        metadata = await client.GET("/memoryset/{name_or_id}", params={"name_or_id": name})
         return cls(metadata)
     @classmethod
@@ -1279,9 +1639,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         Returns:
             List of handles to all memorysets in the OrcaCloud
         """
+        client = OrcaClient._resolve_client()
         return [
             cls(metadata)
-            for metadata in orca_api.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
+            for metadata in client.GET("/memoryset", params={"type": cls.memory_type, "show_hidden": show_hidden})
         ]
     @classmethod
@@ -1298,7 +1659,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             LookupError: If the memoryset does not exist and if_not_exists is `"error"`
         """
         try:
-            orca_api.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
+            client = OrcaClient._resolve_client()
+            client.DELETE("/memoryset/{name_or_id}", params={"name_or_id": name_or_id})
             logging.info(f"Deleted memoryset {name_or_id}")
         except LookupError:
             if if_not_exists == "error":
@@ -1333,7 +1695,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if hidden is not UNSET:
             payload["hidden"] = hidden
-        orca_api.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
+        client = OrcaClient._resolve_client()
+        client.PATCH("/memoryset/{name_or_id}", params={"name_or_id": self.id}, json=payload)
         self.refresh()
     @overload
@@ -1425,9 +1788,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         elif isinstance(embedding_model, FinetunedEmbeddingModel):
             payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
-        metadata = orca_api.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
+        client = OrcaClient._resolve_client()
+        metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
         job = Job(
-            metadata["insertion_task_id"],
+            metadata["insertion_job_id"],
             lambda: self.open(metadata["id"]),
         )
         return job if background else job.result()
@@ -1556,7 +1920,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 ],
             ]
         """
-        response = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        response = client.POST(
             "/gpu/memoryset/{name_or_id}/lookup",
             params={"name_or_id": self.id},
             json={
@@ -1613,7 +1978,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         ]
         if with_feedback_metrics:
-            response = orca_api.POST(
+            client = OrcaClient._resolve_client()
+            response = client.POST(
                 "/telemetry/memories",
                 json={
                     "memoryset_id": self.id,
@@ -1637,7 +2003,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if sort:
             logging.warning("Sorting is not supported when with_feedback_metrics is False. Sort value will be ignored.")
-        response = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        response = client.POST(
             "/memoryset/{name_or_id}/memories",
             params={"name_or_id": self.id},
             json={
@@ -1698,19 +2065,74 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
             ... ])
         """
-        orca_api.POST(
-            "/gpu/memoryset/{name_or_id}/memory",
-            params={"name_or_id": self.id},
-            json=cast(
-                list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
-                [
-                    _parse_memory_insert(memory, type=self.memory_type)
-                    for memory in (cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else items)
-                ],
-            ),
-        )
+        client = OrcaClient._resolve_client()
+        items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
+        # insert memories in batches to avoid API timeouts
+        for i in range(0, len(items), self._batch_size):
+            batch = items[i : i + self._batch_size]
+            client.POST(
+                "/gpu/memoryset/{name_or_id}/memory",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
+                    [_parse_memory_insert(item, type=self.memory_type) for item in batch],
+                ),
+            )
         self.refresh()
+    async def ainsert(self, items: Iterable[dict[str, Any]] | dict[str, Any]) -> None:
+        """
+        Asynchronously insert memories into the memoryset
+        Params:
+            items: List of memories to insert into the memoryset. This should be a list of
+                dictionaries with the following keys:
+                - `value`: Value of the memory
+                - `label`: Label of the memory
+                - `score`: Score of the memory
+                - `source_id`: Optional unique ID of the memory in a system of reference
+                - `...`: Any other metadata to store for the memory
+        Examples:
+            >>> await memoryset.ainsert([
+            ...     {"value": "I am happy", "label": 1, "source_id": "user_123", "tag": "happy"},
+            ...     {"value": "I am sad", "label": 0, "source_id": "user_124", "tag": "sad"},
+            ... ])
+        """
+        client = OrcaAsyncClient._resolve_client()
+        items = cast(list[dict[str, Any]], [items]) if isinstance(items, dict) else list(items)
+        # insert memories in batches to avoid API timeouts
+        for i in range(0, len(items), self._batch_size):
+            batch = items[i : i + self._batch_size]
+            await client.POST(
+                "/gpu/memoryset/{name_or_id}/memory",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryInsert] | list[ScoredMemoryInsert],
+                    [_parse_memory_insert(item, type=self.memory_type) for item in batch],
+                ),
+            )
+        await self.arefresh()
+    async def arefresh(self, throttle: float = 0):
+        """
+        Asynchronously refresh the information about the memoryset from the OrcaCloud
+        Params:
+            throttle: Minimum time in seconds between refreshes
+        """
+        current_time = datetime.now()
+        # Skip refresh if last refresh was too recent
+        if (current_time - self._last_refresh) < timedelta(seconds=throttle):
+            return
+        refreshed_memoryset = await type(self).aopen(self.id)
+        self.__dict__.update(refreshed_memoryset.__dict__)
+        self._last_refresh = current_time
     @overload
     def get(self, memory_id: str) -> MemoryT:  # type: ignore -- this takes precedence
         pass
@@ -1748,7 +2170,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ]
         """
         if isinstance(memory_id, str):
-            response = orca_api.GET(
+            client = OrcaClient._resolve_client()
+            response = client.GET(
                 "/memoryset/{name_or_id}/memory/{memory_id}", params={"name_or_id": self.id, "memory_id": memory_id}
             )
             return cast(
@@ -1756,7 +2179,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 (LabeledMemory(self.id, response) if "label" in response else ScoredMemory(self.id, response)),
             )
         else:
-            response = orca_api.POST(
+            client = OrcaClient._resolve_client()
+            response = client.POST(
                 "/memoryset/{name_or_id}/memories/get",
                 params={"name_or_id": self.id},
                 json={"memory_ids": list(memory_id)},
@@ -1809,24 +2233,28 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ...     for m in memoryset.query(filters=[("tag", "==", "happy")])
             ... )
         """
-        response = orca_api.PATCH(
-            "/gpu/memoryset/{name_or_id}/memories",
-            params={"name_or_id": self.id},
-            json=cast(
-                list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
-                [
-                    _parse_memory_update(update, type=self.memory_type)
-                    for update in (cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else updates)
-                ],
-            ),
-        )
-        updated_memories = [
-            cast(
-                MemoryT,
-                (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
+        client = OrcaClient._resolve_client()
+        updates_list = cast(list[dict[str, Any]], [updates]) if isinstance(updates, dict) else list(updates)
+        # update memories in batches to avoid API timeouts
+        updated_memories: list[MemoryT] = []
+        for i in range(0, len(updates_list), self._batch_size):
+            batch = updates_list[i : i + self._batch_size]
+            response = client.PATCH(
+                "/gpu/memoryset/{name_or_id}/memories",
+                params={"name_or_id": self.id},
+                json=cast(
+                    list[LabeledMemoryUpdate] | list[ScoredMemoryUpdate],
+                    [_parse_memory_update(update, type=self.memory_type) for update in batch],
+                ),
             )
-            for memory in response
-        ]
+            updated_memories.extend(
+                cast(
+                    MemoryT,
+                    (LabeledMemory(self.id, memory) if "label" in memory else ScoredMemory(self.id, memory)),
+                )
+                for memory in response
+            )
         return updated_memories[0] if isinstance(updates, dict) else updated_memories
     def get_cascading_edits_suggestions(
@@ -1869,7 +2297,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
         """
         # TODO: properly integrate this with memory edits and return something that can be applied
-        return orca_api.POST(
+        client = OrcaClient._resolve_client()
+        return client.POST(
             "/memoryset/{name_or_id}/memory/{memory_id}/cascading_edits",
             params={"name_or_id": self.id, "memory_id": memory.memory_id},
             json={
@@ -1903,10 +2332,14 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             ... )
         """
+        client = OrcaClient._resolve_client()
         memory_ids = [memory_id] if isinstance(memory_id, str) else list(memory_id)
-        orca_api.POST(
-            "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": memory_ids}
-        )
+        # delete memories in batches to avoid API timeouts
+        for i in range(0, len(memory_ids), self._batch_size):
+            batch = memory_ids[i : i + self._batch_size]
+            client.POST(
+                "/memoryset/{name_or_id}/memories/delete", params={"name_or_id": self.id}, json={"memory_ids": batch}
+            )
         logging.info(f"Deleted {len(memory_ids)} memories from memoryset.")
         self.refresh()
@@ -1951,7 +2384,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
                 - **`"cluster"`**: Cluster the memories in the memoryset
                 - **`"label"`**: Analyze the labels to find potential mislabelings
-                - **`"neighbor"`**: Analyze the neighbors to populate anomaly scores
+                - **`"distribution"`**: Analyze the embedding distribution to populate
                 - **`"projection"`**: Create a 2D projection of the embeddings for visualization
             lookup_count: Number of memories to lookup for each memory in the memoryset
@@ -2017,7 +2450,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                     raise ValueError(error_msg)
                 configs[name] = analysis
-        analysis = orca_api.POST(
+        client = OrcaClient._resolve_client()
+        analysis = client.POST(
             "/memoryset/{name_or_id}/analysis",
             params={"name_or_id": self.id},
             json={
@@ -2026,134 +2460,193 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 "clear_metrics": clear_metrics,
             },
         )
-        job = Job(
-            analysis["task_id"],
-            lambda: orca_api.GET(
-                "/memoryset/{name_or_id}/analysis/{analysis_task_id}",
-                params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
-            )["results"],
-        )
+        def get_analysis_result():
+            client = OrcaClient._resolve_client()
+            return client.GET(
+                "/memoryset/{name_or_id}/analysis/{analysis_job_id}",
+                params={"name_or_id": self.id, "analysis_job_id": analysis["job_id"]},
+            )["results"]
+        job = Job(analysis["job_id"], get_analysis_result)
         return job if background else job.result()
     def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
         """Group potential duplicates in the memoryset"""
-        response = orca_api.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
+        client = OrcaClient._resolve_client()
+        response = client.GET("/memoryset/{name_or_id}/potential_duplicate_groups", params={"name_or_id": self.id})
         return [
             [cast(MemoryT, LabeledMemory(self.id, m) if "label" in m else ScoredMemory(self.id, m)) for m in ms]
             for ms in response
         ]
+class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
+    """
+    A Handle to a collection of memories with labels in the OrcaCloud
+    Attributes:
+        id: Unique identifier for the memoryset
+        name: Unique name of the memoryset
+        description: Description of the memoryset
+        label_names: Names for the class labels in the memoryset
+        length: Number of memories in the memoryset
+        embedding_model: Embedding model used to embed the memory values for semantic search
+        created_at: When the memoryset was created, automatically generated on create
+        updated_at: When the memoryset was last updated, automatically updated on updates
+    """
+    label_names: list[str]
+    memory_type: MemoryType = "LABELED"
+    def __init__(self, metadata: MemorysetMetadata):
+        super().__init__(metadata)
+        assert metadata["label_names"] is not None
+        self.label_names = metadata["label_names"]
+    def __eq__(self, other) -> bool:
+        return isinstance(other, LabeledMemoryset) and self.id == other.id
     @overload
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
-        label_column: str = "label",
+        label_column: str | None = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[True],
-    ) -> Job[list[EmbeddingModelResult]]:
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Job[Self]:
         pass
     @overload
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
-        label_column: str = "label",
+        label_column: str | None = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[False] = False,
-    ) -> list[EmbeddingModelResult]:
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self:
         pass
-    @staticmethod
-    def run_embedding_evaluation(
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        name: str,
         datasource: Datasource,
         *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
-        label_column: str = "label",
+        label_column: str | None = "label",
         source_id_column: str | None = None,
-        neighbor_count: int = 5,
-        embedding_models: list[str] | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: bool = False,
-    ) -> Job[list[EmbeddingModelResult]] | list[EmbeddingModelResult]:
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self | Job[Self]:
         """
-        Test the quality of embeddings for the datasource by computing metrics such as prediction accuracy.
+        Create a new labeled memoryset in the OrcaCloud
+        All columns from the datasource that are not specified in the `value_column`,
+        `label_column`, or `source_id_column` will be stored as metadata in the memoryset.
         Params:
-            datasource: The datasource to run the embedding evaluation on
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
             value_column: Name of the column in the datasource that contains the memory values
-            label_column: Name of the column in the datasource that contains the memory labels,
-                these must be contiguous integers starting from 0
+            label_column: Name of the column in the datasource that contains the memory labels.
+                Must contain categorical values as integers or strings. String labels will be
+                converted to integers with the unique strings extracted as `label_names`. To create
+                a memoryset with all none labels, set to `None`.
             source_id_column: Optional name of the column in the datasource that contains the ids in
                 the system of reference
-            neighbor_count: The number of neighbors to select for prediction
-            embedding_models: Optional list of embedding model keys to evaluate, if not provided all
-                available embedding models will be used
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset, must match
+                the number of labels in the `label_column`. Will be automatically inferred if string
+                labels are provided or if a [Dataset][datasets.Dataset] with a
+                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle
+            hidden: Whether the memoryset should be hidden
         Returns:
-            A dictionary containing the results of the embedding evaluation
-        """
+            Handle to the new memoryset in the OrcaCloud
-        response = orca_api.POST(
-            "/datasource/{name_or_id}/embedding_evaluation",
-            params={"name_or_id": datasource.id},
-            json={
-                "value_column": value_column,
-                "label_column": label_column,
-                "source_id_column": source_id_column,
-                "neighbor_count": neighbor_count,
-                "embedding_models": embedding_models,
-            },
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        return super().create(
+            name,
+            datasource,
+            label_column=label_column,
+            score_column=None,
+            embedding_model=embedding_model,
+            value_column=value_column,
+            source_id_column=source_id_column,
+            description=description,
+            label_names=label_names,
+            max_seq_length_override=max_seq_length_override,
+            prompt=prompt,
+            remove_duplicates=remove_duplicates,
+            index_type=index_type,
+            index_params=index_params,
+            if_exists=if_exists,
+            background=background,
+            hidden=hidden,
+            subsample=subsample,
+            memory_type="LABELED",
         )
-        def get_value() -> list[EmbeddingModelResult]:
-            res = orca_api.GET(
-                "/datasource/{name_or_id}/embedding_evaluation/{task_id}",
-                params={"name_or_id": datasource.id, "task_id": response["task_id"]},
-            )
-            assert res["result"] is not None
-            return res["result"]["evaluation_results"]
-        job = Job(response["task_id"], get_value)
-        return job if background else job.result()
-class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
-    """
-    A Handle to a collection of memories with labels in the OrcaCloud
-    Attributes:
-        id: Unique identifier for the memoryset
-        name: Unique name of the memoryset
-        description: Description of the memoryset
-        label_names: Names for the class labels in the memoryset
-        length: Number of memories in the memoryset
-        embedding_model: Embedding model used to embed the memory values for semantic search
-        created_at: When the memoryset was created, automatically generated on create
-        updated_at: When the memoryset was last updated, automatically updated on updates
-    """
-    label_names: list[str]
-    memory_type: MemoryType = "LABELED"
-    def __init__(self, metadata: MemorysetMetadata):
-        super().__init__(metadata)
-        assert metadata["label_names"] is not None
-        self.label_names = metadata["label_names"]
-    def __eq__(self, other) -> bool:
-        return isinstance(other, LabeledMemoryset) and self.id == other.id
-    @classmethod
-    def create(cls, name: str, datasource: Datasource, *, label_column: str | None = "label", **kwargs):
-        return super().create(name, datasource, label_column=label_column, score_column=None, **kwargs)
     def display_label_analysis(self):
         """
         Display an interactive UI to review and act upon the label analysis results
@@ -2185,6 +2678,131 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
     def __eq__(self, other) -> bool:
         return isinstance(other, ScoredMemoryset) and self.id == other.id
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str | None = "score",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[True],
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        score_column: str | None = "score",
+        value_column: str = "value",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[False] = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self:
+        pass
     @classmethod
-    def create(cls, name: str, datasource: Datasource, *, score_column: str | None = "score", **kwargs):
-        return super().create(name, datasource, score_column=score_column, label_column=None, **kwargs)
+    def create(  # type: ignore[override]
+        cls,
+        name: str,
+        datasource: Datasource,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str | None = "score",
+        source_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: bool = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self | Job[Self]:
+        """
+        Create a new scored memoryset in the OrcaCloud
+        All columns from the datasource that are not specified in the `value_column`,
+        `score_column`, or `source_id_column` will be stored as metadata in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            score_column: Name of the column in the datasource that contains the memory scores. Must
+                contain numerical values. To create a memoryset with all none scores, set to `None`.
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle
+            hidden: Whether the memoryset should be hidden
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        return super().create(
+            name,
+            datasource,
+            embedding_model=embedding_model,
+            value_column=value_column,
+            score_column=score_column,
+            source_id_column=source_id_column,
+            description=description,
+            max_seq_length_override=max_seq_length_override,
+            prompt=prompt,
+            remove_duplicates=remove_duplicates,
+            index_type=index_type,
+            index_params=index_params,
+            if_exists=if_exists,
+            background=background,
+            hidden=hidden,
+            subsample=subsample,
+            memory_type="SCORED",
+        )

orca-sdk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

orca-sdk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl