PyPI - orca-sdk - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

orca-sdk 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

orca_sdk/_shared/metrics.py +120 -18
orca_sdk/_shared/metrics_test.py +204 -0
orca_sdk/async_client.py +105 -25
orca_sdk/classification_model.py +4 -5
orca_sdk/client.py +105 -25
orca_sdk/embedding_model.py +19 -14
orca_sdk/embedding_model_test.py +1 -1
orca_sdk/memoryset.py +1093 -231
orca_sdk/memoryset_test.py +109 -2
orca_sdk/regression_model.py +2 -3
{orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/METADATA +1 -1
{orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/RECORD +13 -13
{orca_sdk-0.1.5.dist-info → orca_sdk-0.1.7.dist-info}/WHEEL +0 -0

orca_sdk/memoryset.py CHANGED Viewed

@@ -27,13 +27,12 @@ from .async_client import OrcaAsyncClient
 from .client import (
     CascadingEditSuggestion,
     CloneMemorysetRequest,
+    CreateMemorysetFromDatasourceRequest,
     CreateMemorysetRequest,
     FilterItem,
 )
 from .client import LabeledMemory as LabeledMemoryResponse
-from .client import (
-    LabeledMemoryInsert,
-)
+from .client import LabeledMemoryInsert
 from .client import LabeledMemoryLookup as LabeledMemoryLookupResponse
 from .client import (
     LabeledMemoryUpdate,
@@ -50,9 +49,7 @@ from .client import (
     PredictionFeedback,
 )
 from .client import ScoredMemory as ScoredMemoryResponse
-from .client import (
-    ScoredMemoryInsert,
-)
+from .client import ScoredMemoryInsert
 from .client import ScoredMemoryLookup as ScoredMemoryLookupResponse
 from .client import (
     ScoredMemoryUpdate,
@@ -937,7 +934,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
     length: int
     created_at: datetime
     updated_at: datetime
-    insertion_status: Status
+    insertion_status: Status | None
     embedding_model: EmbeddingModelBase
     index_type: IndexType
     index_params: dict[str, Any]
@@ -959,7 +956,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         self.length = metadata["length"]
         self.created_at = datetime.fromisoformat(metadata["created_at"])
         self.updated_at = datetime.fromisoformat(metadata["updated_at"])
-        self.insertion_status = Status(metadata["insertion_status"])
+        self.insertion_status = (
+            Status(metadata["insertion_status"]) if metadata["insertion_status"] is not None else None
+        )
         self._last_refresh = datetime.now()
         self.index_type = metadata["index_type"]
         self.index_params = metadata["index_params"]
@@ -971,7 +970,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
     def __repr__(self) -> str:
         return (
-            "Memoryset({\n"
+            f"{self.memory_type.capitalize()}Memoryset(" + "{\n"
             f"    name: '{self.name}',\n"
             f"    length: {self.length},\n"
             f"    embedding_model: {self.embedding_model},\n"
@@ -1022,11 +1021,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         return existing
     @classmethod
-    def create(
+    def _create_from_datasource(
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str | None = None,
@@ -1047,54 +1046,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         memory_type: MemoryType | None = None,
     ) -> Self | Job[Self]:
         """
-        Create a new memoryset in the OrcaCloud
-        All columns from the datasource that are not specified in the `value_column`,
-        `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
-        Params:
-            name: Name for the new memoryset (must be unique)
-            datasource: Source data to populate the memories in the memoryset
-            embedding_model: Embedding model to use for embedding memory values for semantic search.
-                If not provided, a default embedding model for the memoryset will be used.
-            value_column: Name of the column in the datasource that contains the memory values
-            label_column: Name of the column in the datasource that contains the memory labels.
-                Must contain categorical values as integers or strings. String labels will be
-                converted to integers with the unique strings extracted as `label_names`
-            score_column: Name of the column in the datasource that contains the memory scores
-            source_id_column: Optional name of the column in the datasource that contains the ids in
-                the system of reference
-            partition_id_column: Optional name of the column in the datasource that contains the partition ids
-            description: Optional description for the memoryset, this will be used in agentic flows,
-                so make sure it is concise and describes the contents of your memoryset not the
-                datasource or the embedding model.
-            label_names: List of human-readable names for the labels in the memoryset, must match
-                the number of labels in the `label_column`. Will be automatically inferred if string
-                labels are provided or if a [Dataset][datasets.Dataset] with a
-                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
-            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
-                value is longer than this it will be truncated, will default to the model's max
-                sequence length if not provided
-            prompt: Optional prompt to use when embedding documents/memories for storage
-            remove_duplicates: Whether to remove duplicates from the datasource before inserting
-                into the memoryset
-            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
-                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
-            index_params: Parameters for the vector index, defaults to `{}`
-            if_exists: What to do if a memoryset with the same name already exists, defaults to
-                `"error"`. Other option is `"open"` to open the existing memoryset.
-            background: Whether to run the operation none blocking and return a job handle
-            hidden: Whether the memoryset should be hidden
-            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
-                datasource to insert. Use to limit the size of the initial memoryset.
-            memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
-                and `"SCORED"` if `score_column` is provided, must be specified for other cases.
-        Returns:
-            Handle to the new memoryset in the OrcaCloud
+        Create a memoryset from a datasource by calling the API.
-        Raises:
-            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
-                `"open"` and the params do not match those of the existing memoryset.
+        This is a private method that performs the actual API call to create a memoryset from a datasource.
         """
         if embedding_model is None:
             embedding_model = PretrainedEmbeddingModel.GTE_BASE
@@ -1108,7 +1062,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         if existing is not None:
             return existing
-        payload: CreateMemorysetRequest = {
+        payload: CreateMemorysetFromDatasourceRequest = {
             "name": name,
             "description": description,
             "datasource_name_or_id": datasource.id,
@@ -1138,141 +1092,582 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             raise ValueError("Invalid embedding model")
         client = OrcaClient._resolve_client()
         response = client.POST("/memoryset", json=payload)
+        if response["insertion_job_id"] is None:
+            raise ValueError("Create memoryset operation failed to produce an insertion job")
         job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
         return job if background else job.result()
     @overload
     @classmethod
-    def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
-        pass
-    @overload
-    @classmethod
-    def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
+    def create(
+        cls,
+        name: str,
+        *,
+        datasource: None = None,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        hidden: bool = False,
+        memory_type: MemoryType | None = None,
+    ) -> Self:
         pass
-    @classmethod
-    def from_hf_dataset(
-        cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
-    ) -> Self | Job[Self]:
-        """
-        Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
-        This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
-        appended with `_datasource` and use that as the datasource for the memoryset.
-        All features that are not specified to be used as `value_column`, `label_column`, or
-        `source_id_column` will be stored as metadata in the memoryset.
-        Params:
-            name: Name for the new memoryset (must be unique)
-            hf_dataset: Hugging Face dataset to create the memoryset from
-            kwargs: Additional parameters for creating the memoryset. See
-                [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
-        Returns:
-            Handle to the new memoryset in the OrcaCloud
-        """
-        if_exists = kwargs.get("if_exists", "error")
-        existing = cls._handle_if_exists(
-            name,
-            if_exists=if_exists,
-            label_names=kwargs.get("label_names"),
-            embedding_model=kwargs.get("embedding_model"),
-        )
-        if existing is not None:
-            return existing
-        datasource = Datasource.from_hf_dataset(
-            f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
-        )
-        kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
     @overload
     @classmethod
-    def from_pytorch(
+    def create(
         cls,
         name: str,
-        torch_data: TorchDataLoader | TorchDataset,
         *,
-        column_names: list[str] | None = None,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[True],
-        **kwargs: Any,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
     ) -> Job[Self]:
         pass
     @overload
     @classmethod
-    def from_pytorch(
+    def create(
         cls,
         name: str,
-        torch_data: TorchDataLoader | TorchDataset,
         *,
-        column_names: list[str] | None = None,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: Literal[False] = False,
-        **kwargs: Any,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
     ) -> Self:
         pass
     @classmethod
-    def from_pytorch(
+    def create(
         cls,
         name: str,
-        torch_data: TorchDataLoader | TorchDataset,
         *,
-        column_names: list[str] | None = None,
+        datasource: Datasource | None = None,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
         background: bool = False,
-        **kwargs: Any,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
     ) -> Self | Job[Self]:
         """
-        Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
-        [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
+        Create a new memoryset in the OrcaCloud
-        This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
-        appended with `_datasource` and use that as the datasource for the memoryset.
+        If `datasource` is provided, all columns from the datasource that are not specified in the
+        `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
+        as metadata in the memoryset.
-        All properties that are not specified to be used as `value_column`, `label_column`, or
-        `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
+        If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
+        You can add memories later using the `insert` method.
         Params:
             name: Name for the new memoryset (must be unique)
-            torch_data: PyTorch data loader or dataset to create the memoryset from
-            column_names: If the provided dataset or data loader returns unnamed tuples, this
-                argument must be provided to specify the names of the columns.
-            background: Whether to run the operation in the background
-            kwargs: Additional parameters for creating the memoryset. See
-                [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
+            datasource: Optional source data to populate the memories in the memoryset. If omitted,
+                an empty memoryset will be created.
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            label_column: Name of the column in the datasource that contains the memory labels.
+                Must contain categorical values as integers or strings. String labels will be
+                converted to integers with the unique strings extracted as `label_names`
+            score_column: Name of the column in the datasource that contains the memory scores
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            partition_id_column: Optional name of the column in the datasource that contains the partition ids
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset, must match
+                the number of labels in the `label_column`. Will be automatically inferred if string
+                labels are provided or if a [Dataset][datasets.Dataset] with a
+                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle.
+                Note: This parameter is ignored when creating an empty memoryset (when datasource is None).
+            hidden: Whether the memoryset should be hidden
+            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
+                datasource to insert. Use to limit the size of the initial memoryset.
+            memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
+                and `"SCORED"` if `score_column` is provided, must be specified for other cases.
         Returns:
             Handle to the new memoryset in the OrcaCloud
-        """
-        if_exists = kwargs.get("if_exists", "error")
-        existing = cls._handle_if_exists(
-            name,
-            if_exists=if_exists,
-            label_names=kwargs.get("label_names"),
-            embedding_model=kwargs.get("embedding_model"),
-        )
-        if existing is not None:
-            return existing
-        datasource = Datasource.from_pytorch(
-            f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
-        )
-        kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        if datasource is None:
+            return cls._create_empty(
+                name,
+                embedding_model=embedding_model,
+                description=description,
+                label_names=label_names,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                hidden=hidden,
+                memory_type=memory_type,
+            )
+        else:
+            return cls._create_from_datasource(
+                name,
+                datasource=datasource,
+                embedding_model=embedding_model,
+                value_column=value_column,
+                label_column=label_column,
+                score_column=score_column,
+                source_id_column=source_id_column,
+                partition_id_column=partition_id_column,
+                description=description,
+                label_names=label_names,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                remove_duplicates=remove_duplicates,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                background=background,
+                hidden=hidden,
+                subsample=subsample,
+                memory_type=memory_type,
+            )
     @overload
     @classmethod
-    def from_list(
+    def from_datasource(
         cls,
         name: str,
-        data: list[dict],
         *,
-        background: Literal[True],
-        **kwargs: Any,
-    ) -> Job[Self]:
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[True],
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def from_datasource(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[False] = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
+    ) -> Self:
+        pass
+    @classmethod
+    def from_datasource(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = None,
+        score_column: str | None = None,
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: bool = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+        memory_type: MemoryType | None = None,
+    ) -> Self | Job[Self]:
+        """
+        Create a new memoryset in the OrcaCloud from a datasource.
+        This is a convenience method that is equivalent to calling `create` with a datasource.
+        All columns from the datasource that are not specified in the `value_column`,
+        `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
+        in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset.
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            label_column: Name of the column in the datasource that contains the memory labels.
+                Must contain categorical values as integers or strings. String labels will be
+                converted to integers with the unique strings extracted as `label_names`
+            score_column: Name of the column in the datasource that contains the memory scores
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            partition_id_column: Optional name of the column in the datasource that contains the partition ids
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset, must match
+                the number of labels in the `label_column`. Will be automatically inferred if string
+                labels are provided or if a [Dataset][datasets.Dataset] with a
+                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle.
+            hidden: Whether the memoryset should be hidden
+            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
+                datasource to insert. Use to limit the size of the initial memoryset.
+            memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
+                and `"SCORED"` if `score_column` is provided, must be specified for other cases.
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        return cls._create_from_datasource(
+            name,
+            datasource=datasource,
+            embedding_model=embedding_model,
+            value_column=value_column,
+            label_column=label_column,
+            score_column=score_column,
+            source_id_column=source_id_column,
+            partition_id_column=partition_id_column,
+            description=description,
+            label_names=label_names,
+            max_seq_length_override=max_seq_length_override,
+            prompt=prompt,
+            remove_duplicates=remove_duplicates,
+            index_type=index_type,
+            index_params=index_params,
+            if_exists=if_exists,
+            background=background,
+            hidden=hidden,
+            subsample=subsample,
+            memory_type=memory_type,
+        )
+    @classmethod
+    def _create_empty(
+        cls,
+        name: str,
+        *,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        hidden: bool = False,
+        memory_type: MemoryType | None = None,
+    ) -> Self:
+        """
+        Create an empty memoryset in the OrcaCloud
+        This creates a memoryset with no initial memories. You can add memories later using
+        the `insert` method.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            hidden: Whether the memoryset should be hidden
+            memory_type: Type of memoryset to create, defaults to `"LABELED"` if called from
+                `LabeledMemoryset` and `"SCORED"` if called from `ScoredMemoryset`.
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        if embedding_model is None:
+            embedding_model = PretrainedEmbeddingModel.GTE_BASE
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=label_names,
+            embedding_model=embedding_model,
+        )
+        if existing is not None:
+            return existing
+        payload: CreateMemorysetRequest = {
+            "name": name,
+            "description": description,
+            "label_names": label_names,
+            "max_seq_length_override": max_seq_length_override,
+            "index_type": index_type,
+            "index_params": index_params,
+            "hidden": hidden,
+        }
+        if memory_type is not None:
+            payload["memory_type"] = memory_type
+        if prompt is not None:
+            payload["prompt"] = prompt
+        if isinstance(embedding_model, PretrainedEmbeddingModel):
+            payload["pretrained_embedding_model_name"] = embedding_model.name
+        elif isinstance(embedding_model, FinetunedEmbeddingModel):
+            payload["finetuned_embedding_model_name_or_id"] = embedding_model.id
+        else:
+            raise ValueError("Invalid embedding model")
+        client = OrcaClient._resolve_client()
+        response = client.POST("/memoryset/empty", json=payload)
+        return cls.open(response["id"])
+    @overload
+    @classmethod
+    def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[True], **kwargs: Any) -> Self:
+        pass
+    @overload
+    @classmethod
+    def from_hf_dataset(cls, name: str, hf_dataset: Dataset, background: Literal[False] = False, **kwargs: Any) -> Self:
+        pass
+    @classmethod
+    def from_hf_dataset(
+        cls, name: str, hf_dataset: Dataset, background: bool = False, **kwargs: Any
+    ) -> Self | Job[Self]:
+        """
+        Create a new memoryset from a Hugging Face [`Dataset`][datasets.Dataset] in the OrcaCloud
+        This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
+        appended with `_datasource` and use that as the datasource for the memoryset.
+        All features that are not specified to be used as `value_column`, `label_column`, or
+        `source_id_column` will be stored as metadata in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            hf_dataset: Hugging Face dataset to create the memoryset from
+            kwargs: Additional parameters for creating the memoryset. See
+                [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
+        datasource = Datasource.from_hf_dataset(
+            f"{name}_datasource", hf_dataset, if_exists=kwargs.get("if_exists", "error")
+        )
+        kwargs["background"] = background
+        return cls.create(name, datasource=datasource, **kwargs)
+    @overload
+    @classmethod
+    def from_pytorch(
+        cls,
+        name: str,
+        torch_data: TorchDataLoader | TorchDataset,
+        *,
+        column_names: list[str] | None = None,
+        background: Literal[True],
+        **kwargs: Any,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def from_pytorch(
+        cls,
+        name: str,
+        torch_data: TorchDataLoader | TorchDataset,
+        *,
+        column_names: list[str] | None = None,
+        background: Literal[False] = False,
+        **kwargs: Any,
+    ) -> Self:
+        pass
+    @classmethod
+    def from_pytorch(
+        cls,
+        name: str,
+        torch_data: TorchDataLoader | TorchDataset,
+        *,
+        column_names: list[str] | None = None,
+        background: bool = False,
+        **kwargs: Any,
+    ) -> Self | Job[Self]:
+        """
+        Create a new memoryset from a PyTorch [`DataLoader`][torch.utils.data.DataLoader] or
+        [`Dataset`][torch.utils.data.Dataset] in the OrcaCloud
+        This will automatically create a [`Datasource`][orca_sdk.Datasource] with the same name
+        appended with `_datasource` and use that as the datasource for the memoryset.
+        All properties that are not specified to be used as `value_column`, `label_column`, or
+        `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            torch_data: PyTorch data loader or dataset to create the memoryset from
+            column_names: If the provided dataset or data loader returns unnamed tuples, this
+                argument must be provided to specify the names of the columns.
+            background: Whether to run the operation in the background
+            kwargs: Additional parameters for creating the memoryset. See
+                [`create`][orca_sdk.memoryset.MemorysetBase.create] attributes for details.
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        """
+        if_exists = kwargs.get("if_exists", "error")
+        existing = cls._handle_if_exists(
+            name,
+            if_exists=if_exists,
+            label_names=kwargs.get("label_names"),
+            embedding_model=kwargs.get("embedding_model"),
+        )
+        if existing is not None:
+            return existing
+        datasource = Datasource.from_pytorch(
+            f"{name}_datasource", torch_data, column_names=column_names, if_exists=kwargs.get("if_exists", "error")
+        )
+        kwargs["background"] = background
+        return cls.create(name, datasource=datasource, **kwargs)
+    @overload
+    @classmethod
+    def from_list(
+        cls,
+        name: str,
+        data: list[dict],
+        *,
+        background: Literal[True],
+        **kwargs: Any,
+    ) -> Job[Self]:
         pass
     @overload
@@ -1333,7 +1728,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         datasource = Datasource.from_list(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        return cls.create(name, datasource=datasource, **kwargs)
     @overload
     @classmethod
@@ -1406,7 +1801,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         datasource = Datasource.from_dict(f"{name}_datasource", data, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        return cls.create(name, datasource=datasource, **kwargs)
     @overload
     @classmethod
@@ -1472,7 +1867,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         datasource = Datasource.from_pandas(f"{name}_datasource", dataframe, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        return cls.create(name, datasource=datasource, **kwargs)
     @overload
     @classmethod
@@ -1540,7 +1935,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
             f"{name}_datasource", pyarrow_table, if_exists=kwargs.get("if_exists", "error")
         )
         kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        return cls.create(name, datasource=datasource, **kwargs)
     @overload
     @classmethod
@@ -1613,7 +2008,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         datasource = Datasource.from_disk(f"{name}_datasource", file_path, if_exists=kwargs.get("if_exists", "error"))
         kwargs["background"] = background
-        return cls.create(name, datasource, **kwargs)
+        return cls.create(name, datasource=datasource, **kwargs)
     @classmethod
     def open(cls, name: str) -> Self:
@@ -1830,6 +2225,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
         client = OrcaClient._resolve_client()
         metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
+        if metadata["insertion_job_id"] is None:
+            raise ValueError("Create memoryset operation failed to produce an insertion job")
         job = Job(
             metadata["insertion_job_id"],
             lambda: self.open(metadata["id"]),
@@ -2482,9 +2881,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
                 - **`"duplicate"`**: Find potentially duplicate memories in the memoryset
                 - **`"cluster"`**: Cluster the memories in the memoryset
-                - **`"label"`**: Analyze the labels to find potential mislabelings
-                - **`"distribution"`**: Analyze the embedding distribution to populate
+                - **`"distribution"`**: Analyze the embedding distribution
                 - **`"projection"`**: Create a 2D projection of the embeddings for visualization
+                - **`"label"`**: Analyze the labels to find potential mislabelings (labeled memorysets only)
+                - **`"class_patterns"`**: Analyze class patterns and find representative memories (labeled memorysets only)
+                - **`"concepts"`**: Discover and name conceptual clusters in the memoryset (labeled memorysets only)
             lookup_count: Number of memories to lookup for each memory in the memoryset
             clear_metrics: Whether to clear any existing metrics from the memories before running the analysis
@@ -2590,35 +2991,246 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
     """
     A Handle to a collection of memories with labels in the OrcaCloud
-    Attributes:
-        id: Unique identifier for the memoryset
-        name: Unique name of the memoryset
-        description: Description of the memoryset
-        label_names: Names for the class labels in the memoryset
-        length: Number of memories in the memoryset
-        embedding_model: Embedding model used to embed the memory values for semantic search
-        created_at: When the memoryset was created, automatically generated on create
-        updated_at: When the memoryset was last updated, automatically updated on updates
-    """
+    Attributes:
+        id: Unique identifier for the memoryset
+        name: Unique name of the memoryset
+        description: Description of the memoryset
+        label_names: Names for the class labels in the memoryset
+        length: Number of memories in the memoryset
+        embedding_model: Embedding model used to embed the memory values for semantic search
+        created_at: When the memoryset was created, automatically generated on create
+        updated_at: When the memoryset was last updated, automatically updated on updates
+    """
+    label_names: list[str]
+    memory_type: MemoryType = "LABELED"
+    def __init__(self, metadata: MemorysetMetadata):
+        super().__init__(metadata)
+        assert metadata["label_names"] is not None
+        self.label_names = metadata["label_names"]
+    def __eq__(self, other) -> bool:
+        return isinstance(other, LabeledMemoryset) and self.id == other.id
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        *,
+        datasource: None = None,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        description: str | None = None,
+        label_names: list[str],
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        hidden: bool = False,
+    ) -> Self:
+        pass
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = "label",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[True],
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = "label",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[False] = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self:
+        pass
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        name: str,
+        *,
+        datasource: Datasource | None = None,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        label_column: str | None = "label",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        label_names: list[str] | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: bool = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self | Job[Self]:
+        """
+        Create a new labeled memoryset in the OrcaCloud
+        If `datasource` is provided, all columns from the datasource that are not specified in the
+        `value_column`, `label_column`, `source_id_column`, or `partition_id_column` will be stored
+        as metadata in the memoryset.
-    label_names: list[str]
-    memory_type: MemoryType = "LABELED"
+        If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
+        You can add memories later using the `insert` method.
-    def __init__(self, metadata: MemorysetMetadata):
-        super().__init__(metadata)
-        assert metadata["label_names"] is not None
-        self.label_names = metadata["label_names"]
+        Params:
+            name: Name for the new memoryset (must be unique)
+            datasource: Optional source data to populate the memories in the memoryset. If omitted,
+                an empty memoryset will be created.
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            label_column: Name of the column in the datasource that contains the memory labels.
+                Must contain categorical values as integers or strings. String labels will be
+                converted to integers with the unique strings extracted as `label_names`. To create
+                a memoryset with all none labels, set to `None`.
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            partition_id_column: Optional name of the column in the datasource that contains the partition ids
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            label_names: List of human-readable names for the labels in the memoryset, must match
+                the number of labels in the `label_column`. Will be automatically inferred if string
+                labels are provided or if a [Dataset][datasets.Dataset] with a
+                [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle
+            hidden: Whether the memoryset should be hidden
-    def __eq__(self, other) -> bool:
-        return isinstance(other, LabeledMemoryset) and self.id == other.id
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        if datasource is None:
+            return super().create(
+                name,
+                datasource=None,
+                embedding_model=embedding_model,
+                description=description,
+                label_names=label_names,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                hidden=hidden,
+                memory_type="LABELED",
+            )
+        else:
+            # Type narrowing: datasource is definitely Datasource here
+            assert datasource is not None
+            if background:
+                return super().create(
+                    name,
+                    datasource=datasource,
+                    label_column=label_column,
+                    score_column=None,
+                    embedding_model=embedding_model,
+                    value_column=value_column,
+                    source_id_column=source_id_column,
+                    partition_id_column=partition_id_column,
+                    description=description,
+                    label_names=label_names,
+                    max_seq_length_override=max_seq_length_override,
+                    prompt=prompt,
+                    remove_duplicates=remove_duplicates,
+                    index_type=index_type,
+                    index_params=index_params,
+                    if_exists=if_exists,
+                    background=True,
+                    hidden=hidden,
+                    subsample=subsample,
+                    memory_type="LABELED",
+                )
+            else:
+                return super().create(
+                    name,
+                    datasource=datasource,
+                    label_column=label_column,
+                    score_column=None,
+                    embedding_model=embedding_model,
+                    value_column=value_column,
+                    source_id_column=source_id_column,
+                    partition_id_column=partition_id_column,
+                    description=description,
+                    label_names=label_names,
+                    max_seq_length_override=max_seq_length_override,
+                    prompt=prompt,
+                    remove_duplicates=remove_duplicates,
+                    index_type=index_type,
+                    index_params=index_params,
+                    if_exists=if_exists,
+                    background=False,
+                    hidden=hidden,
+                    subsample=subsample,
+                    memory_type="LABELED",
+                )
     @overload
     @classmethod
-    def create(
+    def from_datasource(
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str | None = "label",
@@ -2640,11 +3252,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
     @overload
     @classmethod
-    def create(
+    def from_datasource(
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str | None = "label",
@@ -2665,11 +3277,11 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
         pass
     @classmethod
-    def create(  # type: ignore[override]
+    def from_datasource(  # type: ignore[override]
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         label_column: str | None = "label",
@@ -2688,14 +3300,16 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
         subsample: int | float | None = None,
     ) -> Self | Job[Self]:
         """
-        Create a new labeled memoryset in the OrcaCloud
+        Create a new labeled memoryset in the OrcaCloud from a datasource.
+        This is a convenience method that is equivalent to calling `create` with a datasource.
         All columns from the datasource that are not specified in the `value_column`,
-        `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
+        `label_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
+        in the memoryset.
         Params:
             name: Name for the new memoryset (must be unique)
-            datasource: Source data to populate the memories in the memoryset
+            datasource: Source data to populate the memories in the memoryset.
             embedding_model: Embedding model to use for embedding memory values for semantic search.
                 If not provided, a default embedding model for the memoryset will be used.
             value_column: Name of the column in the datasource that contains the memory values
@@ -2724,8 +3338,10 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
             index_params: Parameters for the vector index, defaults to `{}`
             if_exists: What to do if a memoryset with the same name already exists, defaults to
                 `"error"`. Other option is `"open"` to open the existing memoryset.
-            background: Whether to run the operation none blocking and return a job handle
+            background: Whether to run the operation none blocking and return a job handle.
             hidden: Whether the memoryset should be hidden
+            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
+                datasource to insert. Use to limit the size of the initial memoryset.
         Returns:
             Handle to the new memoryset in the OrcaCloud
@@ -2734,28 +3350,52 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
             ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
                 `"open"` and the params do not match those of the existing memoryset.
         """
-        return super().create(
-            name,
-            datasource,
-            label_column=label_column,
-            score_column=None,
-            embedding_model=embedding_model,
-            value_column=value_column,
-            source_id_column=source_id_column,
-            partition_id_column=partition_id_column,
-            description=description,
-            label_names=label_names,
-            max_seq_length_override=max_seq_length_override,
-            prompt=prompt,
-            remove_duplicates=remove_duplicates,
-            index_type=index_type,
-            index_params=index_params,
-            if_exists=if_exists,
-            background=background,
-            hidden=hidden,
-            subsample=subsample,
-            memory_type="LABELED",
-        )
+        if background:
+            return super().create(
+                name,
+                datasource=datasource,
+                label_column=label_column,
+                score_column=None,
+                embedding_model=embedding_model,
+                value_column=value_column,
+                source_id_column=source_id_column,
+                partition_id_column=partition_id_column,
+                description=description,
+                label_names=label_names,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                remove_duplicates=remove_duplicates,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                background=True,
+                hidden=hidden,
+                subsample=subsample,
+                memory_type="LABELED",
+            )
+        else:
+            return super().create(
+                name,
+                datasource=datasource,
+                label_column=label_column,
+                score_column=None,
+                embedding_model=embedding_model,
+                value_column=value_column,
+                source_id_column=source_id_column,
+                partition_id_column=partition_id_column,
+                description=description,
+                label_names=label_names,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                remove_duplicates=remove_duplicates,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                background=False,
+                hidden=hidden,
+                subsample=subsample,
+                memory_type="LABELED",
+            )
     def display_label_analysis(self):
         """
@@ -2793,8 +3433,26 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
     def create(
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: None = None,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        hidden: bool = False,
+    ) -> Self:
+        pass
+    @overload
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         score_column: str | None = "score",
@@ -2818,8 +3476,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
     def create(
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         score_column: str | None = "score",
         value_column: str = "value",
@@ -2842,8 +3500,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
     def create(  # type: ignore[override]
         cls,
         name: str,
-        datasource: Datasource,
         *,
+        datasource: Datasource | None = None,
         embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
         value_column: str = "value",
         score_column: str | None = "score",
@@ -2863,12 +3521,17 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
         """
         Create a new scored memoryset in the OrcaCloud
-        All columns from the datasource that are not specified in the `value_column`,
-        `score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata in the memoryset.
+        If `datasource` is provided, all columns from the datasource that are not specified in the
+        `value_column`, `score_column`, `source_id_column`, or `partition_id_column` will be stored
+        as metadata in the memoryset.
+        If `datasource` is omitted (None), an empty memoryset will be created with no initial memories.
+        You can add memories later using the `insert` method.
         Params:
             name: Name for the new memoryset (must be unique)
-            datasource: Source data to populate the memories in the memoryset
+            datasource: Optional source data to populate the memories in the memoryset. If omitted,
+                an empty memoryset will be created.
             embedding_model: Embedding model to use for embedding memory values for semantic search.
                 If not provided, a default embedding model for the memoryset will be used.
             value_column: Name of the column in the datasource that contains the memory values
@@ -2901,23 +3564,222 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
             ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
                 `"open"` and the params do not match those of the existing memoryset.
         """
-        return super().create(
-            name,
-            datasource,
-            embedding_model=embedding_model,
-            value_column=value_column,
-            score_column=score_column,
-            source_id_column=source_id_column,
-            partition_id_column=partition_id_column,
-            description=description,
-            max_seq_length_override=max_seq_length_override,
-            prompt=prompt,
-            remove_duplicates=remove_duplicates,
-            index_type=index_type,
-            index_params=index_params,
-            if_exists=if_exists,
-            background=background,
-            hidden=hidden,
-            subsample=subsample,
-            memory_type="SCORED",
-        )
+        if datasource is None:
+            return super().create(
+                name,
+                datasource=None,
+                embedding_model=embedding_model,
+                description=description,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                hidden=hidden,
+                memory_type="SCORED",
+            )
+        else:
+            # Type narrowing: datasource is definitely Datasource here
+            assert datasource is not None
+            if background:
+                return super().create(
+                    name,
+                    datasource=datasource,
+                    embedding_model=embedding_model,
+                    value_column=value_column,
+                    score_column=score_column,
+                    source_id_column=source_id_column,
+                    partition_id_column=partition_id_column,
+                    description=description,
+                    max_seq_length_override=max_seq_length_override,
+                    prompt=prompt,
+                    remove_duplicates=remove_duplicates,
+                    index_type=index_type,
+                    index_params=index_params,
+                    if_exists=if_exists,
+                    background=True,
+                    hidden=hidden,
+                    subsample=subsample,
+                    memory_type="SCORED",
+                )
+            else:
+                return super().create(
+                    name,
+                    datasource=datasource,
+                    embedding_model=embedding_model,
+                    value_column=value_column,
+                    score_column=score_column,
+                    source_id_column=source_id_column,
+                    partition_id_column=partition_id_column,
+                    description=description,
+                    max_seq_length_override=max_seq_length_override,
+                    prompt=prompt,
+                    remove_duplicates=remove_duplicates,
+                    index_type=index_type,
+                    index_params=index_params,
+                    if_exists=if_exists,
+                    background=False,
+                    hidden=hidden,
+                    subsample=subsample,
+                    memory_type="SCORED",
+                )
+    @overload
+    @classmethod
+    def from_datasource(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str | None = "score",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[True],
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Job[Self]:
+        pass
+    @overload
+    @classmethod
+    def from_datasource(
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        score_column: str | None = "score",
+        value_column: str = "value",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: Literal[False] = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self:
+        pass
+    @classmethod
+    def from_datasource(  # type: ignore[override]
+        cls,
+        name: str,
+        *,
+        datasource: Datasource,
+        embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
+        value_column: str = "value",
+        score_column: str | None = "score",
+        source_id_column: str | None = None,
+        partition_id_column: str | None = None,
+        description: str | None = None,
+        max_seq_length_override: int | None = None,
+        prompt: str | None = None,
+        remove_duplicates: bool = True,
+        index_type: IndexType = "FLAT",
+        index_params: dict[str, Any] = {},
+        if_exists: CreateMode = "error",
+        background: bool = False,
+        hidden: bool = False,
+        subsample: int | float | None = None,
+    ) -> Self | Job[Self]:
+        """
+        Create a new scored memoryset in the OrcaCloud from a datasource.
+        This is a convenience method that is equivalent to calling `create` with a datasource.
+        All columns from the datasource that are not specified in the `value_column`,
+        `score_column`, `source_id_column`, or `partition_id_column` will be stored as metadata
+        in the memoryset.
+        Params:
+            name: Name for the new memoryset (must be unique)
+            datasource: Source data to populate the memories in the memoryset.
+            embedding_model: Embedding model to use for embedding memory values for semantic search.
+                If not provided, a default embedding model for the memoryset will be used.
+            value_column: Name of the column in the datasource that contains the memory values
+            score_column: Name of the column in the datasource that contains the memory scores. Must
+                contain numerical values. To create a memoryset with all none scores, set to `None`.
+            source_id_column: Optional name of the column in the datasource that contains the ids in
+                the system of reference
+            partition_id_column: Optional name of the column in the datasource that contains the partition ids
+            description: Optional description for the memoryset, this will be used in agentic flows,
+                so make sure it is concise and describes the contents of your memoryset not the
+                datasource or the embedding model.
+            max_seq_length_override: Maximum sequence length of values in the memoryset, if the
+                value is longer than this it will be truncated, will default to the model's max
+                sequence length if not provided
+            prompt: Optional prompt to use when embedding documents/memories for storage
+            remove_duplicates: Whether to remove duplicates from the datasource before inserting
+                into the memoryset
+            index_type: Type of vector index to use for the memoryset, defaults to `"FLAT"`. Valid
+                values are `"FLAT"`, `"IVF_FLAT"`, `"IVF_SQ8"`, `"IVF_PQ"`, `"HNSW"`, and `"DISKANN"`.
+            index_params: Parameters for the vector index, defaults to `{}`
+            if_exists: What to do if a memoryset with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing memoryset.
+            background: Whether to run the operation none blocking and return a job handle.
+            hidden: Whether the memoryset should be hidden
+            subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
+                datasource to insert. Use to limit the size of the initial memoryset.
+        Returns:
+            Handle to the new memoryset in the OrcaCloud
+        Raises:
+            ValueError: If the memoryset already exists and if_exists is `"error"` or if it is
+                `"open"` and the params do not match those of the existing memoryset.
+        """
+        if background:
+            return super().create(
+                name,
+                datasource=datasource,
+                embedding_model=embedding_model,
+                value_column=value_column,
+                score_column=score_column,
+                source_id_column=source_id_column,
+                partition_id_column=partition_id_column,
+                description=description,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                remove_duplicates=remove_duplicates,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                background=True,
+                hidden=hidden,
+                subsample=subsample,
+                memory_type="SCORED",
+            )
+        else:
+            return super().create(
+                name,
+                datasource=datasource,
+                embedding_model=embedding_model,
+                value_column=value_column,
+                score_column=score_column,
+                source_id_column=source_id_column,
+                partition_id_column=partition_id_column,
+                description=description,
+                max_seq_length_override=max_seq_length_override,
+                prompt=prompt,
+                remove_duplicates=remove_duplicates,
+                index_type=index_type,
+                index_params=index_params,
+                if_exists=if_exists,
+                background=False,
+                hidden=hidden,
+                subsample=subsample,
+                memory_type="SCORED",
+            )

orca-sdk 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

orca-sdk 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl