PyPI - orca-sdk - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

orca-sdk 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

orca_sdk/_shared/metrics.py +186 -43
orca_sdk/_shared/metrics_test.py +99 -6
orca_sdk/_utils/data_parsing_test.py +1 -1
orca_sdk/async_client.py +52 -14
orca_sdk/classification_model.py +107 -30
orca_sdk/classification_model_test.py +327 -8
orca_sdk/client.py +52 -14
orca_sdk/conftest.py +140 -21
orca_sdk/embedding_model.py +0 -2
orca_sdk/memoryset.py +141 -26
orca_sdk/memoryset_test.py +253 -4
orca_sdk/regression_model.py +73 -16
orca_sdk/regression_model_test.py +213 -0
{orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/METADATA +1 -1
{orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/RECORD +16 -16
{orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/WHEEL +0 -0

orca_sdk/client.py CHANGED Viewed

@@ -135,6 +135,8 @@ class ClassificationEvaluationRequest(TypedDict):
     telemetry_tags: NotRequired[list[str] | None]
     subsample: NotRequired[int | float | None]
     ignore_unlabeled: NotRequired[bool]
+    datasource_partition_column: NotRequired[str | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class CleanupResponse(TypedDict):
@@ -315,12 +317,16 @@ class ListMemoriesRequest(TypedDict):
     offset: NotRequired[int]
     limit: NotRequired[int]
     filters: NotRequired[list[FilterItem]]
+    partition_id: NotRequired[str | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class LookupRequest(TypedDict):
     query: list[str]
     count: NotRequired[int]
     prompt: NotRequired[str | None]
+    partition_id: NotRequired[str | list[str | None] | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class LookupScoreMetrics(TypedDict):
@@ -547,16 +553,7 @@ class PredictiveModelUpdate(TypedDict):
 PretrainedEmbeddingModelName = Literal[
-    "CLIP_BASE",
-    "GTE_BASE",
-    "CDE_SMALL",
-    "DISTILBERT",
-    "GTE_SMALL",
-    "MXBAI_LARGE",
-    "E5_LARGE",
-    "QWEN2_1_5B",
-    "BGE_BASE",
-    "GIST_LARGE",
+    "CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
 ]
@@ -586,6 +583,8 @@ class RegressionEvaluationRequest(TypedDict):
     telemetry_tags: NotRequired[list[str] | None]
     subsample: NotRequired[int | float | None]
     ignore_unlabeled: NotRequired[bool]
+    datasource_partition_column: NotRequired[str | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class RegressionMetrics(TypedDict):
@@ -629,6 +628,8 @@ class RegressionPredictionRequest(TypedDict):
     use_lookup_cache: NotRequired[bool]
     consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
     ignore_unlabeled: NotRequired[bool]
+    partition_ids: NotRequired[str | list[str | None] | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class ScorePredictionMemoryLookup(TypedDict):
@@ -1163,7 +1164,14 @@ class BootstrapClassificationModelRequest(TypedDict):
     num_examples_per_label: NotRequired[int]
-class BootstrapClassificationModelResult(TypedDict):
+class BootstrapLabeledMemoryDataInput(TypedDict):
+    model_description: str
+    label_names: list[str]
+    initial_examples: NotRequired[list[LabeledExample]]
+    num_examples_per_label: NotRequired[int]
+class BootstrapLabeledMemoryDataResult(TypedDict):
     model_description: str
     label_names: list[str]
     model_name: str
@@ -1216,6 +1224,8 @@ class ClassificationPredictionRequest(TypedDict):
     use_lookup_cache: NotRequired[bool]
     consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
     ignore_unlabeled: NotRequired[bool]
+    partition_ids: NotRequired[str | list[str | None] | None]
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class CloneMemorysetRequest(TypedDict):
@@ -1269,6 +1279,7 @@ class CreateMemorysetRequest(TypedDict):
     datasource_score_column: NotRequired[str | None]
     datasource_value_column: str
     datasource_source_id_column: NotRequired[str | None]
+    datasource_partition_id_column: NotRequired[str | None]
     remove_duplicates: NotRequired[bool]
     pretrained_embedding_model_name: NotRequired[PretrainedEmbeddingModelName | None]
     finetuned_embedding_model_name_or_id: NotRequired[str | None]
@@ -1539,6 +1550,7 @@ class MemorysetAnalysisRequest(TypedDict):
     batch_size: NotRequired[int]
     clear_metrics: NotRequired[bool]
     configs: MemorysetAnalysisConfigs
+    partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
 class MemorysetConceptMetrics(TypedDict):
@@ -1664,7 +1676,7 @@ class BootstrapClassificationModelMeta(TypedDict):
     datasource_meta: DatasourceMetadata
     memoryset_meta: MemorysetMetadata
     model_meta: ClassificationModelMetadata
-    agent_output: BootstrapClassificationModelResult
+    agent_output: BootstrapLabeledMemoryDataResult
 class BootstrapClassificationModelResponse(TypedDict):
@@ -2554,7 +2566,7 @@ class OrcaClient(Client):
         timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
         extensions: RequestExtensions | None = None,
     ) -> BootstrapClassificationModelResponse:
-        """Get the status of a bootstrap classification model job"""
+        """Get the status of a bootstrap labeled memory data job"""
         pass
     def GET(
@@ -3276,6 +3288,32 @@ class OrcaClient(Client):
         """Get row count from a specific datasource with optional filtering."""
         pass
+    @overload
+    def POST(
+        self,
+        path: Literal["/datasource/bootstrap_memory_data"],
+        *,
+        params: None = None,
+        json: BootstrapLabeledMemoryDataInput,
+        data: None = None,
+        files: None = None,
+        content: None = None,
+        parse_as: Literal["json"] = "json",
+        headers: HeaderTypes | None = None,
+        cookies: CookieTypes | None = None,
+        auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
+        timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        extensions: RequestExtensions | None = None,
+    ) -> BootstrapLabeledMemoryDataResult:
+        """
+        Bootstrap memory data using an AI agent.
+        This endpoint uses the bootstrap labeled memory data agent to generate
+        high-quality, diverse training examples for a classification model.
+        """
+        pass
     @overload
     def POST(
         self,
@@ -3524,7 +3562,7 @@ class OrcaClient(Client):
         """
         Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
-        This endpoint uses the bootstrap_classification_model agent to generate:
+        This endpoint uses the bootstrap_labeled_memory_data agent to generate:
         1. Memoryset configuration with appropriate settings
         2. Model configuration with optimal parameters
         3. High-quality training memories for each label

orca_sdk/conftest.py CHANGED Viewed

@@ -99,34 +99,105 @@ def label_names():
 SAMPLE_DATA = [
-    {"value": "i love soup", "label": 0, "key": "g1", "score": 0.1, "source_id": "s1"},
-    {"value": "cats are cute", "label": 1, "key": "g1", "score": 0.9, "source_id": "s2"},
-    {"value": "soup is good", "label": 0, "key": "g1", "score": 0.1, "source_id": "s3"},
-    {"value": "i love cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s4"},
-    {"value": "everyone loves cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s5"},
-    {"value": "soup is great for the winter", "label": 0, "key": "g1", "score": 0.1, "source_id": "s6"},
-    {"value": "hot soup on a rainy day!", "label": 0, "key": "g1", "score": 0.1, "source_id": "s7"},
-    {"value": "cats sleep all day", "label": 1, "key": "g1", "score": 0.9, "source_id": "s8"},
-    {"value": "homemade soup recipes", "label": 0, "key": "g1", "score": 0.1, "source_id": "s9"},
-    {"value": "cats purr when happy", "label": 1, "key": "g2", "score": 0.9, "source_id": "s10"},
-    {"value": "chicken noodle soup is classic", "label": 0, "key": "g1", "score": 0.1, "source_id": "s11"},
-    {"value": "kittens are baby cats", "label": 1, "key": "g2", "score": 0.9, "source_id": "s12"},
-    {"value": "soup can be served cold too", "label": 0, "key": "g1", "score": 0.1, "source_id": "s13"},
-    {"value": "cats have nine lives", "label": 1, "key": "g2", "score": 0.9, "source_id": "s14"},
-    {"value": "tomato soup with grilled cheese", "label": 0, "key": "g1", "score": 0.1, "source_id": "s15"},
-    {"value": "cats are independent animals", "label": 1, "key": "g2", "score": 0.9, "source_id": "s16"},
-    {"value": "the beach is always fun", "label": None, "key": "g3", "score": None, "source_id": "s17"},
-    {"value": "i love the beach", "label": None, "key": "g3", "score": None, "source_id": "s18"},
-    {"value": "the ocean is healing", "label": None, "key": "g3", "score": None, "source_id": "s19"},
+    {"value": "i love soup", "label": 0, "key": "g1", "score": 0.1, "source_id": "s1", "partition_id": "p1"},
+    {"value": "cats are cute", "label": 1, "key": "g1", "score": 0.9, "source_id": "s2", "partition_id": "p1"},
+    {"value": "soup is good", "label": 0, "key": "g1", "score": 0.1, "source_id": "s3", "partition_id": "p1"},
+    {"value": "i love cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s4", "partition_id": "p1"},
+    {"value": "everyone loves cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s5", "partition_id": "p1"},
+    {
+        "value": "soup is great for the winter",
+        "label": 0,
+        "key": "g1",
+        "score": 0.1,
+        "source_id": "s6",
+        "partition_id": "p1",
+    },
+    {
+        "value": "hot soup on a rainy day!",
+        "label": 0,
+        "key": "g1",
+        "score": 0.1,
+        "source_id": "s7",
+        "partition_id": "p1",
+    },
+    {"value": "cats sleep all day", "label": 1, "key": "g1", "score": 0.9, "source_id": "s8", "partition_id": "p1"},
+    {"value": "homemade soup recipes", "label": 0, "key": "g1", "score": 0.1, "source_id": "s9", "partition_id": "p2"},
+    {"value": "cats purr when happy", "label": 1, "key": "g2", "score": 0.9, "source_id": "s10", "partition_id": "p2"},
+    {
+        "value": "chicken noodle soup is classic",
+        "label": 0,
+        "key": "g1",
+        "score": 0.1,
+        "source_id": "s11",
+        "partition_id": "p2",
+    },
+    {"value": "kittens are baby cats", "label": 1, "key": "g2", "score": 0.9, "source_id": "s12", "partition_id": "p2"},
+    {
+        "value": "soup can be served cold too",
+        "label": 0,
+        "key": "g1",
+        "score": 0.1,
+        "source_id": "s13",
+        "partition_id": "p2",
+    },
+    {"value": "cats have nine lives", "label": 1, "key": "g2", "score": 0.9, "source_id": "s14", "partition_id": "p2"},
+    {
+        "value": "tomato soup with grilled cheese",
+        "label": 0,
+        "key": "g1",
+        "score": 0.1,
+        "source_id": "s15",
+        "partition_id": "p2",
+    },
+    {
+        "value": "cats are independent animals",
+        "label": 1,
+        "key": "g2",
+        "score": 0.9,
+        "source_id": "s16",
+        "partition_id": None,
+    },
+    {
+        "value": "the beach is always fun",
+        "label": None,
+        "key": "g3",
+        "score": None,
+        "source_id": "s17",
+        "partition_id": None,
+    },
+    {"value": "i love the beach", "label": None, "key": "g3", "score": None, "source_id": "s18", "partition_id": None},
+    {
+        "value": "the ocean is healing",
+        "label": None,
+        "key": "g3",
+        "score": None,
+        "source_id": "s19",
+        "partition_id": None,
+    },
     {
         "value": "sandy feet, sand between my toes at the beach",
         "label": None,
         "key": "g3",
         "score": None,
         "source_id": "s20",
+        "partition_id": None,
+    },
+    {
+        "value": "i am such a beach bum",
+        "label": None,
+        "key": "g3",
+        "score": None,
+        "source_id": "s21",
+        "partition_id": None,
+    },
+    {
+        "value": "i will always want to be at the beach",
+        "label": None,
+        "key": "g3",
+        "score": None,
+        "source_id": "s22",
+        "partition_id": None,
     },
-    {"value": "i am such a beach bum", "label": None, "key": "g3", "score": None, "source_id": "s21"},
-    {"value": "i will always want to be at the beach", "label": None, "key": "g3", "score": None, "source_id": "s22"},
 ]
@@ -141,6 +212,7 @@ def hf_dataset(label_names: list[str]) -> Dataset:
                 "key": Value("string"),
                 "score": Value("float"),
                 "source_id": Value("string"),
+                "partition_id": Value("string"),
             }
         ),
     )
@@ -186,6 +258,18 @@ def readonly_memoryset(datasource: Datasource) -> LabeledMemoryset:
     return memoryset
+@pytest.fixture(scope="session")
+def readonly_partitioned_memoryset(datasource: Datasource) -> LabeledMemoryset:
+    memoryset = LabeledMemoryset.create(
+        "test_readonly_partitioned_memoryset",
+        datasource=datasource,
+        embedding_model=PretrainedEmbeddingModel.GTE_BASE,
+        source_id_column="source_id",
+        partition_id_column="partition_id",
+    )
+    return memoryset
 @pytest.fixture(scope="function")
 def writable_memoryset(datasource: Datasource, api_key: str) -> Generator[LabeledMemoryset, None, None]:
     """
@@ -237,6 +321,18 @@ def classification_model(readonly_memoryset: LabeledMemoryset) -> Classification
     return model
+@pytest.fixture(scope="session")
+def partitioned_classification_model(readonly_partitioned_memoryset: LabeledMemoryset) -> ClassificationModel:
+    model = ClassificationModel.create(
+        "test_partitioned_classification_model",
+        readonly_partitioned_memoryset,
+        num_classes=2,
+        memory_lookup_count=3,
+        description="test_partitioned_description",
+    )
+    return model
 # Add scored memoryset and regression model fixtures
 @pytest.fixture(scope="session")
 def scored_memoryset(datasource: Datasource) -> ScoredMemoryset:
@@ -261,3 +357,26 @@ def regression_model(scored_memoryset: ScoredMemoryset) -> RegressionModel:
         description="test_regression_description",
     )
     return model
+@pytest.fixture(scope="session")
+def readonly_partitioned_scored_memoryset(datasource: Datasource) -> ScoredMemoryset:
+    memoryset = ScoredMemoryset.create(
+        "test_readonly_partitioned_scored_memoryset",
+        datasource=datasource,
+        embedding_model=PretrainedEmbeddingModel.GTE_BASE,
+        source_id_column="source_id",
+        partition_id_column="partition_id",
+    )
+    return memoryset
+@pytest.fixture(scope="session")
+def partitioned_regression_model(readonly_partitioned_scored_memoryset: ScoredMemoryset) -> RegressionModel:
+    model = RegressionModel.create(
+        "test_partitioned_regression_model",
+        readonly_partitioned_scored_memoryset,
+        memory_lookup_count=3,
+        description="test_partitioned_regression_description",
+    )
+    return model

orca_sdk/embedding_model.py CHANGED Viewed

@@ -340,7 +340,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
     - **`E5_LARGE`**: E5-Large instruction-tuned embedding model from Hugging Face ([intfloat/multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct))
     - **`GIST_LARGE`**: GIST-Large embedding model from Hugging Face ([avsolatorio/GIST-large-Embedding-v0](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0))
     - **`MXBAI_LARGE`**: Mixbreas's Large embedding model from Hugging Face ([mixedbread-ai/mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1))
-    - **`QWEN2_1_5B`**: Alibaba's Qwen2-1.5B instruction-tuned embedding model from Hugging Face ([Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct))
     - **`BGE_BASE`**: BAAI's BGE-Base instruction-tuned embedding model from Hugging Face ([BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5))
     **Instruction Support:**
@@ -373,7 +372,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
     E5_LARGE = _ModelDescriptor("E5_LARGE")
     GIST_LARGE = _ModelDescriptor("GIST_LARGE")
     MXBAI_LARGE = _ModelDescriptor("MXBAI_LARGE")
-    QWEN2_1_5B = _ModelDescriptor("QWEN2_1_5B")
     BGE_BASE = _ModelDescriptor("BGE_BASE")
     name: PretrainedEmbeddingModelName

orca-sdk 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

orca-sdk 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl