PyPI - orca-sdk - Versions diffs - 0.0.98__py3-none-any.whl → 0.0.101__py3-none-any.whl - Mend

orca-sdk 0.0.98py3-none-any.whl → 0.0.101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

orca_sdk/client.py +101 -53
orca_sdk/datasource.py +43 -22
orca_sdk/datasource_test.py +41 -7
orca_sdk/telemetry.py +1 -1
{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/METADATA +1 -1
{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/RECORD +7 -7
{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/WHEEL +0 -0

orca_sdk/client.py CHANGED Viewed

@@ -263,8 +263,8 @@ class MemoryMetrics(TypedDict):
     neighbor_predicted_label_matches_current_label: NotRequired[bool | None]
     spread: NotRequired[float]
     uniformity: NotRequired[float]
-    subconcept_cluster_id: NotRequired[int | None]
-    subconcept_name: NotRequired[str | None]
+    concept_id: NotRequired[int | None]
+    subconcept_id: NotRequired[int | None]
 MemoryType = Literal["LABELED", "SCORED"]
@@ -301,6 +301,19 @@ class MemorysetClusterMetrics(TypedDict):
     num_clusters: int
+class MemorysetConceptAnalysisConfig(TypedDict):
+    high_level_description: NotRequired[str | None]
+    max_sample_rows: NotRequired[int]
+    max_trial_count: NotRequired[int]
+    min_desired_clusters_per_label: NotRequired[int]
+    max_desired_clusters_per_label: NotRequired[int]
+    accuracy_importance: NotRequired[float]
+    noise_penalty: NotRequired[float]
+    naming_examples_count: NotRequired[int]
+    naming_counterexample_count: NotRequired[int]
+    seed: NotRequired[int]
 class MemorysetDuplicateAnalysisConfig(TypedDict):
     potential_duplicate_threshold: NotRequired[float]
@@ -339,19 +352,6 @@ class MemorysetProjectionMetrics(TypedDict):
     pass
-class MemorysetSubconceptAnalysisConfig(TypedDict):
-    high_level_description: NotRequired[str | None]
-    max_sample_rows: NotRequired[int]
-    max_trial_count: NotRequired[int]
-    min_desired_clusters_per_label: NotRequired[int]
-    max_desired_clusters_per_label: NotRequired[int]
-    accuracy_importance: NotRequired[float]
-    noise_penalty: NotRequired[float]
-    naming_examples_count: NotRequired[int]
-    naming_counterexample_count: NotRequired[int]
-    seed: NotRequired[int]
 class MemorysetUpdate(TypedDict):
     label_names: NotRequired[list[str]]
     description: NotRequired[str | None]
@@ -628,11 +628,11 @@ class ServiceUnavailableErrorResponse(TypedDict):
     service: str
-class SubconceptClusterMetrics(TypedDict):
-    cluster_id: int
+class SubConceptMetrics(TypedDict):
+    id: int
     name: str
-    primary_label_index: NotRequired[int | None]
-    description: NotRequired[str | None]
+    description: str | None
+    primary_label: int | None
     memory_count: int
@@ -840,6 +840,13 @@ class GetDatasourceByNameOrIdEmbeddingEvaluationByTaskIdParams(TypedDict):
 class GetDatasourceByNameOrIdDownloadParams(TypedDict):
     name_or_id: str
+    file_type: NotRequired[Literal["hf_dataset", "json", "csv"]]
+    """
+    File type to download:
+    * `hf_dataset`: Zipped HuggingFace dataset (default)
+    * `json`: Row-oriented JSON array
+    * `csv`: CSV file
+    """
 class PatchClassificationModelByNameOrIdParams(TypedDict):
@@ -938,7 +945,6 @@ class GetTelemetryPredictionByPredictionIdExplanationParams(TypedDict):
 class GetTelemetryPredictionByPredictionIdActionParams(TypedDict):
     prediction_id: str
-    refresh: NotRequired[bool]
 class GetTelemetryPredictionByPredictionIdMemorySuggestionsParams(TypedDict):
@@ -1100,6 +1106,15 @@ class ColumnInfo(TypedDict):
     int_values: NotRequired[list[int] | None]
+class ConceptMetrics(TypedDict):
+    id: int
+    name: str
+    description: str | None
+    primary_label: int | None
+    memory_count: int
+    subconcepts: list[SubConceptMetrics]
 class CreateClassificationModelRequest(TypedDict):
     name: str
     description: NotRequired[str | None]
@@ -1378,7 +1393,7 @@ class MemorysetAnalysisConfigs(TypedDict):
     projection: NotRequired[MemorysetProjectionAnalysisConfig | None]
     cluster: NotRequired[MemorysetClusterAnalysisConfig | None]
     class_patterns: NotRequired[MemorysetClassPatternsAnalysisConfig | None]
-    subconcepts: NotRequired[MemorysetSubconceptAnalysisConfig | None]
+    concepts: NotRequired[MemorysetConceptAnalysisConfig | None]
 class MemorysetAnalysisRequest(TypedDict):
@@ -1388,12 +1403,21 @@ class MemorysetAnalysisRequest(TypedDict):
     configs: MemorysetAnalysisConfigs
-class MemorysetSubconceptMetrics(TypedDict):
-    clusters_by_id: dict[str, SubconceptClusterMetrics]
-    num_clusters: int
+class MemorysetConceptMetrics(TypedDict):
+    concepts: list[ConceptMetrics]
     num_outliers: int
+class MemorysetMetrics(TypedDict):
+    neighbor: NotRequired[MemorysetNeighborMetrics | None]
+    label: NotRequired[MemorysetLabelMetrics | None]
+    duplicate: NotRequired[MemorysetDuplicateMetrics | None]
+    projection: NotRequired[MemorysetProjectionMetrics | None]
+    cluster: NotRequired[MemorysetClusterMetrics | None]
+    class_patterns: NotRequired[MemorysetClassPatternsMetrics | None]
+    concepts: NotRequired[MemorysetConceptMetrics | None]
 class PaginatedUnionLabeledMemoryWithFeedbackMetricsScoredMemoryWithFeedbackMetrics(TypedDict):
     items: list[LabeledMemoryWithFeedbackMetrics | ScoredMemoryWithFeedbackMetrics]
     total: int
@@ -1449,34 +1473,6 @@ class EmbeddingEvaluationResult(TypedDict):
     evaluation_results: list[EmbeddingModelResult]
-class MemorysetMetrics(TypedDict):
-    neighbor: NotRequired[MemorysetNeighborMetrics | None]
-    label: NotRequired[MemorysetLabelMetrics | None]
-    duplicate: NotRequired[MemorysetDuplicateMetrics | None]
-    projection: NotRequired[MemorysetProjectionMetrics | None]
-    cluster: NotRequired[MemorysetClusterMetrics | None]
-    class_patterns: NotRequired[MemorysetClassPatternsMetrics | None]
-    subconcepts: NotRequired[MemorysetSubconceptMetrics | None]
-class PaginatedTask(TypedDict):
-    items: list[Task]
-    total: int
-    offset: int
-    limit: int
-class DatasourceEmbeddingEvaluationsResponse(TypedDict):
-    task_id: str
-    org_id: str
-    datasource_id: str
-    status: TaskStatus
-    result: EmbeddingEvaluationResult | None
-    payload: DatasourceEmbeddingEvaluationsTaskPayload
-    created_at: str
-    updated_at: str
 class MemorysetAnalysisResponse(TypedDict):
     task_id: str
     org_id: str
@@ -1516,6 +1512,13 @@ class MemorysetMetadata(TypedDict):
     query_prompt_override: str | None
+class PaginatedTask(TypedDict):
+    items: list[Task]
+    total: int
+    offset: int
+    limit: int
 class BootstrapClassificationModelMeta(TypedDict):
     datasource_meta: DatasourceMetadata
     memoryset_meta: MemorysetMetadata
@@ -1531,6 +1534,17 @@ class BootstrapClassificationModelResponse(TypedDict):
     input: BootstrapClassificationModelRequest | None
+class DatasourceEmbeddingEvaluationsResponse(TypedDict):
+    task_id: str
+    org_id: str
+    datasource_id: str
+    status: TaskStatus
+    result: EmbeddingEvaluationResult | None
+    payload: DatasourceEmbeddingEvaluationsTaskPayload
+    created_at: str
+    updated_at: str
 class OrcaClient(Client):
     @staticmethod
     def _parse_params(
@@ -1911,6 +1925,40 @@ class OrcaClient(Client):
         """Get an embedding evaluation task by ID."""
         pass
+    @overload
+    def GET(
+        self,
+        path: Literal["/datasource/{name_or_id}/download"],
+        *,
+        params: GetDatasourceByNameOrIdDownloadParams,
+        parse_as: Literal["json"] = "json",
+        headers: HeaderTypes | None = None,
+        cookies: CookieTypes | None = None,
+        auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
+        timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        extensions: RequestExtensions | None = None,
+    ) -> list[dict[str, Any]]:
+        """Download datasource in the specified format."""
+        pass
+    @overload
+    def GET(
+        self,
+        path: Literal["/datasource/{name_or_id}/download"],
+        *,
+        params: GetDatasourceByNameOrIdDownloadParams,
+        parse_as: Literal["text"],
+        headers: HeaderTypes | None = None,
+        cookies: CookieTypes | None = None,
+        auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
+        timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
+        extensions: RequestExtensions | None = None,
+    ) -> str:
+        """Download datasource in the specified format."""
+        pass
     @overload
     def GET(
         self,
@@ -1925,7 +1973,7 @@ class OrcaClient(Client):
         timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
         extensions: RequestExtensions | None = None,
     ) -> bytes:
-        """Streams a zipped dataset file response"""
+        """Download datasource in the specified format."""
         pass
     @overload

orca_sdk/datasource.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
+import os
 import tempfile
 import zipfile
 from datetime import datetime
@@ -141,28 +142,6 @@ class Datasource:
             + "})"
         )
-    def download(self, output_path: str | PathLike) -> None:
-        """
-        Download the datasource as a ZIP and extract them to a specified path.
-        Params:
-            output_path: The local file path or directory where the downloaded files will be saved.
-        Returns:
-            None
-        """
-        # TODO: add progress bar to the download
-        response = orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id}, parse_as=None)
-        with tempfile.NamedTemporaryFile(suffix=".zip") as tmp_zip:
-            tmp_zip.write(response)
-            tmp_zip.flush()
-            with zipfile.ZipFile(tmp_zip.name, "r") as zf:
-                Path(output_path).mkdir(parents=True, exist_ok=True)
-                for file in zf.namelist():
-                    out_file = Path(output_path) / Path(file).name
-                    with zf.open(file) as af:
-                        out_file.write_bytes(af.read())
     @classmethod
     def from_hf_dataset(
         cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
@@ -501,3 +480,45 @@ class Datasource:
     def __len__(self) -> int:
         return self.length
+    def download(
+        self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
+    ) -> None:
+        """
+        Download the datasource to a specified path in the specified format type
+        Params:
+            output_dir: The local directory where the downloaded file will be saved.
+            file_type: The type of file to download.
+        Returns:
+            None
+        """
+        extension = "zip" if file_type == "hf_dataset" else file_type
+        output_path = Path(output_dir) / f"{self.name}.{extension}"
+        with open(output_path, "wb") as download_file:
+            with orca_api.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
+                total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
+                with tqdm(desc=f"Downloading", total=total_chunks, disable=total_chunks is None) as progress:
+                    for chunk in response.iter_bytes():
+                        download_file.write(chunk)
+                        progress.update(1)
+        # extract the zip file
+        if extension == "zip":
+            extract_dir = Path(output_dir) / self.name
+            with zipfile.ZipFile(output_path, "r") as zip_ref:
+                zip_ref.extractall(extract_dir)
+            output_path.unlink()  # Remove the zip file after extraction
+            logging.info(f"Downloaded {extract_dir}")
+        else:
+            logging.info(f"Downloaded {output_path}")
+    def to_list(self) -> list[dict]:
+        """
+        Convert the datasource to a list of dictionaries.
+        Returns:
+            A list of dictionaries representation of the datasource.
+        """
+        return orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})

orca_sdk/datasource_test.py CHANGED Viewed

@@ -1,10 +1,14 @@
+import json
 import os
 import tempfile
+from typing import cast
 from uuid import uuid4
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
+from datasets import Dataset
 from .datasource import Datasource
@@ -99,13 +103,6 @@ def test_drop_datasource_invalid_input():
         Datasource.drop("not valid id")
-def test_download_datasource(datasource):
-    with tempfile.TemporaryDirectory() as temp_dir:
-        output_path = os.path.join(temp_dir, "datasource.zip")
-        datasource.download(output_path)
-        assert os.path.exists(output_path)
 def test_from_list():
     # Test creating datasource from list of dictionaries
     data = [
@@ -296,3 +293,40 @@ def test_from_disk_already_exists():
             assert datasource2.name == datasource1.name
         finally:
             os.unlink(f.name)
+def test_to_list(hf_dataset, datasource):
+    assert datasource.to_list() == hf_dataset.to_list()
+def test_download_datasource(hf_dataset, datasource):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Dataset download
+        datasource.download(temp_dir)
+        downloaded_hf_dataset_dir = f"{temp_dir}/{datasource.name}"
+        assert os.path.exists(downloaded_hf_dataset_dir)
+        assert os.path.isdir(downloaded_hf_dataset_dir)
+        assert not os.path.exists(f"{downloaded_hf_dataset_dir}.zip")
+        dataset_from_downloaded_hf_dataset = Dataset.load_from_disk(downloaded_hf_dataset_dir)
+        assert dataset_from_downloaded_hf_dataset.column_names == hf_dataset.column_names
+        assert dataset_from_downloaded_hf_dataset.to_dict() == hf_dataset.to_dict()
+        # JSON download
+        datasource.download(temp_dir, file_type="json")
+        downloaded_json_file = f"{temp_dir}/{datasource.name}.json"
+        assert os.path.exists(downloaded_json_file)
+        with open(downloaded_json_file, "r") as f:
+            content = json.load(f)
+            assert content == hf_dataset.to_list()
+        # CSV download
+        datasource.download(temp_dir, file_type="csv")
+        downloaded_csv_file = f"{temp_dir}/{datasource.name}.csv"
+        assert os.path.exists(downloaded_csv_file)
+        dataset_from_downloaded_csv = cast(Dataset, Dataset.from_csv(downloaded_csv_file))
+        assert dataset_from_downloaded_csv.column_names == hf_dataset.column_names
+        assert (
+            dataset_from_downloaded_csv.remove_columns("score").to_dict()
+            == hf_dataset.remove_columns("score").to_dict()
+        )
+        assert np.allclose(dataset_from_downloaded_csv["score"], hf_dataset["score"])

orca_sdk/telemetry.py CHANGED Viewed

@@ -562,7 +562,7 @@ class ClassificationPrediction(_Prediction):
         response = orca_api.GET(
             "/telemetry/prediction/{prediction_id}/action",
-            params={"prediction_id": self.prediction_id, "refresh": refresh},
+            params={"prediction_id": self.prediction_id},
             timeout=30,
         )
         return (response["action"], response["rationale"])

{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orca_sdk
-Version: 0.0.98
+Version: 0.0.101
 Summary: SDK for interacting with Orca Services
 License: Apache-2.0
 Author: Orca DB Inc.

{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/RECORD RENAMED Viewed

@@ -19,12 +19,12 @@ orca_sdk/_utils/value_parser.py,sha256=c3qMABCCDQcIjn9N1orYYnlRwDW9JWdGwW_2TDZPL
 orca_sdk/_utils/value_parser_test.py,sha256=OybsiC-Obi32RRi9NIuwrVBRAnlyPMV1xVAaevSrb7M,1079
 orca_sdk/classification_model.py,sha256=SUiUgv_o3UUngpz3Le_L6DsijhjXVEB3yo84hrD1MX4,31172
 orca_sdk/classification_model_test.py,sha256=WganVoP-0vw1cqiVWJ2vXyGi4lwYp_hbZHultpxvFqk,19536
-orca_sdk/client.py,sha256=LEqdqMHbidh9NeZ_42RbEDR64KqXnqnX0JBjaoo2pnY,115704
+orca_sdk/client.py,sha256=gZ0e0gHoAxpvtsGYRnqH80XAZzi1EaYy2A0BplouOBA,117268
 orca_sdk/conftest.py,sha256=LHA46gDU_D0T_ogS6XOVQvGDDMD01nVZFWVcBYDConc,8885
 orca_sdk/credentials.py,sha256=KrmgP_5uqBKJXFJV6utTHIhU2odsr95VEqEXWe277DY,5074
 orca_sdk/credentials_test.py,sha256=whUweSJIEws6C8Bku-5V31Dv9hD1mFDDW7X2cCsB6g0,1629
-orca_sdk/datasource.py,sha256=344gqZsSV_N3RDlqXuLnDrPSizvciaAsSPmsiljNSmI,19329
-orca_sdk/datasource_test.py,sha256=nKQGjhX0VwCCLCdwl1ns-6kA5Ow-8pWQkSS9WV3CVww,9975
+orca_sdk/datasource.py,sha256=6wARRq-eNDJVSBABdVzn41z7s69xasTsqbozaVAsf9U,20263
+orca_sdk/datasource_test.py,sha256=wENPourrJvQN-uJJPaJI9EDuof6wVw3GirOhbrY4sFI,11564
 orca_sdk/embedding_model.py,sha256=YxMXdZ3tvnxnK93nArOr_HZ6QoRB-Mc5VNQJ0mcIdpk,26021
 orca_sdk/embedding_model_test.py,sha256=1aELyCuIzxSxUg7Z4tYtNOd8-hV5hFb-gsZTNh712OQ,7765
 orca_sdk/job.py,sha256=wWJPkkQbkNu_ylBtZN4AscU00VwWTfqlSmysRBUlivw,12787
@@ -33,8 +33,8 @@ orca_sdk/memoryset.py,sha256=hnhgHxsuV5JEDMnxazmO03VjxzPBENtHDPppHRfh9z4,85354
 orca_sdk/memoryset_test.py,sha256=14WG6u_adVmm6CSn2dM5lEfQYxybwh3s9Q7RBeTuoPE,20486
 orca_sdk/regression_model.py,sha256=je2g1BmoPCuouv5iWqDolja90F2w2vD6TooXA8KjL7c,24552
 orca_sdk/regression_model_test.py,sha256=8LDhtQeh52grZQ2Xd0ju1MQvb_hwosY_ORDDE3wS2LA,14570
-orca_sdk/telemetry.py,sha256=jOOFYYr1s3i5EASbCcmUJ_O469xCK5OFg7NVcVfTAlU,25400
+orca_sdk/telemetry.py,sha256=dRyf8fIvThkSBLDyD8BYkixg3nphsN-HbneWq7nbp_4,25380
 orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
-orca_sdk-0.0.98.dist-info/METADATA,sha256=z5MKWnC226a5mzHHN7aKJFvpFm9qTe46-gtaRTH8ppM,3613
-orca_sdk-0.0.98.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-orca_sdk-0.0.98.dist-info/RECORD,,
+orca_sdk-0.0.101.dist-info/METADATA,sha256=VmuFgh5XlRRIkRJbWd5y9APC3-b6uFZuUril5dys9vI,3614
+orca_sdk-0.0.101.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+orca_sdk-0.0.101.dist-info/RECORD,,

{orca_sdk-0.0.98.dist-info → orca_sdk-0.0.101.dist-info}/WHEEL RENAMED Viewed

File without changes

orca-sdk 0.0.98__py3-none-any.whl → 0.0.101__py3-none-any.whl

orca-sdk 0.0.98py3-none-any.whl → 0.0.101py3-none-any.whl