PyPI - orca-sdk - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

orca-sdk 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

orca_sdk/_utils/analysis_ui.py +4 -1
orca_sdk/_utils/data_parsing.py +11 -3
orca_sdk/_utils/data_parsing_disk_test.py +91 -0
orca_sdk/_utils/{data_parsing_test.py → data_parsing_torch_test.py} +58 -143
orca_sdk/_utils/prediction_result_ui.py +4 -1
orca_sdk/_utils/value_parser.py +44 -17
orca_sdk/_utils/value_parser_test.py +6 -5
orca_sdk/async_client.py +78 -18
orca_sdk/classification_model.py +1 -1
orca_sdk/classification_model_test.py +69 -22
orca_sdk/client.py +78 -16
orca_sdk/conftest.py +87 -7
orca_sdk/credentials.py +8 -10
orca_sdk/credentials_test.py +5 -8
orca_sdk/datasource.py +13 -8
orca_sdk/datasource_test.py +8 -2
orca_sdk/embedding_model.py +7 -2
orca_sdk/embedding_model_test.py +29 -0
orca_sdk/memoryset.py +325 -107
orca_sdk/memoryset_test.py +87 -178
orca_sdk/regression_model.py +1 -1
orca_sdk/regression_model_test.py +44 -0
orca_sdk/telemetry.py +1 -1
{orca_sdk-0.1.9.dist-info → orca_sdk-0.1.11.dist-info}/METADATA +3 -5
orca_sdk-0.1.11.dist-info/RECORD +42 -0
orca_sdk-0.1.9.dist-info/RECORD +0 -41
{orca_sdk-0.1.9.dist-info → orca_sdk-0.1.11.dist-info}/WHEEL +0 -0

orca_sdk/_utils/analysis_ui.py CHANGED Viewed

@@ -5,7 +5,10 @@ import re
 from pathlib import Path
 from typing import TypedDict, cast
-import gradio as gr
+try:
+    import gradio as gr  # type: ignore
+except ImportError as e:
+    raise ImportError("gradio is required for UI features. Install it with: pip install orca_sdk[ui]") from e
 from ..memoryset import LabeledMemory, LabeledMemoryset

orca_sdk/_utils/data_parsing.py CHANGED Viewed

@@ -1,12 +1,17 @@
+from __future__ import annotations
 import pickle
 from dataclasses import asdict, is_dataclass
 from os import PathLike
-from typing import Any, cast
+from typing import TYPE_CHECKING, Any, cast
 from datasets import Dataset
 from datasets.exceptions import DatasetGenerationError
-from torch.utils.data import DataLoader as TorchDataLoader
-from torch.utils.data import Dataset as TorchDataset
+if TYPE_CHECKING:
+    # peer dependencies that are used for types only
+    from torch.utils.data import DataLoader as TorchDataLoader  # type: ignore
+    from torch.utils.data import Dataset as TorchDataset  # type: ignore
 def parse_dict_like(item: Any, column_names: list[str] | None = None) -> dict:
@@ -62,6 +67,9 @@ def hf_dataset_from_torch(
     Returns:
         A HuggingFace Dataset object containing the data from the PyTorch DataLoader or Dataset.
     """
+    # peer dependency that is guaranteed to exist if the user provided a torch dataset
+    from torch.utils.data import DataLoader as TorchDataLoader  # type: ignore
     if isinstance(torch_data, TorchDataLoader):
         dataloader = torch_data
     else:

orca_sdk/_utils/data_parsing_disk_test.py ADDED Viewed

@@ -0,0 +1,91 @@
+import json
+import pickle
+import tempfile
+from datasets import Dataset
+from .data_parsing import hf_dataset_from_disk
+def test_hf_dataset_from_disk_pickle_list():
+    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
+        # Given a pickle file with test data that is a list
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "wb") as f:
+            pickle.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_pickle_dict():
+    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
+        # Given a pickle file with test data that is a dict
+        test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
+        with open(temp_file.name, "wb") as f:
+            pickle.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_json():
+    with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
+        # Given a JSON file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            json.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_jsonl():
+    with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
+        # Given a JSONL file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            for item in test_data:
+                f.write(json.dumps(item) + "\n")
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_csv():
+    with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        # Given a CSV file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            f.write("value,label\n")
+            for item in test_data:
+                f.write(f"{item['value']},{item['label']}\n")
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_parquet():
+    with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
+        # Given a Parquet file with test data
+        data = {
+            "value": [f"test_{i}" for i in range(30)],
+            "label": [i % 2 for i in range(30)],
+        }
+        dataset = Dataset.from_dict(data)
+        dataset.to_parquet(temp_file.name)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]

orca_sdk/_utils/{data_parsing_test.py → data_parsing_torch_test.py} RENAMED Viewed

@@ -1,18 +1,17 @@
-import json
-import pickle
-import tempfile
 from collections import namedtuple
 from dataclasses import dataclass
-import pandas as pd
 import pytest
 from datasets import Dataset
 from datasets.exceptions import DatasetGenerationError
-from torch.utils.data import DataLoader as TorchDataLoader
-from torch.utils.data import Dataset as TorchDataset
 from ..conftest import SAMPLE_DATA
-from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
+from .data_parsing import hf_dataset_from_torch
+pytest.importorskip("torch")
+from torch.utils.data import DataLoader as TorchDataLoader  # noqa: E402
+from torch.utils.data import Dataset as TorchDataset  # noqa: E402
 class PytorchDictDataset(TorchDataset):
@@ -26,16 +25,6 @@ class PytorchDictDataset(TorchDataset):
         return len(self.data)
-def test_hf_dataset_from_torch_dict():
-    # Given a Pytorch dataset that returns a dictionary for each item
-    dataset = PytorchDictDataset()
-    hf_dataset = hf_dataset_from_torch(dataset)
-    # Then the HF dataset should be created successfully
-    assert isinstance(hf_dataset, Dataset)
-    assert len(hf_dataset) == len(dataset)
-    assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
 class PytorchTupleDataset(TorchDataset):
     def __init__(self):
         self.data = SAMPLE_DATA
@@ -47,6 +36,58 @@ class PytorchTupleDataset(TorchDataset):
         return len(self.data)
+DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
+class PytorchNamedTupleDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
+    def __len__(self):
+        return len(self.data)
+@dataclass
+class DatasetItem:
+    text: str
+    label: int
+class PytorchDataclassDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
+    def __len__(self):
+        return len(self.data)
+class PytorchInvalidDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return [self.data[i]["value"], self.data[i]["label"]]
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_dict():
+    # Given a Pytorch dataset that returns a dictionary for each item
+    dataset = PytorchDictDataset()
+    hf_dataset = hf_dataset_from_torch(dataset)
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
 def test_hf_dataset_from_torch_tuple():
     # Given a Pytorch dataset that returns a tuple for each item
     dataset = PytorchTupleDataset()
@@ -74,20 +115,6 @@ def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
         hf_dataset_from_torch(dataset, column_names=["value"])
-DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
-class PytorchNamedTupleDataset(TorchDataset):
-    def __init__(self):
-        self.data = SAMPLE_DATA
-    def __getitem__(self, i):
-        return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
-    def __len__(self):
-        return len(self.data)
 def test_hf_dataset_from_torch_named_tuple():
     # Given a Pytorch dataset that returns a namedtuple for each item
     dataset = PytorchNamedTupleDataset()
@@ -99,23 +126,6 @@ def test_hf_dataset_from_torch_named_tuple():
     assert hf_dataset.column_names == ["value", "label"]
-@dataclass
-class DatasetItem:
-    text: str
-    label: int
-class PytorchDataclassDataset(TorchDataset):
-    def __init__(self):
-        self.data = SAMPLE_DATA
-    def __getitem__(self, i):
-        return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
-    def __len__(self):
-        return len(self.data)
 def test_hf_dataset_from_torch_dataclass():
     # Given a Pytorch dataset that returns a dataclass for each item
     dataset = PytorchDataclassDataset()
@@ -126,17 +136,6 @@ def test_hf_dataset_from_torch_dataclass():
     assert hf_dataset.column_names == ["text", "label"]
-class PytorchInvalidDataset(TorchDataset):
-    def __init__(self):
-        self.data = SAMPLE_DATA
-    def __getitem__(self, i):
-        return [self.data[i]["value"], self.data[i]["label"]]
-    def __len__(self):
-        return len(self.data)
 def test_hf_dataset_from_torch_invalid_dataset():
     # Given a Pytorch dataset that returns a list for each item
     dataset = PytorchInvalidDataset()
@@ -158,87 +157,3 @@ def test_hf_dataset_from_torchdataloader():
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)
     assert hf_dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_pickle_list():
-    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
-        # Given a pickle file with test data that is a list
-        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
-        with open(temp_file.name, "wb") as f:
-            pickle.dump(test_data, f)
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_pickle_dict():
-    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
-        # Given a pickle file with test data that is a dict
-        test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
-        with open(temp_file.name, "wb") as f:
-            pickle.dump(test_data, f)
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_json():
-    with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
-        # Given a JSON file with test data
-        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
-        with open(temp_file.name, "w") as f:
-            json.dump(test_data, f)
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_jsonl():
-    with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
-        # Given a JSONL file with test data
-        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
-        with open(temp_file.name, "w") as f:
-            for item in test_data:
-                f.write(json.dumps(item) + "\n")
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_csv():
-    with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
-        # Given a CSV file with test data
-        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
-        with open(temp_file.name, "w") as f:
-            f.write("value,label\n")
-            for item in test_data:
-                f.write(f"{item['value']},{item['label']}\n")
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]
-def test_hf_dataset_from_disk_parquet():
-    with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
-        # Given a Parquet file with test data
-        data = {
-            "value": [f"test_{i}" for i in range(30)],
-            "label": [i % 2 for i in range(30)],
-        }
-        df = pd.DataFrame(data)
-        df.to_parquet(temp_file.name)
-        dataset = hf_dataset_from_disk(temp_file.name)
-        # Then the HF dataset should be created successfully
-        assert isinstance(dataset, Dataset)
-        assert len(dataset) == 30
-        assert dataset.column_names == ["value", "label"]

orca_sdk/_utils/prediction_result_ui.py CHANGED Viewed

@@ -5,7 +5,10 @@ import re
 from pathlib import Path
 from typing import TYPE_CHECKING
-import gradio as gr
+try:
+    import gradio as gr  # type: ignore
+except ImportError as e:
+    raise ImportError("gradio is required for UI features. Install it with: pip install orca_sdk[ui]") from e
 from ..memoryset import LabeledMemoryLookup, LabeledMemoryset, ScoredMemoryLookup

orca_sdk/_utils/value_parser.py CHANGED Viewed

@@ -1,27 +1,43 @@
+from __future__ import annotations
 import base64
 import io
-from typing import cast
+from typing import TYPE_CHECKING, Any
-import numpy as np
-from numpy.typing import NDArray
-from PIL import Image as pil
+if TYPE_CHECKING:
+    # peer dependencies that are used for types only
+    import numpy as np  # type: ignore
+    from numpy.typing import NDArray  # type: ignore
+    from PIL import Image as pil  # type: ignore
-ValueType = str | pil.Image | NDArray[np.float32]
-"""
-The type of a value in a memoryset
+    ValueType = str | pil.Image | NDArray[np.float32]
+    """
+    The type of a value in a memoryset
-- `str`: string
-- `pil.Image`: image
-- `NDArray[np.float32]`: univariate or multivariate timeseries
-"""
+    - `str`: string
+    - `pil.Image`: image
+    - `NDArray[np.float32]`: univariate or multivariate timeseries
+    """
+else:
+    ValueType = Any
 def decode_value(value: str) -> ValueType:
     if value.startswith("data:image"):
+        try:
+            from PIL import Image as pil  # type: ignore
+        except ImportError as e:
+            raise ImportError("Install Pillow to use image values") from e
         header, data = value.split(",", 1)
         return pil.open(io.BytesIO(base64.b64decode(data)))
     if value.startswith("data:numpy"):
+        try:
+            import numpy as np  # type: ignore
+        except ImportError as e:
+            raise ImportError("Install numpy to use timeseries values") from e
         header, data = value.split(",", 1)
         return np.load(io.BytesIO(base64.b64decode(data)))
@@ -29,17 +45,28 @@ def decode_value(value: str) -> ValueType:
 def encode_value(value: ValueType) -> str:
-    if isinstance(value, pil.Image):
-        header = f"data:image/{value.format.lower()};base64," if value.format else "data:image;base64,"
+    try:
+        from PIL import Image as pil  # type: ignore
+    except ImportError:
+        pil = None  # type: ignore[assignment]
+    try:
+        import numpy as np  # type: ignore
+    except ImportError:
+        np = None  # type: ignore[assignment]
+    if pil is not None and isinstance(value, pil.Image):
+        header = f"data:image/{value.format.lower()};base64," if value.format else "data:image;base64,"  # type: ignore[union-attr]
         buffer = io.BytesIO()
-        value.save(buffer, format=value.format)
+        value.save(buffer, format=value.format)  # type: ignore[union-attr]
         bytes = buffer.getvalue()
         return header + base64.b64encode(bytes).decode("utf-8")
-    if isinstance(value, np.ndarray):
-        header = f"data:numpy/{value.dtype.name};base64,"
+    if np is not None and isinstance(value, np.ndarray):
+        header = f"data:numpy/{value.dtype.name};base64,"  # type: ignore[union-attr]
         buffer = io.BytesIO()
         np.save(buffer, value)
         return header + base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return value
+    # Value is already a string, or an unhandled type (fall back to str conversion)
+    return value if isinstance(value, str) else str(value)

orca_sdk/_utils/value_parser_test.py CHANGED Viewed

@@ -1,5 +1,4 @@
-import numpy as np
-from PIL import Image as pil
+import pytest
 from .value_parser import decode_value, encode_value
@@ -13,6 +12,7 @@ def test_string_parsing():
 def test_image_parsing():
+    pil = pytest.importorskip("PIL.Image")
     img = pil.new("RGB", (10, 10), color="red")
     img.format = "PNG"
@@ -22,10 +22,11 @@ def test_image_parsing():
     decoded = decode_value(encoded)
     assert isinstance(decoded, pil.Image)
-    assert decoded.size == img.size
+    assert decoded.size == img.size  # type: ignore[union-attr]
 def test_timeseries_parsing():
+    np = pytest.importorskip("numpy")
     timeseries = np.random.rand(20, 3).astype(np.float32)
     encoded = encode_value(timeseries)
@@ -34,6 +35,6 @@ def test_timeseries_parsing():
     decoded = decode_value(encoded)
     assert isinstance(decoded, np.ndarray)
-    assert decoded.shape == timeseries.shape
-    assert decoded.dtype == timeseries.dtype
+    assert decoded.shape == timeseries.shape  # type: ignore[union-attr]
+    assert decoded.dtype == timeseries.dtype  # type: ignore[union-attr]
     assert np.allclose(decoded, timeseries)

orca-sdk 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

orca-sdk 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl