PyPI - orca-sdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

orca-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

orca_sdk/_utils/common.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import Any, Literal
+CreateMode = Literal["error", "open"]
+"""
+Mode for creating a resource.
+**Options:**
+- `"error"`: raise an error if a resource with the same name already exists
+- `"open"`: open the resource with the same name if it exists
+"""
+DropMode = Literal["error", "ignore"]
+"""
+Mode for deleting a resource.
+**Options:**
+- `"error"`: raise an error if the resource does not exist
+- `"ignore"`: do nothing if the resource does not exist
+"""
+class _UnsetSentinel:
+    """See corresponding class in orcalib.pydantic_utils"""
+    def __bool__(self) -> bool:
+        return False
+    def __repr__(self) -> str:
+        return "UNSET"
+UNSET: Any = _UnsetSentinel()
+"""
+Default value to indicate that no update should be applied to a field and it should not be set to None
+"""

orca_sdk/_utils/data_parsing.py ADDED Viewed

@@ -0,0 +1,99 @@
+import pickle
+from dataclasses import asdict, is_dataclass
+from os import PathLike
+from typing import Any, cast
+from datasets import Dataset
+from torch.utils.data import DataLoader as TorchDataLoader
+from torch.utils.data import Dataset as TorchDataset
+def parse_dict_like(item: Any, column_names: list[str] | None = None) -> dict:
+    if isinstance(item, dict):
+        return item
+    if isinstance(item, tuple):
+        if column_names is not None:
+            assert len(item) == len(column_names)
+            return {column_names[i]: item[i] for i in range(len(item))}
+        elif hasattr(item, "_fields") and all(isinstance(field, str) for field in item._fields):  # type: ignore
+            return {field: getattr(item, field) for field in item._fields}  # type: ignore
+        else:
+            raise ValueError("For datasets that return unnamed tuples, please provide column_names argument")
+    if is_dataclass(item) and not isinstance(item, type):
+        return asdict(item)
+    raise ValueError(f"Cannot parse {type(item)}")
+def parse_batch(batch: Any, column_names: list[str] | None = None) -> list[dict]:
+    if isinstance(batch, list):
+        return [parse_dict_like(item, column_names) for item in batch]
+    batch = parse_dict_like(batch, column_names)
+    keys = list(batch.keys())
+    batch_size = len(batch[keys[0]])
+    for key in keys:
+        if not len(batch[key]) == batch_size:
+            raise ValueError(f"Batch must consist of values of the same length, but {key} has length {len(batch[key])}")
+    return [{key: batch[key][idx] for key in keys} for idx in range(batch_size)]
+def hf_dataset_from_torch(torch_data: TorchDataLoader | TorchDataset, column_names: list[str] | None = None) -> Dataset:
+    if isinstance(torch_data, TorchDataLoader):
+        dataloader = torch_data
+    else:
+        dataloader = TorchDataLoader(torch_data, batch_size=1, collate_fn=lambda x: x)
+    def generator():
+        for batch in dataloader:
+            yield from parse_batch(batch, column_names=column_names)
+    return cast(Dataset, Dataset.from_generator(generator))
+def hf_dataset_from_disk(file_path: str | PathLike) -> Dataset:
+    """
+    Load a dataset from disk into a HuggingFace Dataset object.
+    Params:
+        file_path: Path to the file on disk to create the memoryset from. The file type will
+                be inferred from the file extension. The following file types are supported:
+                - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
+                - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
+                - .csv: [`CSV`][csv] files
+                - .parquet: [`Parquet`](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile) files
+                - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
+    Returns:
+        A HuggingFace Dataset object containing the loaded data.
+    Raises:
+        [`ValueError`][ValueError]: If the pickle file contains unsupported data types or if
+            loading the dataset fails for any reason.
+    """
+    if str(file_path).endswith(".pkl"):
+        data = pickle.load(open(file_path, "rb"))
+        if isinstance(data, list):
+            return Dataset.from_list(data)
+        elif isinstance(data, dict):
+            return Dataset.from_dict(data)
+        else:
+            raise ValueError(f"Unsupported pickle file: {file_path}")
+    elif str(file_path).endswith(".json"):
+        hf_dataset = Dataset.from_json(file_path)
+    elif str(file_path).endswith(".jsonl"):
+        hf_dataset = Dataset.from_json(file_path)
+    elif str(file_path).endswith(".csv"):
+        hf_dataset = Dataset.from_csv(file_path)
+    elif str(file_path).endswith(".parquet"):
+        hf_dataset = Dataset.from_parquet(file_path)
+    else:
+        try:
+            hf_dataset = Dataset.load_from_disk(file_path)
+        except Exception as e:
+            raise ValueError(f"Failed to load dataset from disk: {e}")
+    return cast(Dataset, hf_dataset)

orca_sdk/_utils/data_parsing_test.py ADDED Viewed

@@ -0,0 +1,244 @@
+import json
+import pickle
+import tempfile
+from collections import namedtuple
+from dataclasses import dataclass
+import pandas as pd
+import pytest
+from datasets import Dataset
+from datasets.exceptions import DatasetGenerationError
+from torch.utils.data import DataLoader as TorchDataLoader
+from torch.utils.data import Dataset as TorchDataset
+from ..conftest import SAMPLE_DATA
+from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
+class PytorchDictDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return self.data[i]
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_dict():
+    # Given a Pytorch dataset that returns a dictionary for each item
+    dataset = PytorchDictDataset()
+    hf_dataset = hf_dataset_from_torch(dataset)
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert set(hf_dataset.column_names) == {"text", "label", "key", "score", "source_id"}
+class PytorchTupleDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return self.data[i]["text"], self.data[i]["label"]
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_tuple():
+    # Given a Pytorch dataset that returns a tuple for each item
+    dataset = PytorchTupleDataset()
+    # And the correct number of column names passed in
+    hf_dataset = hf_dataset_from_torch(dataset, column_names=["text", "label"])
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert hf_dataset.column_names == ["text", "label"]
+def test_hf_dataset_from_torch_tuple_error():
+    # Given a Pytorch dataset that returns a tuple for each item
+    dataset = PytorchTupleDataset()
+    # Then the HF dataset should raise an error if no column names are passed in
+    with pytest.raises(DatasetGenerationError):
+        hf_dataset_from_torch(dataset)
+def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
+    # Given a Pytorch dataset that returns a tuple for each item
+    dataset = PytorchTupleDataset()
+    # Then the HF dataset should raise an error if not enough column names are passed in
+    with pytest.raises(DatasetGenerationError):
+        hf_dataset_from_torch(dataset, column_names=["value"])
+DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
+class PytorchNamedTupleDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return DatasetTuple(self.data[i]["text"], self.data[i]["label"])
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_named_tuple():
+    # Given a Pytorch dataset that returns a namedtuple for each item
+    dataset = PytorchNamedTupleDataset()
+    # And no column names are passed in
+    hf_dataset = hf_dataset_from_torch(dataset)
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert hf_dataset.column_names == ["value", "label"]
+@dataclass
+class DatasetItem:
+    text: str
+    label: int
+class PytorchDataclassDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return DatasetItem(text=self.data[i]["text"], label=self.data[i]["label"])
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_dataclass():
+    # Given a Pytorch dataset that returns a dataclass for each item
+    dataset = PytorchDataclassDataset()
+    hf_dataset = hf_dataset_from_torch(dataset)
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert hf_dataset.column_names == ["text", "label"]
+class PytorchInvalidDataset(TorchDataset):
+    def __init__(self):
+        self.data = SAMPLE_DATA
+    def __getitem__(self, i):
+        return [self.data[i]["text"], self.data[i]["label"]]
+    def __len__(self):
+        return len(self.data)
+def test_hf_dataset_from_torch_invalid_dataset():
+    # Given a Pytorch dataset that returns a list for each item
+    dataset = PytorchInvalidDataset()
+    # Then the HF dataset should raise an error
+    with pytest.raises(DatasetGenerationError):
+        hf_dataset_from_torch(dataset)
+def test_hf_dataset_from_torchdataloader():
+    # Given a Pytorch dataloader that returns a column-oriented batch of items
+    dataset = PytorchDictDataset()
+    def collate_fn(x: list[dict]):
+        return {"value": [item["text"] for item in x], "label": [item["label"] for item in x]}
+    dataloader = TorchDataLoader(dataset, batch_size=3, collate_fn=collate_fn)
+    hf_dataset = hf_dataset_from_torch(dataloader)
+    # Then the HF dataset should be created successfully
+    assert isinstance(hf_dataset, Dataset)
+    assert len(hf_dataset) == len(dataset)
+    assert hf_dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_pickle_list():
+    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
+        # Given a pickle file with test data that is a list
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "wb") as f:
+            pickle.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_pickle_dict():
+    with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
+        # Given a pickle file with test data that is a dict
+        test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
+        with open(temp_file.name, "wb") as f:
+            pickle.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_json():
+    with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
+        # Given a JSON file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            json.dump(test_data, f)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_jsonl():
+    with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
+        # Given a JSONL file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            for item in test_data:
+                f.write(json.dumps(item) + "\n")
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_csv():
+    with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        # Given a CSV file with test data
+        test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
+        with open(temp_file.name, "w") as f:
+            f.write("value,label\n")
+            for item in test_data:
+                f.write(f"{item['value']},{item['label']}\n")
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]
+def test_hf_dataset_from_disk_parquet():
+    with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
+        # Given a Parquet file with test data
+        data = {
+            "value": [f"test_{i}" for i in range(30)],
+            "label": [i % 2 for i in range(30)],
+        }
+        df = pd.DataFrame(data)
+        df.to_parquet(temp_file.name)
+        dataset = hf_dataset_from_disk(temp_file.name)
+        # Then the HF dataset should be created successfully
+        assert isinstance(dataset, Dataset)
+        assert len(dataset) == 30
+        assert dataset.column_names == ["value", "label"]

orca_sdk/_utils/prediction_result_ui.css ADDED Viewed

@@ -0,0 +1,18 @@
+.white {
+  background-color: white;
+}
+.success {
+  color: gray;
+  font-size: 12px;
+  height: 24px;
+}
+.html-container:has(.no-padding) {
+  padding: 0;
+  height: 24px;
+}
+.progress-bar {
+  background-color: #2b9a66;
+}
+.progress-level-inner {
+  display: none;
+}

orca_sdk/_utils/prediction_result_ui.py ADDED Viewed

@@ -0,0 +1,64 @@
+import logging
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING
+import gradio as gr
+from ..labeled_memoryset import LabeledMemoryLookup
+if TYPE_CHECKING:
+    from ..telemetry import LabelPrediction
+def inspect_prediction_result(prediction_result: "LabelPrediction"):
+    label_names = prediction_result.memoryset.label_names
+    def update_label(val: str, memory: LabeledMemoryLookup, progress=gr.Progress(track_tqdm=True)):
+        progress(0)
+        match = re.search(r".*\((\d+)\)$", val)
+        if match:
+            progress(0.5)
+            new_label = int(match.group(1))
+            memory.update(label=new_label)
+            progress(1)
+            return "&#9989; Changes saved"
+        else:
+            logging.error(f"Invalid label format: {val}")
+    with gr.Blocks(
+        fill_width=True,
+        title="Prediction Results",
+        css_paths=str(Path(__file__).parent / "prediction_result_ui.css"),
+    ) as prediction_result_ui:
+        gr.Markdown("# Prediction Results")
+        gr.Markdown(f"**Input:** {prediction_result.input_value}")
+        gr.Markdown(f"**Prediction:** {label_names[prediction_result.label]} ({prediction_result.label})")
+        gr.Markdown("### Memory Lookups")
+        with gr.Row(equal_height=True, variant="panel"):
+            with gr.Column(scale=7):
+                gr.Markdown("**Value**")
+            with gr.Column(scale=3, min_width=150):
+                gr.Markdown("**Label**")
+        for i, mem_lookup in enumerate(prediction_result.memory_lookups):
+            with gr.Row(equal_height=True, variant="panel", elem_classes="white" if i % 2 == 0 else None):
+                with gr.Column(scale=7):
+                    gr.Markdown(mem_lookup.value, label="Value", height=50)
+                with gr.Column(scale=3, min_width=150):
+                    dropdown = gr.Dropdown(
+                        choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
+                        label="Label",
+                        value=f"{label_names[mem_lookup.label]} ({mem_lookup.label})",
+                        interactive=True,
+                        container=False,
+                    )
+                    changes_saved = gr.HTML(lambda: "", elem_classes="success no-padding", every=15)
+                    dropdown.change(
+                        lambda val, mem_lookup=mem_lookup: update_label(val, mem_lookup),
+                        inputs=[dropdown],
+                        outputs=[changes_saved],
+                        show_progress="full",
+                    )
+    prediction_result_ui.launch()

orca_sdk/_utils/task.py ADDED Viewed

@@ -0,0 +1,73 @@
+import time
+from tqdm.auto import tqdm
+from .._generated_api_client.api import abort_task as _abort_task
+from .._generated_api_client.api import get_task_status_task
+from .._generated_api_client.api import list_tasks as _list_tasks
+from .._generated_api_client.models import Task, TaskStatus, TaskStatusInfo
+task_config = {
+    "retry_interval": 3,
+    "show_progress": True,
+    "max_wait": 60 * 60,
+}
+def set_task_config(
+    retry_interval: int | None = None,
+    show_progress: bool | None = None,
+    max_wait: int | None = None,
+) -> None:
+    if retry_interval is not None:
+        task_config["retry_interval"] = retry_interval
+    if show_progress is not None:
+        task_config["show_progress"] = show_progress
+    if max_wait is not None:
+        task_config["max_wait"] = max_wait
+def wait_for_task(task_id: str, description: str | None = None, show_progress: bool = True) -> None:
+    start_time = time.time()
+    pbar = None
+    steps_total = None
+    show_progress = show_progress and task_config["show_progress"]
+    while True:
+        task_status = get_task_status_task(task_id)
+        # setup progress bar if steps total is known
+        if task_status.steps_total is not None and steps_total is None:
+            steps_total = task_status.steps_total
+        if not pbar and steps_total is not None and show_progress:
+            pbar = tqdm(total=steps_total, desc=description)
+        # return if task is complete
+        if task_status.status == TaskStatus.COMPLETED:
+            if pbar:
+                pbar.update(steps_total - pbar.n)
+                pbar.close()
+            return
+        # raise error if task failed
+        if task_status.status == TaskStatus.FAILED:
+            raise RuntimeError(f"Task failed with {task_status.exception}")
+        # raise error if task timed out
+        if (time.time() - start_time) > task_config["max_wait"]:
+            raise RuntimeError(f"Task {task_id} timed out after {task_config['max_wait']}s")
+        # update progress bar
+        if pbar and task_status.steps_completed is not None:
+            pbar.update(task_status.steps_completed - pbar.n)
+        # sleep before retrying
+        time.sleep(task_config["retry_interval"])
+def abort_task(task_id: str) -> TaskStatusInfo:
+    _abort_task(task_id)
+    return get_task_status_task(task_id)
+def list_tasks() -> list[Task]:
+    return _list_tasks()