PyPI - orca-sdk - Versions diffs - 0.0.92__py3-none-any.whl → 0.0.94__py3-none-any.whl - Mend

orca-sdk 0.0.92py3-none-any.whl → 0.0.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

orca_sdk/datasource.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import logging
 import tempfile
+import zipfile
 from datetime import datetime
 from os import PathLike
 from pathlib import Path
@@ -12,6 +13,7 @@ import pyarrow as pa
 from datasets import Dataset
 from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
+from tqdm.auto import tqdm
 from ._generated_api_client.api import (
     delete_datasource,
@@ -25,6 +27,7 @@ from ._generated_api_client.client import get_client
 from ._generated_api_client.models import ColumnType, DatasourceMetadata
 from ._utils.common import CreateMode, DropMode
 from ._utils.data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
+from ._utils.tqdm_file_reader import TqdmFileReader
 class Datasource:
@@ -82,6 +85,39 @@ class Datasource:
             + "})"
         )
+    def download(self, output_path: str | PathLike) -> None:
+        """
+        Download the datasource as a ZIP and extract them to a specified path.
+        Params:
+            output_path: The local file path or directory where the downloaded files will be saved.
+        Returns:
+            None
+        Raises:
+            RuntimeError: If the download fails.
+        """
+        output_path = Path(output_path)
+        client = get_client().get_httpx_client()
+        url = f"/datasource/{self.id}/download"
+        response = client.get(url)
+        if response.status_code == 404:
+            raise LookupError(f"Datasource {self.id} not found")
+        if response.status_code != 200:
+            raise RuntimeError(f"Failed to download datasource: {response.status_code} {response.text}")
+        with tempfile.NamedTemporaryFile(suffix=".zip") as tmp_zip:
+            tmp_zip.write(response.content)
+            tmp_zip.flush()
+            with zipfile.ZipFile(tmp_zip.name, "r") as zf:
+                output_path.mkdir(parents=True, exist_ok=True)
+                for file in zf.namelist():
+                    out_file = output_path / Path(file).name
+                    with zf.open(file) as af:
+                        out_file.write_bytes(af.read())
     @classmethod
     def from_hf_dataset(
         cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
@@ -113,19 +149,27 @@ class Datasource:
         with tempfile.TemporaryDirectory() as tmp_dir:
             dataset.save_to_disk(tmp_dir)
             files = []
-            for file_path in Path(tmp_dir).iterdir():
-                buffered_reader = open(file_path, "rb")
-                files.append(("files", buffered_reader))
-            # Do not use Generated client for this endpoint b/c it does not handle files properly
-            metadata = parse_create_response(
-                response=client.get_httpx_client().request(
-                    method="post",
-                    url="/datasource/",
-                    files=files,
-                    data={"name": name, "description": description},
+            # Calculate total size for all files
+            file_paths = list(Path(tmp_dir).iterdir())
+            total_size = sum(file_path.stat().st_size for file_path in file_paths)
+            with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
+                for file_path in file_paths:
+                    buffered_reader = open(file_path, "rb")
+                    tqdm_reader = TqdmFileReader(buffered_reader, pbar)
+                    files.append(("files", (file_path.name, tqdm_reader)))
+                # Do not use Generated client for this endpoint b/c it does not handle files properly
+                metadata = parse_create_response(
+                    response=client.get_httpx_client().request(
+                        method="post",
+                        url="/datasource/",
+                        files=files,
+                        data={"name": name, "description": description},
+                    )
                 )
-            )
             return cls(metadata=metadata)
     @classmethod

orca_sdk/datasource_test.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import os
+import tempfile
 from uuid import uuid4
 import pytest
@@ -94,3 +96,10 @@ def test_drop_datasource_unauthorized(datasource, unauthorized):
 def test_drop_datasource_invalid_input():
     with pytest.raises(ValueError, match=r"Invalid input:.*"):
         Datasource.drop("not valid id")
+def test_download_datasource(datasource):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        output_path = os.path.join(temp_dir, "datasource.zip")
+        datasource.download(output_path)
+        assert os.path.exists(output_path)

orca_sdk/embedding_model_test.py CHANGED Viewed

@@ -53,7 +53,7 @@ def test_embed_text_unauthenticated(unauthenticated):
 @pytest.fixture(scope="session")
 def finetuned_model(datasource) -> FinetunedEmbeddingModel:
-    return PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource, value_column="text")
+    return PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource)
 def test_finetune_model_with_datasource(finetuned_model: FinetunedEmbeddingModel):
@@ -65,8 +65,10 @@ def test_finetune_model_with_datasource(finetuned_model: FinetunedEmbeddingModel
     assert finetuned_model._status == TaskStatus.COMPLETED
-def test_finetune_model_with_memoryset(memoryset: LabeledMemoryset):
-    finetuned_model = PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model_from_memoryset", memoryset)
+def test_finetune_model_with_memoryset(readonly_memoryset: LabeledMemoryset):
+    finetuned_model = PretrainedEmbeddingModel.DISTILBERT.finetune(
+        "test_finetuned_model_from_memoryset", readonly_memoryset
+    )
     assert finetuned_model is not None
     assert finetuned_model.name == "test_finetuned_model_from_memoryset"
     assert finetuned_model.base_model == PretrainedEmbeddingModel.DISTILBERT
@@ -109,7 +111,6 @@ def test_use_finetuned_model_in_memoryset(datasource: Datasource, finetuned_mode
         "test_memoryset_finetuned_model",
         datasource,
         embedding_model=finetuned_model,
-        value_column="text",
     )
     assert memoryset is not None
     assert memoryset.name == "test_memoryset_finetuned_model"
@@ -152,7 +153,7 @@ def test_all_finetuned_models_unauthorized(unauthorized, finetuned_model: Finetu
 def test_drop_finetuned_model(datasource: Datasource):
-    PretrainedEmbeddingModel.DISTILBERT.finetune("finetuned_model_to_delete", datasource, value_column="text")
+    PretrainedEmbeddingModel.DISTILBERT.finetune("finetuned_model_to_delete", datasource)
     assert FinetunedEmbeddingModel.open("finetuned_model_to_delete")
     FinetunedEmbeddingModel.drop("finetuned_model_to_delete")
     with pytest.raises(LookupError):

orca_sdk/memoryset.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Any, Iterable, Literal, cast, overload
 import pandas as pd
 import pyarrow as pa
+from attrs import fields
 from datasets import Dataset
 from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
@@ -29,11 +30,14 @@ from ._generated_api_client.api import (
     memoryset_lookup_gpu,
     potential_duplicate_groups,
     query_memoryset,
+    suggest_cascading_edits,
     update_memories_gpu,
     update_memory_gpu,
     update_memoryset,
 )
 from ._generated_api_client.models import (
+    CascadeEditSuggestionsRequest,
+    CascadingEditSuggestion,
     CloneLabeledMemorysetRequest,
     CreateLabeledMemorysetRequest,
     DeleteMemoriesRequest,
@@ -1180,6 +1184,63 @@ class LabeledMemoryset:
         updated_memories = [LabeledMemory(self.id, memory) for memory in response]
         return updated_memories[0] if isinstance(updates, dict) else updated_memories
+    def get_cascading_edits_suggestions(
+        self: LabeledMemoryset,
+        memory: LabeledMemory,
+        *,
+        old_label: int,
+        new_label: int,
+        max_neighbors: int = 50,
+        max_validation_neighbors: int = 10,
+        similarity_threshold: float | None = None,
+        only_if_has_old_label: bool = True,
+        exclude_if_new_label: bool = True,
+        suggestion_cooldown_time: float = 3600.0 * 24.0,  # 1 day
+        label_confirmation_cooldown_time: float = 3600.0 * 24.0 * 7,  # 1 week
+    ) -> list[CascadingEditSuggestion]:
+        """
+        Suggests cascading edits for a given memory based on nearby points with similar labels.
+        This function is triggered after a user changes a memory's label. It looks for nearby
+        candidates in embedding space that may be subject to similar relabeling and returns them
+        as suggestions. The system uses scoring heuristics, label filters, and cooldown tracking
+        to reduce noise and improve usability.
+        Params:
+            memory: The memory whose label was just changed.
+            old_label: The label this memory used to have.
+            new_label: The label it was changed to.
+            max_neighbors: Maximum number of neighbors to consider.
+            max_validation_neighbors: Maximum number of neighbors to use for label suggestion.
+            similarity_threshold: If set, only include neighbors with a lookup score above this threshold.
+            only_if_has_old_label: If True, only consider neighbors that have the old label.
+            exclude_if_new_label: If True, exclude neighbors that already have the new label.
+            suggestion_cooldown_time: Minimum time (in seconds) since the last suggestion for a neighbor
+                to be considered again.
+            label_confirmation_cooldown_time: Minimum time (in seconds) since a neighbor's label was confirmed
+                to be considered for suggestions.
+            _current_time: Optional override for the current timestamp (useful for testing).
+        Returns:
+            A list of CascadingEditSuggestion objects, each containing a neighbor and the suggested new label.
+        """
+        return suggest_cascading_edits(
+            name_or_id=self.id,
+            memory_id=memory.memory_id,
+            body=CascadeEditSuggestionsRequest(
+                old_label=old_label,
+                new_label=new_label,
+                max_neighbors=max_neighbors,
+                max_validation_neighbors=max_validation_neighbors,
+                similarity_threshold=similarity_threshold,
+                only_if_has_old_label=only_if_has_old_label,
+                exclude_if_new_label=exclude_if_new_label,
+                suggestion_cooldown_time=suggestion_cooldown_time,
+                label_confirmation_cooldown_time=label_confirmation_cooldown_time,
+            ),
+        )
     def delete(self, memory_id: str | Iterable[str]) -> None:
         """
         Delete memories from the memoryset
@@ -1229,6 +1290,9 @@ class LabeledMemoryset:
         Returns:
             dictionary with aggregate metrics for each analysis that was run
+        Raises:
+            ValueError: If an invalid analysis name is provided
         Examples:
             Run label and duplicate analysis:
             >>> memoryset.analyze("label", {"name": "duplicate", "possible_duplicate_threshold": 0.99})
@@ -1263,12 +1327,26 @@ class LabeledMemoryset:
             Display label analysis to review potential mislabelings:
             >>> memoryset.display_label_analysis()
         """
+        # Get valid analysis names from MemorysetAnalysisConfigs
+        valid_analysis_names = {
+            field.name for field in fields(MemorysetAnalysisConfigs) if field.name != "additional_properties"
+        }
         configs: dict[str, dict] = {}
         for analysis in analyses:
             if isinstance(analysis, str):
+                error_msg = (
+                    f"Invalid analysis name: {analysis}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
+                )
+                if analysis not in valid_analysis_names:
+                    raise ValueError(error_msg)
                 configs[analysis] = {}
             else:
                 name = analysis.pop("name")  # type: ignore
+                error_msg = f"Invalid analysis name: {name}. Valid names are: {', '.join(sorted(valid_analysis_names))}"
+                if name not in valid_analysis_names:
+                    raise ValueError(error_msg)
                 configs[name] = analysis  # type: ignore
         analysis = analyze_memoryset(

orca-sdk 0.0.92__py3-none-any.whl → 0.0.94__py3-none-any.whl

orca-sdk 0.0.92py3-none-any.whl → 0.0.94py3-none-any.whl