PyPI - orca-sdk - Versions diffs - 0.0.91__py3-none-any.whl → 0.0.93__py3-none-any.whl - Mend

orca-sdk 0.0.91py3-none-any.whl → 0.0.93py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

orca_sdk/_shared/metrics_test.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""
+IMPORTANT:
+- This is a shared file between OrcaLib and the Orca SDK.
+- Please ensure that it does not have any dependencies on the OrcaLib code.
+- Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
+"""
+from typing import Literal
+import numpy as np
+import pytest
+from .metrics import (
+    EvalPrediction,
+    calculate_pr_curve,
+    calculate_roc_curve,
+    classification_scores,
+    compute_classifier_metrics,
+    softmax,
+)
+def test_binary_metrics():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([0.1, 0.9, 0.8, 0.3, 0.2])
+    metrics = classification_scores(y_true, y_score)
+    assert metrics["accuracy"] == 0.8
+    assert metrics["f1_score"] == 0.8
+    assert metrics["roc_auc"] is not None
+    assert metrics["roc_auc"] > 0.8
+    assert metrics["roc_auc"] < 1.0
+    assert metrics["pr_auc"] is not None
+    assert metrics["pr_auc"] > 0.8
+    assert metrics["pr_auc"] < 1.0
+    assert metrics["log_loss"] is not None
+    assert metrics["log_loss"] > 0.0
+def test_multiclass_metrics_with_2_classes():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([[0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
+    metrics = classification_scores(y_true, y_score)
+    assert metrics["accuracy"] == 0.8
+    assert metrics["f1_score"] == 0.8
+    assert metrics["roc_auc"] is not None
+    assert metrics["roc_auc"] > 0.8
+    assert metrics["roc_auc"] < 1.0
+    assert metrics["pr_auc"] is not None
+    assert metrics["pr_auc"] > 0.8
+    assert metrics["pr_auc"] < 1.0
+    assert metrics["log_loss"] is not None
+    assert metrics["log_loss"] > 0.0
+@pytest.mark.parametrize(
+    "average, multiclass",
+    [("micro", "ovr"), ("macro", "ovr"), ("weighted", "ovr"), ("micro", "ovo"), ("macro", "ovo"), ("weighted", "ovo")],
+)
+def test_multiclass_metrics_with_3_classes(
+    average: Literal["micro", "macro", "weighted"], multiclass: Literal["ovr", "ovo"]
+):
+    y_true = np.array([0, 1, 1, 0, 2])
+    y_score = np.array([[0.9, 0.1, 0.0], [0.1, 0.9, 0.0], [0.2, 0.8, 0.0], [0.7, 0.3, 0.0], [0.0, 0.0, 1.0]])
+    metrics = classification_scores(y_true, y_score, average=average, multi_class=multiclass)
+    assert metrics["accuracy"] == 1.0
+    assert metrics["f1_score"] == 1.0
+    assert metrics["roc_auc"] is not None
+    assert metrics["roc_auc"] > 0.8
+    assert metrics["pr_auc"] is None
+    assert metrics["log_loss"] is not None
+    assert metrics["log_loss"] > 0.0
+def test_does_not_modify_logits_unless_necessary():
+    logits = np.array([[0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
+    references = np.array([0, 1, 0, 1])
+    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
+    assert metrics["log_loss"] == classification_scores(references, logits)["log_loss"]
+def test_normalizes_logits_if_necessary():
+    logits = np.array([[1.2, 3.9], [1.2, 5.8], [1.2, 2.7], [1.2, 1.3]])
+    references = np.array([0, 1, 0, 1])
+    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
+    assert (
+        metrics["log_loss"] == classification_scores(references, logits / logits.sum(axis=1, keepdims=True))["log_loss"]
+    )
+def test_softmaxes_logits_if_necessary():
+    logits = np.array([[-1.2, 3.9], [1.2, -5.8], [1.2, 2.7], [1.2, 1.3]])
+    references = np.array([0, 1, 0, 1])
+    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
+    assert metrics["log_loss"] == classification_scores(references, softmax(logits))["log_loss"]
+def test_precision_recall_curve():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
+    precision, recall, thresholds = calculate_pr_curve(y_true, y_score)
+    assert precision is not None
+    assert recall is not None
+    assert thresholds is not None
+    assert len(precision) == len(recall) == len(thresholds) == 6
+    assert precision[0] == 0.6
+    assert recall[0] == 1.0
+    assert precision[-1] == 1.0
+    assert recall[-1] == 0.0
+    # test that thresholds are sorted
+    assert np.all(np.diff(thresholds) >= 0)
+def test_roc_curve():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
+    fpr, tpr, thresholds = calculate_roc_curve(y_true, y_score)
+    assert fpr is not None
+    assert tpr is not None
+    assert thresholds is not None
+    assert len(fpr) == len(tpr) == len(thresholds) == 6
+    assert fpr[0] == 1.0
+    assert tpr[0] == 1.0
+    assert fpr[-1] == 0.0
+    assert tpr[-1] == 0.0
+    # test that thresholds are sorted
+    assert np.all(np.diff(thresholds) >= 0)
+def test_precision_recall_curve_max_length():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
+    precision, recall, thresholds = calculate_pr_curve(y_true, y_score, max_length=5)
+    assert len(precision) == len(recall) == len(thresholds) == 5
+    assert precision[0] == 0.6
+    assert recall[0] == 1.0
+    assert precision[-1] == 1.0
+    assert recall[-1] == 0.0
+    # test that thresholds are sorted
+    assert np.all(np.diff(thresholds) >= 0)
+def test_roc_curve_max_length():
+    y_true = np.array([0, 1, 1, 0, 1])
+    y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
+    fpr, tpr, thresholds = calculate_roc_curve(y_true, y_score, max_length=5)
+    assert len(fpr) == len(tpr) == len(thresholds) == 5
+    assert fpr[0] == 1.0
+    assert tpr[0] == 1.0
+    assert fpr[-1] == 0.0
+    assert tpr[-1] == 0.0
+    # test that thresholds are sorted
+    assert np.all(np.diff(thresholds) >= 0)

orca_sdk/_utils/data_parsing.py CHANGED Viewed

@@ -1,12 +1,16 @@
+import logging
 import pickle
 from dataclasses import asdict, is_dataclass
 from os import PathLike
+from tempfile import TemporaryDirectory
 from typing import Any, cast
 from datasets import Dataset
 from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 def parse_dict_like(item: Any, column_names: list[str] | None = None) -> dict:
     if isinstance(item, dict):
@@ -40,7 +44,24 @@ def parse_batch(batch: Any, column_names: list[str] | None = None) -> list[dict]
     return [{key: batch[key][idx] for key in keys} for idx in range(batch_size)]
-def hf_dataset_from_torch(torch_data: TorchDataLoader | TorchDataset, column_names: list[str] | None = None) -> Dataset:
+def hf_dataset_from_torch(
+    torch_data: TorchDataLoader | TorchDataset, column_names: list[str] | None = None, ignore_cache=False
+) -> Dataset:
+    """
+    Create a HuggingFace Dataset from a PyTorch DataLoader or Dataset.
+    NOTE:  It's important to ignore the cached files when testing (i.e., ignore_cache=Ture), because
+    cached results can ignore changes you've made to tests. This can make a test appear to succeed
+    when it's actually broken or vice versa.
+    Params:
+        torch_data: A PyTorch DataLoader or Dataset object to create the HuggingFace Dataset from.
+        column_names: Optional list of column names to use for the dataset. If not provided,
+            the column names will be inferred from the data.
+        ignore_cache: If True, the dataset will not be cached on disk.
+    Returns:
+        A HuggingFace Dataset object containing the data from the PyTorch DataLoader or Dataset.
+    """
     if isinstance(torch_data, TorchDataLoader):
         dataloader = torch_data
     else:
@@ -50,7 +71,15 @@ def hf_dataset_from_torch(torch_data: TorchDataLoader | TorchDataset, column_nam
         for batch in dataloader:
             yield from parse_batch(batch, column_names=column_names)
-    return cast(Dataset, Dataset.from_generator(generator))
+    if ignore_cache:
+        with TemporaryDirectory() as temp_dir:
+            ds = Dataset.from_generator(generator, cache_dir=temp_dir)
+    else:
+        ds = Dataset.from_generator(generator)
+    if not isinstance(ds, Dataset):
+        raise ValueError(f"Failed to create dataset from generator: {type(ds)}")
+    return ds
 def hf_dataset_from_disk(file_path: str | PathLike) -> Dataset:

orca_sdk/_utils/data_parsing_test.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import logging
 import pickle
 import tempfile
 from collections import namedtuple
@@ -14,6 +15,8 @@ from torch.utils.data import Dataset as TorchDataset
 from ..conftest import SAMPLE_DATA
 from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 class PytorchDictDataset(TorchDataset):
     def __init__(self):
@@ -29,11 +32,11 @@ class PytorchDictDataset(TorchDataset):
 def test_hf_dataset_from_torch_dict():
     # Given a Pytorch dataset that returns a dictionary for each item
     dataset = PytorchDictDataset()
-    hf_dataset = hf_dataset_from_torch(dataset)
+    hf_dataset = hf_dataset_from_torch(dataset, ignore_cache=True)
     # Then the HF dataset should be created successfully
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)
-    assert set(hf_dataset.column_names) == {"text", "label", "key", "score", "source_id"}
+    assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id"}
 class PytorchTupleDataset(TorchDataset):
@@ -41,7 +44,7 @@ class PytorchTupleDataset(TorchDataset):
         self.data = SAMPLE_DATA
     def __getitem__(self, i):
-        return self.data[i]["text"], self.data[i]["label"]
+        return self.data[i]["value"], self.data[i]["label"]
     def __len__(self):
         return len(self.data)
@@ -51,11 +54,11 @@ def test_hf_dataset_from_torch_tuple():
     # Given a Pytorch dataset that returns a tuple for each item
     dataset = PytorchTupleDataset()
     # And the correct number of column names passed in
-    hf_dataset = hf_dataset_from_torch(dataset, column_names=["text", "label"])
+    hf_dataset = hf_dataset_from_torch(dataset, column_names=["value", "label"], ignore_cache=True)
     # Then the HF dataset should be created successfully
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)
-    assert hf_dataset.column_names == ["text", "label"]
+    assert hf_dataset.column_names == ["value", "label"]
 def test_hf_dataset_from_torch_tuple_error():
@@ -63,7 +66,7 @@ def test_hf_dataset_from_torch_tuple_error():
     dataset = PytorchTupleDataset()
     # Then the HF dataset should raise an error if no column names are passed in
     with pytest.raises(DatasetGenerationError):
-        hf_dataset_from_torch(dataset)
+        hf_dataset_from_torch(dataset, ignore_cache=True)
 def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
@@ -71,7 +74,7 @@ def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
     dataset = PytorchTupleDataset()
     # Then the HF dataset should raise an error if not enough column names are passed in
     with pytest.raises(DatasetGenerationError):
-        hf_dataset_from_torch(dataset, column_names=["value"])
+        hf_dataset_from_torch(dataset, column_names=["value"], ignore_cache=True)
 DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
@@ -82,7 +85,7 @@ class PytorchNamedTupleDataset(TorchDataset):
         self.data = SAMPLE_DATA
     def __getitem__(self, i):
-        return DatasetTuple(self.data[i]["text"], self.data[i]["label"])
+        return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
     def __len__(self):
         return len(self.data)
@@ -92,7 +95,7 @@ def test_hf_dataset_from_torch_named_tuple():
     # Given a Pytorch dataset that returns a namedtuple for each item
     dataset = PytorchNamedTupleDataset()
     # And no column names are passed in
-    hf_dataset = hf_dataset_from_torch(dataset)
+    hf_dataset = hf_dataset_from_torch(dataset, ignore_cache=True)
     # Then the HF dataset should be created successfully
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)
@@ -110,7 +113,7 @@ class PytorchDataclassDataset(TorchDataset):
         self.data = SAMPLE_DATA
     def __getitem__(self, i):
-        return DatasetItem(text=self.data[i]["text"], label=self.data[i]["label"])
+        return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
     def __len__(self):
         return len(self.data)
@@ -119,7 +122,7 @@ class PytorchDataclassDataset(TorchDataset):
 def test_hf_dataset_from_torch_dataclass():
     # Given a Pytorch dataset that returns a dataclass for each item
     dataset = PytorchDataclassDataset()
-    hf_dataset = hf_dataset_from_torch(dataset)
+    hf_dataset = hf_dataset_from_torch(dataset, ignore_cache=True)
     # Then the HF dataset should be created successfully
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)
@@ -131,7 +134,7 @@ class PytorchInvalidDataset(TorchDataset):
         self.data = SAMPLE_DATA
     def __getitem__(self, i):
-        return [self.data[i]["text"], self.data[i]["label"]]
+        return [self.data[i]["value"], self.data[i]["label"]]
     def __len__(self):
         return len(self.data)
@@ -142,7 +145,7 @@ def test_hf_dataset_from_torch_invalid_dataset():
     dataset = PytorchInvalidDataset()
     # Then the HF dataset should raise an error
     with pytest.raises(DatasetGenerationError):
-        hf_dataset_from_torch(dataset)
+        hf_dataset_from_torch(dataset, ignore_cache=True)
 def test_hf_dataset_from_torchdataloader():
@@ -150,10 +153,10 @@ def test_hf_dataset_from_torchdataloader():
     dataset = PytorchDictDataset()
     def collate_fn(x: list[dict]):
-        return {"value": [item["text"] for item in x], "label": [item["label"] for item in x]}
+        return {"value": [item["value"] for item in x], "label": [item["label"] for item in x]}
     dataloader = TorchDataLoader(dataset, batch_size=3, collate_fn=collate_fn)
-    hf_dataset = hf_dataset_from_torch(dataloader)
+    hf_dataset = hf_dataset_from_torch(dataloader, ignore_cache=True)
     # Then the HF dataset should be created successfully
     assert isinstance(hf_dataset, Dataset)
     assert len(hf_dataset) == len(dataset)

orca_sdk/_utils/tqdm_file_reader.py ADDED Viewed

@@ -0,0 +1,12 @@
+class TqdmFileReader:
+    def __init__(self, file_obj, pbar):
+        self.file_obj = file_obj
+        self.pbar = pbar
+    def read(self, size=-1):
+        data = self.file_obj.read(size)
+        self.pbar.update(len(data))
+        return data
+    def __getattr__(self, attr):
+        return getattr(self.file_obj, attr)

orca_sdk/classification_model.py CHANGED Viewed

@@ -1,10 +1,22 @@
 from __future__ import annotations
 import logging
+import os
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Any, Generator, Iterable, Literal, cast, overload
-from uuid import UUID
+from uuid import UUID, uuid4
+import numpy as np
+import numpy as np
+from datasets import Dataset
+from sklearn.metrics import (
+    accuracy_score,
+    auc,
+    f1_score,
+    roc_auc_score,
+)
 from ._generated_api_client.api import (
     create_evaluation,
@@ -19,9 +31,11 @@ from ._generated_api_client.api import (
     update_model,
 )
 from ._generated_api_client.models import (
+    ClassificationEvaluationResult,
     CreateRACModelRequest,
     EvaluationRequest,
     ListPredictionsRequest,
+    PrecisionRecallCurve,
 )
 from ._generated_api_client.models import (
     PredictionSortItemItemType0 as PredictionSortColumns,
@@ -33,8 +47,10 @@ from ._generated_api_client.models import (
     RACHeadType,
     RACModelMetadata,
     RACModelUpdate,
+    ROCCurve,
 )
 from ._generated_api_client.models.prediction_request import PredictionRequest
+from ._shared.metrics import calculate_pr_curve, calculate_roc_curve
 from ._utils.common import UNSET, CreateMode, DropMode
 from ._utils.task import wait_for_task
 from .datasource import Datasource
@@ -299,7 +315,8 @@ class ClassificationModel:
         value: list[str],
         expected_labels: list[int] | None = None,
         tags: set[str] = set(),
-        disable_telemetry: bool = False,
+        save_telemetry: bool = True,
+        save_telemetry_synchronously: bool = False,
     ) -> list[LabelPrediction]:
         pass
@@ -309,7 +326,8 @@ class ClassificationModel:
         value: str,
         expected_labels: int | None = None,
         tags: set[str] = set(),
-        disable_telemetry: bool = False,
+        save_telemetry: bool = True,
+        save_telemetry_synchronously: bool = False,
     ) -> LabelPrediction:
         pass
@@ -318,7 +336,8 @@ class ClassificationModel:
         value: list[str] | str,
         expected_labels: list[int] | int | None = None,
         tags: set[str] = set(),
-        disable_telemetry: bool = False,
+        save_telemetry: bool = True,
+        save_telemetry_synchronously: bool = False,
     ) -> list[LabelPrediction] | LabelPrediction:
         """
         Predict label(s) for the given input value(s) grounded in similar memories
@@ -327,7 +346,10 @@ class ClassificationModel:
             value: Value(s) to get predict the labels of
             expected_labels: Expected label(s) for the given input to record for model evaluation
             tags: Tags to add to the prediction(s)
-            disable_telemetry: Whether to disable telemetry for the prediction(s)
+            save_telemetry: Whether to enable telemetry for the prediction(s)
+            save_telemetry_synchronously: Whether to save telemetry synchronously. If `False`, telemetry will be saved
+                asynchronously in the background. This may result in a delay in the telemetry being available. Please note that this
+                may be overriden by the ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY environment variable.
         Returns:
             Label prediction or list of label predictions
@@ -345,6 +367,13 @@ class ClassificationModel:
             ]
         """
+        if "ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY" in os.environ:
+            env_var = os.environ["ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY"]
+            logging.info(
+                f"ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY is set to {env_var} which will override the parameter save_telemetry_synchronously = {save_telemetry_synchronously}"
+            )
+            save_telemetry_synchronously = env_var.lower() == "true"
         response = predict_gpu(
             self.id,
             body=PredictionRequest(
@@ -356,11 +385,12 @@ class ClassificationModel:
                     else [expected_labels] if expected_labels is not None else None
                 ),
                 tags=list(tags),
-                disable_telemetry=disable_telemetry,
+                save_telemetry=save_telemetry,
+                save_telemetry_synchronously=save_telemetry_synchronously,
             ),
         )
-        if not disable_telemetry and any(p.prediction_id is None for p in response):
+        if save_telemetry and any(p.prediction_id is None for p in response):
             raise RuntimeError("Failed to save prediction to database.")
         predictions = [
@@ -372,6 +402,7 @@ class ClassificationModel:
                 anomaly_score=prediction.anomaly_score,
                 memoryset=self.memoryset,
                 model=self,
+                logits=prediction.logits,
             )
             for prediction in response
         ]
@@ -444,46 +475,158 @@ class ClassificationModel:
             for prediction in predictions
         ]
-    def evaluate(
+    def _calculate_metrics(
+        self,
+        predictions: list[LabelPrediction],
+        expected_labels: list[int],
+    ) -> ClassificationEvaluationResult:
+        targets_array = np.array(expected_labels)
+        predictions_array = np.array([p.label for p in predictions])
+        logits_array = np.array([p.logits for p in predictions])
+        f1 = float(f1_score(targets_array, predictions_array, average="weighted"))
+        accuracy = float(accuracy_score(targets_array, predictions_array))
+        # Only compute ROC AUC and PR AUC for binary classification
+        unique_classes = np.unique(targets_array)
+        pr_curve = None
+        roc_curve = None
+        if len(unique_classes) == 2:
+            try:
+                precisions, recalls, pr_thresholds = calculate_pr_curve(targets_array, logits_array)
+                pr_auc = float(auc(recalls, precisions))
+                pr_curve = PrecisionRecallCurve(
+                    precisions=precisions.tolist(),
+                    recalls=recalls.tolist(),
+                    thresholds=pr_thresholds.tolist(),
+                    auc=pr_auc,
+                )
+                fpr, tpr, roc_thresholds = calculate_roc_curve(targets_array, logits_array)
+                roc_auc = float(roc_auc_score(targets_array, logits_array[:, 1]))
+                roc_curve = ROCCurve(
+                    false_positive_rates=fpr.tolist(),
+                    true_positive_rates=tpr.tolist(),
+                    thresholds=roc_thresholds.tolist(),
+                    auc=roc_auc,
+                )
+            except ValueError as e:
+                logging.warning(f"Error calculating PR and ROC curves: {e}")
+        return ClassificationEvaluationResult(
+            f1_score=f1,
+            accuracy=accuracy,
+            loss=0.0,
+            precision_recall_curve=pr_curve,
+            roc_curve=roc_curve,
+        )
+    def _evaluate_datasource(
         self,
         datasource: Datasource,
+        value_column: str,
+        label_column: str,
+        record_predictions: bool,
+        tags: set[str] | None,
+    ) -> dict[str, Any]:
+        response = create_evaluation(
+            self.id,
+            body=EvaluationRequest(
+                datasource_id=datasource.id,
+                datasource_label_column=label_column,
+                datasource_value_column=value_column,
+                memoryset_override_id=self._memoryset_override_id,
+                record_telemetry=record_predictions,
+                telemetry_tags=list(tags) if tags else None,
+            ),
+        )
+        wait_for_task(response.task_id, description="Running evaluation")
+        response = get_evaluation(self.id, UUID(response.task_id))
+        assert response.result is not None
+        return response.result.to_dict()
+    def _evaluate_dataset(
+        self,
+        dataset: Dataset,
+        value_column: str,
+        label_column: str,
+        record_predictions: bool,
+        tags: set[str],
+        batch_size: int,
+    ) -> dict[str, Any]:
+        predictions = []
+        expected_labels = []
+        for i in range(0, len(dataset), batch_size):
+            batch = dataset[i : i + batch_size]
+            predictions.extend(
+                self.predict(
+                    batch[value_column],
+                    expected_labels=batch[label_column],
+                    tags=tags,
+                    save_telemetry=record_predictions,
+                    save_telemetry_synchronously=(not record_predictions),
+                )
+            )
+            expected_labels.extend(batch[label_column])
+        return self._calculate_metrics(predictions, expected_labels).to_dict()
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
         value_column: str = "value",
         label_column: str = "label",
         record_predictions: bool = False,
-        tags: set[str] | None = None,
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
     ) -> dict[str, Any]:
         """
-        Evaluate the classification model on a given datasource
+        Evaluate the classification model on a given dataset or datasource
         Params:
-            datasource: Datasource to evaluate the model on
+            data: Dataset or Datasource to evaluate the model on
             value_column: Name of the column that contains the input values to the model
             label_column: Name of the column containing the expected labels
             record_predictions: Whether to record [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s for analysis
             tags: Optional tags to add to the recorded [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s
+            batch_size: Batch size for processing Dataset inputs (only used when input is a Dataset)
         Returns:
-            Dictionary with evaluation metrics
+            Dictionary with evaluation metrics, including anomaly score statistics (mean, median, variance)
         Examples:
+            Evaluate using a Datasource:
             >>> model.evaluate(datasource, value_column="text", label_column="airline_sentiment")
             { "f1_score": 0.85, "roc_auc": 0.85, "pr_auc": 0.85, "accuracy": 0.85, "loss": 0.35, ... }
+            Evaluate using a Dataset:
+            >>> model.evaluate(dataset, value_column="text", label_column="sentiment")
+            { "f1_score": 0.85, "roc_auc": 0.85, "pr_auc": 0.85, "accuracy": 0.85, "loss": 0.35, ... }
         """
-        response = create_evaluation(
-            self.id,
-            body=EvaluationRequest(
-                datasource_id=datasource.id,
-                datasource_label_column=label_column,
-                datasource_value_column=value_column,
-                memoryset_override_id=self._memoryset_override_id,
-                record_telemetry=record_predictions,
-                telemetry_tags=list(tags) if tags else None,
-            ),
-        )
-        wait_for_task(response.task_id, description="Running evaluation")
-        response = get_evaluation(self.id, UUID(response.task_id))
-        assert response.result is not None
-        return response.result.to_dict()
+        if isinstance(data, Datasource):
+            return self._evaluate_datasource(
+                datasource=data,
+                value_column=value_column,
+                label_column=label_column,
+                record_predictions=record_predictions,
+                tags=tags,
+            )
+        else:
+            return self._evaluate_dataset(
+                dataset=data,
+                value_column=value_column,
+                label_column=label_column,
+                record_predictions=record_predictions,
+                tags=tags,
+                batch_size=batch_size,
+            )
     def finetune(self, datasource: Datasource):
         #  do not document until implemented

orca-sdk 0.0.91__py3-none-any.whl → 0.0.93__py3-none-any.whl

orca-sdk 0.0.91py3-none-any.whl → 0.0.93py3-none-any.whl