PyPI - orca-sdk - Versions diffs - 0.0.93__py3-none-any.whl → 0.0.95__py3-none-any.whl - Mend

orca-sdk 0.0.93py3-none-any.whl → 0.0.95py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

orca_sdk/_shared/metrics_test.py CHANGED Viewed

@@ -9,13 +9,13 @@ from typing import Literal
 import numpy as np
 import pytest
+import sklearn.metrics
 from .metrics import (
-    EvalPrediction,
+    calculate_classification_metrics,
     calculate_pr_curve,
+    calculate_regression_metrics,
     calculate_roc_curve,
-    classification_scores,
-    compute_classifier_metrics,
     softmax,
 )
@@ -24,36 +24,36 @@ def test_binary_metrics():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.3, 0.2])
-    metrics = classification_scores(y_true, y_score)
+    metrics = calculate_classification_metrics(y_true, y_score)
-    assert metrics["accuracy"] == 0.8
-    assert metrics["f1_score"] == 0.8
-    assert metrics["roc_auc"] is not None
-    assert metrics["roc_auc"] > 0.8
-    assert metrics["roc_auc"] < 1.0
-    assert metrics["pr_auc"] is not None
-    assert metrics["pr_auc"] > 0.8
-    assert metrics["pr_auc"] < 1.0
-    assert metrics["log_loss"] is not None
-    assert metrics["log_loss"] > 0.0
+    assert metrics.accuracy == 0.8
+    assert metrics.f1_score == 0.8
+    assert metrics.roc_auc is not None
+    assert metrics.roc_auc > 0.8
+    assert metrics.roc_auc < 1.0
+    assert metrics.pr_auc is not None
+    assert metrics.pr_auc > 0.8
+    assert metrics.pr_auc < 1.0
+    assert metrics.loss is not None
+    assert metrics.loss > 0.0
 def test_multiclass_metrics_with_2_classes():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([[0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
-    metrics = classification_scores(y_true, y_score)
+    metrics = calculate_classification_metrics(y_true, y_score)
-    assert metrics["accuracy"] == 0.8
-    assert metrics["f1_score"] == 0.8
-    assert metrics["roc_auc"] is not None
-    assert metrics["roc_auc"] > 0.8
-    assert metrics["roc_auc"] < 1.0
-    assert metrics["pr_auc"] is not None
-    assert metrics["pr_auc"] > 0.8
-    assert metrics["pr_auc"] < 1.0
-    assert metrics["log_loss"] is not None
-    assert metrics["log_loss"] > 0.0
+    assert metrics.accuracy == 0.8
+    assert metrics.f1_score == 0.8
+    assert metrics.roc_auc is not None
+    assert metrics.roc_auc > 0.8
+    assert metrics.roc_auc < 1.0
+    assert metrics.pr_auc is not None
+    assert metrics.pr_auc > 0.8
+    assert metrics.pr_auc < 1.0
+    assert metrics.loss is not None
+    assert metrics.loss > 0.0
 @pytest.mark.parametrize(
@@ -66,104 +66,163 @@ def test_multiclass_metrics_with_3_classes(
     y_true = np.array([0, 1, 1, 0, 2])
     y_score = np.array([[0.9, 0.1, 0.0], [0.1, 0.9, 0.0], [0.2, 0.8, 0.0], [0.7, 0.3, 0.0], [0.0, 0.0, 1.0]])
-    metrics = classification_scores(y_true, y_score, average=average, multi_class=multiclass)
+    metrics = calculate_classification_metrics(y_true, y_score, average=average, multi_class=multiclass)
-    assert metrics["accuracy"] == 1.0
-    assert metrics["f1_score"] == 1.0
-    assert metrics["roc_auc"] is not None
-    assert metrics["roc_auc"] > 0.8
-    assert metrics["pr_auc"] is None
-    assert metrics["log_loss"] is not None
-    assert metrics["log_loss"] > 0.0
+    assert metrics.accuracy == 1.0
+    assert metrics.f1_score == 1.0
+    assert metrics.roc_auc is not None
+    assert metrics.roc_auc > 0.8
+    assert metrics.pr_auc is None
+    assert metrics.loss is not None
+    assert metrics.loss > 0.0
 def test_does_not_modify_logits_unless_necessary():
     logits = np.array([[0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
-    references = np.array([0, 1, 0, 1])
-    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
-    assert metrics["log_loss"] == classification_scores(references, logits)["log_loss"]
+    expected_labels = [0, 1, 0, 1]
+    assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
+        expected_labels, logits
+    )
 def test_normalizes_logits_if_necessary():
     logits = np.array([[1.2, 3.9], [1.2, 5.8], [1.2, 2.7], [1.2, 1.3]])
-    references = np.array([0, 1, 0, 1])
-    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
-    assert (
-        metrics["log_loss"] == classification_scores(references, logits / logits.sum(axis=1, keepdims=True))["log_loss"]
+    expected_labels = [0, 1, 0, 1]
+    assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
+        expected_labels, logits / logits.sum(axis=1, keepdims=True)
     )
 def test_softmaxes_logits_if_necessary():
     logits = np.array([[-1.2, 3.9], [1.2, -5.8], [1.2, 2.7], [1.2, 1.3]])
-    references = np.array([0, 1, 0, 1])
-    metrics = compute_classifier_metrics(EvalPrediction(logits, references))
-    assert metrics["log_loss"] == classification_scores(references, softmax(logits))["log_loss"]
+    expected_labels = [0, 1, 0, 1]
+    assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
+        expected_labels, softmax(logits)
+    )
 def test_precision_recall_curve():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
-    precision, recall, thresholds = calculate_pr_curve(y_true, y_score)
-    assert precision is not None
-    assert recall is not None
-    assert thresholds is not None
+    pr_curve = calculate_pr_curve(y_true, y_score)
-    assert len(precision) == len(recall) == len(thresholds) == 6
-    assert precision[0] == 0.6
-    assert recall[0] == 1.0
-    assert precision[-1] == 1.0
-    assert recall[-1] == 0.0
+    assert len(pr_curve["precisions"]) == len(pr_curve["recalls"]) == len(pr_curve["thresholds"]) == 6
+    assert np.allclose(pr_curve["precisions"][0], 0.6)
+    assert np.allclose(pr_curve["recalls"][0], 1.0)
+    assert np.allclose(pr_curve["precisions"][-1], 1.0)
+    assert np.allclose(pr_curve["recalls"][-1], 0.0)
     # test that thresholds are sorted
-    assert np.all(np.diff(thresholds) >= 0)
+    assert np.all(np.diff(pr_curve["thresholds"]) >= 0)
 def test_roc_curve():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
-    fpr, tpr, thresholds = calculate_roc_curve(y_true, y_score)
-    assert fpr is not None
-    assert tpr is not None
-    assert thresholds is not None
+    roc_curve = calculate_roc_curve(y_true, y_score)
-    assert len(fpr) == len(tpr) == len(thresholds) == 6
-    assert fpr[0] == 1.0
-    assert tpr[0] == 1.0
-    assert fpr[-1] == 0.0
-    assert tpr[-1] == 0.0
+    assert (
+        len(roc_curve["false_positive_rates"])
+        == len(roc_curve["true_positive_rates"])
+        == len(roc_curve["thresholds"])
+        == 6
+    )
+    assert roc_curve["false_positive_rates"][0] == 1.0
+    assert roc_curve["true_positive_rates"][0] == 1.0
+    assert roc_curve["false_positive_rates"][-1] == 0.0
+    assert roc_curve["true_positive_rates"][-1] == 0.0
     # test that thresholds are sorted
-    assert np.all(np.diff(thresholds) >= 0)
+    assert np.all(np.diff(roc_curve["thresholds"]) >= 0)
 def test_precision_recall_curve_max_length():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
-    precision, recall, thresholds = calculate_pr_curve(y_true, y_score, max_length=5)
-    assert len(precision) == len(recall) == len(thresholds) == 5
+    pr_curve = calculate_pr_curve(y_true, y_score, max_length=5)
+    assert len(pr_curve["precisions"]) == len(pr_curve["recalls"]) == len(pr_curve["thresholds"]) == 5
-    assert precision[0] == 0.6
-    assert recall[0] == 1.0
-    assert precision[-1] == 1.0
-    assert recall[-1] == 0.0
+    assert np.allclose(pr_curve["precisions"][0], 0.6)
+    assert np.allclose(pr_curve["recalls"][0], 1.0)
+    assert np.allclose(pr_curve["precisions"][-1], 1.0)
+    assert np.allclose(pr_curve["recalls"][-1], 0.0)
     # test that thresholds are sorted
-    assert np.all(np.diff(thresholds) >= 0)
+    assert np.all(np.diff(pr_curve["thresholds"]) >= 0)
 def test_roc_curve_max_length():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
-    fpr, tpr, thresholds = calculate_roc_curve(y_true, y_score, max_length=5)
-    assert len(fpr) == len(tpr) == len(thresholds) == 5
-    assert fpr[0] == 1.0
-    assert tpr[0] == 1.0
-    assert fpr[-1] == 0.0
-    assert tpr[-1] == 0.0
+    roc_curve = calculate_roc_curve(y_true, y_score, max_length=5)
+    assert (
+        len(roc_curve["false_positive_rates"])
+        == len(roc_curve["true_positive_rates"])
+        == len(roc_curve["thresholds"])
+        == 5
+    )
+    assert np.allclose(roc_curve["false_positive_rates"][0], 1.0)
+    assert np.allclose(roc_curve["true_positive_rates"][0], 1.0)
+    assert np.allclose(roc_curve["false_positive_rates"][-1], 0.0)
+    assert np.allclose(roc_curve["true_positive_rates"][-1], 0.0)
     # test that thresholds are sorted
-    assert np.all(np.diff(thresholds) >= 0)
+    assert np.all(np.diff(roc_curve["thresholds"]) >= 0)
+# Regression Metrics Tests
+def test_perfect_regression_predictions():
+    y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+    y_pred = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+    metrics = calculate_regression_metrics(y_true, y_pred)
+    assert metrics.mse == 0.0
+    assert metrics.rmse == 0.0
+    assert metrics.mae == 0.0
+    assert metrics.r2 == 1.0
+    assert metrics.explained_variance == 1.0
+    assert metrics.loss == 0.0
+    assert metrics.anomaly_score_mean is None
+    assert metrics.anomaly_score_median is None
+    assert metrics.anomaly_score_variance is None
+def test_basic_regression_metrics():
+    y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+    y_pred = np.array([1.1, 1.9, 3.2, 3.8, 5.1], dtype=np.float32)
+    metrics = calculate_regression_metrics(y_true, y_pred)
+    # Check that all metrics are reasonable
+    assert metrics.mse > 0.0
+    assert metrics.rmse == pytest.approx(np.sqrt(metrics.mse))
+    assert metrics.mae > 0.0
+    assert 0.0 <= metrics.r2 <= 1.0
+    assert 0.0 <= metrics.explained_variance <= 1.0
+    assert metrics.loss == metrics.mse
+    # Check specific values based on the data
+    expected_mse = np.mean((y_true - y_pred) ** 2)
+    assert metrics.mse == pytest.approx(expected_mse)
+    expected_mae = np.mean(np.abs(y_true - y_pred))
+    assert metrics.mae == pytest.approx(expected_mae)
+def test_regression_metrics_with_anomaly_scores():
+    y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+    y_pred = np.array([1.1, 1.9, 3.2, 3.8, 5.1], dtype=np.float32)
+    anomaly_scores = [0.1, 0.2, 0.15, 0.3, 0.25]
+    metrics = calculate_regression_metrics(y_true, y_pred, anomaly_scores)
+    assert metrics.anomaly_score_mean == pytest.approx(np.mean(anomaly_scores))
+    assert metrics.anomaly_score_median == pytest.approx(np.median(anomaly_scores))
+    assert metrics.anomaly_score_variance == pytest.approx(np.var(anomaly_scores))

orca_sdk/_utils/data_parsing.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import pickle
 from dataclasses import asdict, is_dataclass
 from os import PathLike
@@ -9,8 +8,6 @@ from datasets import Dataset
 from torch.utils.data import DataLoader as TorchDataLoader
 from torch.utils.data import Dataset as TorchDataset
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 def parse_dict_like(item: Any, column_names: list[str] | None = None) -> dict:
     if isinstance(item, dict):

orca_sdk/_utils/data_parsing_test.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import logging
 import pickle
 import tempfile
 from collections import namedtuple
@@ -15,8 +14,6 @@ from torch.utils.data import Dataset as TorchDataset
 from ..conftest import SAMPLE_DATA
 from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 class PytorchDictDataset(TorchDataset):
     def __init__(self):

orca_sdk/_utils/prediction_result_ui.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import logging
 import re
 from pathlib import Path
@@ -5,14 +7,13 @@ from typing import TYPE_CHECKING
 import gradio as gr
-from ..memoryset import LabeledMemoryLookup
+from ..memoryset import LabeledMemoryLookup, ScoredMemoryLookup, LabeledMemoryset
 if TYPE_CHECKING:
-    from ..telemetry import LabelPrediction
+    from ..telemetry import _Prediction
-def inspect_prediction_result(prediction_result: "LabelPrediction"):
-    label_names = prediction_result.memoryset.label_names
+def inspect_prediction_result(prediction_result: _Prediction):
     def update_label(val: str, memory: LabeledMemoryLookup, progress=gr.Progress(track_tqdm=True)):
         progress(0)
@@ -26,6 +27,12 @@ def inspect_prediction_result(prediction_result: "LabelPrediction"):
         else:
             logging.error(f"Invalid label format: {val}")
+    def update_score(val: float, memory: ScoredMemoryLookup, progress=gr.Progress(track_tqdm=True)):
+        progress(0)
+        memory.update(score=val)
+        progress(1)
+        return "&#9989; Changes saved"
     with gr.Blocks(
         fill_width=True,
         title="Prediction Results",
@@ -33,14 +40,21 @@ def inspect_prediction_result(prediction_result: "LabelPrediction"):
     ) as prediction_result_ui:
         gr.Markdown("# Prediction Results")
         gr.Markdown(f"**Input:** {prediction_result.input_value}")
-        gr.Markdown(f"**Prediction:** {label_names[prediction_result.label]} ({prediction_result.label})")
+        if isinstance(prediction_result.memoryset, LabeledMemoryset) and prediction_result.label is not None:
+            label_names = prediction_result.memoryset.label_names
+            gr.Markdown(f"**Prediction:** {label_names[prediction_result.label]} ({prediction_result.label})")
+        else:
+            gr.Markdown(f"**Prediction:** {prediction_result.score:.2f}")
         gr.Markdown("### Memory Lookups")
         with gr.Row(equal_height=True, variant="panel"):
             with gr.Column(scale=7):
                 gr.Markdown("**Value**")
             with gr.Column(scale=3, min_width=150):
-                gr.Markdown("**Label**")
+                gr.Markdown("**Label**" if prediction_result.label is not None else "**Score**")
         for i, mem_lookup in enumerate(prediction_result.memory_lookups):
             with gr.Row(equal_height=True, variant="panel", elem_classes="white" if i % 2 == 0 else None):
                 with gr.Column(scale=7):
@@ -48,27 +62,45 @@ def inspect_prediction_result(prediction_result: "LabelPrediction"):
                         (
                             mem_lookup.value
                             if isinstance(mem_lookup.value, str)
-                            else "Time series data"
-                            if isinstance(mem_lookup.value, list)
-                            else "Image data"
+                            else "Time series data" if isinstance(mem_lookup.value, list) else "Image data"
                         ),
                         label="Value",
                         height=50,
                     )
                 with gr.Column(scale=3, min_width=150):
-                    dropdown = gr.Dropdown(
-                        choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
-                        label="Label",
-                        value=f"{label_names[mem_lookup.label]} ({mem_lookup.label})",
-                        interactive=True,
-                        container=False,
-                    )
-                    changes_saved = gr.HTML(lambda: "", elem_classes="success no-padding", every=15)
-                    dropdown.change(
-                        lambda val, mem_lookup=mem_lookup: update_label(val, mem_lookup),
-                        inputs=[dropdown],
-                        outputs=[changes_saved],
-                        show_progress="full",
-                    )
+                    if (
+                        isinstance(prediction_result.memoryset, LabeledMemoryset)
+                        and prediction_result.label is not None
+                        and isinstance(mem_lookup, LabeledMemoryLookup)
+                    ):
+                        label_names = prediction_result.memoryset.label_names
+                        dropdown = gr.Dropdown(
+                            choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
+                            label="Label",
+                            value=f"{label_names[mem_lookup.label]} ({mem_lookup.label})",
+                            interactive=True,
+                            container=False,
+                        )
+                        changes_saved = gr.HTML(lambda: "", elem_classes="success no-padding", every=15)
+                        dropdown.change(
+                            lambda val, mem=mem_lookup: update_label(val, mem),
+                            inputs=[dropdown],
+                            outputs=[changes_saved],
+                            show_progress="full",
+                        )
+                    elif prediction_result.score is not None and isinstance(mem_lookup, ScoredMemoryLookup):
+                        input = gr.Number(
+                            value=mem_lookup.score,
+                            label="Score",
+                            interactive=True,
+                            container=False,
+                        )
+                        changes_saved = gr.HTML(lambda: "", elem_classes="success no-padding", every=15)
+                        input.change(
+                            lambda val, mem=mem_lookup: update_score(val, mem),
+                            inputs=[input],
+                            outputs=[changes_saved],
+                            show_progress="full",
+                        )
     prediction_result_ui.launch()

orca-sdk 0.0.93__py3-none-any.whl → 0.0.95__py3-none-any.whl

orca-sdk 0.0.93py3-none-any.whl → 0.0.95py3-none-any.whl