PyPI - orca-sdk - Versions diffs - 0.0.103__py3-none-any.whl → 0.0.104__py3-none-any.whl - Mend

orca-sdk 0.0.103py3-none-any.whl → 0.0.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

orca_sdk/_shared/metrics.py +31 -9
orca_sdk/_shared/metrics_test.py +30 -4
orca_sdk/_utils/prediction_result_ui.py +5 -1
orca_sdk/classification_model.py +32 -1
orca_sdk/classification_model_test.py +18 -0
orca_sdk/client.py +297 -257
orca_sdk/conftest.py +12 -0
orca_sdk/datasource.py +1 -1
orca_sdk/datasource_test.py +6 -1
orca_sdk/embedding_model.py +28 -1
orca_sdk/job_test.py +20 -10
orca_sdk/memoryset.py +9 -23
orca_sdk/memoryset_test.py +3 -2
orca_sdk/regression_model.py +29 -1
orca_sdk/regression_model_test.py +18 -1
{orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/METADATA +14 -14
{orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/RECORD +18 -18
{orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/WHEEL +1 -1

orca_sdk/_shared/metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@
 This module contains metrics for usage with the Hugging Face Trainer.
 IMPORTANT:
-- This is a shared file between OrcaLib and the Orca SDK.
+- This is a shared file between OrcaLib and the OrcaSDK.
 - Please ensure that it does not have any dependencies on the OrcaLib code.
 - Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
@@ -147,13 +147,16 @@ def calculate_roc_curve(
 @dataclass
 class ClassificationMetrics:
+    coverage: float
+    """Percentage of predictions that are not none"""
     f1_score: float
     """F1 score of the predictions"""
     accuracy: float
     """Accuracy of the predictions"""
-    loss: float
+    loss: float | None
     """Cross-entropy loss of the logits"""
     anomaly_score_mean: float | None = None
@@ -225,12 +228,15 @@ def calculate_classification_metrics(
         raise ValueError("Logits must be 1 or 2 dimensional")
     predictions = np.argmax(probabilities, axis=-1)
+    predictions[np.isnan(probabilities).all(axis=-1)] = -1  # set predictions to -1 for all nan logits
     num_classes_references = len(set(references))
     num_classes_predictions = len(set(predictions))
+    num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
+    coverage = 1 - num_none_predictions / len(probabilities)
     if average is None:
-        average = "binary" if num_classes_references == 2 else "weighted"
+        average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
     anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
     anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
@@ -240,13 +246,17 @@ def calculate_classification_metrics(
     f1 = sklearn.metrics.f1_score(references, predictions, average=average)
     # Ensure sklearn sees the full class set corresponding to probability columns
     # to avoid errors when y_true does not contain all classes.
-    loss = sklearn.metrics.log_loss(
-        references,
-        probabilities,
-        labels=list(range(probabilities.shape[1])),
+    loss = (
+        sklearn.metrics.log_loss(
+            references,
+            probabilities,
+            labels=list(range(probabilities.shape[1])),
+        )
+        if num_none_predictions == 0
+        else None
     )
-    if num_classes_references == num_classes_predictions:
+    if num_classes_references == num_classes_predictions and num_none_predictions == 0:
         # special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
         if num_classes_references == 2:
             roc_auc = sklearn.metrics.roc_auc_score(references, logits[:, 1])
@@ -265,9 +275,10 @@ def calculate_classification_metrics(
         roc_curve = None
     return ClassificationMetrics(
+        coverage=coverage,
         accuracy=float(accuracy),
         f1_score=float(f1),
-        loss=float(loss),
+        loss=float(loss) if loss is not None else None,
         anomaly_score_mean=anomaly_score_mean,
         anomaly_score_median=anomaly_score_median,
         anomaly_score_variance=anomaly_score_variance,
@@ -280,6 +291,9 @@ def calculate_classification_metrics(
 @dataclass
 class RegressionMetrics:
+    coverage: float
+    """Percentage of predictions that are not none"""
     mse: float
     """Mean squared error of the predictions"""
@@ -351,6 +365,13 @@ def calculate_regression_metrics(
     anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
     anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
+    none_prediction_mask = np.isnan(predictions)
+    num_none_predictions = none_prediction_mask.sum()
+    coverage = 1 - num_none_predictions / len(predictions)
+    if num_none_predictions > 0:
+        references = references[~none_prediction_mask]
+        predictions = predictions[~none_prediction_mask]
     # Calculate core regression metrics
     mse = float(sklearn.metrics.mean_squared_error(references, predictions))
     rmse = float(np.sqrt(mse))
@@ -359,6 +380,7 @@ def calculate_regression_metrics(
     explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
     return RegressionMetrics(
+        coverage=coverage,
         mse=mse,
         rmse=rmse,
         mae=mae,

orca_sdk/_shared/metrics_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 IMPORTANT:
-- This is a shared file between OrcaLib and the Orca SDK.
+- This is a shared file between OrcaLib and the OrcaSDK.
 - Please ensure that it does not have any dependencies on the OrcaLib code.
 - Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
 """
@@ -101,6 +101,20 @@ def test_softmaxes_logits_if_necessary():
     )
+def test_handles_nan_logits():
+    logits = np.array([[np.nan, np.nan], [np.nan, np.nan], [0.1, 0.9], [0.2, 0.8]])
+    expected_labels = [0, 1, 0, 1]
+    metrics = calculate_classification_metrics(expected_labels, logits)
+    assert metrics.loss is None
+    assert metrics.accuracy == 0.25
+    assert metrics.f1_score == 0.25
+    assert metrics.roc_auc is None
+    assert metrics.pr_auc is None
+    assert metrics.pr_curve is None
+    assert metrics.roc_curve is None
+    assert metrics.coverage == 0.5
 def test_precision_recall_curve():
     y_true = np.array([0, 1, 1, 0, 1])
     y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
@@ -153,7 +167,7 @@ def test_log_loss_handles_missing_classes_in_y_true():
     metrics = calculate_classification_metrics(y_true, y_score)
     expected_loss = sklearn.metrics.log_loss(y_true, y_score, labels=[0, 1, 2])
-    assert np.isfinite(metrics.loss)
+    assert metrics.loss is not None
     assert np.allclose(metrics.loss, expected_loss)
@@ -194,8 +208,6 @@ def test_roc_curve_max_length():
 # Regression Metrics Tests
 def test_perfect_regression_predictions():
     y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
     y_pred = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
@@ -245,3 +257,17 @@ def test_regression_metrics_with_anomaly_scores():
     assert metrics.anomaly_score_mean == pytest.approx(np.mean(anomaly_scores))
     assert metrics.anomaly_score_median == pytest.approx(np.median(anomaly_scores))
     assert metrics.anomaly_score_variance == pytest.approx(np.var(anomaly_scores))
+def test_regression_metrics_handles_nans():
+    y_true = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+    y_pred = np.array([1.1, 1.9, np.nan], dtype=np.float32)
+    metrics = calculate_regression_metrics(y_true, y_pred)
+    assert np.allclose(metrics.coverage, 0.6666666666666666)
+    assert metrics.mse > 0.0
+    assert metrics.rmse > 0.0
+    assert metrics.mae > 0.0
+    assert 0.0 <= metrics.r2 <= 1.0
+    assert 0.0 <= metrics.explained_variance <= 1.0

orca_sdk/_utils/prediction_result_ui.py CHANGED Viewed

@@ -77,7 +77,11 @@ def inspect_prediction_result(prediction_result: PredictionBase):
                         dropdown = gr.Dropdown(
                             choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
                             label="Label",
-                            value=f"{label_names[mem_lookup.label]} ({mem_lookup.label})",
+                            value=(
+                                f"{label_names[mem_lookup.label]} ({mem_lookup.label})"
+                                if mem_lookup.label is not None
+                                else "None"
+                            ),
                             interactive=True,
                             container=False,
                         )

orca_sdk/classification_model.py CHANGED Viewed

@@ -343,6 +343,7 @@ class ClassificationModel:
         save_telemetry: TelemetryMode = "on",
         prompt: str | None = None,
         use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
     ) -> list[ClassificationPrediction]:
         pass
@@ -356,6 +357,7 @@ class ClassificationModel:
         save_telemetry: TelemetryMode = "on",
         prompt: str | None = None,
         use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
     ) -> ClassificationPrediction:
         pass
@@ -368,6 +370,7 @@ class ClassificationModel:
         save_telemetry: TelemetryMode = "on",
         prompt: str | None = None,
         use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
     ) -> list[ClassificationPrediction] | ClassificationPrediction:
         """
         Predict label(s) for the given input value(s) grounded in similar memories
@@ -384,10 +387,16 @@ class ClassificationModel:
                 * `"sync"`: Save telemetry synchronously
                 * `"async"`: Save telemetry asynchronously
             prompt: Optional prompt to use for instruction-tuned embedding models
+            use_lookup_cache: Whether to use cached lookup results for faster predictions
+            timeout_seconds: Timeout in seconds for the request, defaults to 10 seconds
         Returns:
             Label prediction or list of label predictions
+        Raises:
+            ValueError: If timeout_seconds is not a positive integer
+            TimeoutError: If the request times out after the specified duration
         Examples:
             Predict the label for a single value:
             >>> prediction = model.predict("I am happy", tags={"test"})
@@ -405,6 +414,9 @@ class ClassificationModel:
             ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
         """
+        if timeout_seconds <= 0:
+            raise ValueError("timeout_seconds must be a positive integer")
         parsed_filters = [
             _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
         ]
@@ -437,6 +449,7 @@ class ClassificationModel:
                 "prompt": prompt,
                 "use_lookup_cache": use_lookup_cache,
             },
+            timeout=timeout_seconds,
         )
         if telemetry_on and any(p["prediction_id"] is None for p in response):
@@ -557,7 +570,19 @@ class ClassificationModel:
                 params={"model_name_or_id": self.id, "task_id": response["task_id"]},
             )
             assert res["result"] is not None
-            return ClassificationMetrics(**res["result"])
+            return ClassificationMetrics(
+                coverage=res["result"].get("coverage"),
+                f1_score=res["result"].get("f1_score"),
+                accuracy=res["result"].get("accuracy"),
+                loss=res["result"].get("loss"),
+                anomaly_score_mean=res["result"].get("anomaly_score_mean"),
+                anomaly_score_median=res["result"].get("anomaly_score_median"),
+                anomaly_score_variance=res["result"].get("anomaly_score_variance"),
+                roc_auc=res["result"].get("roc_auc"),
+                pr_auc=res["result"].get("pr_auc"),
+                pr_curve=res["result"].get("pr_curve"),
+                roc_curve=res["result"].get("roc_curve"),
+            )
         job = Job(response["task_id"], get_value)
         return job if background else job.result()
@@ -571,6 +596,12 @@ class ClassificationModel:
         tags: set[str],
         batch_size: int,
     ) -> ClassificationMetrics:
+        if len(dataset) == 0:
+            raise ValueError("Evaluation dataset cannot be empty")
+        if any(x is None for x in dataset[label_column]):
+            raise ValueError("Evaluation dataset cannot contain None values in the label column")
         predictions = [
             prediction
             for i in range(0, len(dataset), batch_size)

orca_sdk/classification_model_test.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .conftest import skip_in_ci
 from .datasource import Datasource
 from .embedding_model import PretrainedEmbeddingModel
 from .memoryset import LabeledMemoryset
+from .telemetry import ClassificationPrediction
 def test_create_model(classification_model: ClassificationModel, readonly_memoryset: LabeledMemoryset):
@@ -193,6 +194,16 @@ def test_evaluate(classification_model, eval_datasource: Datasource, eval_datase
     assert np.allclose(result.roc_curve["true_positive_rates"], [1.0, 0.5, 0.5, 0.0])
+def test_evaluate_datasource_with_nones_raises_error(classification_model: ClassificationModel, datasource: Datasource):
+    with pytest.raises(ValueError):
+        classification_model.evaluate(datasource, record_predictions=True, tags={"test"})
+def test_evaluate_dataset_with_nones_raises_error(classification_model: ClassificationModel, hf_dataset: Dataset):
+    with pytest.raises(ValueError):
+        classification_model.evaluate(hf_dataset, record_predictions=True, tags={"test"})
 def test_evaluate_with_telemetry(classification_model: ClassificationModel, eval_dataset: Dataset):
     result = classification_model.evaluate(eval_dataset, record_predictions=True, tags={"test"})
     assert result is not None
@@ -223,6 +234,13 @@ def test_predict(classification_model: ClassificationModel, label_names: list[st
     assert predictions[1].logits[0] < predictions[1].logits[1]
+def test_classification_prediction_has_no_label(classification_model: ClassificationModel):
+    """Ensure optional score is None for classification predictions."""
+    prediction = classification_model.predict("Do you want to go to the beach?")
+    assert isinstance(prediction, ClassificationPrediction)
+    assert prediction.label is None
 def test_predict_disable_telemetry(classification_model: ClassificationModel, label_names: list[str]):
     predictions = classification_model.predict(["Do you love soup?", "Are cats cute?"], save_telemetry="off")
     assert len(predictions) == 2

orca-sdk 0.0.103__py3-none-any.whl → 0.0.104__py3-none-any.whl

orca-sdk 0.0.103py3-none-any.whl → 0.0.104py3-none-any.whl