PyPI - orca-sdk - Versions diffs - 0.0.94__py3-none-any.whl → 0.0.95__py3-none-any.whl - Mend

orca-sdk 0.0.94py3-none-any.whl → 0.0.95py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

orca_sdk/classification_model.py CHANGED Viewed

@@ -5,37 +5,29 @@ import os
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Any, Generator, Iterable, Literal, cast, overload
-from uuid import UUID, uuid4
+from uuid import UUID
-import numpy as np
-import numpy as np
 from datasets import Dataset
-from sklearn.metrics import (
-    accuracy_score,
-    auc,
-    f1_score,
-    roc_auc_score,
-)
 from ._generated_api_client.api import (
-    create_evaluation,
-    create_model,
-    delete_model,
-    get_evaluation,
-    get_model,
-    list_models,
+    create_classification_model,
+    delete_classification_model,
+    evaluate_classification_model,
+    get_classification_model,
+    get_classification_model_evaluation,
+    list_classification_models,
     list_predictions,
-    predict_gpu,
+    predict_label_gpu,
     record_prediction_feedback,
-    update_model,
+    update_classification_model,
 )
 from ._generated_api_client.models import (
-    ClassificationEvaluationResult,
-    CreateRACModelRequest,
-    EvaluationRequest,
+    ClassificationEvaluationRequest,
+    ClassificationModelMetadata,
+    ClassificationPredictionRequest,
+    CreateClassificationModelRequest,
+    LabelPredictionWithMemoriesAndFeedback,
     ListPredictionsRequest,
-    PrecisionRecallCurve,
 )
 from ._generated_api_client.models import (
     PredictionSortItemItemType0 as PredictionSortColumns,
@@ -43,19 +35,19 @@ from ._generated_api_client.models import (
 from ._generated_api_client.models import (
     PredictionSortItemItemType1 as PredictionSortDirection,
 )
-from ._generated_api_client.models import (
-    RACHeadType,
-    RACModelMetadata,
-    RACModelUpdate,
-    ROCCurve,
-)
-from ._generated_api_client.models.prediction_request import PredictionRequest
-from ._shared.metrics import calculate_pr_curve, calculate_roc_curve
+from ._generated_api_client.models import PredictiveModelUpdate, RACHeadType
+from ._generated_api_client.types import UNSET as CLIENT_UNSET
+from ._shared.metrics import ClassificationMetrics, calculate_classification_metrics
 from ._utils.common import UNSET, CreateMode, DropMode
-from ._utils.task import wait_for_task
 from .datasource import Datasource
-from .memoryset import LabeledMemoryset
-from .telemetry import LabelPrediction, _parse_feedback
+from .job import Job
+from .memoryset import (
+    FilterItem,
+    FilterItemTuple,
+    LabeledMemoryset,
+    _parse_filter_item_from_tuple,
+)
+from .telemetry import ClassificationPrediction, _parse_feedback
 class ClassificationModel:
@@ -72,6 +64,7 @@ class ClassificationModel:
         memory_lookup_count: Number of memories the model uses for each prediction
         weigh_memories: If using a KNN head, whether the model weighs memories by their lookup score
         min_memory_weight: If using a KNN head, minimum lookup score memories have to be over to not be ignored
+        locked: Whether the model is locked to prevent accidental deletion
         created_at: When the model was created
     """
@@ -85,9 +78,10 @@ class ClassificationModel:
     weigh_memories: bool | None
     min_memory_weight: float | None
     version: int
+    locked: bool
     created_at: datetime
-    def __init__(self, metadata: RACModelMetadata):
+    def __init__(self, metadata: ClassificationModelMetadata):
         # for internal use only, do not document
         self.id = metadata.id
         self.name = metadata.name
@@ -99,10 +93,11 @@ class ClassificationModel:
         self.weigh_memories = metadata.weigh_memories
         self.min_memory_weight = metadata.min_memory_weight
         self.version = metadata.version
+        self.locked = metadata.locked
         self.created_at = metadata.created_at
         self._memoryset_override_id: str | None = None
-        self._last_prediction: LabelPrediction | None = None
+        self._last_prediction: ClassificationPrediction | None = None
         self._last_prediction_was_batch: bool = False
     def __eq__(self, other) -> bool:
@@ -120,7 +115,7 @@ class ClassificationModel:
         )
     @property
-    def last_prediction(self) -> LabelPrediction:
+    def last_prediction(self) -> ClassificationPrediction:
         """
         Last prediction made by the model
@@ -208,8 +203,8 @@ class ClassificationModel:
                 return existing
-        metadata = create_model(
-            body=CreateRACModelRequest(
+        metadata = create_classification_model(
+            body=CreateClassificationModelRequest(
                 name=name,
                 memoryset_id=memoryset.id,
                 head_type=RACHeadType(head_type),
@@ -236,7 +231,7 @@ class ClassificationModel:
         Raises:
             LookupError: If the classification model does not exist
         """
-        return cls(get_model(name))
+        return cls(get_classification_model(name))
     @classmethod
     def exists(cls, name_or_id: str) -> bool:
@@ -263,7 +258,7 @@ class ClassificationModel:
         Returns:
             List of handles to all classification models in the OrcaCloud
         """
-        return [cls(metadata) for metadata in list_models()]
+        return [cls(metadata) for metadata in list_classification_models()]
     @classmethod
     def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
@@ -282,7 +277,7 @@ class ClassificationModel:
             LookupError: If the classification model does not exist and if_not_exists is `"error"`
         """
         try:
-            delete_model(name_or_id)
+            delete_classification_model(name_or_id)
             logging.info(f"Deleted model {name_or_id}")
         except LookupError:
             if if_not_exists == "error":
@@ -290,34 +285,53 @@ class ClassificationModel:
     def refresh(self):
         """Refresh the model data from the OrcaCloud"""
-        self.__dict__.update(ClassificationModel.open(self.name).__dict__)
+        self.__dict__.update(self.open(self.name).__dict__)
-    def update_metadata(self, *, description: str | None = UNSET) -> None:
+    def set(self, *, description: str | None = UNSET, locked: bool = UNSET) -> None:
         """
-        Update editable classification model metadata properties.
+        Update editable attributes of the model.
+        Note:
+            If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
         Params:
-            description: Value to set for the description, defaults to `[UNSET]` if not provided.
+            description: Value to set for the description
+            locked: Value to set for the locked status
         Examples:
             Update the description:
-            >>> model.update(description="New description")
+            >>> model.set(description="New description")
             Remove description:
-            >>> model.update(description=None)
+            >>> model.set(description=None)
+            Lock the model:
+            >>> model.set(locked=True)
         """
-        update_model(self.id, body=RACModelUpdate(description=description))
+        update_data = PredictiveModelUpdate(
+            description=CLIENT_UNSET if description is UNSET else description,
+            locked=CLIENT_UNSET if locked is UNSET else locked,
+        )
+        update_classification_model(self.id, body=update_data)
         self.refresh()
+    def lock(self) -> None:
+        """Lock the model to prevent accidental deletion"""
+        self.set(locked=True)
+    def unlock(self) -> None:
+        """Unlock the model to allow deletion"""
+        self.set(locked=False)
     @overload
     def predict(
         self,
         value: list[str],
         expected_labels: list[int] | None = None,
-        tags: set[str] = set(),
-        save_telemetry: bool = True,
-        save_telemetry_synchronously: bool = False,
-    ) -> list[LabelPrediction]:
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: Literal["off", "on", "sync", "async"] = "on",
+    ) -> list[ClassificationPrediction]:
         pass
     @overload
@@ -325,20 +339,20 @@ class ClassificationModel:
         self,
         value: str,
         expected_labels: int | None = None,
-        tags: set[str] = set(),
-        save_telemetry: bool = True,
-        save_telemetry_synchronously: bool = False,
-    ) -> LabelPrediction:
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: Literal["off", "on", "sync", "async"] = "on",
+    ) -> ClassificationPrediction:
         pass
     def predict(
         self,
         value: list[str] | str,
         expected_labels: list[int] | int | None = None,
-        tags: set[str] = set(),
-        save_telemetry: bool = True,
-        save_telemetry_synchronously: bool = False,
-    ) -> list[LabelPrediction] | LabelPrediction:
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: Literal["off", "on", "sync", "async"] = "on",
+    ) -> list[ClassificationPrediction] | ClassificationPrediction:
         """
         Predict label(s) for the given input value(s) grounded in similar memories
@@ -346,10 +360,12 @@ class ClassificationModel:
             value: Value(s) to get predict the labels of
             expected_labels: Expected label(s) for the given input to record for model evaluation
             tags: Tags to add to the prediction(s)
-            save_telemetry: Whether to enable telemetry for the prediction(s)
-            save_telemetry_synchronously: Whether to save telemetry synchronously. If `False`, telemetry will be saved
-                asynchronously in the background. This may result in a delay in the telemetry being available. Please note that this
-                may be overriden by the ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY environment variable.
+            save_telemetry: Whether to save telemetry for the prediction(s). One of
+                * `"off"`: Do not save telemetry
+                * `"on"`: Save telemetry asynchronously unless the `ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY`
+                  environment variable is set.
+                * `"sync"`: Save telemetry synchronously
+                * `"async"`: Save telemetry asynchronously
         Returns:
             Label prediction or list of label predictions
@@ -357,49 +373,51 @@ class ClassificationModel:
         Examples:
             Predict the label for a single value:
             >>> prediction = model.predict("I am happy", tags={"test"})
-            LabelPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
+            ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
             Predict the labels for a list of values:
             >>> predictions = model.predict(["I am happy", "I am sad"], expected_labels=[1, 0])
             [
-                LabelPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
-                LabelPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
+                ClassificationPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
             ]
         """
-        if "ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY" in os.environ:
-            env_var = os.environ["ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY"]
-            logging.info(
-                f"ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY is set to {env_var} which will override the parameter save_telemetry_synchronously = {save_telemetry_synchronously}"
-            )
-            save_telemetry_synchronously = env_var.lower() == "true"
+        parsed_filters = [
+            _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
+        ]
-        response = predict_gpu(
+        if not all(isinstance(filter, FilterItem) for filter in parsed_filters):
+            raise ValueError(f"Cannot filter on {filters} - telemetry filters are not supported for predictions")
+        response = predict_label_gpu(
             self.id,
-            body=PredictionRequest(
+            body=ClassificationPredictionRequest(
                 input_values=value if isinstance(value, list) else [value],
                 memoryset_override_id=self._memoryset_override_id,
                 expected_labels=(
                     expected_labels
                     if isinstance(expected_labels, list)
-                    else [expected_labels]
-                    if expected_labels is not None
-                    else None
+                    else [expected_labels] if expected_labels is not None else None
+                ),
+                tags=list(tags or set()),
+                save_telemetry=save_telemetry != "off",
+                save_telemetry_synchronously=(
+                    os.getenv("ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY", "0") != "0" or save_telemetry == "sync"
                 ),
-                tags=list(tags),
-                save_telemetry=save_telemetry,
-                save_telemetry_synchronously=save_telemetry_synchronously,
+                filters=cast(list[FilterItem], parsed_filters),
             ),
         )
-        if save_telemetry and any(p.prediction_id is None for p in response):
+        if save_telemetry != "off" and any(p.prediction_id is None for p in response):
             raise RuntimeError("Failed to save prediction to database.")
         predictions = [
-            LabelPrediction(
+            ClassificationPrediction(
                 prediction_id=prediction.prediction_id,
                 label=prediction.label,
                 label_name=prediction.label_name,
+                score=None,
                 confidence=prediction.confidence,
                 anomaly_score=prediction.anomaly_score,
                 memoryset=self.memoryset,
@@ -420,7 +438,7 @@ class ClassificationModel:
         tag: str | None = None,
         sort: list[tuple[PredictionSortColumns, PredictionSortDirection]] = [],
         expected_label_match: bool | None = None,
-    ) -> list[LabelPrediction]:
+    ) -> list[ClassificationPrediction]:
         """
         Get a list of predictions made by this model
@@ -440,19 +458,19 @@ class ClassificationModel:
             Get the last 3 predictions:
             >>> predictions = model.predictions(limit=3, sort=[("timestamp", "desc")])
             [
-                LabeledPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
-                LabeledPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
-                LabeledPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am ecstatic'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
+                ClassificationPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am ecstatic'}),
             ]
             Get second most confident prediction:
             >>> predictions = model.predictions(sort=[("confidence", "desc")], offset=1, limit=1)
-            [LabeledPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am having a good day'})]
+            [ClassificationPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am having a good day'})]
             Get predictions where the expected label doesn't match the predicted label:
             >>> predictions = model.predictions(expected_label_match=False)
-            [LabeledPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy', expected_label: 0})]
+            [ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy', expected_label: 0})]
         """
         predictions = list_predictions(
             body=ListPredictionsRequest(
@@ -465,10 +483,11 @@ class ClassificationModel:
             ),
         )
         return [
-            LabelPrediction(
+            ClassificationPrediction(
                 prediction_id=prediction.prediction_id,
                 label=prediction.label,
                 label_name=prediction.label_name,
+                score=None,
                 confidence=prediction.confidence,
                 anomaly_score=prediction.anomaly_score,
                 memoryset=self.memoryset,
@@ -476,59 +495,9 @@ class ClassificationModel:
                 telemetry=prediction,
             )
             for prediction in predictions
+            if isinstance(prediction, LabelPredictionWithMemoriesAndFeedback)
         ]
-    def _calculate_metrics(
-        self,
-        predictions: list[LabelPrediction],
-        expected_labels: list[int],
-    ) -> ClassificationEvaluationResult:
-        targets_array = np.array(expected_labels)
-        predictions_array = np.array([p.label for p in predictions])
-        logits_array = np.array([p.logits for p in predictions])
-        f1 = float(f1_score(targets_array, predictions_array, average="weighted"))
-        accuracy = float(accuracy_score(targets_array, predictions_array))
-        # Only compute ROC AUC and PR AUC for binary classification
-        unique_classes = np.unique(targets_array)
-        pr_curve = None
-        roc_curve = None
-        if len(unique_classes) == 2:
-            try:
-                precisions, recalls, pr_thresholds = calculate_pr_curve(targets_array, logits_array)
-                pr_auc = float(auc(recalls, precisions))
-                pr_curve = PrecisionRecallCurve(
-                    precisions=precisions.tolist(),
-                    recalls=recalls.tolist(),
-                    thresholds=pr_thresholds.tolist(),
-                    auc=pr_auc,
-                )
-                fpr, tpr, roc_thresholds = calculate_roc_curve(targets_array, logits_array)
-                roc_auc = float(roc_auc_score(targets_array, logits_array[:, 1]))
-                roc_curve = ROCCurve(
-                    false_positive_rates=fpr.tolist(),
-                    true_positive_rates=tpr.tolist(),
-                    thresholds=roc_thresholds.tolist(),
-                    auc=roc_auc,
-                )
-            except ValueError as e:
-                logging.warning(f"Error calculating PR and ROC curves: {e}")
-        return ClassificationEvaluationResult(
-            f1_score=f1,
-            accuracy=accuracy,
-            loss=0.0,
-            precision_recall_curve=pr_curve,
-            roc_curve=roc_curve,
-        )
     def _evaluate_datasource(
         self,
         datasource: Datasource,
@@ -536,10 +505,11 @@ class ClassificationModel:
         label_column: str,
         record_predictions: bool,
         tags: set[str] | None,
-    ) -> dict[str, Any]:
-        response = create_evaluation(
+        background: bool = False,
+    ) -> ClassificationMetrics | Job[ClassificationMetrics]:
+        response = evaluate_classification_model(
             self.id,
-            body=EvaluationRequest(
+            body=ClassificationEvaluationRequest(
                 datasource_id=datasource.id,
                 datasource_label_column=label_column,
                 datasource_value_column=value_column,
@@ -548,10 +518,13 @@ class ClassificationModel:
                 telemetry_tags=list(tags) if tags else None,
             ),
         )
-        wait_for_task(response.task_id, description="Running evaluation")
-        response = get_evaluation(self.id, UUID(response.task_id))
-        assert response.result is not None
-        return response.result.to_dict()
+        job = Job(
+            response.task_id,
+            lambda: (r := get_classification_model_evaluation(self.id, UUID(response.task_id)).result)
+            and ClassificationMetrics(**r.to_dict()),
+        )
+        return job if background else job.result()
     def _evaluate_dataset(
         self,
@@ -561,34 +534,64 @@ class ClassificationModel:
         record_predictions: bool,
         tags: set[str],
         batch_size: int,
-    ) -> dict[str, Any]:
-        predictions = []
-        expected_labels = []
-        for i in range(0, len(dataset), batch_size):
-            batch = dataset[i : i + batch_size]
-            predictions.extend(
-                self.predict(
-                    batch[value_column],
-                    expected_labels=batch[label_column],
-                    tags=tags,
-                    save_telemetry=record_predictions,
-                    save_telemetry_synchronously=(not record_predictions),
-                )
+    ) -> ClassificationMetrics:
+        predictions = [
+            prediction
+            for i in range(0, len(dataset), batch_size)
+            for prediction in self.predict(
+                dataset[i : i + batch_size][value_column],
+                expected_labels=dataset[i : i + batch_size][label_column],
+                tags=tags,
+                save_telemetry="sync" if record_predictions else "off",
             )
-            expected_labels.extend(batch[label_column])
+        ]
+        return calculate_classification_metrics(
+            expected_labels=dataset[label_column],
+            logits=[p.logits for p in predictions],
+            anomaly_scores=[p.anomaly_score for p in predictions],
+            include_curves=True,
+        )
+    @overload
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
+        *,
+        value_column: str = "value",
+        label_column: str = "label",
+        record_predictions: bool = False,
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
+        background: Literal[True],
+    ) -> Job[ClassificationMetrics]:
+        pass
-        return self._calculate_metrics(predictions, expected_labels).to_dict()
+    @overload
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
+        *,
+        value_column: str = "value",
+        label_column: str = "label",
+        record_predictions: bool = False,
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
+        background: Literal[False] = False,
+    ) -> ClassificationMetrics:
+        pass
     def evaluate(
         self,
         data: Datasource | Dataset,
+        *,
         value_column: str = "value",
         label_column: str = "label",
         record_predictions: bool = False,
         tags: set[str] = {"evaluation"},
         batch_size: int = 100,
-    ) -> dict[str, Any]:
+        background: bool = False,
+    ) -> ClassificationMetrics | Job[ClassificationMetrics]:
         """
         Evaluate the classification model on a given dataset or datasource
@@ -596,21 +599,23 @@ class ClassificationModel:
             data: Dataset or Datasource to evaluate the model on
             value_column: Name of the column that contains the input values to the model
             label_column: Name of the column containing the expected labels
-            record_predictions: Whether to record [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s for analysis
-            tags: Optional tags to add to the recorded [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s
+            record_predictions: Whether to record [`ClassificationPrediction`][orca_sdk.telemetry.ClassificationPrediction]s for analysis
+            tags: Optional tags to add to the recorded [`ClassificationPrediction`][orca_sdk.telemetry.ClassificationPrediction]s
             batch_size: Batch size for processing Dataset inputs (only used when input is a Dataset)
+            background: Whether to run the operation in the background and return a job handle
         Returns:
-            Dictionary with evaluation metrics, including anomaly score statistics (mean, median, variance)
+            EvaluationResult containing metrics including accuracy, F1 score, ROC AUC, PR AUC, and anomaly score statistics
         Examples:
-            Evaluate using a Datasource:
             >>> model.evaluate(datasource, value_column="text", label_column="airline_sentiment")
-            { "f1_score": 0.85, "roc_auc": 0.85, "pr_auc": 0.85, "accuracy": 0.85, "loss": 0.35, ... }
-            Evaluate using a Dataset:
-            >>> model.evaluate(dataset, value_column="text", label_column="sentiment")
-            { "f1_score": 0.85, "roc_auc": 0.85, "pr_auc": 0.85, "accuracy": 0.85, "loss": 0.35, ... }
+            ClassificationMetrics({
+                accuracy: 0.8500,
+                f1_score: 0.8500,
+                roc_auc: 0.8500,
+                pr_auc: 0.8500,
+                anomaly_score: 0.3500 ± 0.0500,
+            })
         """
         if isinstance(data, Datasource):
             return self._evaluate_datasource(
@@ -619,8 +624,9 @@ class ClassificationModel:
                 label_column=label_column,
                 record_predictions=record_predictions,
                 tags=tags,
+                background=background,
             )
-        else:
+        elif isinstance(data, Dataset):
             return self._evaluate_dataset(
                 dataset=data,
                 value_column=value_column,
@@ -629,6 +635,8 @@ class ClassificationModel:
                 tags=tags,
                 batch_size=batch_size,
             )
+        else:
+            raise ValueError(f"Invalid data type: {type(data)}")
     def finetune(self, datasource: Datasource):
         #  do not document until implemented

orca-sdk 0.0.94__py3-none-any.whl → 0.0.95__py3-none-any.whl

orca-sdk 0.0.94py3-none-any.whl → 0.0.95py3-none-any.whl