PyPI - orca-sdk - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

orca-sdk 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

orca_sdk/classification_model.py CHANGED Viewed

@@ -4,38 +4,58 @@ import logging
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Any, Generator, Iterable, Literal, cast, overload
-from uuid import UUID
-from ._generated_api_client.api import (
-    create_evaluation,
-    create_model,
-    delete_model,
-    get_evaluation,
-    get_model,
-    list_models,
-    list_predictions,
-    predict_gpu,
-    record_prediction_feedback,
-)
-from ._generated_api_client.models import (
-    ClassificationEvaluationResult,
-    CreateRACModelRequest,
-    EvaluationRequest,
-    ListPredictionsRequest,
+from datasets import Dataset
+from ._shared.metrics import ClassificationMetrics, calculate_classification_metrics
+from ._utils.common import UNSET, CreateMode, DropMode
+from .client import (
+    BootstrapClassificationModelMeta,
+    BootstrapClassificationModelResult,
+    ClassificationModelMetadata,
+    PredictiveModelUpdate,
+    RACHeadType,
+    orca_api,
 )
-from ._generated_api_client.models import (
-    ListPredictionsRequestSortItemItemType0 as PredictionSortColumns,
+from .datasource import Datasource
+from .job import Job
+from .memoryset import (
+    FilterItem,
+    FilterItemTuple,
+    LabeledMemoryset,
+    _is_metric_column,
+    _parse_filter_item_from_tuple,
 )
-from ._generated_api_client.models import (
-    ListPredictionsRequestSortItemItemType1 as PredictionSortDirection,
+from .telemetry import (
+    ClassificationPrediction,
+    TelemetryMode,
+    _get_telemetry_config,
+    _parse_feedback,
 )
-from ._generated_api_client.models import RACHeadType, RACModelMetadata
-from ._generated_api_client.models.prediction_request import PredictionRequest
-from ._utils.common import CreateMode, DropMode
-from ._utils.task import wait_for_task
-from .datasource import Datasource
-from .memoryset import LabeledMemoryset
-from .telemetry import LabelPrediction, _parse_feedback
+class BootstrappedClassificationModel:
+    datasource: Datasource | None
+    memoryset: LabeledMemoryset | None
+    classification_model: ClassificationModel | None
+    agent_output: BootstrapClassificationModelResult | None
+    def __init__(self, metadata: BootstrapClassificationModelMeta):
+        self.datasource = Datasource.open(metadata["datasource_meta"]["id"])
+        self.memoryset = LabeledMemoryset.open(metadata["memoryset_meta"]["id"])
+        self.classification_model = ClassificationModel.open(metadata["model_meta"]["id"])
+        self.agent_output = metadata["agent_output"]
+    def __repr__(self):
+        return (
+            "BootstrappedClassificationModel({\n"
+            f"    datasource: {self.datasource},\n"
+            f"    memoryset: {self.memoryset},\n"
+            f"    classification_model: {self.classification_model},\n"
+            f"    agent_output: {self.agent_output},\n"
+            "})"
+        )
 class ClassificationModel:
@@ -45,17 +65,20 @@ class ClassificationModel:
     Attributes:
         id: Unique identifier for the model
         name: Unique name of the model
+        description: Optional description of the model
         memoryset: Memoryset that the model uses
         head_type: Classification head type of the model
         num_classes: Number of distinct classes the model can predict
         memory_lookup_count: Number of memories the model uses for each prediction
         weigh_memories: If using a KNN head, whether the model weighs memories by their lookup score
         min_memory_weight: If using a KNN head, minimum lookup score memories have to be over to not be ignored
+        locked: Whether the model is locked to prevent accidental deletion
         created_at: When the model was created
     """
     id: str
     name: str
+    description: str | None
     memoryset: LabeledMemoryset
     head_type: RACHeadType
     num_classes: int
@@ -63,23 +86,26 @@ class ClassificationModel:
     weigh_memories: bool | None
     min_memory_weight: float | None
     version: int
+    locked: bool
     created_at: datetime
-    def __init__(self, metadata: RACModelMetadata):
+    def __init__(self, metadata: ClassificationModelMetadata):
         # for internal use only, do not document
-        self.id = metadata.id
-        self.name = metadata.name
-        self.memoryset = LabeledMemoryset.open(metadata.memoryset_id)
-        self.head_type = metadata.head_type
-        self.num_classes = metadata.num_classes
-        self.memory_lookup_count = metadata.memory_lookup_count
-        self.weigh_memories = metadata.weigh_memories
-        self.min_memory_weight = metadata.min_memory_weight
-        self.version = metadata.version
-        self.created_at = metadata.created_at
+        self.id = metadata["id"]
+        self.name = metadata["name"]
+        self.description = metadata["description"]
+        self.memoryset = LabeledMemoryset.open(metadata["memoryset_id"])
+        self.head_type = metadata["head_type"]
+        self.num_classes = metadata["num_classes"]
+        self.memory_lookup_count = metadata["memory_lookup_count"]
+        self.weigh_memories = metadata["weigh_memories"]
+        self.min_memory_weight = metadata["min_memory_weight"]
+        self.version = metadata["version"]
+        self.locked = metadata["locked"]
+        self.created_at = datetime.fromisoformat(metadata["created_at"])
         self._memoryset_override_id: str | None = None
-        self._last_prediction: LabelPrediction | None = None
+        self._last_prediction: ClassificationPrediction | None = None
         self._last_prediction_was_batch: bool = False
     def __eq__(self, other) -> bool:
@@ -97,7 +123,7 @@ class ClassificationModel:
         )
     @property
-    def last_prediction(self) -> LabelPrediction:
+    def last_prediction(self) -> ClassificationPrediction:
         """
         Last prediction made by the model
@@ -119,8 +145,9 @@ class ClassificationModel:
         cls,
         name: str,
         memoryset: LabeledMemoryset,
-        head_type: Literal["BMMOE", "FF", "KNN", "MMOE"] = "KNN",
+        head_type: RACHeadType = "KNN",
         *,
+        description: str | None = None,
         num_classes: int | None = None,
         memory_lookup_count: int | None = None,
         weigh_memories: bool = True,
@@ -141,6 +168,8 @@ class ClassificationModel:
             min_memory_weight: If using a KNN head, minimum lookup score memories have to be over to not be ignored
             if_exists: What to do if a model with the same name already exists, defaults to
                 `"error"`. Other option is `"open"` to open the existing model.
+            description: Optional description for the model, this will be used in agentic flows,
+                so make sure it is concise and describes the purpose of your model.
         Returns:
             Handle to the new model in the OrcaCloud
@@ -182,16 +211,18 @@ class ClassificationModel:
                 return existing
-        metadata = create_model(
-            body=CreateRACModelRequest(
-                name=name,
-                memoryset_id=memoryset.id,
-                head_type=RACHeadType(head_type),
-                memory_lookup_count=memory_lookup_count,
-                num_classes=num_classes,
-                weigh_memories=weigh_memories,
-                min_memory_weight=min_memory_weight,
-            ),
+        metadata = orca_api.POST(
+            "/classification_model",
+            json={
+                "name": name,
+                "memoryset_name_or_id": memoryset.id,
+                "head_type": head_type,
+                "memory_lookup_count": memory_lookup_count,
+                "num_classes": num_classes,
+                "weigh_memories": weigh_memories,
+                "min_memory_weight": min_memory_weight,
+                "description": description,
+            },
         )
         return cls(metadata)
@@ -209,7 +240,7 @@ class ClassificationModel:
         Raises:
             LookupError: If the classification model does not exist
         """
-        return cls(get_model(name))
+        return cls(orca_api.GET("/classification_model/{name_or_id}", params={"name_or_id": name}))
     @classmethod
     def exists(cls, name_or_id: str) -> bool:
@@ -236,7 +267,7 @@ class ClassificationModel:
         Returns:
             List of handles to all classification models in the OrcaCloud
         """
-        return [cls(metadata) for metadata in list_models()]
+        return [cls(metadata) for metadata in orca_api.GET("/classification_model")]
     @classmethod
     def drop(cls, name_or_id: str, if_not_exists: DropMode = "error"):
@@ -255,73 +286,189 @@ class ClassificationModel:
             LookupError: If the classification model does not exist and if_not_exists is `"error"`
         """
         try:
-            delete_model(name_or_id)
+            orca_api.DELETE("/classification_model/{name_or_id}", params={"name_or_id": name_or_id})
             logging.info(f"Deleted model {name_or_id}")
         except LookupError:
             if if_not_exists == "error":
                 raise
+    def refresh(self):
+        """Refresh the model data from the OrcaCloud"""
+        self.__dict__.update(self.open(self.name).__dict__)
+    def set(self, *, description: str | None = UNSET, locked: bool = UNSET) -> None:
+        """
+        Update editable attributes of the model.
+        Note:
+            If a field is not provided, it will default to [UNSET][orca_sdk.UNSET] and not be updated.
+        Params:
+            description: Value to set for the description
+            locked: Value to set for the locked status
+        Examples:
+            Update the description:
+            >>> model.set(description="New description")
+            Remove description:
+            >>> model.set(description=None)
+            Lock the model:
+            >>> model.set(locked=True)
+        """
+        update: PredictiveModelUpdate = {}
+        if description is not UNSET:
+            update["description"] = description
+        if locked is not UNSET:
+            update["locked"] = locked
+        orca_api.PATCH("/classification_model/{name_or_id}", params={"name_or_id": self.id}, json=update)
+        self.refresh()
+    def lock(self) -> None:
+        """Lock the model to prevent accidental deletion"""
+        self.set(locked=True)
+    def unlock(self) -> None:
+        """Unlock the model to allow deletion"""
+        self.set(locked=False)
     @overload
     def predict(
-        self, value: list[str], expected_labels: list[int] | None = None, tags: set[str] = set()
-    ) -> list[LabelPrediction]:
+        self,
+        value: list[str],
+        expected_labels: list[int] | None = None,
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: TelemetryMode = "on",
+        prompt: str | None = None,
+        use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
+    ) -> list[ClassificationPrediction]:
         pass
     @overload
-    def predict(self, value: str, expected_labels: int | None = None, tags: set[str] = set()) -> LabelPrediction:
+    def predict(
+        self,
+        value: str,
+        expected_labels: int | None = None,
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: TelemetryMode = "on",
+        prompt: str | None = None,
+        use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
+    ) -> ClassificationPrediction:
         pass
     def predict(
-        self, value: list[str] | str, expected_labels: list[int] | int | None = None, tags: set[str] = set()
-    ) -> list[LabelPrediction] | LabelPrediction:
+        self,
+        value: list[str] | str,
+        expected_labels: list[int] | list[str] | int | str | None = None,
+        filters: list[FilterItemTuple] = [],
+        tags: set[str] | None = None,
+        save_telemetry: TelemetryMode = "on",
+        prompt: str | None = None,
+        use_lookup_cache: bool = True,
+        timeout_seconds: int = 10,
+    ) -> list[ClassificationPrediction] | ClassificationPrediction:
         """
         Predict label(s) for the given input value(s) grounded in similar memories
         Params:
             value: Value(s) to get predict the labels of
             expected_labels: Expected label(s) for the given input to record for model evaluation
+            filters: Optional filters to apply during memory lookup
             tags: Tags to add to the prediction(s)
+            save_telemetry: Whether to save telemetry for the prediction(s). One of
+                * `"off"`: Do not save telemetry
+                * `"on"`: Save telemetry asynchronously unless the `ORCA_SAVE_TELEMETRY_SYNCHRONOUSLY`
+                  environment variable is set.
+                * `"sync"`: Save telemetry synchronously
+                * `"async"`: Save telemetry asynchronously
+            prompt: Optional prompt to use for instruction-tuned embedding models
+            use_lookup_cache: Whether to use cached lookup results for faster predictions
+            timeout_seconds: Timeout in seconds for the request, defaults to 10 seconds
         Returns:
             Label prediction or list of label predictions
+        Raises:
+            ValueError: If timeout_seconds is not a positive integer
+            TimeoutError: If the request times out after the specified duration
         Examples:
             Predict the label for a single value:
             >>> prediction = model.predict("I am happy", tags={"test"})
-            LabelPrediction({label: <positive: 1>, confidence: 0.95, input_value: 'I am happy' })
+            ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
             Predict the labels for a list of values:
             >>> predictions = model.predict(["I am happy", "I am sad"], expected_labels=[1, 0])
             [
-                LabelPrediction({label: <positive: 1>, confidence: 0.95, input_value: 'I am happy'}),
-                LabelPrediction({label: <negative: 0>, confidence: 0.05, input_value: 'I am sad'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
+                ClassificationPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
             ]
+            Using a prompt with an instruction-tuned embedding model:
+            >>> prediction = model.predict("I am happy", prompt="Represent this text for sentiment classification:")
+            ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
         """
-        response = predict_gpu(
-            self.id,
-            body=PredictionRequest(
-                input_values=value if isinstance(value, list) else [value],
-                memoryset_override_id=self._memoryset_override_id,
-                expected_labels=(
-                    expected_labels
-                    if isinstance(expected_labels, list)
-                    else [expected_labels]
-                    if expected_labels is not None
-                    else None
-                ),
-                tags=list(tags),
-            ),
+        if timeout_seconds <= 0:
+            raise ValueError("timeout_seconds must be a positive integer")
+        parsed_filters = [
+            _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
+        ]
+        if any(_is_metric_column(filter[0]) for filter in filters):
+            raise ValueError(f"Cannot filter on {filters} - telemetry filters are not supported for predictions")
+        if isinstance(expected_labels, int):
+            expected_labels = [expected_labels]
+        elif isinstance(expected_labels, str):
+            expected_labels = [self.memoryset.label_names.index(expected_labels)]
+        elif isinstance(expected_labels, list):
+            expected_labels = [
+                self.memoryset.label_names.index(label) if isinstance(label, str) else label
+                for label in expected_labels
+            ]
+        telemetry_on, telemetry_sync = _get_telemetry_config(save_telemetry)
+        response = orca_api.POST(
+            "/gpu/classification_model/{name_or_id}/prediction",
+            params={"name_or_id": self.id},
+            json={
+                "input_values": value if isinstance(value, list) else [value],
+                "memoryset_override_name_or_id": self._memoryset_override_id,
+                "expected_labels": expected_labels,
+                "tags": list(tags or set()),
+                "save_telemetry": telemetry_on,
+                "save_telemetry_synchronously": telemetry_sync,
+                "filters": cast(list[FilterItem], parsed_filters),
+                "prompt": prompt,
+                "use_lookup_cache": use_lookup_cache,
+            },
+            timeout=timeout_seconds,
         )
+        if telemetry_on and any(p["prediction_id"] is None for p in response):
+            raise RuntimeError("Failed to save prediction to database.")
         predictions = [
-            LabelPrediction(
-                prediction_id=prediction.prediction_id,
-                label=prediction.label,
-                label_name=prediction.label_name,
-                confidence=prediction.confidence,
+            ClassificationPrediction(
+                prediction_id=prediction["prediction_id"],
+                label=prediction["label"],
+                label_name=prediction["label_name"],
+                score=None,
+                confidence=prediction["confidence"],
+                anomaly_score=prediction["anomaly_score"],
                 memoryset=self.memoryset,
                 model=self,
+                logits=prediction["logits"],
+                input_value=input_value,
             )
-            for prediction in response
+            for prediction, input_value in zip(response, value if isinstance(value, list) else [value])
         ]
         self._last_prediction_was_batch = isinstance(value, list)
         self._last_prediction = predictions[-1]
@@ -332,8 +479,9 @@ class ClassificationModel:
         limit: int = 100,
         offset: int = 0,
         tag: str | None = None,
-        sort: list[tuple[PredictionSortColumns, PredictionSortDirection]] = [],
-    ) -> list[LabelPrediction]:
+        sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
+        expected_label_match: bool | None = None,
+    ) -> list[ClassificationPrediction]:
         """
         Get a list of predictions made by this model
@@ -343,6 +491,8 @@ class ClassificationModel:
             tag: Optional tag to filter predictions by
             sort: Optional list of columns and directions to sort the predictions by.
                 Predictions can be sorted by `timestamp` or `confidence`.
+            expected_label_match: Optional filter to only include predictions where the expected
+                label does (`True`) or doesn't (`False`) match the predicted label
         Returns:
             List of label predictions
@@ -351,78 +501,209 @@ class ClassificationModel:
             Get the last 3 predictions:
             >>> predictions = model.predictions(limit=3, sort=[("timestamp", "desc")])
             [
-                LabeledPrediction({label: <positive: 1>, confidence: 0.95, input_value: 'I am happy'}),
-                LabeledPrediction({label: <negative: 0>, confidence: 0.05, input_value: 'I am sad'}),
-                LabeledPrediction({label: <positive: 1>, confidence: 0.90, input_value: 'I am ecstatic'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy'}),
+                ClassificationPrediction({label: <negative: 0>, confidence: 0.05, anomaly_score: 0.1, input_value: 'I am sad'}),
+                ClassificationPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am ecstatic'}),
             ]
             Get second most confident prediction:
             >>> predictions = model.predictions(sort=[("confidence", "desc")], offset=1, limit=1)
-            [LabeledPrediction({label: <positive: 1>, confidence: 0.90, input_value: 'I am having a good day'})]
+            [ClassificationPrediction({label: <positive: 1>, confidence: 0.90, anomaly_score: 0.1, input_value: 'I am having a good day'})]
+            Get predictions where the expected label doesn't match the predicted label:
+            >>> predictions = model.predictions(expected_label_match=False)
+            [ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy', expected_label: 0})]
         """
-        predictions = list_predictions(
-            body=ListPredictionsRequest(
-                model_id=self.id,
-                limit=limit,
-                offset=offset,
-                sort=cast(list[list[PredictionSortColumns | PredictionSortDirection]], sort),
-                tag=tag,
-            ),
+        predictions = orca_api.POST(
+            "/telemetry/prediction",
+            json={
+                "model_id": self.id,
+                "limit": limit,
+                "offset": offset,
+                "sort": [list(sort_item) for sort_item in sort],
+                "tag": tag,
+                "expected_label_match": expected_label_match,
+            },
         )
         return [
-            LabelPrediction(
-                prediction_id=prediction.prediction_id,
-                label=prediction.label,
-                label_name=prediction.label_name,
-                confidence=prediction.confidence,
+            ClassificationPrediction(
+                prediction_id=prediction["prediction_id"],
+                label=prediction["label"],
+                label_name=prediction["label_name"],
+                score=None,
+                confidence=prediction["confidence"],
+                anomaly_score=prediction["anomaly_score"],
                 memoryset=self.memoryset,
                 model=self,
                 telemetry=prediction,
             )
             for prediction in predictions
+            if "label" in prediction
         ]
-    def evaluate(
+    def _evaluate_datasource(
         self,
         datasource: Datasource,
+        value_column: str,
+        label_column: str,
+        record_predictions: bool,
+        tags: set[str] | None,
+        background: bool = False,
+    ) -> ClassificationMetrics | Job[ClassificationMetrics]:
+        response = orca_api.POST(
+            "/classification_model/{model_name_or_id}/evaluation",
+            params={"model_name_or_id": self.id},
+            json={
+                "datasource_name_or_id": datasource.id,
+                "datasource_label_column": label_column,
+                "datasource_value_column": value_column,
+                "memoryset_override_name_or_id": self._memoryset_override_id,
+                "record_telemetry": record_predictions,
+                "telemetry_tags": list(tags) if tags else None,
+            },
+        )
+        def get_value():
+            res = orca_api.GET(
+                "/classification_model/{model_name_or_id}/evaluation/{task_id}",
+                params={"model_name_or_id": self.id, "task_id": response["task_id"]},
+            )
+            assert res["result"] is not None
+            return ClassificationMetrics(
+                coverage=res["result"].get("coverage"),
+                f1_score=res["result"].get("f1_score"),
+                accuracy=res["result"].get("accuracy"),
+                loss=res["result"].get("loss"),
+                anomaly_score_mean=res["result"].get("anomaly_score_mean"),
+                anomaly_score_median=res["result"].get("anomaly_score_median"),
+                anomaly_score_variance=res["result"].get("anomaly_score_variance"),
+                roc_auc=res["result"].get("roc_auc"),
+                pr_auc=res["result"].get("pr_auc"),
+                pr_curve=res["result"].get("pr_curve"),
+                roc_curve=res["result"].get("roc_curve"),
+            )
+        job = Job(response["task_id"], get_value)
+        return job if background else job.result()
+    def _evaluate_dataset(
+        self,
+        dataset: Dataset,
+        value_column: str,
+        label_column: str,
+        record_predictions: bool,
+        tags: set[str],
+        batch_size: int,
+    ) -> ClassificationMetrics:
+        if len(dataset) == 0:
+            raise ValueError("Evaluation dataset cannot be empty")
+        if any(x is None for x in dataset[label_column]):
+            raise ValueError("Evaluation dataset cannot contain None values in the label column")
+        predictions = [
+            prediction
+            for i in range(0, len(dataset), batch_size)
+            for prediction in self.predict(
+                dataset[i : i + batch_size][value_column],
+                expected_labels=dataset[i : i + batch_size][label_column],
+                tags=tags,
+                save_telemetry="sync" if record_predictions else "off",
+            )
+        ]
+        return calculate_classification_metrics(
+            expected_labels=dataset[label_column],
+            logits=[p.logits for p in predictions],
+            anomaly_scores=[p.anomaly_score for p in predictions],
+            include_curves=True,
+        )
+    @overload
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
+        *,
         value_column: str = "value",
         label_column: str = "label",
         record_predictions: bool = False,
-        tags: set[str] | None = None,
-    ) -> dict[str, float]:
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
+        background: Literal[True],
+    ) -> Job[ClassificationMetrics]:
+        pass
+    @overload
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
+        *,
+        value_column: str = "value",
+        label_column: str = "label",
+        record_predictions: bool = False,
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
+        background: Literal[False] = False,
+    ) -> ClassificationMetrics:
+        pass
+    def evaluate(
+        self,
+        data: Datasource | Dataset,
+        *,
+        value_column: str = "value",
+        label_column: str = "label",
+        record_predictions: bool = False,
+        tags: set[str] = {"evaluation"},
+        batch_size: int = 100,
+        background: bool = False,
+    ) -> ClassificationMetrics | Job[ClassificationMetrics]:
         """
-        Evaluate the classification model on a given datasource
+        Evaluate the classification model on a given dataset or datasource
         Params:
-            datasource: Datasource to evaluate the model on
+            data: Dataset or Datasource to evaluate the model on
             value_column: Name of the column that contains the input values to the model
             label_column: Name of the column containing the expected labels
-            record_predictions: Whether to record [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s for analysis
-            tags: Optional tags to add to the recorded [`LabelPrediction`][orca_sdk.telemetry.LabelPrediction]s
+            record_predictions: Whether to record [`ClassificationPrediction`][orca_sdk.telemetry.ClassificationPrediction]s for analysis
+            tags: Optional tags to add to the recorded [`ClassificationPrediction`][orca_sdk.telemetry.ClassificationPrediction]s
+            batch_size: Batch size for processing Dataset inputs (only used when input is a Dataset)
+            background: Whether to run the operation in the background and return a job handle
         Returns:
-            Dictionary with evaluation metrics
+            EvaluationResult containing metrics including accuracy, F1 score, ROC AUC, PR AUC, and anomaly score statistics
         Examples:
             >>> model.evaluate(datasource, value_column="text", label_column="airline_sentiment")
-            { "f1_score": 0.85, "roc_auc": 0.85, "pr_auc": 0.85, "accuracy": 0.85, "loss": 0.35 }
+            ClassificationMetrics({
+                accuracy: 0.8500,
+                f1_score: 0.8500,
+                roc_auc: 0.8500,
+                pr_auc: 0.8500,
+                anomaly_score: 0.3500 ± 0.0500,
+            })
         """
-        response = create_evaluation(
-            self.id,
-            body=EvaluationRequest(
-                datasource_id=datasource.id,
-                datasource_label_column=label_column,
-                datasource_value_column=value_column,
-                memoryset_override_id=self._memoryset_override_id,
-                record_telemetry=record_predictions,
-                telemetry_tags=list(tags) if tags else None,
-            ),
-        )
-        wait_for_task(response.task_id, description="Running evaluation")
-        response = get_evaluation(self.id, UUID(response.task_id))
-        assert response.result is not None
-        return response.result.to_dict()
+        if isinstance(data, Datasource):
+            return self._evaluate_datasource(
+                datasource=data,
+                value_column=value_column,
+                label_column=label_column,
+                record_predictions=record_predictions,
+                tags=tags,
+                background=background,
+            )
+        elif isinstance(data, Dataset):
+            return self._evaluate_dataset(
+                dataset=data,
+                value_column=value_column,
+                label_column=label_column,
+                record_predictions=record_predictions,
+                tags=tags,
+                batch_size=batch_size,
+            )
+        else:
+            raise ValueError(f"Invalid data type: {type(data)}")
     def finetune(self, datasource: Datasource):
         #  do not document until implemented
@@ -492,8 +773,37 @@ class ClassificationModel:
             ValueError: If the value does not match previous value types for the category, or is a
                 [`float`][float] that is not between `-1.0` and `+1.0`.
         """
-        record_prediction_feedback(
-            body=[
+        orca_api.PUT(
+            "/telemetry/prediction/feedback",
+            json=[
                 _parse_feedback(f) for f in (cast(list[dict], [feedback]) if isinstance(feedback, dict) else feedback)
             ],
         )
+    @staticmethod
+    def bootstrap_model(
+        model_description: str,
+        label_names: list[str],
+        initial_examples: list[tuple[str, str]],
+        num_examples_per_label: int,
+        background: bool = False,
+    ) -> Job[BootstrappedClassificationModel] | BootstrappedClassificationModel:
+        response = orca_api.POST(
+            "/agents/bootstrap_classification_model",
+            json={
+                "model_description": model_description,
+                "label_names": label_names,
+                "initial_examples": [{"text": text, "label_name": label_name} for text, label_name in initial_examples],
+                "num_examples_per_label": num_examples_per_label,
+            },
+        )
+        def get_result() -> BootstrappedClassificationModel:
+            res = orca_api.GET(
+                "/agents/bootstrap_classification_model/{task_id}", params={"task_id": response["task_id"]}
+            )
+            assert res["result"] is not None
+            return BootstrappedClassificationModel(res["result"])
+        job = Job(response["task_id"], get_result)
+        return job if background else job.result()

orca-sdk 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

orca-sdk 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl