PyPI - scorebook - Versions diffs - 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

scorebook 0.0.11py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

scorebook/__init__.py +8 -1
scorebook/evaluate/_async/evaluate_async.py +100 -125
scorebook/evaluate/_sync/evaluate.py +100 -126
scorebook/evaluate/evaluate_helpers.py +24 -24
scorebook/exceptions.py +6 -2
scorebook/score/__init__.py +6 -0
scorebook/score/_async/__init__.py +0 -0
scorebook/score/_async/score_async.py +145 -0
scorebook/score/_sync/__init__.py +0 -0
scorebook/score/_sync/score.py +145 -0
scorebook/score/score_helpers.py +207 -0
scorebook/trismik/upload_results.py +254 -0
scorebook/types.py +33 -54
scorebook/utils/__init__.py +8 -1
scorebook/utils/common_helpers.py +41 -0
scorebook/utils/progress_bars.py +67 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/METADATA +2 -2
{scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/RECORD +21 -13
{scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0

scorebook/evaluate/evaluate_helpers.py CHANGED Viewed

@@ -2,9 +2,8 @@
 import asyncio
 import dataclasses
-import inspect
 import logging
-from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
 from trismik._async.client import TrismikAsyncClient
 from trismik._sync.client import TrismikClient
@@ -25,30 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
 logger = logging.getLogger(__name__)
-def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
-    """Resolve the upload_results parameter based on trismik login status."""
+# TODO: Remove this when backend supports boolean item metrics
+NORMALIZE_METRICS_FOR_UPLOAD = True
-    if upload_results == "auto":
-        upload_results = get_token() is not None
-        logger.debug("Auto upload results resolved to: %s", upload_results)
-    return upload_results
+def normalize_metric_value(value: Any) -> Any:
+    """Normalize metric values for API upload compatibility.
-def resolve_show_progress(show_progress: Optional[bool]) -> bool:
-    """Resolve whether to show progress bars.
+    TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
+    This function converts boolean values to floats (True -> 1.0, False -> 0.0)
+    to ensure upload compatibility.
     Args:
-        show_progress: Explicit setting (None uses default from settings)
+        value: The metric value to normalize
     Returns:
-        bool: Whether to show progress bars
+        Float if value is bool, otherwise unchanged
+    TODO: Remove this function when backend supports boolean metrics natively.
+          To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
     """
-    if show_progress is None:
-        from scorebook.settings import SHOW_PROGRESS_BARS
+    if not NORMALIZE_METRICS_FOR_UPLOAD:
+        return value
+    # Convert booleans to floats for API compatibility
+    if isinstance(value, bool):
+        return float(value)  # True -> 1.0, False -> 0.0
-        return bool(SHOW_PROGRESS_BARS)
-    return show_progress
+    return value
 def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
@@ -109,7 +112,7 @@ def prepare_datasets(
         # Prepare adaptive datasets
         elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
-            datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
+            datasets_out.append(AdaptiveEvalDataset(dataset))
         # TODO: dataset name string registry
         elif isinstance(dataset, str):
@@ -220,9 +223,9 @@ def build_adaptive_eval_run_spec(
     metadata: Optional[Dict[str, Any]] = None,
 ) -> AdaptiveEvalRunSpec:
     """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
-    dataset = adaptive_dataset.replace(":adaptive", "")
+    # Keep the full dataset name including ":adaptive" suffix for backend API
     adaptive_eval_run_spec = AdaptiveEvalRunSpec(
-        dataset,
+        adaptive_dataset,
         dataset_index,
         hyperparameter_config,
         hyperparameter_config_index,
@@ -345,10 +348,7 @@ def make_trismik_inference(
     """
     # Check if the inference function is async
-    is_async = inspect.iscoroutinefunction(inference_function) or (
-        hasattr(inference_function, "__call__")
-        and inspect.iscoroutinefunction(inference_function.__call__)
-    )
+    is_async = is_awaitable(inference_function)
     def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
         # Single TrismikMultipleChoiceTextItem dataclass

scorebook/exceptions.py CHANGED Viewed

@@ -84,10 +84,14 @@ class MetricComputationError(EvaluationError):
         )
-class DataMismatchError(EvaluationError):
+class ScoreError(ScoreBookError):
+    """Raised when there are errors during scoring."""
+class DataMismatchError(ScoreError):
     """Raised when there's a mismatch between outputs and expected labels."""
-    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
+    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
         """Initialize data mismatch error."""
         self.outputs_count = outputs_count
         self.labels_count = labels_count

scorebook/score/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Score module for computing metrics on pre-computed outputs."""
+from scorebook.score._async.score_async import score_async
+from scorebook.score._sync.score import score
+__all__ = ["score", "score_async"]

scorebook/score/_async/__init__.py ADDED Viewed

File without changes

scorebook/score/_async/score_async.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+from typing import Any, Dict, List, Literal, Optional, Union, cast
+from scorebook.exceptions import DataMismatchError, ParameterValidationError
+from scorebook.score.score_helpers import (
+    calculate_metric_scores_async,
+    format_results,
+    resolve_metrics,
+    validate_items,
+)
+from scorebook.trismik.upload_results import upload_result_async
+from scorebook.types import Metrics
+from scorebook.utils import resolve_show_progress, resolve_upload_results, scoring_progress_context
+logger = logging.getLogger(__name__)
+async def score_async(
+    items: List[Dict[str, Any]],
+    metrics: Metrics,
+    output_column: str = "output",
+    label_column: str = "label",
+    input_column: str = "input",
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    dataset_name: Optional[str] = None,
+    model_name: Optional[str] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    experiment_id: Optional[str] = None,
+    project_id: Optional[str] = None,
+    upload_results: Union[Literal["auto"], bool] = "auto",
+    show_progress: Optional[bool] = None,
+) -> Dict[str, List[Dict[str, Any]]]:
+    """Score pre-computed model outputs against labels using specified metrics.
+    Args:
+        items: List of dictionaries containing model outputs and labels. Each item should
+            have keys matching the output_column and label_column parameters.
+        metrics: Metric(s) to compute. Can be a single Metric class, instance, string name,
+            or a list of any combination of these.
+        output_column: Key in items dictionaries containing model outputs. Defaults to "output".
+        label_column: Key in items dictionaries containing ground truth labels. Defaults to "label".
+        input_column: Key in items dictionaries containing inputs for reference.
+            Defaults to "input".
+        hyperparameters: Optional dictionary of hyperparameters used during inference.
+            Defaults to None.
+        dataset_name: Optional name of the dataset being evaluated. Defaults to None.
+        model_name: Optional name of the model being evaluated. Defaults to None.
+        metadata: Optional dictionary of additional metadata to store with results.
+            Defaults to None.
+        experiment_id: Optional experiment identifier for grouping related runs.
+            Required if upload_results is True. Defaults to None.
+        project_id: Optional Trismik project ID for uploading results.
+            Required if upload_results is True. Defaults to None.
+        upload_results: Whether to upload results to Trismik. Can be True, False, or "auto"
+            (uploads if experiment_id and project_id are provided). Defaults to "auto".
+        show_progress: Whether to display a progress bar during scoring. If None, uses
+            SHOW_PROGRESS_BARS from settings (defaults to True). Defaults to None.
+    Returns:
+        Dictionary containing scoring results with keys:
+            - "aggregate_results": List with one dict containing aggregate metric scores
+            - "item_results": List of dicts with per-item scores and data
+    """
+    # Resolve and validate parameters
+    upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
+    # Validate upload requirements
+    if upload_results and (experiment_id is None or project_id is None):
+        raise ParameterValidationError(
+            "experiment_id and project_id are required to upload a run",
+        )
+    # Validate items parameter
+    validate_items(items, output_column, label_column)
+    # Validate hyperparameters is a dict (not list)
+    if hyperparameters is not None and not isinstance(hyperparameters, dict):
+        raise ParameterValidationError("hyperparameters must be a dict")
+    # Resolve metrics to a list of Metrics
+    metric_instances = resolve_metrics(metrics)
+    # Extract outputs and labels from items
+    inputs = [item.get(input_column) for item in items]
+    outputs = [item.get(output_column) for item in items]
+    labels = [item.get(label_column) for item in items]
+    # Validate outputs and labels have same length
+    if len(outputs) != len(labels):
+        raise DataMismatchError(len(outputs), len(labels), dataset_name)
+    # Compute scores for each metric with progress display
+    with scoring_progress_context(
+        total_metrics=len(metric_instances),
+        enabled=show_progress_bars,
+    ) as progress_bar:
+        metric_scores = await calculate_metric_scores_async(
+            metrics=metric_instances,
+            outputs=outputs,
+            labels=labels,
+            dataset_name=dataset_name,
+            progress_bar=progress_bar,
+        )
+    # Build results
+    results: Dict[str, List[Dict[str, Any]]] = format_results(
+        inputs=inputs,
+        outputs=outputs,
+        labels=labels,
+        metric_scores=metric_scores,
+        hyperparameters=hyperparameters,
+        dataset_name=dataset_name,
+    )
+    # Upload if requested
+    if upload_results and experiment_id and project_id:
+        try:
+            run_id = await upload_result_async(
+                run_result=results,
+                experiment_id=experiment_id,
+                project_id=project_id,
+                dataset_name=dataset_name,
+                hyperparameters=hyperparameters,
+                metadata=metadata,
+                model_name=model_name,
+            )
+            logger.info(f"Score results uploaded successfully with run_id: {run_id}")
+            # Add run_id to aggregate results
+            if results.get("aggregate_results"):
+                results["aggregate_results"][0]["run_id"] = run_id
+            # Add run_id to each item result
+            if results.get("item_results"):
+                for item in results["item_results"]:
+                    item["run_id"] = run_id
+        except Exception as e:
+            logger.warning(f"Failed to upload score results: {e}")
+            # Don't raise - continue execution even if upload fails
+    logger.info("Async scoring complete")
+    return results

scorebook/score/_sync/__init__.py ADDED Viewed

File without changes

scorebook/score/_sync/score.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+from typing import Any, Dict, List, Literal, Optional, Union, cast
+from scorebook.exceptions import DataMismatchError, ParameterValidationError
+from scorebook.score.score_helpers import (
+    calculate_metric_scores,
+    format_results,
+    resolve_metrics,
+    validate_items,
+)
+from scorebook.trismik.upload_results import upload_result
+from scorebook.types import Metrics
+from scorebook.utils import resolve_show_progress, resolve_upload_results, scoring_progress_context
+logger = logging.getLogger(__name__)
+def score(
+    items: List[Dict[str, Any]],
+    metrics: Metrics,
+    output_column: str = "output",
+    label_column: str = "label",
+    input_column: str = "input",
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    dataset_name: Optional[str] = None,
+    model_name: Optional[str] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    experiment_id: Optional[str] = None,
+    project_id: Optional[str] = None,
+    upload_results: Union[Literal["auto"], bool] = "auto",
+    show_progress: Optional[bool] = None,
+) -> Dict[str, List[Dict[str, Any]]]:
+    """Score pre-computed model outputs against labels using specified metrics.
+    Args:
+        items: List of dictionaries containing model outputs and labels. Each item should
+            have keys matching the output_column and label_column parameters.
+        metrics: Metric(s) to compute. Can be a single Metric class, instance, string name,
+            or a list of any combination of these.
+        output_column: Key in items dictionaries containing model outputs. Defaults to "output".
+        label_column: Key in items dictionaries containing ground truth labels. Defaults to "label".
+        input_column: Key in items dictionaries containing inputs for reference.
+            Defaults to "input".
+        hyperparameters: Optional dictionary of hyperparameters used during inference.
+            Defaults to None.
+        dataset_name: Optional name of the dataset being evaluated. Defaults to None.
+        model_name: Optional name of the model being evaluated. Defaults to None.
+        metadata: Optional dictionary of additional metadata to store with results.
+            Defaults to None.
+        experiment_id: Optional experiment identifier for grouping related runs.
+            Required if upload_results is True. Defaults to None.
+        project_id: Optional Trismik project ID for uploading results.
+            Required if upload_results is True. Defaults to None.
+        upload_results: Whether to upload results to Trismik. Can be True, False, or "auto"
+            (uploads if experiment_id and project_id are provided). Defaults to "auto".
+        show_progress: Whether to display a progress bar during scoring. If None, uses
+            SHOW_PROGRESS_BARS from settings (defaults to True). Defaults to None.
+    Returns:
+        Dictionary containing scoring results with keys:
+            - "aggregate_results": List with one dict containing aggregate metric scores
+            - "item_results": List of dicts with per-item scores and data
+    """
+    # Resolve and validate parameters
+    upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
+    # Validate upload requirements
+    if upload_results and (experiment_id is None or project_id is None):
+        raise ParameterValidationError(
+            "experiment_id and project_id are required to upload a run",
+        )
+    # Validate items parameter
+    validate_items(items, output_column, label_column)
+    # Validate hyperparameters is a dict (not list)
+    if hyperparameters is not None and not isinstance(hyperparameters, dict):
+        raise ParameterValidationError("hyperparameters must be a dict")
+    # Resolve metrics to a list of Metrics
+    metric_instances = resolve_metrics(metrics)
+    # Extract outputs and labels from items
+    inputs = [item.get(input_column) for item in items]
+    outputs = [item.get(output_column) for item in items]
+    labels = [item.get(label_column) for item in items]
+    # Validate outputs and labels have same length
+    if len(outputs) != len(labels):
+        raise DataMismatchError(len(outputs), len(labels), dataset_name)
+    # Compute scores for each metric with progress display
+    with scoring_progress_context(
+        total_metrics=len(metric_instances),
+        enabled=show_progress_bars,
+    ) as progress_bar:
+        metric_scores = calculate_metric_scores(
+            metrics=metric_instances,
+            outputs=outputs,
+            labels=labels,
+            dataset_name=dataset_name,
+            progress_bar=progress_bar,
+        )
+    # Build results
+    results: Dict[str, List[Dict[str, Any]]] = format_results(
+        inputs=inputs,
+        outputs=outputs,
+        labels=labels,
+        metric_scores=metric_scores,
+        hyperparameters=hyperparameters,
+        dataset_name=dataset_name,
+    )
+    # Upload if requested
+    if upload_results and experiment_id and project_id:
+        try:
+            run_id = upload_result(
+                run_result=results,
+                experiment_id=experiment_id,
+                project_id=project_id,
+                dataset_name=dataset_name,
+                hyperparameters=hyperparameters,
+                metadata=metadata,
+                model_name=model_name,
+            )
+            logger.info(f"Score results uploaded successfully with run_id: {run_id}")
+            # Add run_id to aggregate results
+            if results.get("aggregate_results"):
+                results["aggregate_results"][0]["run_id"] = run_id
+            # Add run_id to each item result
+            if results.get("item_results"):
+                for item in results["item_results"]:
+                    item["run_id"] = run_id
+        except Exception as e:
+            logger.warning(f"Failed to upload score results: {e}")
+            # Don't raise - continue execution even if upload fails
+    logger.info("Scoring complete")
+    return results

scorebook/score/score_helpers.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Helper functions shared between score() and score_async()."""
+import logging
+from typing import Any, Dict, List, Mapping, Optional, Type, Union
+from scorebook.exceptions import DataMismatchError, ParameterValidationError
+from scorebook.metrics.metric_base import MetricBase
+from scorebook.metrics.metric_registry import MetricRegistry
+from scorebook.types import MetricScore
+from scorebook.utils import is_awaitable
+logger = logging.getLogger(__name__)
+def validate_items(items: List[Dict[str, Any]], output_column: str, label_column: str) -> None:
+    """Validate the items parameter."""
+    if not isinstance(items, list):
+        raise ParameterValidationError("items must be a list")
+    if len(items) == 0:
+        raise ParameterValidationError("items list cannot be empty")
+    required = {output_column, label_column}
+    for idx, item in enumerate(items):
+        if not isinstance(item, Mapping):
+            raise ParameterValidationError(f"Item at index {idx} is not a dict")
+        missing = required - item.keys()
+        if missing:
+            for key in sorted(missing):
+                raise ParameterValidationError(f"Item at index {idx} missing required '{key}' key")
+def resolve_metrics(
+    metrics: Union[
+        str, MetricBase, Type[MetricBase], List[Union[str, MetricBase, Type[MetricBase]]]
+    ]
+) -> List[MetricBase]:
+    """Resolve metrics parameter to list of MetricBase instances."""
+    # Ensure metrics is a list
+    if not isinstance(metrics, list):
+        metrics = [metrics]
+    # Resolve each metric
+    metric_instances = []
+    for metric in metrics:
+        if isinstance(metric, str) or (isinstance(metric, type) and issubclass(metric, MetricBase)):
+            # Use MetricRegistry to resolve string names or classes
+            metric_instance = MetricRegistry.get(metric)
+            metric_instances.append(metric_instance)
+        elif isinstance(metric, MetricBase):
+            # Already an instance
+            metric_instances.append(metric)
+        else:
+            raise ParameterValidationError(
+                f"Invalid metric type: {type(metric)}. "
+                "Metrics must be string names, MetricBase classes, or MetricBase instances"
+            )
+    return metric_instances
+async def calculate_metric_scores_async(
+    metrics: List[MetricBase],
+    outputs: List[Any],
+    labels: List[Any],
+    dataset_name: Optional[str],
+    progress_bar: Optional[Any] = None,
+) -> List[MetricScore]:
+    """Calculate metric scores asynchronously (supports both sync and async metrics).
+    Args:
+        metrics: List of metric instances to compute scores for.
+        outputs: List of model outputs.
+        labels: List of ground truth labels.
+        dataset_name: Name of the dataset being scored.
+        progress_bar: Optional progress bar to update during computation.
+    Returns:
+        List of MetricScore objects containing aggregate and item-level scores.
+    Raises:
+        DataMismatchError: If outputs and labels have different lengths.
+    """
+    if len(outputs) != len(labels):
+        raise DataMismatchError(len(outputs), len(labels), dataset_name)
+    metric_scores: List[MetricScore] = []
+    for metric in metrics:
+        if progress_bar is not None:
+            progress_bar.set_current_metric(metric.name)
+        if is_awaitable(metric.score):
+            aggregate_scores, item_scores = await metric.score(outputs, labels)
+        else:
+            aggregate_scores, item_scores = metric.score(outputs, labels)
+        metric_scores.append(MetricScore(metric.name, aggregate_scores, item_scores))
+        if progress_bar is not None:
+            progress_bar.update(1)
+    return metric_scores
+def calculate_metric_scores(
+    metrics: List[MetricBase],
+    outputs: List[Any],
+    labels: List[Any],
+    dataset_name: Optional[str],
+    progress_bar: Optional[Any] = None,
+) -> List[MetricScore]:
+    """Calculate metric scores synchronously (sync metrics only).
+    Args:
+        metrics: List of metric instances to compute scores for.
+        outputs: List of model outputs.
+        labels: List of ground truth labels.
+        dataset_name: Name of the dataset being scored.
+        progress_bar: Optional progress bar to update during computation.
+    Returns:
+        List of MetricScore objects containing aggregate and item-level scores.
+    Raises:
+        DataMismatchError: If outputs and labels have different lengths.
+        ParameterValidationError: If any metric has an async score method.
+    """
+    if len(outputs) != len(labels):
+        raise DataMismatchError(len(outputs), len(labels), dataset_name)
+    metric_scores: List[MetricScore] = []
+    for metric in metrics:
+        if progress_bar is not None:
+            progress_bar.set_current_metric(metric.name)
+        if is_awaitable(metric.score):
+            raise ParameterValidationError(
+                f"Metric '{metric.name}' has an async score() method. "
+                "Use score_async() instead of score() for async metrics."
+            )
+        aggregate_scores, item_scores = metric.score(outputs, labels)
+        metric_scores.append(MetricScore(metric.name, aggregate_scores, item_scores))
+        if progress_bar is not None:
+            progress_bar.update(1)
+    return metric_scores
+def format_results(
+    inputs: Optional[List[Any]],
+    outputs: List[Any],
+    labels: List[Any],
+    metric_scores: List[MetricScore],
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    dataset_name: Optional[str] = None,
+) -> Dict[str, List[Dict[str, Any]]]:
+    """Format results dict with both aggregates and items."""
+    # Use defaults if not provided
+    hyperparameters = hyperparameters or {}
+    dataset_name = dataset_name or "scored_items"
+    # Build aggregate results
+    aggregate_result = {
+        "dataset": dataset_name,
+        **hyperparameters,
+    }
+    # Add aggregate scores from metrics
+    for metric_score in metric_scores:
+        for key, value in metric_score.aggregate_scores.items():
+            score_key = (
+                key if key == metric_score.metric_name else f"{metric_score.metric_name}_{key}"
+            )
+            aggregate_result[score_key] = value
+    # Build item results
+    item_results = []
+    for idx in range(len(outputs)):
+        item_result: Dict[str, Any] = {
+            "id": idx,
+            "dataset": dataset_name,
+            "output": outputs[idx],
+            "label": labels[idx],
+            **hyperparameters,
+        }
+        # Add input if present
+        if inputs is not None and inputs[idx] is not None:
+            item_result["input"] = inputs[idx]
+        # Add item-level metric scores
+        for metric_score in metric_scores:
+            if idx < len(metric_score.item_scores):
+                item_result[metric_score.metric_name] = metric_score.item_scores[idx]
+        item_results.append(item_result)
+    # Always return both aggregates and items
+    return {
+        "aggregate_results": [aggregate_result],
+        "item_results": item_results,
+    }

scorebook 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

scorebook 0.0.11py3-none-any.whl → 0.0.12py3-none-any.whl