PyPI - scorebook - Versions diffs - 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl - Mend

scorebook 0.0.11py3-none-any.whl → 0.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

scorebook/__init__.py +8 -1
scorebook/eval_datasets/eval_dataset.py +18 -0
scorebook/evaluate/_async/evaluate_async.py +116 -126
scorebook/evaluate/_sync/evaluate.py +116 -127
scorebook/evaluate/evaluate_helpers.py +98 -25
scorebook/exceptions.py +6 -2
scorebook/score/__init__.py +6 -0
scorebook/score/_async/__init__.py +0 -0
scorebook/score/_async/score_async.py +145 -0
scorebook/score/_sync/__init__.py +0 -0
scorebook/score/_sync/score.py +145 -0
scorebook/score/score_helpers.py +207 -0
scorebook/trismik/upload_results.py +254 -0
scorebook/types.py +35 -54
scorebook/utils/__init__.py +8 -1
scorebook/utils/common_helpers.py +41 -0
scorebook/utils/progress_bars.py +67 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/METADATA +2 -2
{scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/RECORD +22 -14
{scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/WHEEL +0 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/licenses/LICENSE +0 -0

scorebook/evaluate/_sync/evaluate.py CHANGED Viewed

@@ -2,13 +2,8 @@ import logging
 from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
 from trismik import TrismikAsyncClient, TrismikClient
-from trismik.types import (
-    TrismikClassicEvalItem,
-    TrismikClassicEvalMetric,
-    TrismikClassicEvalRequest,
-    TrismikClassicEvalResponse,
-    TrismikRunMetadata,
-)
+from trismik.settings import evaluation_settings
+from trismik.types import TrismikRunMetadata
 from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
@@ -19,12 +14,12 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
-    resolve_show_progress,
-    resolve_upload_results,
-    score_metrics,
+    resolve_adaptive_split,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
+from scorebook.inference.inference_pipeline import InferencePipeline
+from scorebook.score._sync.score import score
 from scorebook.types import (
     AdaptiveEvalRunResult,
     AdaptiveEvalRunSpec,
@@ -32,15 +27,20 @@ from scorebook.types import (
     EvalResult,
     EvalRunSpec,
 )
-from contextlib import nullcontext
-from scorebook.utils import evaluation_progress_context
+from scorebook.utils import (
+    nullcontext,
+    evaluation_progress_context,
+    resolve_show_progress,
+    resolve_upload_results,
+)
 logger = logging.getLogger(__name__)
 def evaluate(
-    inference: Callable,
+    inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     experiment_id: Optional[str] = None,
@@ -59,6 +59,7 @@ def evaluate(
     Args:
         inference: The inference callable to evaluate
         datasets: Dataset(s) to evaluate on
+        split: Split to use for evaluation (default: "validation")
         hyperparameters: Hyperparameter configuration(s) to evaluate with
         metadata: Optional metadata to attach to the evaluation
         experiment_id: Optional experiment identifier
@@ -83,14 +84,14 @@ def evaluate(
     validate_parameters(locals(), evaluate)
     # Prepare datasets, hyperparameters, and eval run specs
-    datasets = prepare_datasets(datasets, sample_size)
+    datasets = prepare_datasets(datasets, split, sample_size)
     hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
     eval_run_specs = sorted(
         build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
         key=lambda run: (run.dataset_index, run.hyperparameters_index),
     )
-    # Create Trismik client if needed (for adaptive evals or uploads)
+    # Create a Trismik client if needed (for adaptive evals or uploads)
     needs_client = upload_results or any(
         isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
     )
@@ -101,7 +102,14 @@ def evaluate(
     with trismik_client or nullcontext():
         # Execute evaluation runs
         # Calculate total items across all runs
-        total_items = sum(len(run.dataset.items) for run in eval_run_specs)
+        total_items = sum(
+            (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )  # Adaptive evals use max_iterations
+            for run in eval_run_specs
+        )
         model_display = get_model_name(inference)
         with evaluation_progress_context(
@@ -145,34 +153,32 @@ def execute_runs(
     def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Execute run (score_async handles upload internally for classic evals)
         run_result = execute_run(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
         )
         # Update progress bars with items processed and success status
         if progress_bars is not None:
-            items_processed = len(run.dataset.items)
+            # Classic evals have .items; adaptive evals use max_iterations
+            items_processed = (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )
             progress_bars.on_run_completed(items_processed, run_result.run_completed)
+        # Update upload progress for classic evals
         if (
             upload_results
             and isinstance(run_result, ClassicEvalRunResult)
-            and experiment_id
-            and project_id
             and run_result.run_completed
-            and trismik_client is not None
         ):
-            try:
-                run_id = upload_classic_run_results(
-                    run_result, experiment_id, project_id, inference, metadata, trismik_client
-                )
-                run_result.run_id = run_id
+            # Check if upload succeeded by checking for run_id
+            if experiment_id and project_id:
+                upload_succeeded = run_result.run_id is not None
                 if progress_bars is not None:
-                    progress_bars.on_upload_completed(succeeded=True)
-            except Exception as e:
-                logger.warning(f"Failed to upload run results: {e}")
-                if progress_bars is not None:
-                    progress_bars.on_upload_completed(succeeded=False)
-                # Continue evaluation even if upload fails
+                    progress_bars.on_upload_completed(succeeded=upload_succeeded)
         return run_result
@@ -191,6 +197,7 @@ def execute_runs(
 def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
+    upload_results: bool,  # NEW PARAMETER
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +206,9 @@ def execute_run(
     """Execute a single evaluation run."""
     if isinstance(run, EvalRunSpec):
-        return execute_classic_eval_run(inference, run)
+        return execute_classic_eval_run(
+            inference, run, upload_results, experiment_id, project_id, metadata
+        )
     elif isinstance(run, AdaptiveEvalRunSpec):
         resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +226,79 @@ def execute_run(
         raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
-def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
-    """Execute a classic evaluation run."""
+def execute_classic_eval_run(
+    inference: Callable,
+    run: EvalRunSpec,
+    upload_results: bool,
+    experiment_id: Optional[str],
+    project_id: Optional[str],
+    metadata: Optional[Dict[str, Any]],
+) -> ClassicEvalRunResult:
+    """Execute a classic evaluation run using score_async() for scoring and uploading."""
     logger.debug("Executing classic eval run for %s", run)
     inference_outputs = None
-    metric_scores = None
+    scores = None
     try:
+        # 1. Run inference
         inference_outputs = run_inference_callable(
             inference, run.inputs, run.hyperparameter_config
         )
-        metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
-        logger.debug("Classic evaluation completed for run %s", run)
-        return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
+        # 2. Build items for score_async
+        items = [
+            {
+                "input": run.inputs[i] if i < len(run.inputs) else None,
+                "output": inference_outputs[i],
+                "label": run.labels[i] if i < len(run.labels) else "",
+            }
+            for i in range(len(inference_outputs))
+        ]
+        # 3. Get the model name for upload
+        model_name = get_model_name(inference, metadata)
+        # 4. Call score_async
+        scores = score(
+            items=items,
+            metrics=run.dataset.metrics,
+            output_column="output",  # Explicit parameter
+            label_column="label",  # Explicit parameter
+            input_column="input",  # Explicit parameter
+            hyperparameters=run.hyperparameter_config,
+            dataset_name=run.dataset.name,
+            model_name=model_name,
+            metadata=metadata,
+            experiment_id=experiment_id,
+            project_id=project_id,
+            upload_results=upload_results,
+            show_progress=False,
+        )
+        # 5. Extract run_id if upload succeeded
+        run_id = None
+        if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
+            run_id = scores["aggregate_results"][0].get("run_id")
+        logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=True,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=run_id,
+        )
     except Exception as e:
         logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
-        return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=False,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=None,
+        )
 def run_inference_callable(
@@ -296,93 +360,6 @@ def execute_adaptive_eval_run(
         return AdaptiveEvalRunResult(run, False, {})
-def upload_classic_run_results(
-    run_result: ClassicEvalRunResult,
-    experiment_id: str,
-    project_id: str,
-    inference_callable: Optional[Callable],
-    metadata: Optional[Dict[str, Any]],
-    trismik_client: Union[TrismikClient, TrismikAsyncClient],
-) -> str:
-    """Upload a classic evaluation run result to Trismik platform.
-    Args:
-        run: The evaluation run result to upload
-        experiment_id: Trismik experiment identifier
-        project_id: Trismik project identifier
-        model: Model name used for evaluation
-        metadata: Optional metadata dictionary
-        trismik_client: Trismik client instance
-    Returns:
-        Run id
-    """
-    model = get_model_name(inference_callable)
-    # Create eval items from run_spec inputs, outputs, and labels
-    items: List[TrismikClassicEvalItem] = []
-    inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
-    for idx, (input_value, output) in enumerate(inputs_outputs):
-        labels = run_result.run_spec.labels
-        label = labels[idx] if idx < len(labels) else ""
-        # Calculate item-level metrics for this item
-        item_metrics: Dict[str, Any] = {}
-        if run_result.scores:
-            for metric_name, metric_data in run_result.scores.items():
-                if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                    if idx < len(metric_data["item_scores"]):
-                        item_metrics[metric_name] = metric_data["item_scores"][idx]
-                else:
-                    # If scores is just a single value, use it for all items
-                    item_metrics[metric_name] = metric_data
-        eval_item = TrismikClassicEvalItem(
-            datasetItemId=str(idx),
-            modelInput=str(input_value),
-            modelOutput=str(output),
-            goldOutput=str(label),
-            metrics=item_metrics,
-        )
-        items.append(eval_item)
-    # Create eval metrics from run aggregate scores
-    metrics: List[TrismikClassicEvalMetric] = []
-    if run_result.scores:
-        for metric_name, metric_data in run_result.scores.items():
-            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                # Handle structured metric data with aggregate scores
-                for agg_name, agg_value in metric_data["aggregate_scores"].items():
-                    metric_id = (
-                        f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
-                    )
-                    metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
-                    metrics.append(metric)
-            else:
-                # Handle simple metric data (single value)
-                metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
-                metrics.append(metric)
-    classic_eval_request = TrismikClassicEvalRequest(
-        project_id,
-        experiment_id,
-        run_result.run_spec.dataset.name,
-        model,
-        run_result.run_spec.hyperparameter_config,
-        items,
-        metrics,
-    )
-    response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
-        classic_eval_request
-    )
-    run_id: str = response.id
-    logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
-    return run_id
 def run_adaptive_evaluation(
     inference: Callable,
     adaptive_run_spec: AdaptiveEvalRunSpec,
@@ -403,8 +380,20 @@ def run_adaptive_evaluation(
     Returns:
         Results from the adaptive evaluation
     """
+    # Fetch available splits from Trismik
+    dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
+    available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
+    # Resolve the split to use (with fallback: user-specified -> validation -> test)
+    resolved_split = resolve_adaptive_split(
+        test_id=adaptive_run_spec.dataset,
+        user_specified_split=adaptive_run_spec.split,
+        available_splits=available_splits,
+    )
     trismik_results = trismik_client.run(
         test_id=adaptive_run_spec.dataset,
+        split=resolved_split,
         project_id=project_id,
         experiment=experiment_id,
         run_metadata=TrismikRunMetadata(

scorebook/evaluate/evaluate_helpers.py CHANGED Viewed

@@ -2,9 +2,8 @@
 import asyncio
 import dataclasses
-import inspect
 import logging
-from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
 from trismik._async.client import TrismikAsyncClient
 from trismik._sync.client import TrismikClient
@@ -25,30 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
 logger = logging.getLogger(__name__)
-def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
-    """Resolve the upload_results parameter based on trismik login status."""
+# TODO: Remove this when backend supports boolean item metrics
+NORMALIZE_METRICS_FOR_UPLOAD = True
-    if upload_results == "auto":
-        upload_results = get_token() is not None
-        logger.debug("Auto upload results resolved to: %s", upload_results)
-    return upload_results
+def normalize_metric_value(value: Any) -> Any:
+    """Normalize metric values for API upload compatibility.
-def resolve_show_progress(show_progress: Optional[bool]) -> bool:
-    """Resolve whether to show progress bars.
+    TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
+    This function converts boolean values to floats (True -> 1.0, False -> 0.0)
+    to ensure upload compatibility.
     Args:
-        show_progress: Explicit setting (None uses default from settings)
+        value: The metric value to normalize
     Returns:
-        bool: Whether to show progress bars
+        Float if value is bool, otherwise unchanged
+    TODO: Remove this function when backend supports boolean metrics natively.
+          To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
     """
-    if show_progress is None:
-        from scorebook.settings import SHOW_PROGRESS_BARS
+    if not NORMALIZE_METRICS_FOR_UPLOAD:
+        return value
+    # Convert booleans to floats for API compatibility
+    if isinstance(value, bool):
+        return float(value)  # True -> 1.0, False -> 0.0
-        return bool(SHOW_PROGRESS_BARS)
-    return show_progress
+    return value
 def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
@@ -88,6 +91,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
 def prepare_datasets(
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
+    split: Optional[str] = None,
     sample_size: Optional[int] = None,
 ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
     """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
@@ -101,6 +105,12 @@ def prepare_datasets(
         # Prepare classic datasets
         if isinstance(dataset, EvalDataset):
+            # Warn if dataset split differs from provided split parameter
+            if split is not None and dataset.split is not None and dataset.split != split:
+                logger.warning(
+                    f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
+                    f"parameter is '{split}'. The dataset split will be used."
+                )
             if sample_size is not None:
                 dataset = dataset.sample(sample_size)
@@ -108,8 +118,17 @@ def prepare_datasets(
             datasets_out.append(dataset)
         # Prepare adaptive datasets
-        elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
-            datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
+        elif isinstance(dataset, str) and ":adaptive" in dataset:
+            # Parse adaptive dataset
+            parts = dataset.split(":")
+            if len(parts) != 2 or parts[1] != "adaptive":
+                raise ParameterValidationError(
+                    f"Invalid adaptive dataset format: '{dataset}'. "
+                    f"Use 'test_id:adaptive' format and specify split via the split parameter."
+                )
+            # Use the split parameter for all adaptive datasets
+            datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
         # TODO: dataset name string registry
         elif isinstance(dataset, str):
@@ -171,6 +190,7 @@ def build_eval_run_specs(
                         hyperparameters_index,
                         experiment_id,
                         project_id,
+                        dataset.split,
                         metadata,
                     )
                 )
@@ -217,17 +237,19 @@ def build_adaptive_eval_run_spec(
     hyperparameter_config_index: int,
     experiment_id: str,
     project_id: str,
+    split: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
 ) -> AdaptiveEvalRunSpec:
     """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
-    dataset = adaptive_dataset.replace(":adaptive", "")
+    # Keep the full dataset name including ":adaptive" suffix for backend API
     adaptive_eval_run_spec = AdaptiveEvalRunSpec(
-        dataset,
+        adaptive_dataset,
         dataset_index,
         hyperparameter_config,
         hyperparameter_config_index,
         experiment_id,
         project_id,
+        split,
         metadata,
     )
     logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
@@ -345,10 +367,7 @@ def make_trismik_inference(
     """
     # Check if the inference function is async
-    is_async = inspect.iscoroutinefunction(inference_function) or (
-        hasattr(inference_function, "__call__")
-        and inspect.iscoroutinefunction(inference_function.__call__)
-    )
+    is_async = is_awaitable(inference_function)
     def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
         # Single TrismikMultipleChoiceTextItem dataclass
@@ -386,3 +405,57 @@ def make_trismik_inference(
         )
     return sync_trismik_inference_function
+def resolve_adaptive_split(
+    test_id: str,
+    user_specified_split: Optional[str],
+    available_splits: List[str],
+) -> str:
+    """Resolve the dataset split to use for adaptive evaluation.
+    Resolution order:
+    1. If user specified a split, validate it exists and use it
+    2. If not specified and exactly one split is available, use it
+    3. If not specified and multiple splits are available, raise an error
+    4. If no splits are available, raise an error
+    Args:
+        test_id: The test dataset ID (without ":adaptive" suffix)
+        user_specified_split: Optional split name specified by the user
+        available_splits: List of available split names for this dataset
+    Returns:
+        The resolved split name to use
+    Raises:
+        ScoreBookError: If the specified split doesn't exist, multiple splits exist without
+            user specification, or no splits are available
+    """
+    logger.debug(f"Available splits for {test_id}: {available_splits}")
+    # If user specified a split, validate and use it
+    if user_specified_split is not None:
+        if user_specified_split in available_splits:
+            logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
+            return user_specified_split
+        else:
+            raise ScoreBookError(
+                f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
+                f"Available splits: {available_splits}"
+            )
+    # No split specified - check available splits
+    if len(available_splits) == 0:
+        raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
+    elif len(available_splits) == 1:
+        # Exactly one split - auto-select it
+        selected_split = available_splits[0]
+        logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
+        return selected_split
+    else:
+        # Multiple splits available - user must specify
+        raise ScoreBookError(
+            f"Multiple splits available for dataset '{test_id}': {available_splits}. "
+            f"Please specify which split to use via evaluate's 'split' parameter."
+        )

scorebook/exceptions.py CHANGED Viewed

@@ -84,10 +84,14 @@ class MetricComputationError(EvaluationError):
         )
-class DataMismatchError(EvaluationError):
+class ScoreError(ScoreBookError):
+    """Raised when there are errors during scoring."""
+class DataMismatchError(ScoreError):
     """Raised when there's a mismatch between outputs and expected labels."""
-    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
+    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
         """Initialize data mismatch error."""
         self.outputs_count = outputs_count
         self.labels_count = labels_count

scorebook/score/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Score module for computing metrics on pre-computed outputs."""
+from scorebook.score._async.score_async import score_async
+from scorebook.score._sync.score import score
+__all__ = ["score", "score_async"]

scorebook/score/_async/__init__.py ADDED Viewed

File without changes

scorebook 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl

scorebook 0.0.11py3-none-any.whl → 0.0.13py3-none-any.whl