PyPI - scorebook - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

scorebook/__init__.py +11 -4
scorebook/eval_datasets/__init__.py +5 -0
scorebook/eval_datasets/eval_dataset.py +719 -0
scorebook/evaluate/_async/evaluate_async.py +135 -130
scorebook/evaluate/_sync/evaluate.py +135 -131
scorebook/evaluate/evaluate_helpers.py +46 -23
scorebook/exceptions.py +54 -2
scorebook/inference/clients/bedrock.py +1 -1
scorebook/inference/clients/portkey.py +1 -1
scorebook/inference/clients/vertex.py +1 -1
scorebook/score/__init__.py +6 -0
scorebook/score/_async/__init__.py +0 -0
scorebook/score/_async/score_async.py +145 -0
scorebook/score/_sync/__init__.py +0 -0
scorebook/score/_sync/score.py +145 -0
scorebook/score/score_helpers.py +207 -0
scorebook/settings.py +3 -0
scorebook/trismik/upload_results.py +254 -0
scorebook/types.py +36 -54
scorebook/utils/__init__.py +11 -4
scorebook/utils/common_helpers.py +41 -0
scorebook/utils/io_helpers.py +18 -5
scorebook/utils/progress_bars.py +819 -70
scorebook/utils/{build_prompt.py → render_template.py} +13 -12
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
scorebook-0.0.12.dist-info/RECORD +50 -0
scorebook/eval_dataset.py +0 -404
scorebook-0.0.10.dist-info/RECORD +0 -41
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0

scorebook/evaluate/_async/evaluate_async.py CHANGED Viewed

@@ -3,15 +3,10 @@ import logging
 from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
 from trismik import TrismikAsyncClient, TrismikClient
-from trismik.types import (
-    TrismikClassicEvalItem,
-    TrismikClassicEvalMetric,
-    TrismikClassicEvalRequest,
-    TrismikClassicEvalResponse,
-    TrismikRunMetadata,
-)
+from trismik.settings import evaluation_settings
+from trismik.types import TrismikRunMetadata
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
     build_eval_run_specs,
     create_trismik_async_client,
@@ -20,11 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
-    resolve_upload_results,
-    score_metrics,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
+from scorebook.inference.inference_pipeline import InferencePipeline
+from scorebook.score._async.score_async import score_async
 from scorebook.types import (
     AdaptiveEvalRunResult,
     AdaptiveEvalRunSpec,
@@ -32,13 +27,18 @@ from scorebook.types import (
     EvalResult,
     EvalRunSpec,
 )
-from scorebook.utils import async_nullcontext, evaluation_progress
+from scorebook.utils import (
+    async_nullcontext,
+    evaluation_progress_context,
+    resolve_show_progress,
+    resolve_upload_results,
+)
 logger = logging.getLogger(__name__)
 async def evaluate_async(
-    inference: Callable,
+    inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -50,6 +50,7 @@ async def evaluate_async(
     return_output: bool = False,
     upload_results: Union[Literal["auto"], bool] = "auto",
     sample_size: Optional[int] = None,
+    show_progress: Optional[bool] = None,
 ) -> Union[Dict, List, EvalResult]:
     """
     Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +68,8 @@ async def evaluate_async(
         return_output: If True, returns model outputs for each dataset item
         upload_results: If True, uploads results to Trismik's dashboard
         sample_size: Optional number of items to sample from each dataset
+        show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
+            If True/False, explicitly enables/disables progress bars for this evaluation.
     Returns:
         The evaluation results in the format specified by return parameters:
@@ -75,6 +78,7 @@ async def evaluate_async(
     """
     # Resolve and validate parameters
     upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
     validate_parameters(locals(), evaluate_async)
     # Prepare datasets, hyperparameters, and eval run specs
@@ -85,7 +89,7 @@ async def evaluate_async(
         key=lambda run: (run.dataset_index, run.hyperparameters_index),
     )
-    # Create Trismik client if needed (for adaptive evals or uploads)
+    # Create a Trismik client if needed (for adaptive evals or uploads)
     needs_client = upload_results or any(
         isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
     )
@@ -95,10 +99,24 @@ async def evaluate_async(
     async with trismik_client or async_nullcontext():
         # Execute evaluation runs
-        with evaluation_progress(
+        # Calculate total items across all runs
+        total_items = sum(
+            (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )  # Adaptive evals use max_iterations
+            for run in eval_run_specs
+        )
+        model_display = get_model_name(inference)
+        with evaluation_progress_context(
+            total_eval_runs=len(eval_run_specs),
+            total_items=total_items,
             dataset_count=len(datasets),
-            hyperparameter_config_count=len(hyperparameter_configs),
-            run_count=len(eval_run_specs),
+            hyperparam_count=len(hyperparameter_configs),
+            model_display=model_display,
+            enabled=show_progress_bars,
         ) as progress_bars:
             eval_result = await execute_runs(
                 inference,
@@ -133,23 +151,32 @@ async def execute_runs(
     async def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Execute run (score_async handles upload internally for classic evals)
         run_result = await execute_run(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
         )
-        progress_bars.on_eval_run_completed(run.dataset_index)
+        # Update progress bars with items processed and success status
+        if progress_bars is not None:
+            # Classic evals have .items; adaptive evals use max_iterations
+            items_processed = (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )
+            progress_bars.on_run_completed(items_processed, run_result.run_completed)
+        # Update upload progress for classic evals
         if (
             upload_results
             and isinstance(run_result, ClassicEvalRunResult)
-            and experiment_id
-            and project_id
             and run_result.run_completed
-            and trismik_client is not None
         ):
-            run_id = await upload_classic_run_results(
-                run_result, experiment_id, project_id, inference, metadata, trismik_client
-            )
-            run_result.run_id = run_id
+            # Check if upload succeeded by checking for run_id
+            if experiment_id and project_id:
+                upload_succeeded = run_result.run_id is not None
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=upload_succeeded)
         return run_result
@@ -168,6 +195,7 @@ async def execute_runs(
 async def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
+    upload_results: bool,  # NEW PARAMETER
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -176,7 +204,9 @@ async def execute_run(
     """Execute a single evaluation run."""
     if isinstance(run, EvalRunSpec):
-        return await execute_classic_eval_run(inference, run)
+        return await execute_classic_eval_run(
+            inference, run, upload_results, experiment_id, project_id, metadata
+        )
     elif isinstance(run, AdaptiveEvalRunSpec):
         resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -194,35 +224,90 @@ async def execute_run(
         raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
-async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
-    """Execute a classic evaluation run."""
+async def execute_classic_eval_run(
+    inference: Callable,
+    run: EvalRunSpec,
+    upload_results: bool,
+    experiment_id: Optional[str],
+    project_id: Optional[str],
+    metadata: Optional[Dict[str, Any]],
+) -> ClassicEvalRunResult:
+    """Execute a classic evaluation run using score_async() for scoring and uploading."""
     logger.debug("Executing classic eval run for %s", run)
     inference_outputs = None
-    metric_scores = None
+    scores = None
     try:
+        # 1. Run inference
         inference_outputs = await run_inference_callable(
-            inference, run.dataset.items, run.hyperparameter_config
+            inference, run.inputs, run.hyperparameter_config
+        )
+        # 2. Build items for score_async
+        items = [
+            {
+                "input": run.inputs[i] if i < len(run.inputs) else None,
+                "output": inference_outputs[i],
+                "label": run.labels[i] if i < len(run.labels) else "",
+            }
+            for i in range(len(inference_outputs))
+        ]
+        # 3. Get the model name for upload
+        model_name = get_model_name(inference, metadata)
+        # 4. Call score_async
+        scores = await score_async(
+            items=items,
+            metrics=run.dataset.metrics,
+            output_column="output",  # Explicit parameter
+            label_column="label",  # Explicit parameter
+            input_column="input",  # Explicit parameter
+            hyperparameters=run.hyperparameter_config,
+            dataset_name=run.dataset.name,
+            model_name=model_name,
+            metadata=metadata,
+            experiment_id=experiment_id,
+            project_id=project_id,
+            upload_results=upload_results,
+            show_progress=False,
+        )
+        # 5. Extract run_id if upload succeeded
+        run_id = None
+        if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
+            run_id = scores["aggregate_results"][0].get("run_id")
+        logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=True,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=run_id,
         )
-        metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
-        logger.debug("Classic evaluation completed for run %s", run)
-        return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
     except Exception as e:
         logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
-        return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=False,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=None,
+        )
 async def run_inference_callable(
     inference: Callable,
-    items: List[Dict[str, Any]],
+    inputs: List[Any],
     hyperparameter_config: Dict[str, Any],
 ) -> Any:
     """Run inference on a given dataset and hyperparameter configuration."""
     try:
-        predictions = await inference(items, **hyperparameter_config)
+        predictions = await inference(inputs, **hyperparameter_config)
     except Exception as e:
         logger.error(
             "Inference callable raised an exception: %s",
@@ -230,11 +315,11 @@ async def run_inference_callable(
         )
         raise InferenceError(f"Inference failed: {str(e)}") from e
-    if not isinstance(predictions, list) or len(predictions) != len(items):
+    if not isinstance(predictions, list) or len(predictions) != len(inputs):
         raise InferenceError(
             "Inference callable must return a list of predictions "
-            "of shared length as the input items. "
-            f"Items length: {len(items)}, predictions length: {len(predictions)}"
+            "of shared length as the inputs. "
+            f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
         )
     if all(prediction == "" for prediction in predictions):
@@ -257,100 +342,20 @@ async def execute_adaptive_eval_run(
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
-    if trismik_client is None:
-        raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    adaptive_eval_run_result = await run_adaptive_evaluation(
-        inference, run, experiment_id, project_id, metadata, trismik_client
-    )
-    logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    return adaptive_eval_run_result
-async def upload_classic_run_results(
-    run_result: ClassicEvalRunResult,
-    experiment_id: str,
-    project_id: str,
-    inference_callable: Optional[Callable],
-    metadata: Optional[Dict[str, Any]],
-    trismik_client: Union[TrismikClient, TrismikAsyncClient],
-) -> str:
-    """Upload a classic evaluation run result to Trismik platform.
-    Args:
-        run: The evaluation run result to upload
-        experiment_id: Trismik experiment identifier
-        project_id: Trismik project identifier
-        model: Model name used for evaluation
-        metadata: Optional metadata dictionary
-        trismik_client: Trismik client instance
+    try:
+        if trismik_client is None:
+            raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    Returns:
-        Run id
-    """
-    model = get_model_name(inference_callable)
-    # Create eval items from run_spec items, outputs, and labels
-    items: List[TrismikClassicEvalItem] = []
-    for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
-        label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
-        # Calculate item-level metrics for this item
-        item_metrics: Dict[str, Any] = {}
-        if run_result.scores:
-            for metric_name, metric_data in run_result.scores.items():
-                if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                    if idx < len(metric_data["item_scores"]):
-                        item_metrics[metric_name] = metric_data["item_scores"][idx]
-                else:
-                    # If scores is just a single value, use it for all items
-                    item_metrics[metric_name] = metric_data
-        eval_item = TrismikClassicEvalItem(
-            datasetItemId=str(idx),
-            modelInput=str(item),
-            modelOutput=str(output),
-            goldOutput=str(label),
-            metrics=item_metrics,
+        adaptive_eval_run_result = await run_adaptive_evaluation(
+            inference, run, experiment_id, project_id, metadata, trismik_client
         )
-        items.append(eval_item)
-    # Create eval metrics from run aggregate scores
-    metrics: List[TrismikClassicEvalMetric] = []
-    if run_result.scores:
-        for metric_name, metric_data in run_result.scores.items():
-            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                # Handle structured metric data with aggregate scores
-                for agg_name, agg_value in metric_data["aggregate_scores"].items():
-                    metric_id = (
-                        f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
-                    )
-                    metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
-                    metrics.append(metric)
-            else:
-                # Handle simple metric data (single value)
-                metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
-                metrics.append(metric)
-    classic_eval_request = TrismikClassicEvalRequest(
-        project_id,
-        experiment_id,
-        run_result.run_spec.dataset.name,
-        model,
-        run_result.run_spec.hyperparameter_config,
-        items,
-        metrics,
-    )
-    response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
-        classic_eval_request
-    )
+        logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    run_id: str = response.id
-    logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
+        return adaptive_eval_run_result
-    return run_id
+    except Exception as e:
+        logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
+        return AdaptiveEvalRunResult(run, False, {})
 async def run_adaptive_evaluation(
@@ -410,4 +415,4 @@ async def run_adaptive_evaluation(
     # Make scores JSON serializable
     scores = make_json_serializable(scores)
-    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
+    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)

scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl