PyPI - scorebook - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

scorebook 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

scorebook/__init__.py +3 -3
scorebook/eval_datasets/__init__.py +5 -0
scorebook/eval_datasets/eval_dataset.py +719 -0
scorebook/evaluate/_async/evaluate_async.py +58 -28
scorebook/evaluate/_sync/evaluate.py +58 -28
scorebook/evaluate/evaluate_helpers.py +31 -8
scorebook/exceptions.py +48 -0
scorebook/inference/clients/bedrock.py +1 -1
scorebook/inference/clients/portkey.py +1 -1
scorebook/inference/clients/vertex.py +1 -1
scorebook/settings.py +3 -0
scorebook/types.py +8 -5
scorebook/utils/__init__.py +4 -4
scorebook/utils/io_helpers.py +18 -5
scorebook/utils/progress_bars.py +752 -70
scorebook/utils/{build_prompt.py → render_template.py} +13 -12
{scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/METADATA +2 -1
{scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/RECORD +21 -20
scorebook/eval_dataset.py +0 -404
{scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0

scorebook/evaluate/_async/evaluate_async.py CHANGED Viewed

@@ -11,7 +11,7 @@ from trismik.types import (
     TrismikRunMetadata,
 )
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
     build_eval_run_specs,
     create_trismik_async_client,
@@ -20,6 +20,7 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
+    resolve_show_progress,
     resolve_upload_results,
     score_metrics,
     validate_parameters,
@@ -32,7 +33,7 @@ from scorebook.types import (
     EvalResult,
     EvalRunSpec,
 )
-from scorebook.utils import async_nullcontext, evaluation_progress
+from scorebook.utils import async_nullcontext, evaluation_progress_context
 logger = logging.getLogger(__name__)
@@ -50,6 +51,7 @@ async def evaluate_async(
     return_output: bool = False,
     upload_results: Union[Literal["auto"], bool] = "auto",
     sample_size: Optional[int] = None,
+    show_progress: Optional[bool] = None,
 ) -> Union[Dict, List, EvalResult]:
     """
     Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +69,8 @@ async def evaluate_async(
         return_output: If True, returns model outputs for each dataset item
         upload_results: If True, uploads results to Trismik's dashboard
         sample_size: Optional number of items to sample from each dataset
+        show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
+            If True/False, explicitly enables/disables progress bars for this evaluation.
     Returns:
         The evaluation results in the format specified by return parameters:
@@ -75,6 +79,7 @@ async def evaluate_async(
     """
     # Resolve and validate parameters
     upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
     validate_parameters(locals(), evaluate_async)
     # Prepare datasets, hyperparameters, and eval run specs
@@ -95,10 +100,17 @@ async def evaluate_async(
     async with trismik_client or async_nullcontext():
         # Execute evaluation runs
-        with evaluation_progress(
+        # Calculate total items across all runs
+        total_items = sum(len(run.dataset.items) for run in eval_run_specs)
+        model_display = get_model_name(inference)
+        with evaluation_progress_context(
+            total_eval_runs=len(eval_run_specs),
+            total_items=total_items,
             dataset_count=len(datasets),
-            hyperparameter_config_count=len(hyperparameter_configs),
-            run_count=len(eval_run_specs),
+            hyperparam_count=len(hyperparameter_configs),
+            model_display=model_display,
+            enabled=show_progress_bars,
         ) as progress_bars:
             eval_result = await execute_runs(
                 inference,
@@ -136,7 +148,10 @@ async def execute_runs(
         run_result = await execute_run(
             inference, run, experiment_id, project_id, metadata, trismik_client
         )
-        progress_bars.on_eval_run_completed(run.dataset_index)
+        # Update progress bars with items processed and success status
+        if progress_bars is not None:
+            items_processed = len(run.dataset.items)
+            progress_bars.on_run_completed(items_processed, run_result.run_completed)
         if (
             upload_results
@@ -146,10 +161,18 @@ async def execute_runs(
             and run_result.run_completed
             and trismik_client is not None
         ):
-            run_id = await upload_classic_run_results(
-                run_result, experiment_id, project_id, inference, metadata, trismik_client
-            )
-            run_result.run_id = run_id
+            try:
+                run_id = await upload_classic_run_results(
+                    run_result, experiment_id, project_id, inference, metadata, trismik_client
+                )
+                run_result.run_id = run_id
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=True)
+            except Exception as e:
+                logger.warning(f"Failed to upload run results: {e}")
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=False)
+                # Continue evaluation even if upload fails
         return run_result
@@ -203,7 +226,7 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
     try:
         inference_outputs = await run_inference_callable(
-            inference, run.dataset.items, run.hyperparameter_config
+            inference, run.inputs, run.hyperparameter_config
         )
         metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
         logger.debug("Classic evaluation completed for run %s", run)
@@ -216,13 +239,13 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
 async def run_inference_callable(
     inference: Callable,
-    items: List[Dict[str, Any]],
+    inputs: List[Any],
     hyperparameter_config: Dict[str, Any],
 ) -> Any:
     """Run inference on a given dataset and hyperparameter configuration."""
     try:
-        predictions = await inference(items, **hyperparameter_config)
+        predictions = await inference(inputs, **hyperparameter_config)
     except Exception as e:
         logger.error(
             "Inference callable raised an exception: %s",
@@ -230,11 +253,11 @@ async def run_inference_callable(
         )
         raise InferenceError(f"Inference failed: {str(e)}") from e
-    if not isinstance(predictions, list) or len(predictions) != len(items):
+    if not isinstance(predictions, list) or len(predictions) != len(inputs):
         raise InferenceError(
             "Inference callable must return a list of predictions "
-            "of shared length as the input items. "
-            f"Items length: {len(items)}, predictions length: {len(predictions)}"
+            "of shared length as the inputs. "
+            f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
         )
     if all(prediction == "" for prediction in predictions):
@@ -257,15 +280,20 @@ async def execute_adaptive_eval_run(
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
-    if trismik_client is None:
-        raise ScoreBookError("Trismik client is required for adaptive evaluation")
+    try:
+        if trismik_client is None:
+            raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    adaptive_eval_run_result = await run_adaptive_evaluation(
-        inference, run, experiment_id, project_id, metadata, trismik_client
-    )
-    logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
+        adaptive_eval_run_result = await run_adaptive_evaluation(
+            inference, run, experiment_id, project_id, metadata, trismik_client
+        )
+        logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    return adaptive_eval_run_result
+        return adaptive_eval_run_result
+    except Exception as e:
+        logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
+        return AdaptiveEvalRunResult(run, False, {})
 async def upload_classic_run_results(
@@ -291,10 +319,12 @@ async def upload_classic_run_results(
     """
     model = get_model_name(inference_callable)
-    # Create eval items from run_spec items, outputs, and labels
+    # Create eval items from run_spec inputs, outputs, and labels
     items: List[TrismikClassicEvalItem] = []
-    for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
-        label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
+    inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
+    for idx, (input_value, output) in enumerate(inputs_outputs):
+        labels = run_result.run_spec.labels
+        label = labels[idx] if idx < len(labels) else ""
         # Calculate item-level metrics for this item
         item_metrics: Dict[str, Any] = {}
@@ -309,7 +339,7 @@ async def upload_classic_run_results(
         eval_item = TrismikClassicEvalItem(
             datasetItemId=str(idx),
-            modelInput=str(item),
+            modelInput=str(input_value),
             modelOutput=str(output),
             goldOutput=str(label),
             metrics=item_metrics,
@@ -410,4 +440,4 @@ async def run_adaptive_evaluation(
     # Make scores JSON serializable
     scores = make_json_serializable(scores)
-    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
+    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)

scorebook/evaluate/_sync/evaluate.py CHANGED Viewed

@@ -10,7 +10,7 @@ from trismik.types import (
     TrismikRunMetadata,
 )
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
     build_eval_run_specs,
     create_trismik_sync_client,
@@ -19,6 +19,7 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
+    resolve_show_progress,
     resolve_upload_results,
     score_metrics,
     validate_parameters,
@@ -32,7 +33,7 @@ from scorebook.types import (
     EvalRunSpec,
 )
 from contextlib import nullcontext
-from scorebook.utils import evaluation_progress
+from scorebook.utils import evaluation_progress_context
 logger = logging.getLogger(__name__)
@@ -50,6 +51,7 @@ def evaluate(
     return_output: bool = False,
     upload_results: Union[Literal["auto"], bool] = "auto",
     sample_size: Optional[int] = None,
+    show_progress: Optional[bool] = None,
 ) -> Union[Dict, List, EvalResult]:
     """
     Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +69,8 @@ def evaluate(
         return_output: If True, returns model outputs for each dataset item
         upload_results: If True, uploads results to Trismik's dashboard
         sample_size: Optional number of items to sample from each dataset
+        show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
+            If True/False, explicitly enables/disables progress bars for this evaluation.
     Returns:
         The evaluation results in the format specified by return parameters:
@@ -75,6 +79,7 @@ def evaluate(
     """
     # Resolve and validate parameters
     upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
     validate_parameters(locals(), evaluate)
     # Prepare datasets, hyperparameters, and eval run specs
@@ -95,10 +100,17 @@ def evaluate(
     with trismik_client or nullcontext():
         # Execute evaluation runs
-        with evaluation_progress(
+        # Calculate total items across all runs
+        total_items = sum(len(run.dataset.items) for run in eval_run_specs)
+        model_display = get_model_name(inference)
+        with evaluation_progress_context(
+            total_eval_runs=len(eval_run_specs),
+            total_items=total_items,
             dataset_count=len(datasets),
-            hyperparameter_config_count=len(hyperparameter_configs),
-            run_count=len(eval_run_specs),
+            hyperparam_count=len(hyperparameter_configs),
+            model_display=model_display,
+            enabled=show_progress_bars,
         ) as progress_bars:
             eval_result = execute_runs(
                 inference,
@@ -136,7 +148,10 @@ def execute_runs(
         run_result = execute_run(
             inference, run, experiment_id, project_id, metadata, trismik_client
         )
-        progress_bars.on_eval_run_completed(run.dataset_index)
+        # Update progress bars with items processed and success status
+        if progress_bars is not None:
+            items_processed = len(run.dataset.items)
+            progress_bars.on_run_completed(items_processed, run_result.run_completed)
         if (
             upload_results
@@ -146,10 +161,18 @@ def execute_runs(
             and run_result.run_completed
             and trismik_client is not None
         ):
-            run_id = upload_classic_run_results(
-                run_result, experiment_id, project_id, inference, metadata, trismik_client
-            )
-            run_result.run_id = run_id
+            try:
+                run_id = upload_classic_run_results(
+                    run_result, experiment_id, project_id, inference, metadata, trismik_client
+                )
+                run_result.run_id = run_id
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=True)
+            except Exception as e:
+                logger.warning(f"Failed to upload run results: {e}")
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=False)
+                # Continue evaluation even if upload fails
         return run_result
@@ -203,7 +226,7 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
     try:
         inference_outputs = run_inference_callable(
-            inference, run.dataset.items, run.hyperparameter_config
+            inference, run.inputs, run.hyperparameter_config
         )
         metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
         logger.debug("Classic evaluation completed for run %s", run)
@@ -216,13 +239,13 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
 def run_inference_callable(
     inference: Callable,
-    items: List[Dict[str, Any]],
+    inputs: List[Any],
     hyperparameter_config: Dict[str, Any],
 ) -> Any:
     """Run inference on a given dataset and hyperparameter configuration."""
     try:
-        predictions = inference(items, **hyperparameter_config)
+        predictions = inference(inputs, **hyperparameter_config)
     except Exception as e:
         logger.error(
             "Inference callable raised an exception: %s",
@@ -230,11 +253,11 @@ def run_inference_callable(
         )
         raise InferenceError(f"Inference failed: {str(e)}") from e
-    if not isinstance(predictions, list) or len(predictions) != len(items):
+    if not isinstance(predictions, list) or len(predictions) != len(inputs):
         raise InferenceError(
             "Inference callable must return a list of predictions "
-            "of shared length as the input items. "
-            f"Items length: {len(items)}, predictions length: {len(predictions)}"
+            "of shared length as the inputs. "
+            f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
         )
     if all(prediction == "" for prediction in predictions):
@@ -257,15 +280,20 @@ def execute_adaptive_eval_run(
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
-    if trismik_client is None:
-        raise ScoreBookError("Trismik client is required for adaptive evaluation")
+    try:
+        if trismik_client is None:
+            raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    adaptive_eval_run_result = run_adaptive_evaluation(
-        inference, run, experiment_id, project_id, metadata, trismik_client
-    )
-    logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
+        adaptive_eval_run_result = run_adaptive_evaluation(
+            inference, run, experiment_id, project_id, metadata, trismik_client
+        )
+        logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    return adaptive_eval_run_result
+        return adaptive_eval_run_result
+    except Exception as e:
+        logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
+        return AdaptiveEvalRunResult(run, False, {})
 def upload_classic_run_results(
@@ -291,10 +319,12 @@ def upload_classic_run_results(
     """
     model = get_model_name(inference_callable)
-    # Create eval items from run_spec items, outputs, and labels
+    # Create eval items from run_spec inputs, outputs, and labels
     items: List[TrismikClassicEvalItem] = []
-    for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
-        label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
+    inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
+    for idx, (input_value, output) in enumerate(inputs_outputs):
+        labels = run_result.run_spec.labels
+        label = labels[idx] if idx < len(labels) else ""
         # Calculate item-level metrics for this item
         item_metrics: Dict[str, Any] = {}
@@ -309,7 +339,7 @@ def upload_classic_run_results(
         eval_item = TrismikClassicEvalItem(
             datasetItemId=str(idx),
-            modelInput=str(item),
+            modelInput=str(input_value),
             modelOutput=str(output),
             goldOutput=str(label),
             metrics=item_metrics,
@@ -410,4 +440,4 @@ def run_adaptive_evaluation(
     # Make scores JSON serializable
     scores = make_json_serializable(scores)
-    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
+    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)

scorebook/evaluate/evaluate_helpers.py CHANGED Viewed

@@ -35,6 +35,22 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
     return upload_results
+def resolve_show_progress(show_progress: Optional[bool]) -> bool:
+    """Resolve whether to show progress bars.
+    Args:
+        show_progress: Explicit setting (None uses default from settings)
+    Returns:
+        bool: Whether to show progress bars
+    """
+    if show_progress is None:
+        from scorebook.settings import SHOW_PROGRESS_BARS
+        return bool(SHOW_PROGRESS_BARS)
+    return show_progress
 def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
     """Validate all parameters for evaluation."""
@@ -172,15 +188,22 @@ def build_classic_eval_run_spec(
     hyperparameters: Dict[str, Any],
     hyperparameters_index: int,
 ) -> EvalRunSpec:
-    """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
-    items = dataset.items
-    labels = [item.get(dataset.label) for item in items]
+    """Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
+    Extracts input and label values from the appropriate columns in the dataset.
+    The column names are determined by dataset.input and dataset.label,
+    which may be original field names (e.g., "question", "answer") or computed
+    column names (e.g., "*input", "*label") if templates were used.
+    """
+    # Extract inputs and labels using the dataset's column specifications
+    inputs = dataset[dataset.input]  # Returns List[Any]
+    labels = dataset[dataset.label]  # Returns List[Any]
     eval_run_spec = EvalRunSpec(
         dataset,
         dataset_index,
         hyperparameters,
         hyperparameters_index,
-        items,
+        inputs,
         labels,
     )
     logger.debug("Built EvalRunSpec: %s", eval_run_spec)
@@ -256,7 +279,7 @@ def create_trismik_sync_client() -> TrismikClient:
 def get_model_name(
     inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
 ) -> str:
-    """Determine a model's name with the fallback "unspecified"."""
+    """Determine a model's name with the fallback "Model"."""
     # First priority: metadata.model
     if metadata and "model" in metadata:
@@ -266,8 +289,8 @@ def get_model_name(
     if inference_callable and hasattr(inference_callable, "model"):
         return str(inference_callable.model)
-    # Fallback: "unspecified"
-    return "unspecified"
+    # Fallback: "Model"
+    return "Model"
 def format_results(
@@ -292,7 +315,7 @@ def format_results(
             # Remove inference output if not requested
             if not return_output:
                 for item in item_scores:
-                    item.pop("inference_output", None)
+                    item.pop("output", None)
             results["item_results"] = item_scores

scorebook/exceptions.py CHANGED Viewed

@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
     """Base exception class for all Scorebook-related errors."""
+class EvalDatasetError(ScoreBookError):
+    """Base exception class for all EvalDataset errors."""
+class DatasetConfigurationError(EvalDatasetError):
+    """Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
+class MissingFieldError(EvalDatasetError):
+    """Raised when required field is missing from dataset."""
+    def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
+        """Initialize missing field error with structured context."""
+        self.field_name = field_name
+        self.field_type = field_type  # "input" or "label"
+        self.available_fields = available_fields
+        super().__init__(
+            f"{field_type.capitalize()} field '{field_name}' not found. "
+            f"Available fields: {', '.join(available_fields)}"
+        )
+class DatasetLoadError(EvalDatasetError):
+    """Raised when dataset fails to load from source (file or remote)."""
+class DatasetParseError(EvalDatasetError):
+    """Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
+class DatasetNotInitializedError(EvalDatasetError):
+    """Raised when operations are attempted on uninitialized dataset."""
+class DatasetSampleError(EvalDatasetError):
+    """Raised when sampling parameters are invalid."""
+    def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
+        """Initialize dataset sample error with structured context."""
+        self.sample_size = sample_size
+        self.dataset_size = dataset_size
+        self.dataset_name = dataset_name
+        super().__init__(
+            f"Sample size {sample_size} exceeds dataset size {dataset_size} "
+            f"for dataset '{dataset_name}'"
+        )
 class EvaluationError(ScoreBookError):
     """Raised when there are errors during model evaluation."""

scorebook/inference/clients/bedrock.py CHANGED Viewed

@@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def batch(

scorebook/inference/clients/portkey.py CHANGED Viewed

@@ -13,7 +13,7 @@ import tempfile
 from typing import Any, List, Optional
 from portkey_ai import AsyncPortkey
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def responses(

scorebook/inference/clients/vertex.py CHANGED Viewed

@@ -18,7 +18,7 @@ import pandas as pd
 from google import genai
 from google.cloud import storage
 from google.genai import types
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def responses(

scorebook/settings.py CHANGED Viewed

@@ -16,3 +16,6 @@ TRISMIK_ADAPTIVE_TESTING_URL = f"{TRISMIK_API_BASE_URL}/adaptive-testing"
 # Allow override via environment variable
 TRISMIK_SERVICE_URL = os.environ.get("TRISMIK_SERVICE_URL", TRISMIK_ADAPTIVE_TESTING_URL)
+# Progress bar configuration
+SHOW_PROGRESS_BARS = os.environ.get("SCOREBOOK_SHOW_PROGRESS_BARS", "true").lower() == "true"

scorebook/types.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
 @dataclass
@@ -21,7 +21,7 @@ class EvalRunSpec:
     dataset_index: int
     hyperparameter_config: Dict[str, Any]
     hyperparameters_index: int
-    items: List[Dict[str, Any]]
+    inputs: List[Any]
     labels: List[Any]
     def __str__(self) -> str:
@@ -64,13 +64,15 @@ class ClassicEvalRunResult:
         if self.outputs:
             for idx, output in enumerate(self.outputs):
-                if idx >= len(self.run_spec.items):
+                if idx >= len(self.run_spec.inputs):
                     break
                 result = {
-                    "item_id": idx,
+                    "id": idx,
                     "dataset_name": self.run_spec.dataset.name,
-                    "inference_output": output,
+                    "input": self.run_spec.inputs[idx],
+                    "label": self.run_spec.labels[idx] if idx < len(self.run_spec.labels) else None,
+                    "output": output,
                     **self.run_spec.hyperparameter_config,
                 }
@@ -125,6 +127,7 @@ class AdaptiveEvalRunResult:
     """Results from executing an adaptive evaluation run."""
     run_spec: AdaptiveEvalRunSpec
+    run_completed: bool
     scores: Dict[str, Any]
     @property

scorebook/utils/__init__.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Utility functions and common helpers for the Scorebook framework."""
 from scorebook.utils.async_utils import async_nullcontext, is_awaitable
-from scorebook.utils.build_prompt import build_prompt
 from scorebook.utils.io_helpers import validate_path
-from scorebook.utils.progress_bars import evaluation_progress
+from scorebook.utils.progress_bars import evaluation_progress_context
+from scorebook.utils.render_template import render_template
 from scorebook.utils.transform_helpers import expand_dict
 __all__ = [
@@ -11,6 +11,6 @@ __all__ = [
     "is_awaitable",
     "validate_path",
     "expand_dict",
-    "evaluation_progress",
-    "build_prompt",
+    "evaluation_progress_context",
+    "render_template",
 ]

scorebook 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

scorebook 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl