PyPI - scorebook - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

scorebook/__init__.py +11 -4
scorebook/eval_datasets/__init__.py +5 -0
scorebook/eval_datasets/eval_dataset.py +719 -0
scorebook/evaluate/_async/evaluate_async.py +135 -130
scorebook/evaluate/_sync/evaluate.py +135 -131
scorebook/evaluate/evaluate_helpers.py +46 -23
scorebook/exceptions.py +54 -2
scorebook/inference/clients/bedrock.py +1 -1
scorebook/inference/clients/portkey.py +1 -1
scorebook/inference/clients/vertex.py +1 -1
scorebook/score/__init__.py +6 -0
scorebook/score/_async/__init__.py +0 -0
scorebook/score/_async/score_async.py +145 -0
scorebook/score/_sync/__init__.py +0 -0
scorebook/score/_sync/score.py +145 -0
scorebook/score/score_helpers.py +207 -0
scorebook/settings.py +3 -0
scorebook/trismik/upload_results.py +254 -0
scorebook/types.py +36 -54
scorebook/utils/__init__.py +11 -4
scorebook/utils/common_helpers.py +41 -0
scorebook/utils/io_helpers.py +18 -5
scorebook/utils/progress_bars.py +819 -70
scorebook/utils/{build_prompt.py → render_template.py} +13 -12
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
scorebook-0.0.12.dist-info/RECORD +50 -0
scorebook/eval_dataset.py +0 -404
scorebook-0.0.10.dist-info/RECORD +0 -41
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0

scorebook/evaluate/_sync/evaluate.py CHANGED Viewed

@@ -2,15 +2,10 @@ import logging
 from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
 from trismik import TrismikAsyncClient, TrismikClient
-from trismik.types import (
-    TrismikClassicEvalItem,
-    TrismikClassicEvalMetric,
-    TrismikClassicEvalRequest,
-    TrismikClassicEvalResponse,
-    TrismikRunMetadata,
-)
+from trismik.settings import evaluation_settings
+from trismik.types import TrismikRunMetadata
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
 from scorebook.evaluate.evaluate_helpers import (
     build_eval_run_specs,
     create_trismik_sync_client,
@@ -19,11 +14,11 @@ from scorebook.evaluate.evaluate_helpers import (
     make_trismik_inference,
     prepare_datasets,
     prepare_hyperparameter_configs,
-    resolve_upload_results,
-    score_metrics,
     validate_parameters,
 )
 from scorebook.exceptions import InferenceError, ScoreBookError
+from scorebook.inference.inference_pipeline import InferencePipeline
+from scorebook.score._sync.score import score
 from scorebook.types import (
     AdaptiveEvalRunResult,
     AdaptiveEvalRunSpec,
@@ -31,14 +26,18 @@ from scorebook.types import (
     EvalResult,
     EvalRunSpec,
 )
-from contextlib import nullcontext
-from scorebook.utils import evaluation_progress
+from scorebook.utils import (
+    nullcontext,
+    evaluation_progress_context,
+    resolve_show_progress,
+    resolve_upload_results,
+)
 logger = logging.getLogger(__name__)
 def evaluate(
-    inference: Callable,
+    inference: Union[Callable, InferencePipeline],
     datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
     hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -50,6 +49,7 @@ def evaluate(
     return_output: bool = False,
     upload_results: Union[Literal["auto"], bool] = "auto",
     sample_size: Optional[int] = None,
+    show_progress: Optional[bool] = None,
 ) -> Union[Dict, List, EvalResult]:
     """
     Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +67,8 @@ def evaluate(
         return_output: If True, returns model outputs for each dataset item
         upload_results: If True, uploads results to Trismik's dashboard
         sample_size: Optional number of items to sample from each dataset
+        show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
+            If True/False, explicitly enables/disables progress bars for this evaluation.
     Returns:
         The evaluation results in the format specified by return parameters:
@@ -75,6 +77,7 @@ def evaluate(
     """
     # Resolve and validate parameters
     upload_results = cast(bool, resolve_upload_results(upload_results))
+    show_progress_bars = resolve_show_progress(show_progress)
     validate_parameters(locals(), evaluate)
     # Prepare datasets, hyperparameters, and eval run specs
@@ -85,7 +88,7 @@ def evaluate(
         key=lambda run: (run.dataset_index, run.hyperparameters_index),
     )
-    # Create Trismik client if needed (for adaptive evals or uploads)
+    # Create a Trismik client if needed (for adaptive evals or uploads)
     needs_client = upload_results or any(
         isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
     )
@@ -95,10 +98,24 @@ def evaluate(
     with trismik_client or nullcontext():
         # Execute evaluation runs
-        with evaluation_progress(
+        # Calculate total items across all runs
+        total_items = sum(
+            (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )  # Adaptive evals use max_iterations
+            for run in eval_run_specs
+        )
+        model_display = get_model_name(inference)
+        with evaluation_progress_context(
+            total_eval_runs=len(eval_run_specs),
+            total_items=total_items,
             dataset_count=len(datasets),
-            hyperparameter_config_count=len(hyperparameter_configs),
-            run_count=len(eval_run_specs),
+            hyperparam_count=len(hyperparameter_configs),
+            model_display=model_display,
+            enabled=show_progress_bars,
         ) as progress_bars:
             eval_result = execute_runs(
                 inference,
@@ -133,23 +150,32 @@ def execute_runs(
     def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Execute run (score_async handles upload internally for classic evals)
         run_result = execute_run(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
         )
-        progress_bars.on_eval_run_completed(run.dataset_index)
+        # Update progress bars with items processed and success status
+        if progress_bars is not None:
+            # Classic evals have .items; adaptive evals use max_iterations
+            items_processed = (
+                len(run.dataset.items)
+                if isinstance(run, EvalRunSpec)
+                else evaluation_settings["max_iterations"]
+            )
+            progress_bars.on_run_completed(items_processed, run_result.run_completed)
+        # Update upload progress for classic evals
         if (
             upload_results
             and isinstance(run_result, ClassicEvalRunResult)
-            and experiment_id
-            and project_id
             and run_result.run_completed
-            and trismik_client is not None
         ):
-            run_id = upload_classic_run_results(
-                run_result, experiment_id, project_id, inference, metadata, trismik_client
-            )
-            run_result.run_id = run_id
+            # Check if upload succeeded by checking for run_id
+            if experiment_id and project_id:
+                upload_succeeded = run_result.run_id is not None
+                if progress_bars is not None:
+                    progress_bars.on_upload_completed(succeeded=upload_succeeded)
         return run_result
@@ -168,6 +194,7 @@ def execute_runs(
 def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
+    upload_results: bool,  # NEW PARAMETER
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -176,7 +203,9 @@ def execute_run(
     """Execute a single evaluation run."""
     if isinstance(run, EvalRunSpec):
-        return execute_classic_eval_run(inference, run)
+        return execute_classic_eval_run(
+            inference, run, upload_results, experiment_id, project_id, metadata
+        )
     elif isinstance(run, AdaptiveEvalRunSpec):
         resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -194,35 +223,90 @@ def execute_run(
         raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
-def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
-    """Execute a classic evaluation run."""
+def execute_classic_eval_run(
+    inference: Callable,
+    run: EvalRunSpec,
+    upload_results: bool,
+    experiment_id: Optional[str],
+    project_id: Optional[str],
+    metadata: Optional[Dict[str, Any]],
+) -> ClassicEvalRunResult:
+    """Execute a classic evaluation run using score_async() for scoring and uploading."""
     logger.debug("Executing classic eval run for %s", run)
     inference_outputs = None
-    metric_scores = None
+    scores = None
     try:
+        # 1. Run inference
         inference_outputs = run_inference_callable(
-            inference, run.dataset.items, run.hyperparameter_config
+            inference, run.inputs, run.hyperparameter_config
+        )
+        # 2. Build items for score_async
+        items = [
+            {
+                "input": run.inputs[i] if i < len(run.inputs) else None,
+                "output": inference_outputs[i],
+                "label": run.labels[i] if i < len(run.labels) else "",
+            }
+            for i in range(len(inference_outputs))
+        ]
+        # 3. Get the model name for upload
+        model_name = get_model_name(inference, metadata)
+        # 4. Call score_async
+        scores = score(
+            items=items,
+            metrics=run.dataset.metrics,
+            output_column="output",  # Explicit parameter
+            label_column="label",  # Explicit parameter
+            input_column="input",  # Explicit parameter
+            hyperparameters=run.hyperparameter_config,
+            dataset_name=run.dataset.name,
+            model_name=model_name,
+            metadata=metadata,
+            experiment_id=experiment_id,
+            project_id=project_id,
+            upload_results=upload_results,
+            show_progress=False,
+        )
+        # 5. Extract run_id if upload succeeded
+        run_id = None
+        if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
+            run_id = scores["aggregate_results"][0].get("run_id")
+        logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=True,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=run_id,
         )
-        metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
-        logger.debug("Classic evaluation completed for run %s", run)
-        return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
     except Exception as e:
         logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
-        return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
+        return ClassicEvalRunResult(
+            run_spec=run,
+            run_completed=False,
+            outputs=inference_outputs,
+            scores=scores,
+            run_id=None,
+        )
 def run_inference_callable(
     inference: Callable,
-    items: List[Dict[str, Any]],
+    inputs: List[Any],
     hyperparameter_config: Dict[str, Any],
 ) -> Any:
     """Run inference on a given dataset and hyperparameter configuration."""
     try:
-        predictions = inference(items, **hyperparameter_config)
+        predictions = inference(inputs, **hyperparameter_config)
     except Exception as e:
         logger.error(
             "Inference callable raised an exception: %s",
@@ -230,11 +314,11 @@ def run_inference_callable(
         )
         raise InferenceError(f"Inference failed: {str(e)}") from e
-    if not isinstance(predictions, list) or len(predictions) != len(items):
+    if not isinstance(predictions, list) or len(predictions) != len(inputs):
         raise InferenceError(
             "Inference callable must return a list of predictions "
-            "of shared length as the input items. "
-            f"Items length: {len(items)}, predictions length: {len(predictions)}"
+            "of shared length as the inputs. "
+            f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
         )
     if all(prediction == "" for prediction in predictions):
@@ -257,100 +341,20 @@ def execute_adaptive_eval_run(
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
-    if trismik_client is None:
-        raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    adaptive_eval_run_result = run_adaptive_evaluation(
-        inference, run, experiment_id, project_id, metadata, trismik_client
-    )
-    logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    return adaptive_eval_run_result
-def upload_classic_run_results(
-    run_result: ClassicEvalRunResult,
-    experiment_id: str,
-    project_id: str,
-    inference_callable: Optional[Callable],
-    metadata: Optional[Dict[str, Any]],
-    trismik_client: Union[TrismikClient, TrismikAsyncClient],
-) -> str:
-    """Upload a classic evaluation run result to Trismik platform.
-    Args:
-        run: The evaluation run result to upload
-        experiment_id: Trismik experiment identifier
-        project_id: Trismik project identifier
-        model: Model name used for evaluation
-        metadata: Optional metadata dictionary
-        trismik_client: Trismik client instance
+    try:
+        if trismik_client is None:
+            raise ScoreBookError("Trismik client is required for adaptive evaluation")
-    Returns:
-        Run id
-    """
-    model = get_model_name(inference_callable)
-    # Create eval items from run_spec items, outputs, and labels
-    items: List[TrismikClassicEvalItem] = []
-    for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
-        label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
-        # Calculate item-level metrics for this item
-        item_metrics: Dict[str, Any] = {}
-        if run_result.scores:
-            for metric_name, metric_data in run_result.scores.items():
-                if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                    if idx < len(metric_data["item_scores"]):
-                        item_metrics[metric_name] = metric_data["item_scores"][idx]
-                else:
-                    # If scores is just a single value, use it for all items
-                    item_metrics[metric_name] = metric_data
-        eval_item = TrismikClassicEvalItem(
-            datasetItemId=str(idx),
-            modelInput=str(item),
-            modelOutput=str(output),
-            goldOutput=str(label),
-            metrics=item_metrics,
+        adaptive_eval_run_result = run_adaptive_evaluation(
+            inference, run, experiment_id, project_id, metadata, trismik_client
         )
-        items.append(eval_item)
-    # Create eval metrics from run aggregate scores
-    metrics: List[TrismikClassicEvalMetric] = []
-    if run_result.scores:
-        for metric_name, metric_data in run_result.scores.items():
-            if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                # Handle structured metric data with aggregate scores
-                for agg_name, agg_value in metric_data["aggregate_scores"].items():
-                    metric_id = (
-                        f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
-                    )
-                    metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
-                    metrics.append(metric)
-            else:
-                # Handle simple metric data (single value)
-                metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
-                metrics.append(metric)
-    classic_eval_request = TrismikClassicEvalRequest(
-        project_id,
-        experiment_id,
-        run_result.run_spec.dataset.name,
-        model,
-        run_result.run_spec.hyperparameter_config,
-        items,
-        metrics,
-    )
-    response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
-        classic_eval_request
-    )
+        logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
-    run_id: str = response.id
-    logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
+        return adaptive_eval_run_result
-    return run_id
+    except Exception as e:
+        logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
+        return AdaptiveEvalRunResult(run, False, {})
 def run_adaptive_evaluation(
@@ -410,4 +414,4 @@ def run_adaptive_evaluation(
     # Make scores JSON serializable
     scores = make_json_serializable(scores)
-    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
+    return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)

scorebook/evaluate/evaluate_helpers.py CHANGED Viewed

@@ -2,9 +2,8 @@
 import asyncio
 import dataclasses
-import inspect
 import logging
-from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
 from trismik._async.client import TrismikAsyncClient
 from trismik._sync.client import TrismikClient
@@ -25,14 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
 logger = logging.getLogger(__name__)
-def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
-    """Resolve the upload_results parameter based on trismik login status."""
+# TODO: Remove this when backend supports boolean item metrics
+NORMALIZE_METRICS_FOR_UPLOAD = True
-    if upload_results == "auto":
-        upload_results = get_token() is not None
-        logger.debug("Auto upload results resolved to: %s", upload_results)
-    return upload_results
+def normalize_metric_value(value: Any) -> Any:
+    """Normalize metric values for API upload compatibility.
+    TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
+    This function converts boolean values to floats (True -> 1.0, False -> 0.0)
+    to ensure upload compatibility.
+    Args:
+        value: The metric value to normalize
+    Returns:
+        Float if value is bool, otherwise unchanged
+    TODO: Remove this function when backend supports boolean metrics natively.
+          To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
+    """
+    if not NORMALIZE_METRICS_FOR_UPLOAD:
+        return value
+    # Convert booleans to floats for API compatibility
+    if isinstance(value, bool):
+        return float(value)  # True -> 1.0, False -> 0.0
+    return value
 def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
@@ -93,7 +112,7 @@ def prepare_datasets(
         # Prepare adaptive datasets
         elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
-            datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
+            datasets_out.append(AdaptiveEvalDataset(dataset))
         # TODO: dataset name string registry
         elif isinstance(dataset, str):
@@ -172,15 +191,22 @@ def build_classic_eval_run_spec(
     hyperparameters: Dict[str, Any],
     hyperparameters_index: int,
 ) -> EvalRunSpec:
-    """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
-    items = dataset.items
-    labels = [item.get(dataset.label) for item in items]
+    """Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
+    Extracts input and label values from the appropriate columns in the dataset.
+    The column names are determined by dataset.input and dataset.label,
+    which may be original field names (e.g., "question", "answer") or computed
+    column names (e.g., "*input", "*label") if templates were used.
+    """
+    # Extract inputs and labels using the dataset's column specifications
+    inputs = dataset[dataset.input]  # Returns List[Any]
+    labels = dataset[dataset.label]  # Returns List[Any]
     eval_run_spec = EvalRunSpec(
         dataset,
         dataset_index,
         hyperparameters,
         hyperparameters_index,
-        items,
+        inputs,
         labels,
     )
     logger.debug("Built EvalRunSpec: %s", eval_run_spec)
@@ -197,9 +223,9 @@ def build_adaptive_eval_run_spec(
     metadata: Optional[Dict[str, Any]] = None,
 ) -> AdaptiveEvalRunSpec:
     """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
-    dataset = adaptive_dataset.replace(":adaptive", "")
+    # Keep the full dataset name including ":adaptive" suffix for backend API
     adaptive_eval_run_spec = AdaptiveEvalRunSpec(
-        dataset,
+        adaptive_dataset,
         dataset_index,
         hyperparameter_config,
         hyperparameter_config_index,
@@ -256,7 +282,7 @@ def create_trismik_sync_client() -> TrismikClient:
 def get_model_name(
     inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
 ) -> str:
-    """Determine a model's name with the fallback "unspecified"."""
+    """Determine a model's name with the fallback "Model"."""
     # First priority: metadata.model
     if metadata and "model" in metadata:
@@ -266,8 +292,8 @@ def get_model_name(
     if inference_callable and hasattr(inference_callable, "model"):
         return str(inference_callable.model)
-    # Fallback: "unspecified"
-    return "unspecified"
+    # Fallback: "Model"
+    return "Model"
 def format_results(
@@ -292,7 +318,7 @@ def format_results(
             # Remove inference output if not requested
             if not return_output:
                 for item in item_scores:
-                    item.pop("inference_output", None)
+                    item.pop("output", None)
             results["item_results"] = item_scores
@@ -322,10 +348,7 @@ def make_trismik_inference(
     """
     # Check if the inference function is async
-    is_async = inspect.iscoroutinefunction(inference_function) or (
-        hasattr(inference_function, "__call__")
-        and inspect.iscoroutinefunction(inference_function.__call__)
-    )
+    is_async = is_awaitable(inference_function)
     def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
         # Single TrismikMultipleChoiceTextItem dataclass

scorebook/exceptions.py CHANGED Viewed

@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
     """Base exception class for all Scorebook-related errors."""
+class EvalDatasetError(ScoreBookError):
+    """Base exception class for all EvalDataset errors."""
+class DatasetConfigurationError(EvalDatasetError):
+    """Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
+class MissingFieldError(EvalDatasetError):
+    """Raised when required field is missing from dataset."""
+    def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
+        """Initialize missing field error with structured context."""
+        self.field_name = field_name
+        self.field_type = field_type  # "input" or "label"
+        self.available_fields = available_fields
+        super().__init__(
+            f"{field_type.capitalize()} field '{field_name}' not found. "
+            f"Available fields: {', '.join(available_fields)}"
+        )
+class DatasetLoadError(EvalDatasetError):
+    """Raised when dataset fails to load from source (file or remote)."""
+class DatasetParseError(EvalDatasetError):
+    """Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
+class DatasetNotInitializedError(EvalDatasetError):
+    """Raised when operations are attempted on uninitialized dataset."""
+class DatasetSampleError(EvalDatasetError):
+    """Raised when sampling parameters are invalid."""
+    def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
+        """Initialize dataset sample error with structured context."""
+        self.sample_size = sample_size
+        self.dataset_size = dataset_size
+        self.dataset_name = dataset_name
+        super().__init__(
+            f"Sample size {sample_size} exceeds dataset size {dataset_size} "
+            f"for dataset '{dataset_name}'"
+        )
 class EvaluationError(ScoreBookError):
     """Raised when there are errors during model evaluation."""
@@ -36,10 +84,14 @@ class MetricComputationError(EvaluationError):
         )
-class DataMismatchError(EvaluationError):
+class ScoreError(ScoreBookError):
+    """Raised when there are errors during scoring."""
+class DataMismatchError(ScoreError):
     """Raised when there's a mismatch between outputs and expected labels."""
-    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
+    def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
         """Initialize data mismatch error."""
         self.outputs_count = outputs_count
         self.labels_count = labels_count

scorebook/inference/clients/bedrock.py CHANGED Viewed

@@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def batch(

scorebook/inference/clients/portkey.py CHANGED Viewed

@@ -13,7 +13,7 @@ import tempfile
 from typing import Any, List, Optional
 from portkey_ai import AsyncPortkey
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def responses(

scorebook/inference/clients/vertex.py CHANGED Viewed

@@ -18,7 +18,7 @@ import pandas as pd
 from google import genai
 from google.cloud import storage
 from google.genai import types
-from tqdm.asyncio import tqdm
+from tqdm.auto import tqdm
 async def responses(

scorebook/score/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Score module for computing metrics on pre-computed outputs."""
+from scorebook.score._async.score_async import score_async
+from scorebook.score._sync.score import score
+__all__ = ["score", "score_async"]

scorebook/score/_async/__init__.py ADDED Viewed

File without changes

scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl