PyPI - scorebook - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

scorebook/__init__.py +11 -4
scorebook/eval_datasets/__init__.py +5 -0
scorebook/eval_datasets/eval_dataset.py +719 -0
scorebook/evaluate/_async/evaluate_async.py +135 -130
scorebook/evaluate/_sync/evaluate.py +135 -131
scorebook/evaluate/evaluate_helpers.py +46 -23
scorebook/exceptions.py +54 -2
scorebook/inference/clients/bedrock.py +1 -1
scorebook/inference/clients/portkey.py +1 -1
scorebook/inference/clients/vertex.py +1 -1
scorebook/score/__init__.py +6 -0
scorebook/score/_async/__init__.py +0 -0
scorebook/score/_async/score_async.py +145 -0
scorebook/score/_sync/__init__.py +0 -0
scorebook/score/_sync/score.py +145 -0
scorebook/score/score_helpers.py +207 -0
scorebook/settings.py +3 -0
scorebook/trismik/upload_results.py +254 -0
scorebook/types.py +36 -54
scorebook/utils/__init__.py +11 -4
scorebook/utils/common_helpers.py +41 -0
scorebook/utils/io_helpers.py +18 -5
scorebook/utils/progress_bars.py +819 -70
scorebook/utils/{build_prompt.py → render_template.py} +13 -12
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
scorebook-0.0.12.dist-info/RECORD +50 -0
scorebook/eval_dataset.py +0 -404
scorebook-0.0.10.dist-info/RECORD +0 -41
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0

scorebook/trismik/upload_results.py ADDED Viewed

@@ -0,0 +1,254 @@
+"""Upload evaluation and scoring results to Trismik's experimentation platform."""
+import logging
+from typing import Any, Dict, List, Optional
+from trismik.types import (
+    TrismikClassicEvalItem,
+    TrismikClassicEvalMetric,
+    TrismikClassicEvalRequest,
+    TrismikClassicEvalResponse,
+)
+from scorebook.evaluate.evaluate_helpers import (
+    create_trismik_async_client,
+    create_trismik_sync_client,
+    get_model_name,
+    normalize_metric_value,
+)
+logger = logging.getLogger(__name__)
+# Known fields that are not metrics or hyperparameters
+KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
+KNOWN_ITEM_FIELDS = {"id", "dataset_name", "input", "output", "label", "run_id"}
+def upload_result(
+    run_result: Dict[str, List[Dict[str, Any]]],
+    experiment_id: str,
+    project_id: str,
+    dataset_name: Optional[str] = None,
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    model_name: Optional[str] = None,
+) -> str:
+    """Upload evaluation or scoring results to Trismik's platform (synchronous).
+    This function uploads results in the format returned by the evaluate or score
+    functions to the Trismik platform for tracking and analysis.
+    Args:
+        run_result: Dict with keys 'aggregate_results' and 'item_results' containing
+            evaluation/scoring results. Structure matches the output of evaluate()/score().
+        experiment_id: Trismik experiment identifier
+        project_id: Trismik project identifier
+        dataset_name: Optional dataset name. If not provided, extracted from metadata
+            or defaults to "Dataset"
+        hyperparameters: Optional dict of hyperparameters. If not provided, extracted
+            from run_result.
+        metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
+        model_name: Optional model name. If not provided, extracted from metadata
+            or defaults to "Model"
+    Returns:
+        str: Run ID assigned by Trismik
+    Raises:
+        Exception: If upload fails (re-raises underlying exceptions)
+    """
+    # Create Trismik client
+    trismik_client = create_trismik_sync_client()
+    # Get model name - use provided model_name, or extract from metadata, or use default
+    if model_name is not None:
+        model = model_name
+    else:
+        model = get_model_name(metadata=metadata)
+    # Get dataset name - use provided dataset_name, or extract from metadata, or use default
+    if dataset_name is None:
+        if metadata and "dataset" in metadata:
+            dataset_name = str(metadata["dataset"])
+        else:
+            dataset_name = "Dataset"
+    # Extract aggregate and item results
+    aggregate_results = run_result.get("aggregate_results", [])
+    item_results = run_result.get("item_results", [])
+    # Use provided hyperparameters or default to empty dict
+    # Note: We don't extract hyperparameters from aggregate_results to avoid
+    # misclassifying metrics as hyperparameters
+    if hyperparameters is None:
+        hyperparameters = {}
+    # Create eval items from item_results
+    trismik_items: List[TrismikClassicEvalItem] = []
+    for item in item_results:
+        # Extract inputs, outputs, labels
+        item_id = str(item.get("id", 0))
+        model_input = str(item.get("input", ""))
+        model_output = str(item.get("output", ""))
+        gold_output = str(item.get("label", ""))
+        # Extract item-level metrics (exclude known fields and hyperparameters)
+        item_metrics: Dict[str, Any] = {}
+        for key, value in item.items():
+            if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
+                # Normalize metric value for API compatibility
+                item_metrics[key] = normalize_metric_value(value)
+        eval_item = TrismikClassicEvalItem(  # pragma: allowlist secret
+            datasetItemId=item_id,
+            modelInput=model_input,
+            modelOutput=model_output,
+            goldOutput=gold_output,
+            metrics=item_metrics,
+        )
+        trismik_items.append(eval_item)
+    # Extract aggregate metrics from aggregate_results
+    trismik_metrics: List[TrismikClassicEvalMetric] = []
+    if aggregate_results:
+        for key, value in aggregate_results[0].items():
+            if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
+                # This is a metric  # pragma: allowlist secret
+                metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
+                trismik_metrics.append(metric)  # pragma: allowlist secret
+    # Create classic eval request
+    classic_eval_request = TrismikClassicEvalRequest(
+        project_id,
+        experiment_id,
+        dataset_name,
+        model,
+        hyperparameters,
+        trismik_items,
+        trismik_metrics,
+    )
+    # Submit to Trismik  # pragma: allowlist secret
+    response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
+        classic_eval_request
+    )  # pragma: allowlist secret
+    run_id: str = response.id
+    logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
+    return run_id
+async def upload_result_async(
+    run_result: Dict[str, List[Dict[str, Any]]],
+    experiment_id: str,
+    project_id: str,
+    dataset_name: Optional[str] = None,
+    hyperparameters: Optional[Dict[str, Any]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    model_name: Optional[str] = None,
+) -> str:
+    """Upload evaluation or scoring results to Trismik's platform (asynchronous).
+    This function uploads results in the format returned by the evaluate or
+    score functions to the Trismik platform for tracking and analysis.
+    Args:
+        run_result: Dict with keys 'aggregate_results' and 'item_results' containing
+            evaluation/scoring results. Structure matches the output of evaluate()/score().
+        experiment_id: Trismik experiment identifier
+        project_id: Trismik project identifier
+        dataset_name: Optional dataset name. If not provided, extracted from metadata
+        or defaults to "Dataset"
+        hyperparameters: Optional dict of hyperparameters. If not provided, extracted
+            from run_result.
+        metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
+        model_name: Optional model name. If not provided, extracted from metadata
+            or defaults to "Model"
+    Returns:
+        str: Run ID assigned by Trismik
+    Raises:
+        Exception: If upload fails (re-raises underlying exceptions)
+    """
+    # Create Trismik async client
+    trismik_client = create_trismik_async_client()
+    # Get model name - use provided model_name, or extract from metadata, or use default
+    if model_name is not None:
+        model = model_name
+    else:
+        model = get_model_name(metadata=metadata)
+    # Get dataset name - use provided dataset_name, or extract from metadata, or use default
+    if dataset_name is None:
+        if metadata and "dataset" in metadata:
+            dataset_name = str(metadata["dataset"])
+        else:
+            dataset_name = "Dataset"
+    # Extract aggregate and item results
+    aggregate_results = run_result.get("aggregate_results", [])
+    item_results = run_result.get("item_results", [])
+    # Use provided hyperparameters or default to empty dict
+    # Note: We don't extract hyperparameters from aggregate_results to avoid
+    # misclassifying metrics as hyperparameters
+    if hyperparameters is None:
+        hyperparameters = {}
+    # Create eval items from item_results
+    trismik_items: List[TrismikClassicEvalItem] = []
+    for item in item_results:
+        # Extract inputs, outputs, labels
+        item_id = str(item.get("id", 0))
+        model_input = str(item.get("input", ""))
+        model_output = str(item.get("output", ""))
+        gold_output = str(item.get("label", ""))
+        # Extract item-level metrics (exclude known fields and hyperparameters)
+        item_metrics: Dict[str, Any] = {}
+        for key, value in item.items():
+            if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
+                # Normalize metric value for API compatibility
+                item_metrics[key] = normalize_metric_value(value)
+        eval_item = TrismikClassicEvalItem(  # pragma: allowlist secret
+            datasetItemId=item_id,
+            modelInput=model_input,
+            modelOutput=model_output,
+            goldOutput=gold_output,
+            metrics=item_metrics,
+        )
+        trismik_items.append(eval_item)
+    # Extract aggregate metrics from aggregate_results
+    trismik_metrics: List[TrismikClassicEvalMetric] = []
+    if aggregate_results:
+        for key, value in aggregate_results[0].items():
+            if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
+                # This is a metric  # pragma: allowlist secret
+                metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
+                trismik_metrics.append(metric)  # pragma: allowlist secret
+    # Create classic eval request
+    classic_eval_request = TrismikClassicEvalRequest(
+        project_id,
+        experiment_id,
+        dataset_name,
+        model,
+        hyperparameters,
+        trismik_items,
+        trismik_metrics,
+    )
+    # Submit to Trismik (async)  # pragma: allowlist secret
+    response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
+        classic_eval_request
+    )  # pragma: allowlist secret
+    run_id: str = response.id
+    logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
+    return run_id

scorebook/types.py CHANGED Viewed

@@ -1,9 +1,15 @@
 """Type definitions for scorebook evaluation framework."""
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Sequence, Type, Union
-from scorebook.eval_dataset import EvalDataset
+from scorebook.eval_datasets import EvalDataset
+from scorebook.metrics.metric_base import MetricBase
+# Type alias for metrics parameter
+Metrics = Union[
+    str, "MetricBase", Type["MetricBase"], Sequence[Union[str, "MetricBase", Type["MetricBase"]]]
+]
 @dataclass
@@ -21,7 +27,7 @@ class EvalRunSpec:
     dataset_index: int
     hyperparameter_config: Dict[str, Any]
     hyperparameters_index: int
-    items: List[Dict[str, Any]]
+    inputs: List[Any]
     labels: List[Any]
     def __str__(self) -> str:
@@ -54,77 +60,44 @@ class ClassicEvalRunResult:
     run_spec: EvalRunSpec
     run_completed: bool
     outputs: Optional[List[Any]]
-    scores: Optional[Dict[str, Any]]
+    scores: Optional[Dict[str, List[Dict[str, Any]]]]  # score_async format
     run_id: Optional[str] = None
     @property
     def item_scores(self) -> List[Dict[str, Any]]:
         """Return a list of dictionaries containing scores for each evaluated item."""
-        results = []
-        if self.outputs:
-            for idx, output in enumerate(self.outputs):
-                if idx >= len(self.run_spec.items):
-                    break
-                result = {
-                    "item_id": idx,
-                    "dataset_name": self.run_spec.dataset.name,
-                    "inference_output": output,
-                    **self.run_spec.hyperparameter_config,
-                }
-                # Add run_id if available
-                if self.run_id is not None:
-                    result["run_id"] = self.run_id
-                # Add individual item scores if available
-                if self.scores is not None:
-                    for metric_name, metric_data in self.scores.items():
-                        if isinstance(metric_data, dict) and "item_scores" in metric_data:
-                            if idx < len(metric_data["item_scores"]):
-                                result[metric_name] = metric_data["item_scores"][idx]
-                        else:
-                            # If scores is just a single value, replicate it for each item
-                            result[metric_name] = metric_data
-                results.append(result)
-        return results
+        if self.scores and "item_results" in self.scores:
+            # score_async already built this in the exact format we need
+            return self.scores["item_results"]
+        return []
     @property
     def aggregate_scores(self) -> Dict[str, Any]:
         """Return the aggregated scores for this run."""
-        result = {
+        if (
+            self.scores
+            and "aggregate_results" in self.scores
+            and len(self.scores["aggregate_results"]) > 0
+        ):
+            result = self.scores["aggregate_results"][0].copy()
+            # Add run_completed (not included in score_async format)
+            result["run_completed"] = self.run_completed
+            return result
+        # Fallback if no scores available
+        return {
             "dataset": self.run_spec.dataset.name,
             "run_completed": self.run_completed,
             **self.run_spec.hyperparameter_config,
         }
-        # Add run_id if available
-        if self.run_id is not None:
-            result["run_id"] = self.run_id
-        # Add aggregate scores from metrics
-        if self.scores is not None:
-            for metric_name, metric_data in self.scores.items():
-                if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
-                    # Flatten the aggregate scores from each metric
-                    for key, value in metric_data["aggregate_scores"].items():
-                        score_key = key if key == metric_name else f"{metric_name}_{key}"
-                        result[score_key] = value
-                else:
-                    # If scores is just a single value, use it as is
-                    result[metric_name] = metric_data
-        return result
 @dataclass
 class AdaptiveEvalRunResult:
     """Results from executing an adaptive evaluation run."""
     run_spec: AdaptiveEvalRunSpec
+    run_completed: bool
     scores: Dict[str, Any]
     @property
@@ -177,3 +150,12 @@ class EvalResult:
             results.append(run_result.aggregate_scores)
         return results
+@dataclass
+class MetricScore:
+    """Container for metric scores across multiple runs."""
+    metric_name: str
+    aggregate_scores: Dict[str, Any]
+    item_scores: List[Dict[str, Any]]

scorebook/utils/__init__.py CHANGED Viewed

@@ -1,16 +1,23 @@
 """Utility functions and common helpers for the Scorebook framework."""
+from contextlib import nullcontext
 from scorebook.utils.async_utils import async_nullcontext, is_awaitable
-from scorebook.utils.build_prompt import build_prompt
+from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
 from scorebook.utils.io_helpers import validate_path
-from scorebook.utils.progress_bars import evaluation_progress
+from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
+from scorebook.utils.render_template import render_template
 from scorebook.utils.transform_helpers import expand_dict
 __all__ = [
     "async_nullcontext",
+    "nullcontext",
     "is_awaitable",
+    "resolve_show_progress",
+    "resolve_upload_results",
     "validate_path",
     "expand_dict",
-    "evaluation_progress",
-    "build_prompt",
+    "evaluation_progress_context",
+    "scoring_progress_context",
+    "render_template",
 ]

scorebook/utils/common_helpers.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Common helper functions shared across scorebook modules."""
+import logging
+from typing import Literal, Optional, Union
+logger = logging.getLogger(__name__)
+def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
+    """Resolve the upload_results parameter based on trismik login status.
+    Args:
+        upload_results: Can be True, False, or "auto". When "auto", resolves to True
+            if user is logged in to Trismik, False otherwise.
+    Returns:
+        bool: Whether to upload results to Trismik
+    """
+    if upload_results == "auto":
+        from scorebook.trismik.credentials import get_token
+        upload_results = get_token() is not None
+        logger.debug("Auto upload results resolved to: %s", upload_results)
+    return upload_results
+def resolve_show_progress(show_progress: Optional[bool]) -> bool:
+    """Resolve whether to show progress bars.
+    Args:
+        show_progress: Explicit setting (None uses default from settings)
+    Returns:
+        bool: Whether to show progress bars
+    """
+    if show_progress is None:
+        from scorebook.settings import SHOW_PROGRESS_BARS
+        return bool(SHOW_PROGRESS_BARS)
+    return show_progress

scorebook/utils/io_helpers.py CHANGED Viewed

@@ -1,15 +1,18 @@
 """Input/output helper functions for Scorebook."""
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple, Union
-def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path:
+def validate_path(
+    file_path: Union[str, Path], expected_suffix: Optional[Union[str, Tuple[str, ...]]] = None
+) -> Path:
     """Validate that a file path exists and optionally check its suffix.
     Args:
         file_path: Path to the file as string or Path object
-        expected_suffix: Optional file extension to validate (e.g. ".json", ".csv")
+        expected_suffix: Optional file extension(s) to validate.
+            Can be a single string (e.g. ".json") or tuple of strings (e.g. (".yaml", ".yml"))
     Returns:
         Path object for the validated file path
@@ -22,7 +25,17 @@ def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path
     if not path.exists():
         raise FileNotFoundError(f"File not found: {file_path}")
-    if expected_suffix and path.suffix.lower() != expected_suffix.lower():
-        raise ValueError(f"File must have {expected_suffix} extension, got: {path.suffix}")
+    if expected_suffix:
+        # Convert single suffix to tuple for uniform handling
+        allowed_suffixes = (
+            (expected_suffix,) if isinstance(expected_suffix, str) else expected_suffix
+        )
+        allowed_suffixes_lower = tuple(s.lower() for s in allowed_suffixes)
+        if path.suffix.lower() not in allowed_suffixes_lower:
+            suffix_list = ", ".join(f"'{s}'" for s in allowed_suffixes)
+            raise ValueError(
+                f"File must have one of ({suffix_list}) extensions, got: '{path.suffix}'"
+            )
     return path

scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

scorebook 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl