PyPI - scorebook - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

scorebook/__init__.py +2 -0
scorebook/dashboard/credentials.py +34 -4
scorebook/eval_datasets/eval_dataset.py +2 -2
scorebook/evaluate/_async/evaluate_async.py +27 -11
scorebook/evaluate/_sync/evaluate.py +27 -11
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +8 -0
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/score_helpers.py +28 -11
scorebook/types.py +2 -2
scorebook/utils/progress_bars.py +58 -786
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
scorebook-0.0.15.dist-info/RECORD +110 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -107
scorebook-0.0.14.dist-info/RECORD +0 -53
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0

scorebook/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ from scorebook.eval_datasets.eval_dataset import EvalDataset
 from scorebook.evaluate._async.evaluate_async import evaluate_async
 from scorebook.evaluate._sync.evaluate import evaluate
 from scorebook.inference.inference_pipeline import InferencePipeline
+from scorebook.metrics.core.metric_registry import scorebook_metric
 from scorebook.score._async.score_async import score_async
 from scorebook.score._sync.score import score
 from scorebook.utils.render_template import render_template
@@ -35,4 +36,5 @@ __all__ = [
     "create_project_async",
     "upload_result",
     "upload_result_async",
+    "scorebook_metric",
 ]

scorebook/dashboard/credentials.py CHANGED Viewed

@@ -3,8 +3,10 @@
 import logging
 import os
 import pathlib
+import warnings
 from typing import Optional
+from dotenv import load_dotenv
 from trismik import TrismikClient
 from scorebook.settings import TRISMIK_SERVICE_URL
@@ -92,16 +94,44 @@ def validate_token(token: str) -> bool:
         return False
-def login(trismik_api_key: str) -> None:
+def login(trismik_api_key: Optional[str] = None) -> None:
     """Login to trismik by saving API key locally.
+    If no API key is provided, the function will attempt to read it from the
+    TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
+    Environment variables take precedence over .env file values.
     Args:
-        trismik_api_key: The API key to use.
+        trismik_api_key: The API key to use. If not provided, reads from
+            environment or .env file.
     Raises:
-        ValueError: If API key is empty or invalid.
+        ValueError: If API key is empty, not found, or invalid.
+    Warns:
+        UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
+            environment variable is also set.
     """
+    # Warn if user passes explicit key but env var is also set
+    if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
+        warnings.warn(
+            "TRISMIK_API_KEY environment variable is set. The environment variable "
+            "takes precedence over the stored token when calling evaluate(). "
+            "To use the explicitly provided key, unset the TRISMIK_API_KEY "
+            "environment variable.",
+            UserWarning,
+            stacklevel=2,
+        )
+    if trismik_api_key is None:
+        # Load from .env file if TRISMIK_API_KEY is not already set in environment
+        load_dotenv()
+        trismik_api_key = os.environ.get("TRISMIK_API_KEY")
     if not trismik_api_key:
-        raise ValueError("API key cannot be empty")
+        raise ValueError(
+            "API key cannot be empty. Either pass it as a parameter or "
+            "set the TRISMIK_API_KEY environment variable or .env file."
+        )
     # Validate token
     if not validate_token(trismik_api_key):

scorebook/eval_datasets/eval_dataset.py CHANGED Viewed

@@ -18,8 +18,8 @@ from scorebook.exceptions import (
     DatasetSampleError,
     MissingFieldError,
 )
-from scorebook.metrics.metric_base import MetricBase
-from scorebook.metrics.metric_registry import MetricRegistry
+from scorebook.metrics.core.metric_base import MetricBase
+from scorebook.metrics.core.metric_registry import MetricRegistry
 from scorebook.utils.io_helpers import validate_path
 from scorebook.utils.render_template import render_template

scorebook/evaluate/_async/evaluate_async.py CHANGED Viewed

@@ -113,8 +113,6 @@ async def evaluate_async(
         with evaluation_progress_context(
             total_eval_runs=len(eval_run_specs),
             total_items=total_items,
-            dataset_count=len(datasets),
-            hyperparam_count=len(hyperparameter_configs),
             model_display=model_display,
             enabled=show_progress_bars,
         ) as progress_bars:
@@ -151,19 +149,31 @@ async def execute_runs(
     async def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Create progress callback for adaptive evals
+        on_progress: Optional[Callable[[int, int], None]] = None
+        if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
+            def _on_progress(current: int, total: int) -> None:
+                progress_bars.on_item_progress(current, total)
+            on_progress = _on_progress
         # Execute run (score_async handles upload internally for classic evals)
         run_result = await execute_run(
-            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
+            inference,
+            run,
+            upload_results,
+            experiment_id,
+            project_id,
+            metadata,
+            trismik_client,
+            on_progress,
         )
         # Update progress bars with items processed and success status
         if progress_bars is not None:
-            # Classic evals have .items; adaptive evals use max_iterations
-            items_processed = (
-                len(run.dataset.items)
-                if isinstance(run, EvalRunSpec)
-                else evaluation_settings["max_iterations"]
-            )
+            # Classic evals: update items count; Adaptive evals: items already tracked via callback
+            items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
             progress_bars.on_run_completed(items_processed, run_result.run_completed)
         # Update upload progress for classic evals
@@ -195,11 +205,12 @@ async def execute_runs(
 async def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
-    upload_results: bool,  # NEW PARAMETER
+    upload_results: bool,
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
     trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
     """Execute a single evaluation run."""
@@ -218,6 +229,7 @@ async def execute_run(
             resolved_project_id,
             metadata,
             trismik_client,
+            on_progress,
         )
     else:
@@ -338,6 +350,7 @@ async def execute_adaptive_eval_run(
     project_id: str,
     metadata: Optional[Dict[str, Any]] = None,
     trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> AdaptiveEvalRunResult:
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
@@ -347,7 +360,7 @@ async def execute_adaptive_eval_run(
             raise ScoreBookError("Trismik client is required for adaptive evaluation")
         adaptive_eval_run_result = await run_adaptive_evaluation(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
         )
         logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
@@ -365,6 +378,7 @@ async def run_adaptive_evaluation(
     project_id: str,
     metadata: Any,
     trismik_client: Union[TrismikClient, TrismikAsyncClient],
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> AdaptiveEvalRunResult:
     """Run an adaptive evaluation using the Trismik API.
@@ -375,6 +389,7 @@ async def run_adaptive_evaluation(
         project_id: Trismik project ID
         metadata: Additional metadata
         trismik_client: Trismik client instance
+        on_progress: Optional callback for progress updates (current, total)
     Returns:
         Results from the adaptive evaluation
     """
@@ -404,6 +419,7 @@ async def run_adaptive_evaluation(
             inference_setup={},
         ),
         item_processor=make_trismik_inference(inference_with_hyperparams),
+        on_progress=on_progress,
         return_dict=False,
     )

scorebook/evaluate/_sync/evaluate.py CHANGED Viewed

@@ -112,8 +112,6 @@ def evaluate(
         with evaluation_progress_context(
             total_eval_runs=len(eval_run_specs),
             total_items=total_items,
-            dataset_count=len(datasets),
-            hyperparam_count=len(hyperparameter_configs),
             model_display=model_display,
             enabled=show_progress_bars,
         ) as progress_bars:
@@ -150,19 +148,31 @@ def execute_runs(
     def worker(
         run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
     ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
+        # Create progress callback for adaptive evals
+        on_progress: Optional[Callable[[int, int], None]] = None
+        if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
+            def _on_progress(current: int, total: int) -> None:
+                progress_bars.on_item_progress(current, total)
+            on_progress = _on_progress
         # Execute run (score_async handles upload internally for classic evals)
         run_result = execute_run(
-            inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
+            inference,
+            run,
+            upload_results,
+            experiment_id,
+            project_id,
+            metadata,
+            trismik_client,
+            on_progress,
         )
         # Update progress bars with items processed and success status
         if progress_bars is not None:
-            # Classic evals have .items; adaptive evals use max_iterations
-            items_processed = (
-                len(run.dataset.items)
-                if isinstance(run, EvalRunSpec)
-                else evaluation_settings["max_iterations"]
-            )
+            # Classic evals: update items count; Adaptive evals: items already tracked via callback
+            items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
             progress_bars.on_run_completed(items_processed, run_result.run_completed)
         # Update upload progress for classic evals
@@ -194,11 +204,12 @@ def execute_runs(
 def execute_run(
     inference: Callable,
     run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
-    upload_results: bool,  # NEW PARAMETER
+    upload_results: bool,
     experiment_id: Optional[str] = None,
     project_id: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = None,
     trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
     """Execute a single evaluation run."""
@@ -217,6 +228,7 @@ def execute_run(
             resolved_project_id,
             metadata,
             trismik_client,
+            on_progress,
         )
     else:
@@ -337,6 +349,7 @@ def execute_adaptive_eval_run(
     project_id: str,
     metadata: Optional[Dict[str, Any]] = None,
     trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> AdaptiveEvalRunResult:
     """Execute an adaptive evaluation run."""
     logger.debug("Executing adaptive run for %s", run)
@@ -346,7 +359,7 @@ def execute_adaptive_eval_run(
             raise ScoreBookError("Trismik client is required for adaptive evaluation")
         adaptive_eval_run_result = run_adaptive_evaluation(
-            inference, run, experiment_id, project_id, metadata, trismik_client
+            inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
         )
         logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
@@ -364,6 +377,7 @@ def run_adaptive_evaluation(
     project_id: str,
     metadata: Any,
     trismik_client: Union[TrismikClient, TrismikAsyncClient],
+    on_progress: Optional[Callable[[int, int], None]] = None,
 ) -> AdaptiveEvalRunResult:
     """Run an adaptive evaluation using the Trismik API.
@@ -374,6 +388,7 @@ def run_adaptive_evaluation(
         project_id: Trismik project ID
         metadata: Additional metadata
         trismik_client: Trismik client instance
+        on_progress: Optional callback for progress updates (current, total)
     Returns:
         Results from the adaptive evaluation
     """
@@ -403,6 +418,7 @@ def run_adaptive_evaluation(
             inference_setup={},
         ),
         item_processor=make_trismik_inference(inference_with_hyperparams),
+        on_progress=on_progress,
         return_dict=False,
     )

scorebook/metrics/README.md ADDED Viewed

@@ -0,0 +1,121 @@
+# Adding Metrics to Scorebook
+This guide explains how to add new metrics to Scorebook.
+## Quick Start
+1. Create a metric file: `src/scorebook/metrics/yourmetric.py`
+2. Implement the metric class
+3. Add tests
+4. Submit PR for review
+### Where to Put Tests
+Tests go in one of two directories:
+- **`tests/unit/test_metrics/`** - For fast tests using mocked data. These run on every commit.
+- **`tests/extended/test_metrics/`** - For tests that require external dependencies, large datasets, or are computationally expensive.
+Most metrics only need unit tests. Use extended tests when your metric relies on external APIs, models, or takes significant time to run.
+See [CONTRIBUTING.md](../../../CONTRIBUTING.md) for instructions on running tests.
+---
+## Requirements
+Your metric must:
+- Use the `@scorebook_metric` decorator
+- Inherit from `MetricBase`
+- Implement the `score()` static method
+The `score()` method returns a tuple of `(aggregate_scores, item_scores)`:
+- **aggregate_scores**: A `Dict[str, float]` with overall metric values (e.g., `{"accuracy": 0.85}`)
+- **item_scores**: A `List` of per-item scores. For metrics that produce a single value per item, use `int`, `float`, `bool`, or `str`. For metrics that produce multiple values per item, use a `Dict[str, Union[int, float, bool, str]]` where keys are metric names.
+---
+## File Naming
+Metric files must use normalized names (lowercase, no underscores/spaces). This naming convention is required for the registry's lazy loading system to work.
+1. User requests a metric by name (e.g., `"f1_score"`, `"F1Score"`, or `"f1 score"`)
+2. The registry normalizes the input → `"f1score"`
+3. The registry imports `scorebook.metrics.f1score`
+4. The `@scorebook_metric` decorator registers the class
+**Examples:**
+- Class: `F1Score` → File: `f1score.py` → User can request: `"f1score"`, `"F1Score"`, `"f1_score"`, `"f1 score"`
+- Class: `MeanSquaredError` → File: `meansquarederror.py` → User can request: `"MeanSquaredError"`, `"mean_squared_error"`, etc.
+**Collision detection:** Class names that normalize to the same key will raise an error at registration time. For example, `F1Score` and `F1_Score` both normalize to `"f1score"` and cannot coexist.
+---
+## Implementation Template
+Create your metric file in `src/scorebook/metrics/yourmetric.py`:
+```python
+"""Brief description of the metric."""
+from typing import Any, Dict, List, Tuple
+from scorebook.metrics import MetricBase, scorebook_metric
+@scorebook_metric
+class YourMetric(MetricBase):
+    """One-line description of what this metric measures.
+    Formula or explanation (e.g., Accuracy = correct / total).
+    """
+    def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
+        """Calculate metric score between outputs and labels.
+        Args:
+            outputs: A list of model inference outputs.
+            labels: A list of ground truth labels.
+        Returns:
+            Tuple containing:
+                - Aggregate scores dict (e.g., {"your_metric": 0.85})
+                - List of per-item scores
+        Raises:
+            ValueError: If outputs and labels have different lengths.
+        """
+        # Input validation
+        if len(outputs) != len(labels):
+            raise ValueError("Number of outputs must match number of labels")
+        if not outputs:
+            return {"your_metric": 0.0}, []
+        # Calculate per-item scores
+        item_scores = [calculate_score(out, lab) for out, lab in zip(outputs, labels)]
+        # Calculate aggregate score
+        aggregate_score = sum(item_scores) / len(item_scores)
+        return {"your_metric": aggregate_score}, item_scores
+```
+---
+## Documentation
+Each metric should have:
+1. **Module-level docstring**: Brief description at the top of the file
+2. **Class docstring**: What the metric measures, formula, and any limitations
+3. **Method docstring**: Args, Returns, and Raises sections
+---
+## Example
+See `src/scorebook/metrics/accuracy.py` for a complete reference implementation.

scorebook/metrics/__init__.py CHANGED Viewed

@@ -1 +1,9 @@
 """Metrics for evaluating model predictions."""
+from scorebook.metrics.core.metric_base import MetricBase
+from scorebook.metrics.core.metric_registry import scorebook_metric
+__all__ = [
+    "MetricBase",
+    "scorebook_metric",
+]

scorebook/metrics/accuracy.py CHANGED Viewed

@@ -2,11 +2,10 @@
 from typing import Any, Dict, List, Tuple
-from scorebook.metrics.metric_base import MetricBase
-from scorebook.metrics.metric_registry import MetricRegistry
+from scorebook.metrics import MetricBase, scorebook_metric
-@MetricRegistry.register()
+@scorebook_metric
 class Accuracy(MetricBase):
     """Accuracy metric for evaluating model predictions of any type.
@@ -25,9 +24,6 @@ class Accuracy(MetricBase):
             The aggregate accuracy score for all items (correct predictions / total predictions).
             The item scores for each output-label pair (true/false).
         """
-        if len(outputs) != len(labels):
-            raise ValueError("Number of outputs must match number of labels")
         if not outputs:  # Handle empty lists
             return {"accuracy": 0.0}, []

scorebook/metrics/bertscore.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""BertScore implementation for Scorebook."""
+from typing import Any, Dict, List, Tuple
+import bert_score
+from scorebook.metrics import scorebook_metric
+from scorebook.metrics.core.metric_base import MetricBase
+@scorebook_metric
+class BertScore(MetricBase):
+    """BertScore metric for evaluating model predictions against reference text."""
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize BertScore metric."""
+        defaults = {"lang": "en", "verbose": False}
+        self.kwargs = {**defaults, **kwargs}  # User kwargs override defaults
+    def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
+        """Calculate bert score between predictions and references.
+        Args:
+            outputs: A list of inference outputs.
+            labels: A list of ground truth labels.
+        Returns:
+            A tuple containing:
+                - aggregate_scores (Dict[str, float]): Dictionary with average precision,
+                  recall, and F1 scores for all items.
+                - item_scores (List[Dict[str, float]]): List of dictionaries with precision,
+                  recall, and F1 scores for each output-label pair.
+        """
+        if not outputs:  # Handle empty lists
+            return {"precision": 0.0, "recall": 0.0, "F1": 0.0}, []
+        # Calculate item scores
+        p_scores, r_scores, f1_scores = bert_score.score(outputs, labels, **self.kwargs)
+        item_scores = [
+            {"precision": p, "recall": r, "F1": f1}
+            for p, r, f1 in zip(p_scores.tolist(), r_scores.tolist(), f1_scores.tolist())
+        ]
+        aggregate_scores = {
+            "precision": p_scores.mean().item(),
+            "recall": r_scores.mean().item(),
+            "F1": f1_scores.mean().item(),
+        }
+        return aggregate_scores, item_scores

scorebook/metrics/bleu.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""BLEU metric implementation for Scorebook, based on sacrebleu."""
+from typing import Any, Dict, List, Tuple
+import sacrebleu
+from scorebook.metrics import MetricBase, scorebook_metric
+@scorebook_metric
+class BLEU(MetricBase):
+    """BLEU metric implementation for Scorebook, based on sacrebleu."""
+    def __init__(self, compact: bool = True, **kwargs: Any) -> None:
+        """
+        Generate BLEU metric.
+        :param compact: if True, returns only the BLEU metric; if False,
+        returns the full signature of BLEU.
+        :param kwargs: additional arguments passed to BLEU.
+        """
+        self.compact = compact
+        self.corpus_bleu = sacrebleu.metrics.BLEU(**kwargs)
+        # Overwrite effective order for sentence level scores
+        kwargs["effective_order"] = True
+        self.sentence_bleu = sacrebleu.metrics.BLEU(**kwargs)
+    def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
+        """Calculate accuracy score between predictions and references.
+        Args:
+            outputs: A list of inference outputs.
+            labels: A list of ground truth labels.
+        Returns:
+            The aggregate accuracy score for all items (correct predictions / total predictions).
+            The item scores for each output-label pair (true/false).
+        """
+        if not outputs:  # Handle empty lists
+            return {"BLEU": 0.0}, []
+        item_scores = []
+        # Calculate item scores
+        for output, label in zip(outputs, labels):
+            item_bleu: sacrebleu.metrics.BLEUScore = self.sentence_bleu.sentence_score(
+                output, [label]
+            )
+            item_score = {
+                "BLEU": item_bleu.score,
+            }
+            if not self.compact:
+                item_score["1-gram"] = item_bleu.precisions[0]
+                item_score["2-gram"] = item_bleu.precisions[1]
+                item_score["3-gram"] = item_bleu.precisions[2]
+                item_score["4-gram"] = item_bleu.precisions[3]
+                item_score["BP"] = item_bleu.bp
+                item_score["ratio"] = item_bleu.ratio
+                item_score["hyp_len"] = item_bleu.sys_len
+                item_score["ref_len"] = item_bleu.ref_len
+            item_scores.append(item_score)
+        # Calculate aggregate score
+        corpus_bleu: sacrebleu.metrics.BLEUScore = self.corpus_bleu.corpus_score(outputs, [labels])
+        aggregate_scores = {"BLEU": corpus_bleu.score}
+        if not self.compact:
+            aggregate_scores["1-gram"] = corpus_bleu.precisions[0]
+            aggregate_scores["2-gram"] = corpus_bleu.precisions[1]
+            aggregate_scores["3-gram"] = corpus_bleu.precisions[2]
+            aggregate_scores["4-gram"] = corpus_bleu.precisions[3]
+            aggregate_scores["BP"] = corpus_bleu.bp
+            aggregate_scores["ratio"] = corpus_bleu.ratio
+            aggregate_scores["hyp_len"] = corpus_bleu.sys_len
+            aggregate_scores["ref_len"] = corpus_bleu.ref_len
+        return aggregate_scores, item_scores

scorebook/metrics/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Core metric framework components."""

scorebook/metrics/{metric_base.py → core/metric_base.py} RENAMED Viewed

@@ -12,9 +12,8 @@ class MetricBase(ABC):
         """Return the metric name based on the class name."""
         return self.__class__.__name__.lower()
-    @staticmethod
     @abstractmethod
-    def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
+    def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
         """Calculate the metric score for a list of outputs and labels.
         Args:

scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl