scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
  23. scorebook-0.0.15.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py CHANGED
@@ -16,6 +16,7 @@ from scorebook.eval_datasets.eval_dataset import EvalDataset
16
16
  from scorebook.evaluate._async.evaluate_async import evaluate_async
17
17
  from scorebook.evaluate._sync.evaluate import evaluate
18
18
  from scorebook.inference.inference_pipeline import InferencePipeline
19
+ from scorebook.metrics.core.metric_registry import scorebook_metric
19
20
  from scorebook.score._async.score_async import score_async
20
21
  from scorebook.score._sync.score import score
21
22
  from scorebook.utils.render_template import render_template
@@ -35,4 +36,5 @@ __all__ = [
35
36
  "create_project_async",
36
37
  "upload_result",
37
38
  "upload_result_async",
39
+ "scorebook_metric",
38
40
  ]
@@ -3,8 +3,10 @@
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import warnings
6
7
  from typing import Optional
7
8
 
9
+ from dotenv import load_dotenv
8
10
  from trismik import TrismikClient
9
11
 
10
12
  from scorebook.settings import TRISMIK_SERVICE_URL
@@ -92,16 +94,44 @@ def validate_token(token: str) -> bool:
92
94
  return False
93
95
 
94
96
 
95
- def login(trismik_api_key: str) -> None:
97
+ def login(trismik_api_key: Optional[str] = None) -> None:
96
98
  """Login to trismik by saving API key locally.
97
99
 
100
+ If no API key is provided, the function will attempt to read it from the
101
+ TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
102
+ Environment variables take precedence over .env file values.
103
+
98
104
  Args:
99
- trismik_api_key: The API key to use.
105
+ trismik_api_key: The API key to use. If not provided, reads from
106
+ environment or .env file.
100
107
  Raises:
101
- ValueError: If API key is empty or invalid.
108
+ ValueError: If API key is empty, not found, or invalid.
109
+
110
+ Warns:
111
+ UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
112
+ environment variable is also set.
102
113
  """
114
+ # Warn if user passes explicit key but env var is also set
115
+ if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
116
+ warnings.warn(
117
+ "TRISMIK_API_KEY environment variable is set. The environment variable "
118
+ "takes precedence over the stored token when calling evaluate(). "
119
+ "To use the explicitly provided key, unset the TRISMIK_API_KEY "
120
+ "environment variable.",
121
+ UserWarning,
122
+ stacklevel=2,
123
+ )
124
+
125
+ if trismik_api_key is None:
126
+ # Load from .env file if TRISMIK_API_KEY is not already set in environment
127
+ load_dotenv()
128
+ trismik_api_key = os.environ.get("TRISMIK_API_KEY")
129
+
103
130
  if not trismik_api_key:
104
- raise ValueError("API key cannot be empty")
131
+ raise ValueError(
132
+ "API key cannot be empty. Either pass it as a parameter or "
133
+ "set the TRISMIK_API_KEY environment variable or .env file."
134
+ )
105
135
 
106
136
  # Validate token
107
137
  if not validate_token(trismik_api_key):
@@ -18,8 +18,8 @@ from scorebook.exceptions import (
18
18
  DatasetSampleError,
19
19
  MissingFieldError,
20
20
  )
21
- from scorebook.metrics.metric_base import MetricBase
22
- from scorebook.metrics.metric_registry import MetricRegistry
21
+ from scorebook.metrics.core.metric_base import MetricBase
22
+ from scorebook.metrics.core.metric_registry import MetricRegistry
23
23
  from scorebook.utils.io_helpers import validate_path
24
24
  from scorebook.utils.render_template import render_template
25
25
 
@@ -113,8 +113,6 @@ async def evaluate_async(
113
113
  with evaluation_progress_context(
114
114
  total_eval_runs=len(eval_run_specs),
115
115
  total_items=total_items,
116
- dataset_count=len(datasets),
117
- hyperparam_count=len(hyperparameter_configs),
118
116
  model_display=model_display,
119
117
  enabled=show_progress_bars,
120
118
  ) as progress_bars:
@@ -151,19 +149,31 @@ async def execute_runs(
151
149
  async def worker(
152
150
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
153
151
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
152
+ # Create progress callback for adaptive evals
153
+ on_progress: Optional[Callable[[int, int], None]] = None
154
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
155
+
156
+ def _on_progress(current: int, total: int) -> None:
157
+ progress_bars.on_item_progress(current, total)
158
+
159
+ on_progress = _on_progress
160
+
154
161
  # Execute run (score_async handles upload internally for classic evals)
155
162
  run_result = await execute_run(
156
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
163
+ inference,
164
+ run,
165
+ upload_results,
166
+ experiment_id,
167
+ project_id,
168
+ metadata,
169
+ trismik_client,
170
+ on_progress,
157
171
  )
158
172
 
159
173
  # Update progress bars with items processed and success status
160
174
  if progress_bars is not None:
161
- # Classic evals have .items; adaptive evals use max_iterations
162
- items_processed = (
163
- len(run.dataset.items)
164
- if isinstance(run, EvalRunSpec)
165
- else evaluation_settings["max_iterations"]
166
- )
175
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
176
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
167
177
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
168
178
 
169
179
  # Update upload progress for classic evals
@@ -195,11 +205,12 @@ async def execute_runs(
195
205
  async def execute_run(
196
206
  inference: Callable,
197
207
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
198
- upload_results: bool, # NEW PARAMETER
208
+ upload_results: bool,
199
209
  experiment_id: Optional[str] = None,
200
210
  project_id: Optional[str] = None,
201
211
  metadata: Optional[Dict[str, Any]] = None,
202
212
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
213
+ on_progress: Optional[Callable[[int, int], None]] = None,
203
214
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
204
215
  """Execute a single evaluation run."""
205
216
 
@@ -218,6 +229,7 @@ async def execute_run(
218
229
  resolved_project_id,
219
230
  metadata,
220
231
  trismik_client,
232
+ on_progress,
221
233
  )
222
234
 
223
235
  else:
@@ -338,6 +350,7 @@ async def execute_adaptive_eval_run(
338
350
  project_id: str,
339
351
  metadata: Optional[Dict[str, Any]] = None,
340
352
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
353
+ on_progress: Optional[Callable[[int, int], None]] = None,
341
354
  ) -> AdaptiveEvalRunResult:
342
355
  """Execute an adaptive evaluation run."""
343
356
  logger.debug("Executing adaptive run for %s", run)
@@ -347,7 +360,7 @@ async def execute_adaptive_eval_run(
347
360
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
348
361
 
349
362
  adaptive_eval_run_result = await run_adaptive_evaluation(
350
- inference, run, experiment_id, project_id, metadata, trismik_client
363
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
351
364
  )
352
365
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
353
366
 
@@ -365,6 +378,7 @@ async def run_adaptive_evaluation(
365
378
  project_id: str,
366
379
  metadata: Any,
367
380
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
381
+ on_progress: Optional[Callable[[int, int], None]] = None,
368
382
  ) -> AdaptiveEvalRunResult:
369
383
  """Run an adaptive evaluation using the Trismik API.
370
384
 
@@ -375,6 +389,7 @@ async def run_adaptive_evaluation(
375
389
  project_id: Trismik project ID
376
390
  metadata: Additional metadata
377
391
  trismik_client: Trismik client instance
392
+ on_progress: Optional callback for progress updates (current, total)
378
393
  Returns:
379
394
  Results from the adaptive evaluation
380
395
  """
@@ -404,6 +419,7 @@ async def run_adaptive_evaluation(
404
419
  inference_setup={},
405
420
  ),
406
421
  item_processor=make_trismik_inference(inference_with_hyperparams),
422
+ on_progress=on_progress,
407
423
  return_dict=False,
408
424
  )
409
425
 
@@ -112,8 +112,6 @@ def evaluate(
112
112
  with evaluation_progress_context(
113
113
  total_eval_runs=len(eval_run_specs),
114
114
  total_items=total_items,
115
- dataset_count=len(datasets),
116
- hyperparam_count=len(hyperparameter_configs),
117
115
  model_display=model_display,
118
116
  enabled=show_progress_bars,
119
117
  ) as progress_bars:
@@ -150,19 +148,31 @@ def execute_runs(
150
148
  def worker(
151
149
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
152
150
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
151
+ # Create progress callback for adaptive evals
152
+ on_progress: Optional[Callable[[int, int], None]] = None
153
+ if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
154
+
155
+ def _on_progress(current: int, total: int) -> None:
156
+ progress_bars.on_item_progress(current, total)
157
+
158
+ on_progress = _on_progress
159
+
153
160
  # Execute run (score_async handles upload internally for classic evals)
154
161
  run_result = execute_run(
155
- inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
162
+ inference,
163
+ run,
164
+ upload_results,
165
+ experiment_id,
166
+ project_id,
167
+ metadata,
168
+ trismik_client,
169
+ on_progress,
156
170
  )
157
171
 
158
172
  # Update progress bars with items processed and success status
159
173
  if progress_bars is not None:
160
- # Classic evals have .items; adaptive evals use max_iterations
161
- items_processed = (
162
- len(run.dataset.items)
163
- if isinstance(run, EvalRunSpec)
164
- else evaluation_settings["max_iterations"]
165
- )
174
+ # Classic evals: update items count; Adaptive evals: items already tracked via callback
175
+ items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
166
176
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
167
177
 
168
178
  # Update upload progress for classic evals
@@ -194,11 +204,12 @@ def execute_runs(
194
204
  def execute_run(
195
205
  inference: Callable,
196
206
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
197
- upload_results: bool, # NEW PARAMETER
207
+ upload_results: bool,
198
208
  experiment_id: Optional[str] = None,
199
209
  project_id: Optional[str] = None,
200
210
  metadata: Optional[Dict[str, Any]] = None,
201
211
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
212
+ on_progress: Optional[Callable[[int, int], None]] = None,
202
213
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
203
214
  """Execute a single evaluation run."""
204
215
 
@@ -217,6 +228,7 @@ def execute_run(
217
228
  resolved_project_id,
218
229
  metadata,
219
230
  trismik_client,
231
+ on_progress,
220
232
  )
221
233
 
222
234
  else:
@@ -337,6 +349,7 @@ def execute_adaptive_eval_run(
337
349
  project_id: str,
338
350
  metadata: Optional[Dict[str, Any]] = None,
339
351
  trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
352
+ on_progress: Optional[Callable[[int, int], None]] = None,
340
353
  ) -> AdaptiveEvalRunResult:
341
354
  """Execute an adaptive evaluation run."""
342
355
  logger.debug("Executing adaptive run for %s", run)
@@ -346,7 +359,7 @@ def execute_adaptive_eval_run(
346
359
  raise ScoreBookError("Trismik client is required for adaptive evaluation")
347
360
 
348
361
  adaptive_eval_run_result = run_adaptive_evaluation(
349
- inference, run, experiment_id, project_id, metadata, trismik_client
362
+ inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
350
363
  )
351
364
  logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
352
365
 
@@ -364,6 +377,7 @@ def run_adaptive_evaluation(
364
377
  project_id: str,
365
378
  metadata: Any,
366
379
  trismik_client: Union[TrismikClient, TrismikAsyncClient],
380
+ on_progress: Optional[Callable[[int, int], None]] = None,
367
381
  ) -> AdaptiveEvalRunResult:
368
382
  """Run an adaptive evaluation using the Trismik API.
369
383
 
@@ -374,6 +388,7 @@ def run_adaptive_evaluation(
374
388
  project_id: Trismik project ID
375
389
  metadata: Additional metadata
376
390
  trismik_client: Trismik client instance
391
+ on_progress: Optional callback for progress updates (current, total)
377
392
  Returns:
378
393
  Results from the adaptive evaluation
379
394
  """
@@ -403,6 +418,7 @@ def run_adaptive_evaluation(
403
418
  inference_setup={},
404
419
  ),
405
420
  item_processor=make_trismik_inference(inference_with_hyperparams),
421
+ on_progress=on_progress,
406
422
  return_dict=False,
407
423
  )
408
424
 
@@ -0,0 +1,121 @@
1
+ # Adding Metrics to Scorebook
2
+
3
+ This guide explains how to add new metrics to Scorebook.
4
+
5
+ ## Quick Start
6
+
7
+ 1. Create a metric file: `src/scorebook/metrics/yourmetric.py`
8
+ 2. Implement the metric class
9
+ 3. Add tests
10
+ 4. Submit PR for review
11
+
12
+ ### Where to Put Tests
13
+
14
+ Tests go in one of two directories:
15
+
16
+ - **`tests/unit/test_metrics/`** - For fast tests using mocked data. These run on every commit.
17
+ - **`tests/extended/test_metrics/`** - For tests that require external dependencies, large datasets, or are computationally expensive.
18
+
19
+ Most metrics only need unit tests. Use extended tests when your metric relies on external APIs, models, or takes significant time to run.
20
+
21
+ See [CONTRIBUTING.md](../../../CONTRIBUTING.md) for instructions on running tests.
22
+
23
+ ---
24
+
25
+ ## Requirements
26
+
27
+ Your metric must:
28
+
29
+ - Use the `@scorebook_metric` decorator
30
+ - Inherit from `MetricBase`
31
+ - Implement the `score()` static method
32
+
33
+ The `score()` method returns a tuple of `(aggregate_scores, item_scores)`:
34
+
35
+ - **aggregate_scores**: A `Dict[str, float]` with overall metric values (e.g., `{"accuracy": 0.85}`)
36
+ - **item_scores**: A `List` of per-item scores. For metrics that produce a single value per item, use `int`, `float`, `bool`, or `str`. For metrics that produce multiple values per item, use a `Dict[str, Union[int, float, bool, str]]` where keys are metric names.
37
+
38
+ ---
39
+
40
+ ## File Naming
41
+
42
+ Metric files must use normalized names (lowercase, no underscores/spaces). This naming convention is required for the registry's lazy loading system to work.
43
+
44
+ 1. User requests a metric by name (e.g., `"f1_score"`, `"F1Score"`, or `"f1 score"`)
45
+ 2. The registry normalizes the input → `"f1score"`
46
+ 3. The registry imports `scorebook.metrics.f1score`
47
+ 4. The `@scorebook_metric` decorator registers the class
48
+
49
+ **Examples:**
50
+ - Class: `F1Score` → File: `f1score.py` → User can request: `"f1score"`, `"F1Score"`, `"f1_score"`, `"f1 score"`
51
+ - Class: `MeanSquaredError` → File: `meansquarederror.py` → User can request: `"MeanSquaredError"`, `"mean_squared_error"`, etc.
52
+
53
+ **Collision detection:** Class names that normalize to the same key will raise an error at registration time. For example, `F1Score` and `F1_Score` both normalize to `"f1score"` and cannot coexist.
54
+
55
+ ---
56
+
57
+ ## Implementation Template
58
+
59
+ Create your metric file in `src/scorebook/metrics/yourmetric.py`:
60
+
61
+ ```python
62
+ """Brief description of the metric."""
63
+
64
+ from typing import Any, Dict, List, Tuple
65
+
66
+ from scorebook.metrics import MetricBase, scorebook_metric
67
+
68
+
69
+ @scorebook_metric
70
+ class YourMetric(MetricBase):
71
+ """One-line description of what this metric measures.
72
+
73
+ Formula or explanation (e.g., Accuracy = correct / total).
74
+ """
75
+
76
+ def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
77
+ """Calculate metric score between outputs and labels.
78
+
79
+ Args:
80
+ outputs: A list of model inference outputs.
81
+ labels: A list of ground truth labels.
82
+
83
+ Returns:
84
+ Tuple containing:
85
+ - Aggregate scores dict (e.g., {"your_metric": 0.85})
86
+ - List of per-item scores
87
+
88
+ Raises:
89
+ ValueError: If outputs and labels have different lengths.
90
+ """
91
+ # Input validation
92
+ if len(outputs) != len(labels):
93
+ raise ValueError("Number of outputs must match number of labels")
94
+
95
+ if not outputs:
96
+ return {"your_metric": 0.0}, []
97
+
98
+ # Calculate per-item scores
99
+ item_scores = [calculate_score(out, lab) for out, lab in zip(outputs, labels)]
100
+
101
+ # Calculate aggregate score
102
+ aggregate_score = sum(item_scores) / len(item_scores)
103
+
104
+ return {"your_metric": aggregate_score}, item_scores
105
+ ```
106
+
107
+ ---
108
+
109
+ ## Documentation
110
+
111
+ Each metric should have:
112
+
113
+ 1. **Module-level docstring**: Brief description at the top of the file
114
+ 2. **Class docstring**: What the metric measures, formula, and any limitations
115
+ 3. **Method docstring**: Args, Returns, and Raises sections
116
+
117
+ ---
118
+
119
+ ## Example
120
+
121
+ See `src/scorebook/metrics/accuracy.py` for a complete reference implementation.
@@ -1 +1,9 @@
1
1
  """Metrics for evaluating model predictions."""
2
+
3
+ from scorebook.metrics.core.metric_base import MetricBase
4
+ from scorebook.metrics.core.metric_registry import scorebook_metric
5
+
6
+ __all__ = [
7
+ "MetricBase",
8
+ "scorebook_metric",
9
+ ]
@@ -2,11 +2,10 @@
2
2
 
3
3
  from typing import Any, Dict, List, Tuple
4
4
 
5
- from scorebook.metrics.metric_base import MetricBase
6
- from scorebook.metrics.metric_registry import MetricRegistry
5
+ from scorebook.metrics import MetricBase, scorebook_metric
7
6
 
8
7
 
9
- @MetricRegistry.register()
8
+ @scorebook_metric
10
9
  class Accuracy(MetricBase):
11
10
  """Accuracy metric for evaluating model predictions of any type.
12
11
 
@@ -25,9 +24,6 @@ class Accuracy(MetricBase):
25
24
  The aggregate accuracy score for all items (correct predictions / total predictions).
26
25
  The item scores for each output-label pair (true/false).
27
26
  """
28
- if len(outputs) != len(labels):
29
- raise ValueError("Number of outputs must match number of labels")
30
-
31
27
  if not outputs: # Handle empty lists
32
28
  return {"accuracy": 0.0}, []
33
29
 
@@ -0,0 +1,50 @@
1
+ """BertScore implementation for Scorebook."""
2
+
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ import bert_score
6
+
7
+ from scorebook.metrics import scorebook_metric
8
+ from scorebook.metrics.core.metric_base import MetricBase
9
+
10
+
11
+ @scorebook_metric
12
+ class BertScore(MetricBase):
13
+ """BertScore metric for evaluating model predictions against reference text."""
14
+
15
+ def __init__(self, **kwargs: Any) -> None:
16
+ """Initialize BertScore metric."""
17
+ defaults = {"lang": "en", "verbose": False}
18
+ self.kwargs = {**defaults, **kwargs} # User kwargs override defaults
19
+
20
+ def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
21
+ """Calculate bert score between predictions and references.
22
+
23
+ Args:
24
+ outputs: A list of inference outputs.
25
+ labels: A list of ground truth labels.
26
+
27
+ Returns:
28
+ A tuple containing:
29
+ - aggregate_scores (Dict[str, float]): Dictionary with average precision,
30
+ recall, and F1 scores for all items.
31
+ - item_scores (List[Dict[str, float]]): List of dictionaries with precision,
32
+ recall, and F1 scores for each output-label pair.
33
+ """
34
+ if not outputs: # Handle empty lists
35
+ return {"precision": 0.0, "recall": 0.0, "F1": 0.0}, []
36
+
37
+ # Calculate item scores
38
+ p_scores, r_scores, f1_scores = bert_score.score(outputs, labels, **self.kwargs)
39
+
40
+ item_scores = [
41
+ {"precision": p, "recall": r, "F1": f1}
42
+ for p, r, f1 in zip(p_scores.tolist(), r_scores.tolist(), f1_scores.tolist())
43
+ ]
44
+ aggregate_scores = {
45
+ "precision": p_scores.mean().item(),
46
+ "recall": r_scores.mean().item(),
47
+ "F1": f1_scores.mean().item(),
48
+ }
49
+
50
+ return aggregate_scores, item_scores
@@ -0,0 +1,82 @@
1
+ """BLEU metric implementation for Scorebook, based on sacrebleu."""
2
+
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ import sacrebleu
6
+
7
+ from scorebook.metrics import MetricBase, scorebook_metric
8
+
9
+
10
+ @scorebook_metric
11
+ class BLEU(MetricBase):
12
+ """BLEU metric implementation for Scorebook, based on sacrebleu."""
13
+
14
+ def __init__(self, compact: bool = True, **kwargs: Any) -> None:
15
+ """
16
+ Generate BLEU metric.
17
+
18
+ :param compact: if True, returns only the BLEU metric; if False,
19
+ returns the full signature of BLEU.
20
+ :param kwargs: additional arguments passed to BLEU.
21
+ """
22
+
23
+ self.compact = compact
24
+ self.corpus_bleu = sacrebleu.metrics.BLEU(**kwargs)
25
+
26
+ # Overwrite effective order for sentence level scores
27
+ kwargs["effective_order"] = True
28
+ self.sentence_bleu = sacrebleu.metrics.BLEU(**kwargs)
29
+
30
+ def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
31
+ """Calculate accuracy score between predictions and references.
32
+
33
+ Args:
34
+ outputs: A list of inference outputs.
35
+ labels: A list of ground truth labels.
36
+
37
+ Returns:
38
+ The aggregate accuracy score for all items (correct predictions / total predictions).
39
+ The item scores for each output-label pair (true/false).
40
+ """
41
+
42
+ if not outputs: # Handle empty lists
43
+ return {"BLEU": 0.0}, []
44
+
45
+ item_scores = []
46
+ # Calculate item scores
47
+ for output, label in zip(outputs, labels):
48
+ item_bleu: sacrebleu.metrics.BLEUScore = self.sentence_bleu.sentence_score(
49
+ output, [label]
50
+ )
51
+ item_score = {
52
+ "BLEU": item_bleu.score,
53
+ }
54
+
55
+ if not self.compact:
56
+ item_score["1-gram"] = item_bleu.precisions[0]
57
+ item_score["2-gram"] = item_bleu.precisions[1]
58
+ item_score["3-gram"] = item_bleu.precisions[2]
59
+ item_score["4-gram"] = item_bleu.precisions[3]
60
+ item_score["BP"] = item_bleu.bp
61
+ item_score["ratio"] = item_bleu.ratio
62
+ item_score["hyp_len"] = item_bleu.sys_len
63
+ item_score["ref_len"] = item_bleu.ref_len
64
+
65
+ item_scores.append(item_score)
66
+
67
+ # Calculate aggregate score
68
+
69
+ corpus_bleu: sacrebleu.metrics.BLEUScore = self.corpus_bleu.corpus_score(outputs, [labels])
70
+ aggregate_scores = {"BLEU": corpus_bleu.score}
71
+
72
+ if not self.compact:
73
+ aggregate_scores["1-gram"] = corpus_bleu.precisions[0]
74
+ aggregate_scores["2-gram"] = corpus_bleu.precisions[1]
75
+ aggregate_scores["3-gram"] = corpus_bleu.precisions[2]
76
+ aggregate_scores["4-gram"] = corpus_bleu.precisions[3]
77
+ aggregate_scores["BP"] = corpus_bleu.bp
78
+ aggregate_scores["ratio"] = corpus_bleu.ratio
79
+ aggregate_scores["hyp_len"] = corpus_bleu.sys_len
80
+ aggregate_scores["ref_len"] = corpus_bleu.ref_len
81
+
82
+ return aggregate_scores, item_scores
@@ -0,0 +1 @@
1
+ """Core metric framework components."""
@@ -12,9 +12,8 @@ class MetricBase(ABC):
12
12
  """Return the metric name based on the class name."""
13
13
  return self.__class__.__name__.lower()
14
14
 
15
- @staticmethod
16
15
  @abstractmethod
17
- def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
16
+ def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
18
17
  """Calculate the metric score for a list of outputs and labels.
19
18
 
20
19
  Args: