scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -0
- scorebook/dashboard/credentials.py +34 -4
- scorebook/eval_datasets/eval_dataset.py +2 -2
- scorebook/evaluate/_async/evaluate_async.py +27 -11
- scorebook/evaluate/_sync/evaluate.py +27 -11
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +8 -0
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/score_helpers.py +28 -11
- scorebook/types.py +2 -2
- scorebook/utils/progress_bars.py +58 -786
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -107
- scorebook-0.0.14.dist-info/RECORD +0 -53
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py
CHANGED
|
@@ -16,6 +16,7 @@ from scorebook.eval_datasets.eval_dataset import EvalDataset
|
|
|
16
16
|
from scorebook.evaluate._async.evaluate_async import evaluate_async
|
|
17
17
|
from scorebook.evaluate._sync.evaluate import evaluate
|
|
18
18
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
19
|
+
from scorebook.metrics.core.metric_registry import scorebook_metric
|
|
19
20
|
from scorebook.score._async.score_async import score_async
|
|
20
21
|
from scorebook.score._sync.score import score
|
|
21
22
|
from scorebook.utils.render_template import render_template
|
|
@@ -35,4 +36,5 @@ __all__ = [
|
|
|
35
36
|
"create_project_async",
|
|
36
37
|
"upload_result",
|
|
37
38
|
"upload_result_async",
|
|
39
|
+
"scorebook_metric",
|
|
38
40
|
]
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import warnings
|
|
6
7
|
from typing import Optional
|
|
7
8
|
|
|
9
|
+
from dotenv import load_dotenv
|
|
8
10
|
from trismik import TrismikClient
|
|
9
11
|
|
|
10
12
|
from scorebook.settings import TRISMIK_SERVICE_URL
|
|
@@ -92,16 +94,44 @@ def validate_token(token: str) -> bool:
|
|
|
92
94
|
return False
|
|
93
95
|
|
|
94
96
|
|
|
95
|
-
def login(trismik_api_key: str) -> None:
|
|
97
|
+
def login(trismik_api_key: Optional[str] = None) -> None:
|
|
96
98
|
"""Login to trismik by saving API key locally.
|
|
97
99
|
|
|
100
|
+
If no API key is provided, the function will attempt to read it from the
|
|
101
|
+
TRISMIK_API_KEY environment variable or .env file (using python-dotenv).
|
|
102
|
+
Environment variables take precedence over .env file values.
|
|
103
|
+
|
|
98
104
|
Args:
|
|
99
|
-
trismik_api_key: The API key to use.
|
|
105
|
+
trismik_api_key: The API key to use. If not provided, reads from
|
|
106
|
+
environment or .env file.
|
|
100
107
|
Raises:
|
|
101
|
-
ValueError: If API key is empty or invalid.
|
|
108
|
+
ValueError: If API key is empty, not found, or invalid.
|
|
109
|
+
|
|
110
|
+
Warns:
|
|
111
|
+
UserWarning: If an explicit API key is passed but TRISMIK_API_KEY
|
|
112
|
+
environment variable is also set.
|
|
102
113
|
"""
|
|
114
|
+
# Warn if user passes explicit key but env var is also set
|
|
115
|
+
if trismik_api_key is not None and os.environ.get("TRISMIK_API_KEY"):
|
|
116
|
+
warnings.warn(
|
|
117
|
+
"TRISMIK_API_KEY environment variable is set. The environment variable "
|
|
118
|
+
"takes precedence over the stored token when calling evaluate(). "
|
|
119
|
+
"To use the explicitly provided key, unset the TRISMIK_API_KEY "
|
|
120
|
+
"environment variable.",
|
|
121
|
+
UserWarning,
|
|
122
|
+
stacklevel=2,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if trismik_api_key is None:
|
|
126
|
+
# Load from .env file if TRISMIK_API_KEY is not already set in environment
|
|
127
|
+
load_dotenv()
|
|
128
|
+
trismik_api_key = os.environ.get("TRISMIK_API_KEY")
|
|
129
|
+
|
|
103
130
|
if not trismik_api_key:
|
|
104
|
-
raise ValueError(
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"API key cannot be empty. Either pass it as a parameter or "
|
|
133
|
+
"set the TRISMIK_API_KEY environment variable or .env file."
|
|
134
|
+
)
|
|
105
135
|
|
|
106
136
|
# Validate token
|
|
107
137
|
if not validate_token(trismik_api_key):
|
|
@@ -18,8 +18,8 @@ from scorebook.exceptions import (
|
|
|
18
18
|
DatasetSampleError,
|
|
19
19
|
MissingFieldError,
|
|
20
20
|
)
|
|
21
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
22
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
21
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
22
|
+
from scorebook.metrics.core.metric_registry import MetricRegistry
|
|
23
23
|
from scorebook.utils.io_helpers import validate_path
|
|
24
24
|
from scorebook.utils.render_template import render_template
|
|
25
25
|
|
|
@@ -113,8 +113,6 @@ async def evaluate_async(
|
|
|
113
113
|
with evaluation_progress_context(
|
|
114
114
|
total_eval_runs=len(eval_run_specs),
|
|
115
115
|
total_items=total_items,
|
|
116
|
-
dataset_count=len(datasets),
|
|
117
|
-
hyperparam_count=len(hyperparameter_configs),
|
|
118
116
|
model_display=model_display,
|
|
119
117
|
enabled=show_progress_bars,
|
|
120
118
|
) as progress_bars:
|
|
@@ -151,19 +149,31 @@ async def execute_runs(
|
|
|
151
149
|
async def worker(
|
|
152
150
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
153
151
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
152
|
+
# Create progress callback for adaptive evals
|
|
153
|
+
on_progress: Optional[Callable[[int, int], None]] = None
|
|
154
|
+
if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
|
|
155
|
+
|
|
156
|
+
def _on_progress(current: int, total: int) -> None:
|
|
157
|
+
progress_bars.on_item_progress(current, total)
|
|
158
|
+
|
|
159
|
+
on_progress = _on_progress
|
|
160
|
+
|
|
154
161
|
# Execute run (score_async handles upload internally for classic evals)
|
|
155
162
|
run_result = await execute_run(
|
|
156
|
-
inference,
|
|
163
|
+
inference,
|
|
164
|
+
run,
|
|
165
|
+
upload_results,
|
|
166
|
+
experiment_id,
|
|
167
|
+
project_id,
|
|
168
|
+
metadata,
|
|
169
|
+
trismik_client,
|
|
170
|
+
on_progress,
|
|
157
171
|
)
|
|
158
172
|
|
|
159
173
|
# Update progress bars with items processed and success status
|
|
160
174
|
if progress_bars is not None:
|
|
161
|
-
# Classic evals
|
|
162
|
-
items_processed = (
|
|
163
|
-
len(run.dataset.items)
|
|
164
|
-
if isinstance(run, EvalRunSpec)
|
|
165
|
-
else evaluation_settings["max_iterations"]
|
|
166
|
-
)
|
|
175
|
+
# Classic evals: update items count; Adaptive evals: items already tracked via callback
|
|
176
|
+
items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
|
|
167
177
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
168
178
|
|
|
169
179
|
# Update upload progress for classic evals
|
|
@@ -195,11 +205,12 @@ async def execute_runs(
|
|
|
195
205
|
async def execute_run(
|
|
196
206
|
inference: Callable,
|
|
197
207
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
198
|
-
upload_results: bool,
|
|
208
|
+
upload_results: bool,
|
|
199
209
|
experiment_id: Optional[str] = None,
|
|
200
210
|
project_id: Optional[str] = None,
|
|
201
211
|
metadata: Optional[Dict[str, Any]] = None,
|
|
202
212
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
213
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
203
214
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
204
215
|
"""Execute a single evaluation run."""
|
|
205
216
|
|
|
@@ -218,6 +229,7 @@ async def execute_run(
|
|
|
218
229
|
resolved_project_id,
|
|
219
230
|
metadata,
|
|
220
231
|
trismik_client,
|
|
232
|
+
on_progress,
|
|
221
233
|
)
|
|
222
234
|
|
|
223
235
|
else:
|
|
@@ -338,6 +350,7 @@ async def execute_adaptive_eval_run(
|
|
|
338
350
|
project_id: str,
|
|
339
351
|
metadata: Optional[Dict[str, Any]] = None,
|
|
340
352
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
353
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
341
354
|
) -> AdaptiveEvalRunResult:
|
|
342
355
|
"""Execute an adaptive evaluation run."""
|
|
343
356
|
logger.debug("Executing adaptive run for %s", run)
|
|
@@ -347,7 +360,7 @@ async def execute_adaptive_eval_run(
|
|
|
347
360
|
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
348
361
|
|
|
349
362
|
adaptive_eval_run_result = await run_adaptive_evaluation(
|
|
350
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
363
|
+
inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
|
|
351
364
|
)
|
|
352
365
|
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
353
366
|
|
|
@@ -365,6 +378,7 @@ async def run_adaptive_evaluation(
|
|
|
365
378
|
project_id: str,
|
|
366
379
|
metadata: Any,
|
|
367
380
|
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
381
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
368
382
|
) -> AdaptiveEvalRunResult:
|
|
369
383
|
"""Run an adaptive evaluation using the Trismik API.
|
|
370
384
|
|
|
@@ -375,6 +389,7 @@ async def run_adaptive_evaluation(
|
|
|
375
389
|
project_id: Trismik project ID
|
|
376
390
|
metadata: Additional metadata
|
|
377
391
|
trismik_client: Trismik client instance
|
|
392
|
+
on_progress: Optional callback for progress updates (current, total)
|
|
378
393
|
Returns:
|
|
379
394
|
Results from the adaptive evaluation
|
|
380
395
|
"""
|
|
@@ -404,6 +419,7 @@ async def run_adaptive_evaluation(
|
|
|
404
419
|
inference_setup={},
|
|
405
420
|
),
|
|
406
421
|
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
422
|
+
on_progress=on_progress,
|
|
407
423
|
return_dict=False,
|
|
408
424
|
)
|
|
409
425
|
|
|
@@ -112,8 +112,6 @@ def evaluate(
|
|
|
112
112
|
with evaluation_progress_context(
|
|
113
113
|
total_eval_runs=len(eval_run_specs),
|
|
114
114
|
total_items=total_items,
|
|
115
|
-
dataset_count=len(datasets),
|
|
116
|
-
hyperparam_count=len(hyperparameter_configs),
|
|
117
115
|
model_display=model_display,
|
|
118
116
|
enabled=show_progress_bars,
|
|
119
117
|
) as progress_bars:
|
|
@@ -150,19 +148,31 @@ def execute_runs(
|
|
|
150
148
|
def worker(
|
|
151
149
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
152
150
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
151
|
+
# Create progress callback for adaptive evals
|
|
152
|
+
on_progress: Optional[Callable[[int, int], None]] = None
|
|
153
|
+
if progress_bars is not None and isinstance(run, AdaptiveEvalRunSpec):
|
|
154
|
+
|
|
155
|
+
def _on_progress(current: int, total: int) -> None:
|
|
156
|
+
progress_bars.on_item_progress(current, total)
|
|
157
|
+
|
|
158
|
+
on_progress = _on_progress
|
|
159
|
+
|
|
153
160
|
# Execute run (score_async handles upload internally for classic evals)
|
|
154
161
|
run_result = execute_run(
|
|
155
|
-
inference,
|
|
162
|
+
inference,
|
|
163
|
+
run,
|
|
164
|
+
upload_results,
|
|
165
|
+
experiment_id,
|
|
166
|
+
project_id,
|
|
167
|
+
metadata,
|
|
168
|
+
trismik_client,
|
|
169
|
+
on_progress,
|
|
156
170
|
)
|
|
157
171
|
|
|
158
172
|
# Update progress bars with items processed and success status
|
|
159
173
|
if progress_bars is not None:
|
|
160
|
-
# Classic evals
|
|
161
|
-
items_processed = (
|
|
162
|
-
len(run.dataset.items)
|
|
163
|
-
if isinstance(run, EvalRunSpec)
|
|
164
|
-
else evaluation_settings["max_iterations"]
|
|
165
|
-
)
|
|
174
|
+
# Classic evals: update items count; Adaptive evals: items already tracked via callback
|
|
175
|
+
items_processed = len(run.dataset.items) if isinstance(run, EvalRunSpec) else 0
|
|
166
176
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
167
177
|
|
|
168
178
|
# Update upload progress for classic evals
|
|
@@ -194,11 +204,12 @@ def execute_runs(
|
|
|
194
204
|
def execute_run(
|
|
195
205
|
inference: Callable,
|
|
196
206
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
197
|
-
upload_results: bool,
|
|
207
|
+
upload_results: bool,
|
|
198
208
|
experiment_id: Optional[str] = None,
|
|
199
209
|
project_id: Optional[str] = None,
|
|
200
210
|
metadata: Optional[Dict[str, Any]] = None,
|
|
201
211
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
212
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
202
213
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
203
214
|
"""Execute a single evaluation run."""
|
|
204
215
|
|
|
@@ -217,6 +228,7 @@ def execute_run(
|
|
|
217
228
|
resolved_project_id,
|
|
218
229
|
metadata,
|
|
219
230
|
trismik_client,
|
|
231
|
+
on_progress,
|
|
220
232
|
)
|
|
221
233
|
|
|
222
234
|
else:
|
|
@@ -337,6 +349,7 @@ def execute_adaptive_eval_run(
|
|
|
337
349
|
project_id: str,
|
|
338
350
|
metadata: Optional[Dict[str, Any]] = None,
|
|
339
351
|
trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
|
|
352
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
340
353
|
) -> AdaptiveEvalRunResult:
|
|
341
354
|
"""Execute an adaptive evaluation run."""
|
|
342
355
|
logger.debug("Executing adaptive run for %s", run)
|
|
@@ -346,7 +359,7 @@ def execute_adaptive_eval_run(
|
|
|
346
359
|
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
347
360
|
|
|
348
361
|
adaptive_eval_run_result = run_adaptive_evaluation(
|
|
349
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
362
|
+
inference, run, experiment_id, project_id, metadata, trismik_client, on_progress
|
|
350
363
|
)
|
|
351
364
|
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
352
365
|
|
|
@@ -364,6 +377,7 @@ def run_adaptive_evaluation(
|
|
|
364
377
|
project_id: str,
|
|
365
378
|
metadata: Any,
|
|
366
379
|
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
380
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
367
381
|
) -> AdaptiveEvalRunResult:
|
|
368
382
|
"""Run an adaptive evaluation using the Trismik API.
|
|
369
383
|
|
|
@@ -374,6 +388,7 @@ def run_adaptive_evaluation(
|
|
|
374
388
|
project_id: Trismik project ID
|
|
375
389
|
metadata: Additional metadata
|
|
376
390
|
trismik_client: Trismik client instance
|
|
391
|
+
on_progress: Optional callback for progress updates (current, total)
|
|
377
392
|
Returns:
|
|
378
393
|
Results from the adaptive evaluation
|
|
379
394
|
"""
|
|
@@ -403,6 +418,7 @@ def run_adaptive_evaluation(
|
|
|
403
418
|
inference_setup={},
|
|
404
419
|
),
|
|
405
420
|
item_processor=make_trismik_inference(inference_with_hyperparams),
|
|
421
|
+
on_progress=on_progress,
|
|
406
422
|
return_dict=False,
|
|
407
423
|
)
|
|
408
424
|
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Adding Metrics to Scorebook
|
|
2
|
+
|
|
3
|
+
This guide explains how to add new metrics to Scorebook.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
1. Create a metric file: `src/scorebook/metrics/yourmetric.py`
|
|
8
|
+
2. Implement the metric class
|
|
9
|
+
3. Add tests
|
|
10
|
+
4. Submit PR for review
|
|
11
|
+
|
|
12
|
+
### Where to Put Tests
|
|
13
|
+
|
|
14
|
+
Tests go in one of two directories:
|
|
15
|
+
|
|
16
|
+
- **`tests/unit/test_metrics/`** - For fast tests using mocked data. These run on every commit.
|
|
17
|
+
- **`tests/extended/test_metrics/`** - For tests that require external dependencies, large datasets, or are computationally expensive.
|
|
18
|
+
|
|
19
|
+
Most metrics only need unit tests. Use extended tests when your metric relies on external APIs, models, or takes significant time to run.
|
|
20
|
+
|
|
21
|
+
See [CONTRIBUTING.md](../../../CONTRIBUTING.md) for instructions on running tests.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Requirements
|
|
26
|
+
|
|
27
|
+
Your metric must:
|
|
28
|
+
|
|
29
|
+
- Use the `@scorebook_metric` decorator
|
|
30
|
+
- Inherit from `MetricBase`
|
|
31
|
+
- Implement the `score()` static method
|
|
32
|
+
|
|
33
|
+
The `score()` method returns a tuple of `(aggregate_scores, item_scores)`:
|
|
34
|
+
|
|
35
|
+
- **aggregate_scores**: A `Dict[str, float]` with overall metric values (e.g., `{"accuracy": 0.85}`)
|
|
36
|
+
- **item_scores**: A `List` of per-item scores. For metrics that produce a single value per item, use `int`, `float`, `bool`, or `str`. For metrics that produce multiple values per item, use a `Dict[str, Union[int, float, bool, str]]` where keys are metric names.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## File Naming
|
|
41
|
+
|
|
42
|
+
Metric files must use normalized names (lowercase, no underscores/spaces). This naming convention is required for the registry's lazy loading system to work.
|
|
43
|
+
|
|
44
|
+
1. User requests a metric by name (e.g., `"f1_score"`, `"F1Score"`, or `"f1 score"`)
|
|
45
|
+
2. The registry normalizes the input → `"f1score"`
|
|
46
|
+
3. The registry imports `scorebook.metrics.f1score`
|
|
47
|
+
4. The `@scorebook_metric` decorator registers the class
|
|
48
|
+
|
|
49
|
+
**Examples:**
|
|
50
|
+
- Class: `F1Score` → File: `f1score.py` → User can request: `"f1score"`, `"F1Score"`, `"f1_score"`, `"f1 score"`
|
|
51
|
+
- Class: `MeanSquaredError` → File: `meansquarederror.py` → User can request: `"MeanSquaredError"`, `"mean_squared_error"`, etc.
|
|
52
|
+
|
|
53
|
+
**Collision detection:** Class names that normalize to the same key will raise an error at registration time. For example, `F1Score` and `F1_Score` both normalize to `"f1score"` and cannot coexist.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Implementation Template
|
|
58
|
+
|
|
59
|
+
Create your metric file in `src/scorebook/metrics/yourmetric.py`:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
"""Brief description of the metric."""
|
|
63
|
+
|
|
64
|
+
from typing import Any, Dict, List, Tuple
|
|
65
|
+
|
|
66
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@scorebook_metric
|
|
70
|
+
class YourMetric(MetricBase):
|
|
71
|
+
"""One-line description of what this metric measures.
|
|
72
|
+
|
|
73
|
+
Formula or explanation (e.g., Accuracy = correct / total).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
77
|
+
"""Calculate metric score between outputs and labels.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
outputs: A list of model inference outputs.
|
|
81
|
+
labels: A list of ground truth labels.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Tuple containing:
|
|
85
|
+
- Aggregate scores dict (e.g., {"your_metric": 0.85})
|
|
86
|
+
- List of per-item scores
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If outputs and labels have different lengths.
|
|
90
|
+
"""
|
|
91
|
+
# Input validation
|
|
92
|
+
if len(outputs) != len(labels):
|
|
93
|
+
raise ValueError("Number of outputs must match number of labels")
|
|
94
|
+
|
|
95
|
+
if not outputs:
|
|
96
|
+
return {"your_metric": 0.0}, []
|
|
97
|
+
|
|
98
|
+
# Calculate per-item scores
|
|
99
|
+
item_scores = [calculate_score(out, lab) for out, lab in zip(outputs, labels)]
|
|
100
|
+
|
|
101
|
+
# Calculate aggregate score
|
|
102
|
+
aggregate_score = sum(item_scores) / len(item_scores)
|
|
103
|
+
|
|
104
|
+
return {"your_metric": aggregate_score}, item_scores
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Documentation
|
|
110
|
+
|
|
111
|
+
Each metric should have:
|
|
112
|
+
|
|
113
|
+
1. **Module-level docstring**: Brief description at the top of the file
|
|
114
|
+
2. **Class docstring**: What the metric measures, formula, and any limitations
|
|
115
|
+
3. **Method docstring**: Args, Returns, and Raises sections
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Example
|
|
120
|
+
|
|
121
|
+
See `src/scorebook/metrics/accuracy.py` for a complete reference implementation.
|
scorebook/metrics/__init__.py
CHANGED
scorebook/metrics/accuracy.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Dict, List, Tuple
|
|
4
4
|
|
|
5
|
-
from scorebook.metrics
|
|
6
|
-
from scorebook.metrics.metric_registry import MetricRegistry
|
|
5
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
@
|
|
8
|
+
@scorebook_metric
|
|
10
9
|
class Accuracy(MetricBase):
|
|
11
10
|
"""Accuracy metric for evaluating model predictions of any type.
|
|
12
11
|
|
|
@@ -25,9 +24,6 @@ class Accuracy(MetricBase):
|
|
|
25
24
|
The aggregate accuracy score for all items (correct predictions / total predictions).
|
|
26
25
|
The item scores for each output-label pair (true/false).
|
|
27
26
|
"""
|
|
28
|
-
if len(outputs) != len(labels):
|
|
29
|
-
raise ValueError("Number of outputs must match number of labels")
|
|
30
|
-
|
|
31
27
|
if not outputs: # Handle empty lists
|
|
32
28
|
return {"accuracy": 0.0}, []
|
|
33
29
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""BertScore implementation for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import bert_score
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import scorebook_metric
|
|
8
|
+
from scorebook.metrics.core.metric_base import MetricBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@scorebook_metric
|
|
12
|
+
class BertScore(MetricBase):
|
|
13
|
+
"""BertScore metric for evaluating model predictions against reference text."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
16
|
+
"""Initialize BertScore metric."""
|
|
17
|
+
defaults = {"lang": "en", "verbose": False}
|
|
18
|
+
self.kwargs = {**defaults, **kwargs} # User kwargs override defaults
|
|
19
|
+
|
|
20
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
21
|
+
"""Calculate bert score between predictions and references.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
outputs: A list of inference outputs.
|
|
25
|
+
labels: A list of ground truth labels.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A tuple containing:
|
|
29
|
+
- aggregate_scores (Dict[str, float]): Dictionary with average precision,
|
|
30
|
+
recall, and F1 scores for all items.
|
|
31
|
+
- item_scores (List[Dict[str, float]]): List of dictionaries with precision,
|
|
32
|
+
recall, and F1 scores for each output-label pair.
|
|
33
|
+
"""
|
|
34
|
+
if not outputs: # Handle empty lists
|
|
35
|
+
return {"precision": 0.0, "recall": 0.0, "F1": 0.0}, []
|
|
36
|
+
|
|
37
|
+
# Calculate item scores
|
|
38
|
+
p_scores, r_scores, f1_scores = bert_score.score(outputs, labels, **self.kwargs)
|
|
39
|
+
|
|
40
|
+
item_scores = [
|
|
41
|
+
{"precision": p, "recall": r, "F1": f1}
|
|
42
|
+
for p, r, f1 in zip(p_scores.tolist(), r_scores.tolist(), f1_scores.tolist())
|
|
43
|
+
]
|
|
44
|
+
aggregate_scores = {
|
|
45
|
+
"precision": p_scores.mean().item(),
|
|
46
|
+
"recall": r_scores.mean().item(),
|
|
47
|
+
"F1": f1_scores.mean().item(),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""BLEU metric implementation for Scorebook, based on sacrebleu."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import sacrebleu
|
|
6
|
+
|
|
7
|
+
from scorebook.metrics import MetricBase, scorebook_metric
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@scorebook_metric
|
|
11
|
+
class BLEU(MetricBase):
|
|
12
|
+
"""BLEU metric implementation for Scorebook, based on sacrebleu."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, compact: bool = True, **kwargs: Any) -> None:
|
|
15
|
+
"""
|
|
16
|
+
Generate BLEU metric.
|
|
17
|
+
|
|
18
|
+
:param compact: if True, returns only the BLEU metric; if False,
|
|
19
|
+
returns the full signature of BLEU.
|
|
20
|
+
:param kwargs: additional arguments passed to BLEU.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
self.compact = compact
|
|
24
|
+
self.corpus_bleu = sacrebleu.metrics.BLEU(**kwargs)
|
|
25
|
+
|
|
26
|
+
# Overwrite effective order for sentence level scores
|
|
27
|
+
kwargs["effective_order"] = True
|
|
28
|
+
self.sentence_bleu = sacrebleu.metrics.BLEU(**kwargs)
|
|
29
|
+
|
|
30
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
31
|
+
"""Calculate accuracy score between predictions and references.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
outputs: A list of inference outputs.
|
|
35
|
+
labels: A list of ground truth labels.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The aggregate accuracy score for all items (correct predictions / total predictions).
|
|
39
|
+
The item scores for each output-label pair (true/false).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if not outputs: # Handle empty lists
|
|
43
|
+
return {"BLEU": 0.0}, []
|
|
44
|
+
|
|
45
|
+
item_scores = []
|
|
46
|
+
# Calculate item scores
|
|
47
|
+
for output, label in zip(outputs, labels):
|
|
48
|
+
item_bleu: sacrebleu.metrics.BLEUScore = self.sentence_bleu.sentence_score(
|
|
49
|
+
output, [label]
|
|
50
|
+
)
|
|
51
|
+
item_score = {
|
|
52
|
+
"BLEU": item_bleu.score,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if not self.compact:
|
|
56
|
+
item_score["1-gram"] = item_bleu.precisions[0]
|
|
57
|
+
item_score["2-gram"] = item_bleu.precisions[1]
|
|
58
|
+
item_score["3-gram"] = item_bleu.precisions[2]
|
|
59
|
+
item_score["4-gram"] = item_bleu.precisions[3]
|
|
60
|
+
item_score["BP"] = item_bleu.bp
|
|
61
|
+
item_score["ratio"] = item_bleu.ratio
|
|
62
|
+
item_score["hyp_len"] = item_bleu.sys_len
|
|
63
|
+
item_score["ref_len"] = item_bleu.ref_len
|
|
64
|
+
|
|
65
|
+
item_scores.append(item_score)
|
|
66
|
+
|
|
67
|
+
# Calculate aggregate score
|
|
68
|
+
|
|
69
|
+
corpus_bleu: sacrebleu.metrics.BLEUScore = self.corpus_bleu.corpus_score(outputs, [labels])
|
|
70
|
+
aggregate_scores = {"BLEU": corpus_bleu.score}
|
|
71
|
+
|
|
72
|
+
if not self.compact:
|
|
73
|
+
aggregate_scores["1-gram"] = corpus_bleu.precisions[0]
|
|
74
|
+
aggregate_scores["2-gram"] = corpus_bleu.precisions[1]
|
|
75
|
+
aggregate_scores["3-gram"] = corpus_bleu.precisions[2]
|
|
76
|
+
aggregate_scores["4-gram"] = corpus_bleu.precisions[3]
|
|
77
|
+
aggregate_scores["BP"] = corpus_bleu.bp
|
|
78
|
+
aggregate_scores["ratio"] = corpus_bleu.ratio
|
|
79
|
+
aggregate_scores["hyp_len"] = corpus_bleu.sys_len
|
|
80
|
+
aggregate_scores["ref_len"] = corpus_bleu.ref_len
|
|
81
|
+
|
|
82
|
+
return aggregate_scores, item_scores
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core metric framework components."""
|
|
@@ -12,9 +12,8 @@ class MetricBase(ABC):
|
|
|
12
12
|
"""Return the metric name based on the class name."""
|
|
13
13
|
return self.__class__.__name__.lower()
|
|
14
14
|
|
|
15
|
-
@staticmethod
|
|
16
15
|
@abstractmethod
|
|
17
|
-
def score(outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
16
|
+
def score(self, outputs: List[Any], labels: List[Any]) -> Tuple[Dict[str, Any], List[Any]]:
|
|
18
17
|
"""Calculate the metric score for a list of outputs and labels.
|
|
19
18
|
|
|
20
19
|
Args:
|