scorebook 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -1
- scorebook/evaluator.py +269 -118
- scorebook/exceptions.py +54 -0
- scorebook/inference/__init__.py +0 -4
- scorebook/inference/bedrock.py +305 -0
- scorebook/inference/openai.py +75 -37
- scorebook/inference/vertex.py +295 -0
- scorebook/types/__init__.py +2 -1
- scorebook/types/eval_dataset.py +56 -0
- scorebook/types/eval_result.py +7 -3
- scorebook/types/eval_run_spec.py +28 -0
- scorebook/types/inference_pipeline.py +5 -2
- scorebook/utils/__init__.py +2 -1
- scorebook/utils/build_prompt.py +52 -0
- scorebook/utils/jinja_helpers.py +146 -0
- scorebook/utils/logging_utils.py +1 -0
- scorebook/utils/progress_bars.py +91 -34
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/METADATA +11 -1
- scorebook-0.0.3.dist-info/RECORD +31 -0
- scorebook-0.0.1.dist-info/RECORD +0 -24
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/LICENSE +0 -0
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/WHEEL +0 -0
scorebook/__init__.py
CHANGED
|
@@ -11,5 +11,6 @@ __version__ = importlib.metadata.version(__package__ or __name__)
|
|
|
11
11
|
|
|
12
12
|
from scorebook.evaluator import evaluate
|
|
13
13
|
from scorebook.types.eval_dataset import EvalDataset
|
|
14
|
+
from scorebook.utils.build_prompt import build_prompt
|
|
14
15
|
|
|
15
|
-
__all__ = ["EvalDataset", "evaluate"]
|
|
16
|
+
__all__ = ["EvalDataset", "evaluate", "build_prompt"]
|
scorebook/evaluator.py
CHANGED
|
@@ -14,74 +14,33 @@ models on datasets and computing metric scores.
|
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from scorebook.
|
|
17
|
+
import logging
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
from scorebook.exceptions import (
|
|
21
|
+
DataMismatchError,
|
|
22
|
+
MetricComputationError,
|
|
23
|
+
ParallelExecutionError,
|
|
24
|
+
ParameterValidationError,
|
|
25
|
+
)
|
|
26
|
+
from scorebook.types import EvalDataset, EvalResult, EvalRunSpec
|
|
21
27
|
from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
|
|
22
28
|
|
|
23
|
-
|
|
24
|
-
async def _evaluate_async(
|
|
25
|
-
inference_callable: Callable,
|
|
26
|
-
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
27
|
-
hyperparameters: Optional[Dict[str, Any]] = None,
|
|
28
|
-
experiment_id: Optional[str] = None,
|
|
29
|
-
item_limit: Optional[int] = None,
|
|
30
|
-
return_type: str = "dict",
|
|
31
|
-
score_type: str = "aggregate",
|
|
32
|
-
) -> Union[Dict, List]:
|
|
33
|
-
"""Run inference across datasets/hyperparams, compute metrics, and format results."""
|
|
34
|
-
_validate_score_type(score_type)
|
|
35
|
-
|
|
36
|
-
normalized_datasets = _normalize_datasets(eval_datasets)
|
|
37
|
-
hyperparam_grid = _expand_hyperparams(hyperparameters)
|
|
38
|
-
|
|
39
|
-
eval_results: List[EvalResult] = []
|
|
40
|
-
|
|
41
|
-
with evaluation_progress(normalized_datasets, len(hyperparam_grid)) as progress_bars:
|
|
42
|
-
# Loop through datasets, then hyperparameters for clear progress tracking
|
|
43
|
-
for dataset_idx, eval_dataset in enumerate(normalized_datasets):
|
|
44
|
-
with progress_bars.hyperparam_progress_context():
|
|
45
|
-
# Run inference for each hyperparameter configuration on this dataset
|
|
46
|
-
for hp_idx, hyperparam_config in enumerate(hyperparam_grid):
|
|
47
|
-
items = _clip_items(eval_dataset.items, item_limit)
|
|
48
|
-
labels = _labels_for(items, eval_dataset.label)
|
|
49
|
-
|
|
50
|
-
# 1) Run inference
|
|
51
|
-
outputs = await _run_inference_callable(
|
|
52
|
-
inference_callable, items, hyperparam_config
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# 2) Score metrics
|
|
56
|
-
metric_scores = _score_metrics(eval_dataset, outputs, labels)
|
|
57
|
-
|
|
58
|
-
# 3) Wrap into EvalResult
|
|
59
|
-
eval_results.append(
|
|
60
|
-
EvalResult(eval_dataset, outputs, metric_scores, hyperparam_config)
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Update inner progress bar
|
|
64
|
-
progress_bars.update_hyperparam_progress()
|
|
65
|
-
|
|
66
|
-
# Update the outer progress bar
|
|
67
|
-
progress_bars.update_dataset_progress()
|
|
68
|
-
|
|
69
|
-
# TODO: experiment_id handling (left as passthrough to preserve behavior)
|
|
70
|
-
if experiment_id:
|
|
71
|
-
pass
|
|
72
|
-
|
|
73
|
-
# 4) Format as requested
|
|
74
|
-
return _format_results(eval_results, return_type, score_type)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
75
30
|
|
|
76
31
|
|
|
77
32
|
def evaluate(
|
|
78
33
|
inference_callable: Callable,
|
|
79
34
|
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
80
|
-
hyperparameters: Optional[Dict[str, Any]] = None,
|
|
35
|
+
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
81
36
|
experiment_id: Optional[str] = None,
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
37
|
+
project_id: Optional[str] = None,
|
|
38
|
+
parallel: bool = False,
|
|
39
|
+
return_dict: bool = True,
|
|
40
|
+
return_aggregates: bool = True,
|
|
41
|
+
return_items: bool = False,
|
|
42
|
+
return_output: bool = False,
|
|
43
|
+
sample_size: Optional[int] = None,
|
|
85
44
|
) -> Union[Dict, List]:
|
|
86
45
|
"""
|
|
87
46
|
Evaluate model predictions using specified metrics on given datasets.
|
|
@@ -101,12 +60,12 @@ def evaluate(
|
|
|
101
60
|
- A list of string identifiers
|
|
102
61
|
hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
|
|
103
62
|
experiment_id: Optional string identifier for tracking multiple evaluation runs.
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
63
|
+
return_dict: If True, returns eval results as a dict
|
|
64
|
+
return_aggregates: If True, returns aggregate scores for each dataset
|
|
65
|
+
return_items: If True, returns individual items for each dataset
|
|
66
|
+
return_output: If True, returns model outputs for each dataset item evaluated
|
|
67
|
+
sample_size: If set, only return a sample of the dataset items (for debugging)
|
|
68
|
+
parallel: If True, run inference functions in parallel (requires all functions to be async)
|
|
110
69
|
|
|
111
70
|
Returns:
|
|
112
71
|
Dictionary mapping dataset names to their evaluation results. For each dataset,
|
|
@@ -124,46 +83,194 @@ def evaluate(
|
|
|
124
83
|
|
|
125
84
|
results = evaluate(inference_fn, dataset, item_limit=100)
|
|
126
85
|
"""
|
|
86
|
+
|
|
87
|
+
logger.info(
|
|
88
|
+
"Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
|
|
89
|
+
experiment_id,
|
|
90
|
+
project_id,
|
|
91
|
+
parallel,
|
|
92
|
+
)
|
|
93
|
+
|
|
127
94
|
return asyncio.run(
|
|
128
95
|
_evaluate_async(
|
|
129
96
|
inference_callable=inference_callable,
|
|
130
97
|
eval_datasets=eval_datasets,
|
|
131
98
|
hyperparameters=hyperparameters,
|
|
132
99
|
experiment_id=experiment_id,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
100
|
+
project_id=project_id,
|
|
101
|
+
parallel=parallel,
|
|
102
|
+
return_dict=return_dict,
|
|
103
|
+
return_aggregates=return_aggregates,
|
|
104
|
+
return_items=return_items,
|
|
105
|
+
return_output=return_output,
|
|
106
|
+
sample_size=sample_size,
|
|
136
107
|
)
|
|
137
108
|
)
|
|
138
109
|
|
|
139
110
|
|
|
140
|
-
|
|
111
|
+
async def _evaluate_async(
|
|
112
|
+
inference_callable: Callable,
|
|
113
|
+
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
114
|
+
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
115
|
+
experiment_id: Optional[str] = None,
|
|
116
|
+
project_id: Optional[str] = None,
|
|
117
|
+
return_dict: bool = True,
|
|
118
|
+
return_aggregates: bool = True,
|
|
119
|
+
return_items: bool = False,
|
|
120
|
+
return_output: bool = False,
|
|
121
|
+
parallel: bool = False,
|
|
122
|
+
sample_size: Optional[int] = None,
|
|
123
|
+
) -> Union[Dict, List]:
|
|
124
|
+
_validate_parameters(locals())
|
|
125
|
+
datasets, adaptive_datasets = _prepare_datasets(eval_datasets, sample_size)
|
|
126
|
+
hyperparameters = _prepare_hyperparameters(hyperparameters)
|
|
127
|
+
|
|
128
|
+
logger.info(
|
|
129
|
+
"Prepared %d datasets and %d hyperparameter configurations",
|
|
130
|
+
len(datasets),
|
|
131
|
+
len(hyperparameters),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
runs = _build_runs(datasets, hyperparameters)
|
|
135
|
+
runs.sort(key=lambda run: (run.dataset_idx, run.hp_idx))
|
|
136
|
+
|
|
137
|
+
logger.info("Created %d evaluation runs", len(runs))
|
|
138
|
+
|
|
139
|
+
with evaluation_progress(datasets, len(hyperparameters), parallel, len(runs)) as progress_bars:
|
|
140
|
+
if parallel:
|
|
141
|
+
eval_results = await _run_parallel(inference_callable, runs, progress_bars)
|
|
142
|
+
else:
|
|
143
|
+
eval_results = await _run_sequential(inference_callable, runs, progress_bars)
|
|
144
|
+
|
|
145
|
+
logger.info("Evaluation completed successfully")
|
|
146
|
+
|
|
147
|
+
return _format_results(
|
|
148
|
+
eval_results, return_dict, return_aggregates, return_items, return_output
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ===== ORCHESTRATION PATHS =====
|
|
153
|
+
|
|
141
154
|
|
|
155
|
+
async def _run_parallel(
|
|
156
|
+
inference_callable: Callable,
|
|
157
|
+
runs: List[EvalRunSpec],
|
|
158
|
+
progress_bars: Any,
|
|
159
|
+
) -> List[EvalResult]:
|
|
160
|
+
logger.debug("Running inference in parallel")
|
|
161
|
+
|
|
162
|
+
async def worker(run: EvalRunSpec) -> Tuple[EvalRunSpec, EvalResult]:
|
|
163
|
+
er = await _execute_run(inference_callable, run)
|
|
164
|
+
progress_bars.on_eval_run_completed(run.dataset_idx)
|
|
165
|
+
return run, er
|
|
166
|
+
|
|
167
|
+
pairs = await asyncio.gather(*[worker(r) for r in runs])
|
|
168
|
+
# Return in canonical (dataset_idx, hp_idx) order for stability
|
|
169
|
+
pairs.sort(key=lambda p: (p[0].dataset_idx, p[0].hp_idx))
|
|
170
|
+
return [er for _, er in pairs]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
async def _run_sequential(
|
|
174
|
+
inference_callable: Callable,
|
|
175
|
+
runs: List[EvalRunSpec],
|
|
176
|
+
progress_bars: Any,
|
|
177
|
+
) -> List[EvalResult]:
|
|
178
|
+
logger.debug("Running inference sequentially")
|
|
179
|
+
results: List[EvalResult] = []
|
|
180
|
+
for run in runs:
|
|
181
|
+
er = await _execute_run(inference_callable, run)
|
|
182
|
+
results.append(er)
|
|
183
|
+
progress_bars.on_hyperparam_completed(run.dataset_idx)
|
|
184
|
+
return results
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ===== EVALUATION EXECUTIONS =====
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
async def _execute_run(inference_callable: Callable, run: EvalRunSpec) -> EvalResult:
|
|
191
|
+
logger.debug("Executing run for %s", run)
|
|
192
|
+
|
|
193
|
+
outputs = await _run_inference_callable(inference_callable, run.items, run.hyperparams)
|
|
194
|
+
logger.debug("Inference completed for run %s", run)
|
|
195
|
+
|
|
196
|
+
metric_scores = _score_metrics(run.eval_dataset, outputs, run.labels)
|
|
197
|
+
logger.debug("Metrics computed for run %s. - scores: %s", run, list(metric_scores.keys()))
|
|
198
|
+
|
|
199
|
+
return EvalResult(run.eval_dataset, outputs, metric_scores, run.hyperparams)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ===== HELPER FUNCTIONS =====
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _validate_parameters(params: Dict[str, Any]) -> None:
|
|
206
|
+
"""Validate all parameters for evaluation."""
|
|
142
207
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
208
|
+
if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
|
|
209
|
+
raise ParameterValidationError(
|
|
210
|
+
"When return_dict=True, at least one of return_aggregates or return_items must be True"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if params["parallel"] and not is_awaitable(params["inference_callable"]):
|
|
214
|
+
raise ParallelExecutionError(
|
|
215
|
+
"parallel=True requires the inference_callable to be async. "
|
|
216
|
+
"Please make your inference function async or set parallel=False."
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _prepare_datasets(
|
|
221
|
+
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
222
|
+
sample_size: Optional[int] = None,
|
|
223
|
+
) -> Tuple[List[EvalDataset], List[str]]:
|
|
224
|
+
"""Prepare and separate input datasets into classic and adaptive evaluation datasets."""
|
|
225
|
+
|
|
226
|
+
# Ensure datasets is always a list for consistent processing
|
|
146
227
|
if not isinstance(datasets, list):
|
|
147
228
|
datasets = [datasets]
|
|
148
|
-
# TODO: handle other types (string registry, etc.)
|
|
149
|
-
return [d for d in datasets if isinstance(d, EvalDataset)]
|
|
150
229
|
|
|
230
|
+
# Extract classical datasets TODO: handle other types (string registry)
|
|
231
|
+
classic_eval_datasets = [dataset for dataset in datasets if isinstance(dataset, EvalDataset)]
|
|
151
232
|
|
|
152
|
-
|
|
153
|
-
if
|
|
154
|
-
|
|
233
|
+
# Reduce datasets to a random sample
|
|
234
|
+
if sample_size:
|
|
235
|
+
logger.info("Sampling datasets to %d items each", sample_size)
|
|
236
|
+
for dataset in classic_eval_datasets:
|
|
237
|
+
dataset.shuffle()
|
|
238
|
+
if len(dataset) > sample_size:
|
|
239
|
+
original_size = len(dataset)
|
|
240
|
+
dataset._hf_dataset = dataset._hf_dataset.select(range(sample_size))
|
|
241
|
+
logger.debug(
|
|
242
|
+
"Sampled dataset '%s' from %d to %d items",
|
|
243
|
+
dataset.name,
|
|
244
|
+
original_size,
|
|
245
|
+
sample_size,
|
|
246
|
+
)
|
|
155
247
|
|
|
248
|
+
# Extract adaptive dataset strings
|
|
249
|
+
adaptive_eval_datasets = [
|
|
250
|
+
dataset.replace(":adaptive", "")
|
|
251
|
+
for dataset in datasets
|
|
252
|
+
if isinstance(dataset, str) and dataset.endswith(":adaptive")
|
|
253
|
+
]
|
|
156
254
|
|
|
157
|
-
|
|
158
|
-
|
|
255
|
+
logger.info("Evaluating on classic datasets: %s", [ds.name for ds in classic_eval_datasets])
|
|
256
|
+
logger.info("Evaluating on adaptive datasets: %s", adaptive_eval_datasets)
|
|
159
257
|
|
|
258
|
+
return classic_eval_datasets, adaptive_eval_datasets
|
|
160
259
|
|
|
161
|
-
def _clip_items(items: List[Dict[str, Any]], item_limit: Optional[int]) -> List[Dict[str, Any]]:
|
|
162
|
-
return items[:item_limit] if item_limit else items
|
|
163
260
|
|
|
261
|
+
def _prepare_hyperparameters(
|
|
262
|
+
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
|
|
263
|
+
) -> List[Dict[str, Any]]:
|
|
264
|
+
"""Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
|
|
265
|
+
if hyperparameters is None:
|
|
266
|
+
return [{}]
|
|
267
|
+
if not isinstance(hyperparameters, list):
|
|
268
|
+
expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
|
|
269
|
+
return expanded
|
|
164
270
|
|
|
165
|
-
|
|
166
|
-
|
|
271
|
+
logger.info("Evaluating with hyperparameters: %s", hyperparameters)
|
|
272
|
+
|
|
273
|
+
return hyperparameters
|
|
167
274
|
|
|
168
275
|
|
|
169
276
|
async def _run_inference_callable(
|
|
@@ -177,52 +284,96 @@ async def _run_inference_callable(
|
|
|
177
284
|
return inference_callable(items, **hyperparams)
|
|
178
285
|
|
|
179
286
|
|
|
180
|
-
|
|
181
|
-
def _iter_dataset_jobs(
|
|
287
|
+
def _build_runs(
|
|
182
288
|
datasets: List[EvalDataset],
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
289
|
+
hyperparameters: List[Dict[str, Any]],
|
|
290
|
+
) -> List[EvalRunSpec]:
|
|
291
|
+
"""Build RunSpec objects for each dataset/hyperparameter combination."""
|
|
292
|
+
runs: List[EvalRunSpec] = []
|
|
293
|
+
for d_idx, ds in enumerate(datasets):
|
|
294
|
+
items = ds.items
|
|
295
|
+
labels = [item.get(ds.label) for item in items]
|
|
296
|
+
for hp_idx, hp in enumerate(hyperparameters):
|
|
297
|
+
run_spec = EvalRunSpec(d_idx, ds, items, labels, hp, hp_idx)
|
|
298
|
+
logger.debug("Built RunSpec: %s", run_spec)
|
|
299
|
+
runs.append(run_spec)
|
|
300
|
+
return runs
|
|
191
301
|
|
|
192
302
|
|
|
193
303
|
def _score_metrics(
|
|
194
304
|
eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
|
|
195
305
|
) -> Dict[str, Dict[str, Any]]:
|
|
306
|
+
"""Compute metric scores for a given dataset and inference outputs."""
|
|
196
307
|
metric_scores: Dict[str, Dict[str, Any]] = {}
|
|
308
|
+
|
|
309
|
+
if len(outputs) != len(labels):
|
|
310
|
+
raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
|
|
311
|
+
|
|
197
312
|
for metric in eval_dataset.metrics:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
313
|
+
try:
|
|
314
|
+
aggregate_scores, item_scores = metric.score(outputs, labels)
|
|
315
|
+
metric_scores[metric.name] = {
|
|
316
|
+
"aggregate_scores": aggregate_scores,
|
|
317
|
+
"item_scores": item_scores,
|
|
318
|
+
}
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.error(
|
|
321
|
+
"Failed to compute metric '%s' for dataset '%s': %s",
|
|
322
|
+
metric.name,
|
|
323
|
+
eval_dataset.name,
|
|
324
|
+
str(e),
|
|
325
|
+
)
|
|
326
|
+
raise MetricComputationError(metric.name, eval_dataset.name, e)
|
|
327
|
+
|
|
203
328
|
return metric_scores
|
|
204
329
|
|
|
205
330
|
|
|
206
331
|
def _format_results(
|
|
207
|
-
eval_results: List[EvalResult],
|
|
332
|
+
eval_results: List[EvalResult],
|
|
333
|
+
return_dict: bool,
|
|
334
|
+
return_aggregates: bool,
|
|
335
|
+
return_items: bool,
|
|
336
|
+
return_output: bool,
|
|
208
337
|
) -> Union[Dict, List]:
|
|
209
338
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
339
|
+
# Return results as a dict
|
|
340
|
+
if return_dict:
|
|
341
|
+
|
|
342
|
+
# Include both aggregate and item scores in dict returned
|
|
343
|
+
if return_aggregates and return_items:
|
|
344
|
+
results: Dict[str, List[Dict[str, Any]]] = {"aggregate_results": [], "item_results": []}
|
|
345
|
+
for eval_result in eval_results:
|
|
346
|
+
eval_result_dict = eval_result.to_dict()
|
|
347
|
+
results["aggregate_results"].extend(eval_result_dict["aggregate_results"])
|
|
348
|
+
if return_output:
|
|
349
|
+
results["item_results"].extend(eval_result_dict["item_results"])
|
|
350
|
+
else:
|
|
351
|
+
results["item_results"].extend(
|
|
352
|
+
[
|
|
353
|
+
{k: v for k, v in item.items() if k != "inference_output"}
|
|
354
|
+
for item in eval_result_dict["item_results"]
|
|
355
|
+
]
|
|
356
|
+
)
|
|
357
|
+
return results
|
|
358
|
+
|
|
359
|
+
# Include only aggregate scores in dict returned
|
|
360
|
+
elif return_aggregates:
|
|
361
|
+
return [eval_result.aggregate_scores for eval_result in eval_results]
|
|
362
|
+
|
|
363
|
+
# Include only item scores in dict returned
|
|
364
|
+
else:
|
|
365
|
+
if return_output:
|
|
366
|
+
return [item for eval_result in eval_results for item in eval_result.item_scores]
|
|
367
|
+
else:
|
|
368
|
+
return [
|
|
369
|
+
{k: v for k, v in item.items() if k != "inference_output"}
|
|
370
|
+
for eval_result in eval_results
|
|
371
|
+
for item in eval_result.item_scores
|
|
372
|
+
]
|
|
373
|
+
|
|
374
|
+
# Return results as an EvalResult object
|
|
375
|
+
else:
|
|
376
|
+
out: Dict[str, List[EvalResult]] = {}
|
|
215
377
|
for er in eval_results:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
combined["per_sample"].extend(d["per_sample"])
|
|
219
|
-
return combined
|
|
220
|
-
|
|
221
|
-
if score_type == "aggregate":
|
|
222
|
-
return [er.aggregate_scores for er in eval_results]
|
|
223
|
-
|
|
224
|
-
if score_type == "item":
|
|
225
|
-
return [item for er in eval_results for item in er.item_scores]
|
|
226
|
-
|
|
227
|
-
# Should be unreachable due to validation
|
|
228
|
-
return {}
|
|
378
|
+
out.setdefault(er.eval_dataset.name, []).append(er)
|
|
379
|
+
return out
|
scorebook/exceptions.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions for the Scorebook framework.
|
|
3
|
+
|
|
4
|
+
This module defines specific exception types used throughout the Scorebook
|
|
5
|
+
evaluation framework to provide clear error handling and debugging information.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ScoreBookError(Exception):
|
|
10
|
+
"""Base exception class for all Scorebook-related errors."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EvaluationError(ScoreBookError):
|
|
14
|
+
"""Raised when there are errors during model evaluation."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParameterValidationError(ScoreBookError):
|
|
18
|
+
"""Raised when invalid parameters are provided to evaluation functions."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InferenceError(EvaluationError):
|
|
22
|
+
"""Raised when there are errors during model inference."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricComputationError(EvaluationError):
|
|
26
|
+
"""Raised when metric computation fails."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, metric_name: str, dataset_name: str, original_error: Exception):
|
|
29
|
+
"""Initialize metric computation error."""
|
|
30
|
+
self.metric_name = metric_name
|
|
31
|
+
self.dataset_name = dataset_name
|
|
32
|
+
self.original_error = original_error
|
|
33
|
+
super().__init__(
|
|
34
|
+
f"Failed to compute metric '{metric_name}' for dataset "
|
|
35
|
+
f"'{dataset_name}': {original_error}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DataMismatchError(EvaluationError):
|
|
40
|
+
"""Raised when there's a mismatch between outputs and expected labels."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
|
|
43
|
+
"""Initialize data mismatch error."""
|
|
44
|
+
self.outputs_count = outputs_count
|
|
45
|
+
self.labels_count = labels_count
|
|
46
|
+
self.dataset_name = dataset_name
|
|
47
|
+
super().__init__(
|
|
48
|
+
f"Output count ({outputs_count}) doesn't match label count ({labels_count}) "
|
|
49
|
+
f"for dataset '{dataset_name}'"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ParallelExecutionError(ScoreBookError):
|
|
54
|
+
"""Raised when parallel execution requirements are not met."""
|
scorebook/inference/__init__.py
CHANGED
|
@@ -5,7 +5,3 @@ This module provides functionality for running inference with various models
|
|
|
5
5
|
and processing their responses. It includes utilities for both single and
|
|
6
6
|
batch inference operations.
|
|
7
7
|
"""
|
|
8
|
-
|
|
9
|
-
from scorebook.inference.openai import batch, responses
|
|
10
|
-
|
|
11
|
-
__all__ = ["responses", "batch"]
|