scorebook 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scorebook/__init__.py CHANGED
@@ -11,5 +11,6 @@ __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
12
  from scorebook.evaluator import evaluate
13
13
  from scorebook.types.eval_dataset import EvalDataset
14
+ from scorebook.utils.build_prompt import build_prompt
14
15
 
15
- __all__ = ["EvalDataset", "evaluate"]
16
+ __all__ = ["EvalDataset", "evaluate", "build_prompt"]
scorebook/evaluator.py CHANGED
@@ -14,74 +14,33 @@ models on datasets and computing metric scores.
14
14
  """
15
15
 
16
16
  import asyncio
17
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
18
-
19
- from scorebook.types.eval_dataset import EvalDataset
20
- from scorebook.types.eval_result import EvalResult
17
+ import logging
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ from scorebook.exceptions import (
21
+ DataMismatchError,
22
+ MetricComputationError,
23
+ ParallelExecutionError,
24
+ ParameterValidationError,
25
+ )
26
+ from scorebook.types import EvalDataset, EvalResult, EvalRunSpec
21
27
  from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
22
28
 
23
-
24
- async def _evaluate_async(
25
- inference_callable: Callable,
26
- eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
27
- hyperparameters: Optional[Dict[str, Any]] = None,
28
- experiment_id: Optional[str] = None,
29
- item_limit: Optional[int] = None,
30
- return_type: str = "dict",
31
- score_type: str = "aggregate",
32
- ) -> Union[Dict, List]:
33
- """Run inference across datasets/hyperparams, compute metrics, and format results."""
34
- _validate_score_type(score_type)
35
-
36
- normalized_datasets = _normalize_datasets(eval_datasets)
37
- hyperparam_grid = _expand_hyperparams(hyperparameters)
38
-
39
- eval_results: List[EvalResult] = []
40
-
41
- with evaluation_progress(normalized_datasets, len(hyperparam_grid)) as progress_bars:
42
- # Loop through datasets, then hyperparameters for clear progress tracking
43
- for dataset_idx, eval_dataset in enumerate(normalized_datasets):
44
- with progress_bars.hyperparam_progress_context():
45
- # Run inference for each hyperparameter configuration on this dataset
46
- for hp_idx, hyperparam_config in enumerate(hyperparam_grid):
47
- items = _clip_items(eval_dataset.items, item_limit)
48
- labels = _labels_for(items, eval_dataset.label)
49
-
50
- # 1) Run inference
51
- outputs = await _run_inference_callable(
52
- inference_callable, items, hyperparam_config
53
- )
54
-
55
- # 2) Score metrics
56
- metric_scores = _score_metrics(eval_dataset, outputs, labels)
57
-
58
- # 3) Wrap into EvalResult
59
- eval_results.append(
60
- EvalResult(eval_dataset, outputs, metric_scores, hyperparam_config)
61
- )
62
-
63
- # Update inner progress bar
64
- progress_bars.update_hyperparam_progress()
65
-
66
- # Update the outer progress bar
67
- progress_bars.update_dataset_progress()
68
-
69
- # TODO: experiment_id handling (left as passthrough to preserve behavior)
70
- if experiment_id:
71
- pass
72
-
73
- # 4) Format as requested
74
- return _format_results(eval_results, return_type, score_type)
29
+ logger = logging.getLogger(__name__)
75
30
 
76
31
 
77
32
  def evaluate(
78
33
  inference_callable: Callable,
79
34
  eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
80
- hyperparameters: Optional[Dict[str, Any]] = None,
35
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
81
36
  experiment_id: Optional[str] = None,
82
- item_limit: Optional[int] = None,
83
- return_type: str = "dict",
84
- score_type: str = "aggregate",
37
+ project_id: Optional[str] = None,
38
+ parallel: bool = False,
39
+ return_dict: bool = True,
40
+ return_aggregates: bool = True,
41
+ return_items: bool = False,
42
+ return_output: bool = False,
43
+ sample_size: Optional[int] = None,
85
44
  ) -> Union[Dict, List]:
86
45
  """
87
46
  Evaluate model predictions using specified metrics on given datasets.
@@ -101,12 +60,12 @@ def evaluate(
101
60
  - A list of string identifiers
102
61
  hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
103
62
  experiment_id: Optional string identifier for tracking multiple evaluation runs.
104
- item_limit: Optional integer limiting the number of items to evaluate per dataset.
105
- return_type: Format of the return value. Currently only "dict" is supported.
106
- score_type: Type of score aggregation to return. Options:
107
- - "aggregate": Return aggregated metrics
108
- - "item": Return per-item scores
109
- - "all": Return both aggregate and per-item scores
63
+ return_dict: If True, returns eval results as a dict
64
+ return_aggregates: If True, returns aggregate scores for each dataset
65
+ return_items: If True, returns individual items for each dataset
66
+ return_output: If True, returns model outputs for each dataset item evaluated
67
+ sample_size: If set, only return a sample of the dataset items (for debugging)
68
+ parallel: If True, run inference functions in parallel (requires all functions to be async)
110
69
 
111
70
  Returns:
112
71
  Dictionary mapping dataset names to their evaluation results. For each dataset,
@@ -124,46 +83,194 @@ def evaluate(
124
83
 
125
84
  results = evaluate(inference_fn, dataset, item_limit=100)
126
85
  """
86
+
87
+ logger.info(
88
+ "Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
89
+ experiment_id,
90
+ project_id,
91
+ parallel,
92
+ )
93
+
127
94
  return asyncio.run(
128
95
  _evaluate_async(
129
96
  inference_callable=inference_callable,
130
97
  eval_datasets=eval_datasets,
131
98
  hyperparameters=hyperparameters,
132
99
  experiment_id=experiment_id,
133
- item_limit=item_limit,
134
- return_type=return_type,
135
- score_type=score_type,
100
+ project_id=project_id,
101
+ parallel=parallel,
102
+ return_dict=return_dict,
103
+ return_aggregates=return_aggregates,
104
+ return_items=return_items,
105
+ return_output=return_output,
106
+ sample_size=sample_size,
136
107
  )
137
108
  )
138
109
 
139
110
 
140
- # ===== Helper Functions =====
111
+ async def _evaluate_async(
112
+ inference_callable: Callable,
113
+ eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
114
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
115
+ experiment_id: Optional[str] = None,
116
+ project_id: Optional[str] = None,
117
+ return_dict: bool = True,
118
+ return_aggregates: bool = True,
119
+ return_items: bool = False,
120
+ return_output: bool = False,
121
+ parallel: bool = False,
122
+ sample_size: Optional[int] = None,
123
+ ) -> Union[Dict, List]:
124
+ _validate_parameters(locals())
125
+ datasets, adaptive_datasets = _prepare_datasets(eval_datasets, sample_size)
126
+ hyperparameters = _prepare_hyperparameters(hyperparameters)
127
+
128
+ logger.info(
129
+ "Prepared %d datasets and %d hyperparameter configurations",
130
+ len(datasets),
131
+ len(hyperparameters),
132
+ )
133
+
134
+ runs = _build_runs(datasets, hyperparameters)
135
+ runs.sort(key=lambda run: (run.dataset_idx, run.hp_idx))
136
+
137
+ logger.info("Created %d evaluation runs", len(runs))
138
+
139
+ with evaluation_progress(datasets, len(hyperparameters), parallel, len(runs)) as progress_bars:
140
+ if parallel:
141
+ eval_results = await _run_parallel(inference_callable, runs, progress_bars)
142
+ else:
143
+ eval_results = await _run_sequential(inference_callable, runs, progress_bars)
144
+
145
+ logger.info("Evaluation completed successfully")
146
+
147
+ return _format_results(
148
+ eval_results, return_dict, return_aggregates, return_items, return_output
149
+ )
150
+
151
+
152
+ # ===== ORCHESTRATION PATHS =====
153
+
141
154
 
155
+ async def _run_parallel(
156
+ inference_callable: Callable,
157
+ runs: List[EvalRunSpec],
158
+ progress_bars: Any,
159
+ ) -> List[EvalResult]:
160
+ logger.debug("Running inference in parallel")
161
+
162
+ async def worker(run: EvalRunSpec) -> Tuple[EvalRunSpec, EvalResult]:
163
+ er = await _execute_run(inference_callable, run)
164
+ progress_bars.on_eval_run_completed(run.dataset_idx)
165
+ return run, er
166
+
167
+ pairs = await asyncio.gather(*[worker(r) for r in runs])
168
+ # Return in canonical (dataset_idx, hp_idx) order for stability
169
+ pairs.sort(key=lambda p: (p[0].dataset_idx, p[0].hp_idx))
170
+ return [er for _, er in pairs]
171
+
172
+
173
+ async def _run_sequential(
174
+ inference_callable: Callable,
175
+ runs: List[EvalRunSpec],
176
+ progress_bars: Any,
177
+ ) -> List[EvalResult]:
178
+ logger.debug("Running inference sequentially")
179
+ results: List[EvalResult] = []
180
+ for run in runs:
181
+ er = await _execute_run(inference_callable, run)
182
+ results.append(er)
183
+ progress_bars.on_hyperparam_completed(run.dataset_idx)
184
+ return results
185
+
186
+
187
+ # ===== EVALUATION EXECUTIONS =====
188
+
189
+
190
+ async def _execute_run(inference_callable: Callable, run: EvalRunSpec) -> EvalResult:
191
+ logger.debug("Executing run for %s", run)
192
+
193
+ outputs = await _run_inference_callable(inference_callable, run.items, run.hyperparams)
194
+ logger.debug("Inference completed for run %s", run)
195
+
196
+ metric_scores = _score_metrics(run.eval_dataset, outputs, run.labels)
197
+ logger.debug("Metrics computed for run %s. - scores: %s", run, list(metric_scores.keys()))
198
+
199
+ return EvalResult(run.eval_dataset, outputs, metric_scores, run.hyperparams)
200
+
201
+
202
+ # ===== HELPER FUNCTIONS =====
203
+
204
+
205
+ def _validate_parameters(params: Dict[str, Any]) -> None:
206
+ """Validate all parameters for evaluation."""
142
207
 
143
- def _normalize_datasets(
144
- datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]]
145
- ) -> List[EvalDataset]:
208
+ if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
209
+ raise ParameterValidationError(
210
+ "When return_dict=True, at least one of return_aggregates or return_items must be True"
211
+ )
212
+
213
+ if params["parallel"] and not is_awaitable(params["inference_callable"]):
214
+ raise ParallelExecutionError(
215
+ "parallel=True requires the inference_callable to be async. "
216
+ "Please make your inference function async or set parallel=False."
217
+ )
218
+
219
+
220
+ def _prepare_datasets(
221
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
222
+ sample_size: Optional[int] = None,
223
+ ) -> Tuple[List[EvalDataset], List[str]]:
224
+ """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
225
+
226
+ # Ensure datasets is always a list for consistent processing
146
227
  if not isinstance(datasets, list):
147
228
  datasets = [datasets]
148
- # TODO: handle other types (string registry, etc.)
149
- return [d for d in datasets if isinstance(d, EvalDataset)]
150
229
 
230
+ # Extract classical datasets TODO: handle other types (string registry)
231
+ classic_eval_datasets = [dataset for dataset in datasets if isinstance(dataset, EvalDataset)]
151
232
 
152
- def _validate_score_type(score_type: str) -> None:
153
- if score_type not in {"aggregate", "item", "all"}:
154
- raise ValueError("score_type must be 'aggregate', 'item', or 'all'")
233
+ # Reduce datasets to a random sample
234
+ if sample_size:
235
+ logger.info("Sampling datasets to %d items each", sample_size)
236
+ for dataset in classic_eval_datasets:
237
+ dataset.shuffle()
238
+ if len(dataset) > sample_size:
239
+ original_size = len(dataset)
240
+ dataset._hf_dataset = dataset._hf_dataset.select(range(sample_size))
241
+ logger.debug(
242
+ "Sampled dataset '%s' from %d to %d items",
243
+ dataset.name,
244
+ original_size,
245
+ sample_size,
246
+ )
155
247
 
248
+ # Extract adaptive dataset strings
249
+ adaptive_eval_datasets = [
250
+ dataset.replace(":adaptive", "")
251
+ for dataset in datasets
252
+ if isinstance(dataset, str) and dataset.endswith(":adaptive")
253
+ ]
156
254
 
157
- def _expand_hyperparams(hyperparameters: Optional[Dict[str, Any]]) -> Any:
158
- return expand_dict(hyperparameters or {})
255
+ logger.info("Evaluating on classic datasets: %s", [ds.name for ds in classic_eval_datasets])
256
+ logger.info("Evaluating on adaptive datasets: %s", adaptive_eval_datasets)
159
257
 
258
+ return classic_eval_datasets, adaptive_eval_datasets
160
259
 
161
- def _clip_items(items: List[Dict[str, Any]], item_limit: Optional[int]) -> List[Dict[str, Any]]:
162
- return items[:item_limit] if item_limit else items
163
260
 
261
+ def _prepare_hyperparameters(
262
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
263
+ ) -> List[Dict[str, Any]]:
264
+ """Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
265
+ if hyperparameters is None:
266
+ return [{}]
267
+ if not isinstance(hyperparameters, list):
268
+ expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
269
+ return expanded
164
270
 
165
- def _labels_for(items: List[Dict[str, Any]], label_key: str) -> List[Any]:
166
- return [item.get(label_key) for item in items]
271
+ logger.info("Evaluating with hyperparameters: %s", hyperparameters)
272
+
273
+ return hyperparameters
167
274
 
168
275
 
169
276
  async def _run_inference_callable(
@@ -177,52 +284,96 @@ async def _run_inference_callable(
177
284
  return inference_callable(items, **hyperparams)
178
285
 
179
286
 
180
- # Yields (eval_dataset, items, labels, hyperparams) for every dataset x hyperparam combo.
181
- def _iter_dataset_jobs(
287
+ def _build_runs(
182
288
  datasets: List[EvalDataset],
183
- hyperparam_grid: List[Dict[str, Any]],
184
- item_limit: Optional[int],
185
- ) -> Iterable[Tuple[EvalDataset, List[Dict[str, Any]], List[Any], Dict[str, Any]]]:
186
- for eval_dataset in datasets:
187
- for hp in hyperparam_grid:
188
- items = _clip_items(eval_dataset.items, item_limit)
189
- labels = _labels_for(items, eval_dataset.label)
190
- yield eval_dataset, items, labels, hp
289
+ hyperparameters: List[Dict[str, Any]],
290
+ ) -> List[EvalRunSpec]:
291
+ """Build RunSpec objects for each dataset/hyperparameter combination."""
292
+ runs: List[EvalRunSpec] = []
293
+ for d_idx, ds in enumerate(datasets):
294
+ items = ds.items
295
+ labels = [item.get(ds.label) for item in items]
296
+ for hp_idx, hp in enumerate(hyperparameters):
297
+ run_spec = EvalRunSpec(d_idx, ds, items, labels, hp, hp_idx)
298
+ logger.debug("Built RunSpec: %s", run_spec)
299
+ runs.append(run_spec)
300
+ return runs
191
301
 
192
302
 
193
303
  def _score_metrics(
194
304
  eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
195
305
  ) -> Dict[str, Dict[str, Any]]:
306
+ """Compute metric scores for a given dataset and inference outputs."""
196
307
  metric_scores: Dict[str, Dict[str, Any]] = {}
308
+
309
+ if len(outputs) != len(labels):
310
+ raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
311
+
197
312
  for metric in eval_dataset.metrics:
198
- aggregate_scores, item_scores = metric.score(outputs, labels)
199
- metric_scores[metric.name] = {
200
- "aggregate_scores": aggregate_scores,
201
- "item_scores": item_scores,
202
- }
313
+ try:
314
+ aggregate_scores, item_scores = metric.score(outputs, labels)
315
+ metric_scores[metric.name] = {
316
+ "aggregate_scores": aggregate_scores,
317
+ "item_scores": item_scores,
318
+ }
319
+ except Exception as e:
320
+ logger.error(
321
+ "Failed to compute metric '%s' for dataset '%s': %s",
322
+ metric.name,
323
+ eval_dataset.name,
324
+ str(e),
325
+ )
326
+ raise MetricComputationError(metric.name, eval_dataset.name, e)
327
+
203
328
  return metric_scores
204
329
 
205
330
 
206
331
  def _format_results(
207
- eval_results: List[EvalResult], return_type: str, score_type: str
332
+ eval_results: List[EvalResult],
333
+ return_dict: bool,
334
+ return_aggregates: bool,
335
+ return_items: bool,
336
+ return_output: bool,
208
337
  ) -> Union[Dict, List]:
209
338
 
210
- if return_type != "dict":
211
- return {er.eval_dataset.name: er for er in eval_results}
212
-
213
- if score_type == "all":
214
- combined: Dict[str, List[Dict[str, Any]]] = {"aggregate": [], "per_sample": []}
339
+ # Return results as a dict
340
+ if return_dict:
341
+
342
+ # Include both aggregate and item scores in dict returned
343
+ if return_aggregates and return_items:
344
+ results: Dict[str, List[Dict[str, Any]]] = {"aggregate_results": [], "item_results": []}
345
+ for eval_result in eval_results:
346
+ eval_result_dict = eval_result.to_dict()
347
+ results["aggregate_results"].extend(eval_result_dict["aggregate_results"])
348
+ if return_output:
349
+ results["item_results"].extend(eval_result_dict["item_results"])
350
+ else:
351
+ results["item_results"].extend(
352
+ [
353
+ {k: v for k, v in item.items() if k != "inference_output"}
354
+ for item in eval_result_dict["item_results"]
355
+ ]
356
+ )
357
+ return results
358
+
359
+ # Include only aggregate scores in dict returned
360
+ elif return_aggregates:
361
+ return [eval_result.aggregate_scores for eval_result in eval_results]
362
+
363
+ # Include only item scores in dict returned
364
+ else:
365
+ if return_output:
366
+ return [item for eval_result in eval_results for item in eval_result.item_scores]
367
+ else:
368
+ return [
369
+ {k: v for k, v in item.items() if k != "inference_output"}
370
+ for eval_result in eval_results
371
+ for item in eval_result.item_scores
372
+ ]
373
+
374
+ # Return results as an EvalResult object
375
+ else:
376
+ out: Dict[str, List[EvalResult]] = {}
215
377
  for er in eval_results:
216
- d = er.to_dict()
217
- combined["aggregate"].extend(d["aggregate"])
218
- combined["per_sample"].extend(d["per_sample"])
219
- return combined
220
-
221
- if score_type == "aggregate":
222
- return [er.aggregate_scores for er in eval_results]
223
-
224
- if score_type == "item":
225
- return [item for er in eval_results for item in er.item_scores]
226
-
227
- # Should be unreachable due to validation
228
- return {}
378
+ out.setdefault(er.eval_dataset.name, []).append(er)
379
+ return out
@@ -0,0 +1,54 @@
1
+ """
2
+ Custom exceptions for the Scorebook framework.
3
+
4
+ This module defines specific exception types used throughout the Scorebook
5
+ evaluation framework to provide clear error handling and debugging information.
6
+ """
7
+
8
+
9
+ class ScoreBookError(Exception):
10
+ """Base exception class for all Scorebook-related errors."""
11
+
12
+
13
+ class EvaluationError(ScoreBookError):
14
+ """Raised when there are errors during model evaluation."""
15
+
16
+
17
+ class ParameterValidationError(ScoreBookError):
18
+ """Raised when invalid parameters are provided to evaluation functions."""
19
+
20
+
21
+ class InferenceError(EvaluationError):
22
+ """Raised when there are errors during model inference."""
23
+
24
+
25
+ class MetricComputationError(EvaluationError):
26
+ """Raised when metric computation fails."""
27
+
28
+ def __init__(self, metric_name: str, dataset_name: str, original_error: Exception):
29
+ """Initialize metric computation error."""
30
+ self.metric_name = metric_name
31
+ self.dataset_name = dataset_name
32
+ self.original_error = original_error
33
+ super().__init__(
34
+ f"Failed to compute metric '{metric_name}' for dataset "
35
+ f"'{dataset_name}': {original_error}"
36
+ )
37
+
38
+
39
+ class DataMismatchError(EvaluationError):
40
+ """Raised when there's a mismatch between outputs and expected labels."""
41
+
42
+ def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
43
+ """Initialize data mismatch error."""
44
+ self.outputs_count = outputs_count
45
+ self.labels_count = labels_count
46
+ self.dataset_name = dataset_name
47
+ super().__init__(
48
+ f"Output count ({outputs_count}) doesn't match label count ({labels_count}) "
49
+ f"for dataset '{dataset_name}'"
50
+ )
51
+
52
+
53
+ class ParallelExecutionError(ScoreBookError):
54
+ """Raised when parallel execution requirements are not met."""
@@ -5,7 +5,3 @@ This module provides functionality for running inference with various models
5
5
  and processing their responses. It includes utilities for both single and
6
6
  batch inference operations.
7
7
  """
8
-
9
- from scorebook.inference.openai import batch, responses
10
-
11
- __all__ = ["responses", "batch"]