scorebook 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scorebook/evaluate.py ADDED
@@ -0,0 +1,531 @@
1
+ """
2
+ Model evaluation functionality for the Scorebook framework.
3
+
4
+ This module provides the core evaluation logic to assess model predictions
5
+ against ground truth labels using configurable metrics. It supports:
6
+
7
+ - Batch evaluation of models across multiple datasets
8
+ - Flexible metric computation and aggregation
9
+ - Optional parameter sweeping and experiment tracking
10
+ - Customizable inference functions
11
+
12
+ The main entry point is the `evaluate()` function which handles running
13
+ models on datasets and computing metric scores.
14
+ """
15
+
16
+ import asyncio
17
+ import logging
18
+ from typing import Any, Callable, Dict, List, Optional, Union
19
+
20
+ from scorebook.eval_dataset import EvalDataset
21
+ from scorebook.exceptions import (
22
+ DataMismatchError,
23
+ MetricComputationError,
24
+ ParallelExecutionError,
25
+ ParameterValidationError,
26
+ )
27
+ from scorebook.trismik import run_adaptive_evaluation
28
+ from scorebook.types import (
29
+ AdaptiveEvalDataset,
30
+ AdaptiveEvalRunResult,
31
+ AdaptiveEvalRunSpec,
32
+ ClassicEvalRunResult,
33
+ EvalResult,
34
+ EvalRunSpec,
35
+ )
36
+ from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def evaluate(
42
+ inference_callable: Callable,
43
+ eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
+ metadata: Optional[Dict[str, Any]] = None,
46
+ experiment_id: Optional[str] = None,
47
+ project_id: Optional[str] = None,
48
+ parallel: bool = False,
49
+ return_dict: bool = True,
50
+ return_aggregates: bool = True,
51
+ return_items: bool = False,
52
+ return_output: bool = False,
53
+ sample_size: Optional[int] = None,
54
+ ) -> Union[Dict, List]:
55
+ """
56
+ Evaluate model predictions using specified metrics on given datasets.
57
+
58
+ This function runs the provided inference callable on one or more evaluation datasets,
59
+ computes metric scores, and returns the evaluation results. It supports batch processing,
60
+ parameter sweeping, and different result formatting options.
61
+
62
+ Args:
63
+ inference_callable: A callable function or object that takes (items, hyperparameters)
64
+ and returns predictions. Can be a regular function, async function,
65
+ or callable instance (like a class with __call__ method).
66
+ eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
67
+ - A single EvalDataset instance
68
+ - A list of EvalDataset instances
69
+ - A string identifier (for future dataset registry support)
70
+ - A list of string identifiers
71
+ hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
72
+ metadata: Optional dictionary containing evaluation metadata.
73
+ experiment_id: Optional string identifier for tracking multiple evaluation runs.
74
+ return_dict: If True, returns eval results as a dict
75
+ return_aggregates: If True, returns aggregate scores for each dataset
76
+ return_items: If True, returns individual items for each dataset
77
+ return_output: If True, returns model outputs for each dataset item evaluated
78
+ sample_size: If set, only return a sample of the dataset items (for debugging)
79
+ parallel: If True, run inference functions in parallel (requires all functions to be async)
80
+
81
+ Returns:
82
+ Dictionary mapping dataset names to their evaluation results. For each dataset,
83
+ returns a dictionary containing:
84
+ - items: List of EvalResult objects with predictions and ground truth
85
+ - metrics: Dictionary mapping metric names to their computed scores
86
+
87
+ Example:
88
+
89
+ python
90
+ dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
91
+ def inference_fn(items):
92
+ # Model inference logic here - process all items at once
93
+ return [prediction for item in items]
94
+
95
+ results = evaluate(inference_fn, dataset, item_limit=100)
96
+ """
97
+
98
+ logger.info(
99
+ "Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
100
+ experiment_id,
101
+ project_id,
102
+ parallel,
103
+ )
104
+
105
+ return asyncio.run(
106
+ _evaluate_async(
107
+ inference_callable=inference_callable,
108
+ eval_datasets=eval_datasets,
109
+ hyperparameters=hyperparameters,
110
+ metadata=metadata,
111
+ experiment_id=experiment_id,
112
+ project_id=project_id,
113
+ parallel=parallel,
114
+ return_dict=return_dict,
115
+ return_aggregates=return_aggregates,
116
+ return_items=return_items,
117
+ return_output=return_output,
118
+ sample_size=sample_size,
119
+ )
120
+ )
121
+
122
+
123
+ async def _evaluate_async(
124
+ inference_callable: Callable,
125
+ eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
126
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
127
+ metadata: Optional[Dict[str, Any]] = None,
128
+ experiment_id: Optional[str] = None,
129
+ project_id: Optional[str] = None,
130
+ return_dict: bool = True,
131
+ return_aggregates: bool = True,
132
+ return_items: bool = False,
133
+ return_output: bool = False,
134
+ parallel: bool = False,
135
+ sample_size: Optional[int] = None,
136
+ ) -> Union[Dict, List]:
137
+ _validate_parameters(locals())
138
+ datasets = _prepare_datasets(eval_datasets, sample_size)
139
+ hyperparameter_configs = _prepare_hyperparameter_configs(hyperparameters)
140
+
141
+ logger.info(
142
+ "Prepared %d datasets and %d hyperparameter configurations",
143
+ len(datasets),
144
+ len(hyperparameter_configs),
145
+ )
146
+
147
+ eval_run_specs = _build_eval_run_specs(
148
+ datasets, hyperparameter_configs, experiment_id, project_id, metadata
149
+ )
150
+ eval_run_specs.sort(key=lambda run: (run.dataset_index, run.hyperparameters_index))
151
+
152
+ logger.info("Created %d evaluation run specs", len(eval_run_specs))
153
+
154
+ with evaluation_progress(
155
+ datasets, len(hyperparameter_configs), parallel, len(eval_run_specs)
156
+ ) as progress_bars:
157
+ if parallel:
158
+ eval_results = await _run_parallel(
159
+ inference_callable,
160
+ eval_run_specs,
161
+ progress_bars,
162
+ experiment_id,
163
+ project_id,
164
+ metadata,
165
+ )
166
+ else:
167
+ eval_results = await _run_sequential(
168
+ inference_callable,
169
+ eval_run_specs,
170
+ progress_bars,
171
+ experiment_id,
172
+ project_id,
173
+ metadata,
174
+ )
175
+
176
+ logger.info("Evaluation completed successfully")
177
+
178
+ return _format_results(
179
+ eval_results, return_dict, return_aggregates, return_items, return_output
180
+ )
181
+
182
+
183
+ # ===== ORCHESTRATION PATHS =====
184
+
185
+
186
+ async def _run_parallel(
187
+ inference: Callable,
188
+ runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
189
+ progress_bars: Any,
190
+ experiment_id: Optional[str] = None,
191
+ project_id: Optional[str] = None,
192
+ metadata: Optional[Dict[str, Any]] = None,
193
+ ) -> EvalResult:
194
+ logger.debug("Running inference in parallel")
195
+
196
+ async def worker(
197
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
198
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
199
+ run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
200
+ progress_bars.on_eval_run_completed(run.dataset_index)
201
+ return run_result
202
+
203
+ run_results = await asyncio.gather(*[worker(run) for run in runs])
204
+ # Return in canonical (dataset_idx, hp_idx) order for stability
205
+ run_results.sort(
206
+ key=lambda result: (result.run_spec.dataset_index, result.run_spec.hyperparameters_index)
207
+ )
208
+ return EvalResult(run_results)
209
+
210
+
211
+ async def _run_sequential(
212
+ inference: Callable,
213
+ runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
214
+ progress_bars: Any,
215
+ experiment_id: Optional[str] = None,
216
+ project_id: Optional[str] = None,
217
+ metadata: Optional[Dict[str, Any]] = None,
218
+ ) -> EvalResult:
219
+ logger.debug("Running inference sequentially")
220
+ run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]] = []
221
+ for run in runs:
222
+ run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
223
+ run_results.append(run_result)
224
+ progress_bars.on_hyperparam_completed(run_result.run_spec.dataset_index)
225
+ return EvalResult(run_results)
226
+
227
+
228
+ # ===== EVALUATION RUN EXECUTIONS =====
229
+
230
+
231
+ async def _execute_run(
232
+ inference: Callable,
233
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
234
+ experiment_id: Optional[str] = None,
235
+ project_id: Optional[str] = None,
236
+ metadata: Optional[Dict[str, Any]] = None,
237
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
238
+ """Execute a single evaluation run."""
239
+ if isinstance(run, EvalRunSpec):
240
+ return await _execute_classic_eval_run(inference, run)
241
+ elif isinstance(run, AdaptiveEvalRunSpec):
242
+ if experiment_id is None or project_id is None:
243
+ raise ParameterValidationError(
244
+ "experiment_id and project_id are required for adaptive evaluation runs"
245
+ )
246
+ return await _execute_adaptive_eval_run(inference, run, experiment_id, project_id, metadata)
247
+ else:
248
+ raise ParameterValidationError(f"Unrecognized run type: {type(run)}")
249
+
250
+
251
+ async def _execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
252
+ """Execute a classic evaluation run."""
253
+ logger.debug("Executing classic eval run for %s", run)
254
+
255
+ inference_outputs = await _run_inference_callable(
256
+ inference, run.dataset.items, run.hyperparameter_config
257
+ )
258
+ metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
259
+
260
+ logger.debug("Classic evaluation completed for run %s", run)
261
+ return ClassicEvalRunResult(run, inference_outputs, metric_scores)
262
+
263
+
264
+ async def _execute_adaptive_eval_run(
265
+ inference: Callable,
266
+ run: AdaptiveEvalRunSpec,
267
+ experiment_id: str,
268
+ project_id: str,
269
+ metadata: Optional[Dict[str, Any]] = None,
270
+ ) -> AdaptiveEvalRunResult:
271
+ """Execute an adaptive evaluation run."""
272
+ logger.debug("Executing adaptive run for %s", run)
273
+
274
+ adaptive_eval_run_result = await run_adaptive_evaluation(
275
+ inference, run, experiment_id, project_id, metadata
276
+ )
277
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
278
+
279
+ return adaptive_eval_run_result
280
+
281
+
282
+ # ===== HELPER FUNCTIONS =====
283
+
284
+
285
+ def _validate_parameters(params: Dict[str, Any]) -> None:
286
+ """Validate all parameters for evaluation."""
287
+
288
+ if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
289
+ raise ParameterValidationError(
290
+ "When return_dict=True, at least one of return_aggregates or return_items must be True"
291
+ )
292
+
293
+ if params["parallel"] and not is_awaitable(params["inference_callable"]):
294
+ raise ParallelExecutionError(
295
+ "parallel=True requires the inference_callable to be async. "
296
+ "Please make your inference function async or set parallel=False."
297
+ )
298
+
299
+
300
+ def _prepare_datasets(
301
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
302
+ sample_size: Optional[int] = None,
303
+ ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
304
+ """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
305
+
306
+ # Ensure datasets is always a list for consistent processing
307
+ if not isinstance(datasets, list):
308
+ datasets = [datasets]
309
+
310
+ datasets_out: List[Union[EvalDataset, AdaptiveEvalDataset]] = []
311
+ for dataset in datasets:
312
+
313
+ # Prepare classic datasets
314
+ if isinstance(dataset, EvalDataset):
315
+
316
+ if sample_size is not None:
317
+ dataset = dataset.sample(sample_size)
318
+
319
+ datasets_out.append(dataset)
320
+
321
+ # Prepare adaptive datasets
322
+ elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
323
+ datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
324
+
325
+ # TODO: dataset name string registry
326
+ elif isinstance(dataset, str):
327
+ pass
328
+
329
+ else:
330
+ raise ParameterValidationError(f"Unrecognized dataset type: {type(dataset)}")
331
+
332
+ return datasets_out
333
+
334
+
335
+ def _prepare_hyperparameter_configs(
336
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
337
+ ) -> List[Dict[str, Any]]:
338
+ """Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
339
+ if hyperparameters is None:
340
+ return [{}]
341
+ if not isinstance(hyperparameters, list): # TODO: THIS LOOKS BROKEN
342
+ expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
343
+ return expanded
344
+
345
+ logger.info("Evaluating with hyperparameters: %s", hyperparameters)
346
+
347
+ return hyperparameters
348
+
349
+
350
+ def _build_eval_run_specs(
351
+ datasets: List[Union[EvalDataset, str]],
352
+ hyperparameters: Any,
353
+ experiment_id: Optional[str],
354
+ project_id: Optional[str],
355
+ metadata: Optional[Dict[str, Any]] = None,
356
+ ) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
357
+ """Build RunSpec objects for each dataset/hyperparameter combination."""
358
+ eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
359
+ for dataset_index, dataset in enumerate(datasets):
360
+ for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
361
+
362
+ # Create classic eval run spec
363
+ if isinstance(dataset, EvalDataset):
364
+ eval_run_specs.append(
365
+ _build_classic_eval_run_spec(
366
+ dataset, dataset_index, hyperparameter_config, hyperparameters_index
367
+ )
368
+ )
369
+
370
+ # Create adaptive eval run spec from string
371
+ elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
372
+ if experiment_id is None or project_id is None:
373
+ raise ParameterValidationError(
374
+ "experiment_id and project_id are required for adaptive evaluation"
375
+ )
376
+ eval_run_specs.append(
377
+ _build_adaptive_eval_run_spec(
378
+ dataset,
379
+ dataset_index,
380
+ hyperparameter_config,
381
+ hyperparameters_index,
382
+ experiment_id,
383
+ project_id,
384
+ metadata,
385
+ )
386
+ )
387
+
388
+ # Create adaptive eval run spec from AdaptiveEvalDataset
389
+ elif isinstance(dataset, AdaptiveEvalDataset):
390
+ if experiment_id is None or project_id is None:
391
+ raise ParameterValidationError(
392
+ "experiment_id and project_id are required for adaptive evaluation"
393
+ )
394
+ eval_run_specs.append(
395
+ _build_adaptive_eval_run_spec(
396
+ dataset.name,
397
+ dataset_index,
398
+ hyperparameter_config,
399
+ hyperparameters_index,
400
+ experiment_id,
401
+ project_id,
402
+ metadata,
403
+ )
404
+ )
405
+
406
+ # Log warning - should never happen
407
+ else:
408
+ logger.warning("Unrecognized dataset type: %s", dataset)
409
+
410
+ return eval_run_specs
411
+
412
+
413
+ def _build_classic_eval_run_spec(
414
+ dataset: EvalDataset,
415
+ dataset_index: int,
416
+ hyperparameters: Dict[str, Any],
417
+ hyperparameters_index: int,
418
+ ) -> EvalRunSpec:
419
+ """Build RunSpec objects for each dataset/hyperparameter combination."""
420
+ items = dataset.items
421
+ labels = [item.get(dataset.label) for item in items]
422
+ eval_run_spec = EvalRunSpec(
423
+ dataset,
424
+ dataset_index,
425
+ hyperparameters,
426
+ hyperparameters_index,
427
+ items,
428
+ labels,
429
+ )
430
+ logger.debug("Built EvalRunSpec: %s", eval_run_spec)
431
+ return eval_run_spec
432
+
433
+
434
+ def _build_adaptive_eval_run_spec(
435
+ adaptive_dataset: str,
436
+ dataset_index: int,
437
+ hyperparameter_config: Dict[str, Any],
438
+ hyperparameter_config_index: int,
439
+ experiment_id: str,
440
+ project_id: str,
441
+ metadata: Optional[Dict[str, Any]] = None,
442
+ ) -> AdaptiveEvalRunSpec:
443
+ dataset = adaptive_dataset.replace(":adaptive", "")
444
+ adaptive_eval_run_spec = AdaptiveEvalRunSpec(
445
+ dataset,
446
+ dataset_index,
447
+ hyperparameter_config,
448
+ hyperparameter_config_index,
449
+ experiment_id,
450
+ project_id,
451
+ metadata,
452
+ )
453
+ logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
454
+ return adaptive_eval_run_spec
455
+
456
+
457
+ async def _run_inference_callable(
458
+ inference: Callable,
459
+ items: List[Dict[str, Any]],
460
+ hyperparameter_config: Dict[str, Any],
461
+ ) -> Any:
462
+ if is_awaitable(inference):
463
+ return await inference(items, **hyperparameter_config)
464
+ else:
465
+ return inference(items, **hyperparameter_config)
466
+
467
+
468
+ def _score_metrics(
469
+ eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
470
+ ) -> Dict[str, Dict[str, Any]]:
471
+ """Compute metric scores for a given dataset and inference outputs."""
472
+ metric_scores: Dict[str, Dict[str, Any]] = {}
473
+
474
+ if len(outputs) != len(labels):
475
+ raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
476
+
477
+ for metric in eval_dataset.metrics:
478
+ try:
479
+ aggregate_scores, item_scores = metric.score(outputs, labels)
480
+ metric_scores[metric.name] = {
481
+ "aggregate_scores": aggregate_scores,
482
+ "item_scores": item_scores,
483
+ }
484
+ except Exception as e:
485
+ logger.error(
486
+ "Failed to compute metric '%s' for dataset '%s': %s",
487
+ metric.name,
488
+ eval_dataset.name,
489
+ str(e),
490
+ )
491
+ raise MetricComputationError(metric.name, eval_dataset.name, e)
492
+
493
+ return metric_scores
494
+
495
+
496
+ def _format_results(
497
+ eval_result: EvalResult,
498
+ return_dict: bool,
499
+ return_aggregates: bool,
500
+ return_items: bool,
501
+ return_output: bool,
502
+ ) -> Union[EvalResult, Dict, List]:
503
+
504
+ # Return results as a dict
505
+ if return_dict:
506
+ results = {}
507
+
508
+ if return_aggregates:
509
+ results["aggregate_results"] = eval_result.aggregate_scores
510
+
511
+ if return_items:
512
+ item_scores = eval_result.item_scores
513
+ # Remove inference output if not requested
514
+ if not return_output:
515
+ for item in item_scores:
516
+ item.pop("inference_output", None)
517
+ results["item_results"] = item_scores
518
+
519
+ # If both are requested, return the combined structure
520
+ if return_aggregates and return_items:
521
+ return results
522
+ # If only aggregates requested, return just the list
523
+ elif return_aggregates:
524
+ return results["aggregate_results"]
525
+ # If only items requested, return just the list
526
+ else:
527
+ return results["item_results"]
528
+
529
+ # Return results as an EvalResult object
530
+ else:
531
+ return eval_result
@@ -76,7 +76,29 @@ async def responses(
76
76
  logger.debug("Item %d converted to fallback format", i)
77
77
 
78
78
  logger.debug("Creating OpenAI task %d with messages: %s", i, messages)
79
- task = client.chat.completions.create(model=model, messages=messages, **hyperparameters)
79
+ # Filter to only include valid OpenAI chat completions parameters
80
+ valid_params = {
81
+ "temperature",
82
+ "max_tokens",
83
+ "top_p",
84
+ "frequency_penalty",
85
+ "presence_penalty",
86
+ "stop",
87
+ "stream",
88
+ "logit_bias",
89
+ "user",
90
+ "seed",
91
+ "tools",
92
+ "tool_choice",
93
+ "response_format",
94
+ "n",
95
+ "logprobs",
96
+ "top_logprobs",
97
+ }
98
+ filtered_hyperparameters = {k: v for k, v in hyperparameters.items() if k in valid_params}
99
+ task = client.chat.completions.create(
100
+ model=model, messages=messages, **filtered_hyperparameters
101
+ )
80
102
  tasks.append(task)
81
103
 
82
104
  logger.debug("Created %d tasks, waiting for OpenAI responses...", len(tasks))
@@ -0,0 +1,6 @@
1
+ """Trismik authentication and API integration."""
2
+
3
+ from .adaptive_testing_service import run_adaptive_evaluation
4
+ from .login import get_stored_token, get_token, login, logout, whoami
5
+
6
+ __all__ = ["login", "logout", "whoami", "get_stored_token", "get_token", "run_adaptive_evaluation"]