scorebook 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +5 -3
- scorebook/cli/__init__.py +1 -0
- scorebook/cli/auth.py +98 -0
- scorebook/cli/main.py +57 -0
- scorebook/{types/eval_dataset.py → eval_dataset.py} +38 -0
- scorebook/evaluate.py +531 -0
- scorebook/inference/openai.py +23 -1
- scorebook/trismik/__init__.py +6 -0
- scorebook/trismik/adaptive_testing_service.py +141 -0
- scorebook/trismik/login.py +120 -0
- scorebook/types.py +165 -0
- {scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/METADATA +2 -1
- {scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/RECORD +17 -13
- scorebook-0.0.6.dist-info/entry_points.txt +3 -0
- scorebook/evaluator.py +0 -379
- scorebook/types/__init__.py +0 -12
- scorebook/types/eval_result.py +0 -133
- scorebook/types/eval_run_spec.py +0 -28
- scorebook/utils/logging_utils.py +0 -1
- /scorebook/{types/inference_pipeline.py → inference_pipeline.py} +0 -0
- {scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/LICENSE +0 -0
- {scorebook-0.0.4.dist-info → scorebook-0.0.6.dist-info}/WHEEL +0 -0
scorebook/evaluator.py
DELETED
|
@@ -1,379 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Model evaluation functionality for the Scorebook framework.
|
|
3
|
-
|
|
4
|
-
This module provides the core evaluation logic to assess model predictions
|
|
5
|
-
against ground truth labels using configurable metrics. It supports:
|
|
6
|
-
|
|
7
|
-
- Batch evaluation of models across multiple datasets
|
|
8
|
-
- Flexible metric computation and aggregation
|
|
9
|
-
- Optional parameter sweeping and experiment tracking
|
|
10
|
-
- Customizable inference functions
|
|
11
|
-
|
|
12
|
-
The main entry point is the `evaluate()` function which handles running
|
|
13
|
-
models on datasets and computing metric scores.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
import asyncio
|
|
17
|
-
import logging
|
|
18
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
-
|
|
20
|
-
from scorebook.exceptions import (
|
|
21
|
-
DataMismatchError,
|
|
22
|
-
MetricComputationError,
|
|
23
|
-
ParallelExecutionError,
|
|
24
|
-
ParameterValidationError,
|
|
25
|
-
)
|
|
26
|
-
from scorebook.types import EvalDataset, EvalResult, EvalRunSpec
|
|
27
|
-
from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
|
|
28
|
-
|
|
29
|
-
logger = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def evaluate(
|
|
33
|
-
inference_callable: Callable,
|
|
34
|
-
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
35
|
-
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
36
|
-
experiment_id: Optional[str] = None,
|
|
37
|
-
project_id: Optional[str] = None,
|
|
38
|
-
parallel: bool = False,
|
|
39
|
-
return_dict: bool = True,
|
|
40
|
-
return_aggregates: bool = True,
|
|
41
|
-
return_items: bool = False,
|
|
42
|
-
return_output: bool = False,
|
|
43
|
-
sample_size: Optional[int] = None,
|
|
44
|
-
) -> Union[Dict, List]:
|
|
45
|
-
"""
|
|
46
|
-
Evaluate model predictions using specified metrics on given datasets.
|
|
47
|
-
|
|
48
|
-
This function runs the provided inference callable on one or more evaluation datasets,
|
|
49
|
-
computes metric scores, and returns the evaluation results. It supports batch processing,
|
|
50
|
-
parameter sweeping, and different result formatting options.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
inference_callable: A callable function or object that takes (items, hyperparameters)
|
|
54
|
-
and returns predictions. Can be a regular function, async function,
|
|
55
|
-
or callable instance (like a class with __call__ method).
|
|
56
|
-
eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
|
|
57
|
-
- A single EvalDataset instance
|
|
58
|
-
- A list of EvalDataset instances
|
|
59
|
-
- A string identifier (for future dataset registry support)
|
|
60
|
-
- A list of string identifiers
|
|
61
|
-
hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
|
|
62
|
-
experiment_id: Optional string identifier for tracking multiple evaluation runs.
|
|
63
|
-
return_dict: If True, returns eval results as a dict
|
|
64
|
-
return_aggregates: If True, returns aggregate scores for each dataset
|
|
65
|
-
return_items: If True, returns individual items for each dataset
|
|
66
|
-
return_output: If True, returns model outputs for each dataset item evaluated
|
|
67
|
-
sample_size: If set, only return a sample of the dataset items (for debugging)
|
|
68
|
-
parallel: If True, run inference functions in parallel (requires all functions to be async)
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
Dictionary mapping dataset names to their evaluation results. For each dataset,
|
|
72
|
-
returns a dictionary containing:
|
|
73
|
-
- items: List of EvalResult objects with predictions and ground truth
|
|
74
|
-
- metrics: Dictionary mapping metric names to their computed scores
|
|
75
|
-
|
|
76
|
-
Example:
|
|
77
|
-
|
|
78
|
-
python
|
|
79
|
-
dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
|
|
80
|
-
def inference_fn(items):
|
|
81
|
-
# Model inference logic here - process all items at once
|
|
82
|
-
return [prediction for item in items]
|
|
83
|
-
|
|
84
|
-
results = evaluate(inference_fn, dataset, item_limit=100)
|
|
85
|
-
"""
|
|
86
|
-
|
|
87
|
-
logger.info(
|
|
88
|
-
"Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
|
|
89
|
-
experiment_id,
|
|
90
|
-
project_id,
|
|
91
|
-
parallel,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
return asyncio.run(
|
|
95
|
-
_evaluate_async(
|
|
96
|
-
inference_callable=inference_callable,
|
|
97
|
-
eval_datasets=eval_datasets,
|
|
98
|
-
hyperparameters=hyperparameters,
|
|
99
|
-
experiment_id=experiment_id,
|
|
100
|
-
project_id=project_id,
|
|
101
|
-
parallel=parallel,
|
|
102
|
-
return_dict=return_dict,
|
|
103
|
-
return_aggregates=return_aggregates,
|
|
104
|
-
return_items=return_items,
|
|
105
|
-
return_output=return_output,
|
|
106
|
-
sample_size=sample_size,
|
|
107
|
-
)
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
async def _evaluate_async(
|
|
112
|
-
inference_callable: Callable,
|
|
113
|
-
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
114
|
-
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
115
|
-
experiment_id: Optional[str] = None,
|
|
116
|
-
project_id: Optional[str] = None,
|
|
117
|
-
return_dict: bool = True,
|
|
118
|
-
return_aggregates: bool = True,
|
|
119
|
-
return_items: bool = False,
|
|
120
|
-
return_output: bool = False,
|
|
121
|
-
parallel: bool = False,
|
|
122
|
-
sample_size: Optional[int] = None,
|
|
123
|
-
) -> Union[Dict, List]:
|
|
124
|
-
_validate_parameters(locals())
|
|
125
|
-
datasets, adaptive_datasets = _prepare_datasets(eval_datasets, sample_size)
|
|
126
|
-
hyperparameters = _prepare_hyperparameters(hyperparameters)
|
|
127
|
-
|
|
128
|
-
logger.info(
|
|
129
|
-
"Prepared %d datasets and %d hyperparameter configurations",
|
|
130
|
-
len(datasets),
|
|
131
|
-
len(hyperparameters),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
runs = _build_runs(datasets, hyperparameters)
|
|
135
|
-
runs.sort(key=lambda run: (run.dataset_idx, run.hp_idx))
|
|
136
|
-
|
|
137
|
-
logger.info("Created %d evaluation runs", len(runs))
|
|
138
|
-
|
|
139
|
-
with evaluation_progress(datasets, len(hyperparameters), parallel, len(runs)) as progress_bars:
|
|
140
|
-
if parallel:
|
|
141
|
-
eval_results = await _run_parallel(inference_callable, runs, progress_bars)
|
|
142
|
-
else:
|
|
143
|
-
eval_results = await _run_sequential(inference_callable, runs, progress_bars)
|
|
144
|
-
|
|
145
|
-
logger.info("Evaluation completed successfully")
|
|
146
|
-
|
|
147
|
-
return _format_results(
|
|
148
|
-
eval_results, return_dict, return_aggregates, return_items, return_output
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# ===== ORCHESTRATION PATHS =====
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
async def _run_parallel(
|
|
156
|
-
inference_callable: Callable,
|
|
157
|
-
runs: List[EvalRunSpec],
|
|
158
|
-
progress_bars: Any,
|
|
159
|
-
) -> List[EvalResult]:
|
|
160
|
-
logger.debug("Running inference in parallel")
|
|
161
|
-
|
|
162
|
-
async def worker(run: EvalRunSpec) -> Tuple[EvalRunSpec, EvalResult]:
|
|
163
|
-
er = await _execute_run(inference_callable, run)
|
|
164
|
-
progress_bars.on_eval_run_completed(run.dataset_idx)
|
|
165
|
-
return run, er
|
|
166
|
-
|
|
167
|
-
pairs = await asyncio.gather(*[worker(r) for r in runs])
|
|
168
|
-
# Return in canonical (dataset_idx, hp_idx) order for stability
|
|
169
|
-
pairs.sort(key=lambda p: (p[0].dataset_idx, p[0].hp_idx))
|
|
170
|
-
return [er for _, er in pairs]
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
async def _run_sequential(
|
|
174
|
-
inference_callable: Callable,
|
|
175
|
-
runs: List[EvalRunSpec],
|
|
176
|
-
progress_bars: Any,
|
|
177
|
-
) -> List[EvalResult]:
|
|
178
|
-
logger.debug("Running inference sequentially")
|
|
179
|
-
results: List[EvalResult] = []
|
|
180
|
-
for run in runs:
|
|
181
|
-
er = await _execute_run(inference_callable, run)
|
|
182
|
-
results.append(er)
|
|
183
|
-
progress_bars.on_hyperparam_completed(run.dataset_idx)
|
|
184
|
-
return results
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
# ===== EVALUATION EXECUTIONS =====
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
async def _execute_run(inference_callable: Callable, run: EvalRunSpec) -> EvalResult:
|
|
191
|
-
logger.debug("Executing run for %s", run)
|
|
192
|
-
|
|
193
|
-
outputs = await _run_inference_callable(inference_callable, run.items, run.hyperparams)
|
|
194
|
-
logger.debug("Inference completed for run %s", run)
|
|
195
|
-
|
|
196
|
-
metric_scores = _score_metrics(run.eval_dataset, outputs, run.labels)
|
|
197
|
-
logger.debug("Metrics computed for run %s. - scores: %s", run, list(metric_scores.keys()))
|
|
198
|
-
|
|
199
|
-
return EvalResult(run.eval_dataset, outputs, metric_scores, run.hyperparams)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
# ===== HELPER FUNCTIONS =====
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def _validate_parameters(params: Dict[str, Any]) -> None:
|
|
206
|
-
"""Validate all parameters for evaluation."""
|
|
207
|
-
|
|
208
|
-
if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
|
|
209
|
-
raise ParameterValidationError(
|
|
210
|
-
"When return_dict=True, at least one of return_aggregates or return_items must be True"
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
if params["parallel"] and not is_awaitable(params["inference_callable"]):
|
|
214
|
-
raise ParallelExecutionError(
|
|
215
|
-
"parallel=True requires the inference_callable to be async. "
|
|
216
|
-
"Please make your inference function async or set parallel=False."
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def _prepare_datasets(
|
|
221
|
-
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
222
|
-
sample_size: Optional[int] = None,
|
|
223
|
-
) -> Tuple[List[EvalDataset], List[str]]:
|
|
224
|
-
"""Prepare and separate input datasets into classic and adaptive evaluation datasets."""
|
|
225
|
-
|
|
226
|
-
# Ensure datasets is always a list for consistent processing
|
|
227
|
-
if not isinstance(datasets, list):
|
|
228
|
-
datasets = [datasets]
|
|
229
|
-
|
|
230
|
-
# Extract classical datasets TODO: handle other types (string registry)
|
|
231
|
-
classic_eval_datasets = [dataset for dataset in datasets if isinstance(dataset, EvalDataset)]
|
|
232
|
-
|
|
233
|
-
# Reduce datasets to a random sample
|
|
234
|
-
if sample_size:
|
|
235
|
-
logger.info("Sampling datasets to %d items each", sample_size)
|
|
236
|
-
for dataset in classic_eval_datasets:
|
|
237
|
-
dataset.shuffle()
|
|
238
|
-
if len(dataset) > sample_size:
|
|
239
|
-
original_size = len(dataset)
|
|
240
|
-
dataset._hf_dataset = dataset._hf_dataset.select(range(sample_size))
|
|
241
|
-
logger.debug(
|
|
242
|
-
"Sampled dataset '%s' from %d to %d items",
|
|
243
|
-
dataset.name,
|
|
244
|
-
original_size,
|
|
245
|
-
sample_size,
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# Extract adaptive dataset strings
|
|
249
|
-
adaptive_eval_datasets = [
|
|
250
|
-
dataset.replace(":adaptive", "")
|
|
251
|
-
for dataset in datasets
|
|
252
|
-
if isinstance(dataset, str) and dataset.endswith(":adaptive")
|
|
253
|
-
]
|
|
254
|
-
|
|
255
|
-
logger.info("Evaluating on classic datasets: %s", [ds.name for ds in classic_eval_datasets])
|
|
256
|
-
logger.info("Evaluating on adaptive datasets: %s", adaptive_eval_datasets)
|
|
257
|
-
|
|
258
|
-
return classic_eval_datasets, adaptive_eval_datasets
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def _prepare_hyperparameters(
|
|
262
|
-
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
|
|
263
|
-
) -> List[Dict[str, Any]]:
|
|
264
|
-
"""Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
|
|
265
|
-
if hyperparameters is None:
|
|
266
|
-
return [{}]
|
|
267
|
-
if not isinstance(hyperparameters, list):
|
|
268
|
-
expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
|
|
269
|
-
return expanded
|
|
270
|
-
|
|
271
|
-
logger.info("Evaluating with hyperparameters: %s", hyperparameters)
|
|
272
|
-
|
|
273
|
-
return hyperparameters
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
async def _run_inference_callable(
|
|
277
|
-
inference_callable: Callable,
|
|
278
|
-
items: List[Dict[str, Any]],
|
|
279
|
-
hyperparams: Dict[str, Any],
|
|
280
|
-
) -> Any:
|
|
281
|
-
if is_awaitable(inference_callable):
|
|
282
|
-
return await inference_callable(items, **hyperparams)
|
|
283
|
-
else:
|
|
284
|
-
return inference_callable(items, **hyperparams)
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def _build_runs(
|
|
288
|
-
datasets: List[EvalDataset],
|
|
289
|
-
hyperparameters: List[Dict[str, Any]],
|
|
290
|
-
) -> List[EvalRunSpec]:
|
|
291
|
-
"""Build RunSpec objects for each dataset/hyperparameter combination."""
|
|
292
|
-
runs: List[EvalRunSpec] = []
|
|
293
|
-
for d_idx, ds in enumerate(datasets):
|
|
294
|
-
items = ds.items
|
|
295
|
-
labels = [item.get(ds.label) for item in items]
|
|
296
|
-
for hp_idx, hp in enumerate(hyperparameters):
|
|
297
|
-
run_spec = EvalRunSpec(d_idx, ds, items, labels, hp, hp_idx)
|
|
298
|
-
logger.debug("Built RunSpec: %s", run_spec)
|
|
299
|
-
runs.append(run_spec)
|
|
300
|
-
return runs
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def _score_metrics(
|
|
304
|
-
eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
|
|
305
|
-
) -> Dict[str, Dict[str, Any]]:
|
|
306
|
-
"""Compute metric scores for a given dataset and inference outputs."""
|
|
307
|
-
metric_scores: Dict[str, Dict[str, Any]] = {}
|
|
308
|
-
|
|
309
|
-
if len(outputs) != len(labels):
|
|
310
|
-
raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
|
|
311
|
-
|
|
312
|
-
for metric in eval_dataset.metrics:
|
|
313
|
-
try:
|
|
314
|
-
aggregate_scores, item_scores = metric.score(outputs, labels)
|
|
315
|
-
metric_scores[metric.name] = {
|
|
316
|
-
"aggregate_scores": aggregate_scores,
|
|
317
|
-
"item_scores": item_scores,
|
|
318
|
-
}
|
|
319
|
-
except Exception as e:
|
|
320
|
-
logger.error(
|
|
321
|
-
"Failed to compute metric '%s' for dataset '%s': %s",
|
|
322
|
-
metric.name,
|
|
323
|
-
eval_dataset.name,
|
|
324
|
-
str(e),
|
|
325
|
-
)
|
|
326
|
-
raise MetricComputationError(metric.name, eval_dataset.name, e)
|
|
327
|
-
|
|
328
|
-
return metric_scores
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
def _format_results(
|
|
332
|
-
eval_results: List[EvalResult],
|
|
333
|
-
return_dict: bool,
|
|
334
|
-
return_aggregates: bool,
|
|
335
|
-
return_items: bool,
|
|
336
|
-
return_output: bool,
|
|
337
|
-
) -> Union[Dict, List]:
|
|
338
|
-
|
|
339
|
-
# Return results as a dict
|
|
340
|
-
if return_dict:
|
|
341
|
-
|
|
342
|
-
# Include both aggregate and item scores in dict returned
|
|
343
|
-
if return_aggregates and return_items:
|
|
344
|
-
results: Dict[str, List[Dict[str, Any]]] = {"aggregate_results": [], "item_results": []}
|
|
345
|
-
for eval_result in eval_results:
|
|
346
|
-
eval_result_dict = eval_result.to_dict()
|
|
347
|
-
results["aggregate_results"].extend(eval_result_dict["aggregate_results"])
|
|
348
|
-
if return_output:
|
|
349
|
-
results["item_results"].extend(eval_result_dict["item_results"])
|
|
350
|
-
else:
|
|
351
|
-
results["item_results"].extend(
|
|
352
|
-
[
|
|
353
|
-
{k: v for k, v in item.items() if k != "inference_output"}
|
|
354
|
-
for item in eval_result_dict["item_results"]
|
|
355
|
-
]
|
|
356
|
-
)
|
|
357
|
-
return results
|
|
358
|
-
|
|
359
|
-
# Include only aggregate scores in dict returned
|
|
360
|
-
elif return_aggregates:
|
|
361
|
-
return [eval_result.aggregate_scores for eval_result in eval_results]
|
|
362
|
-
|
|
363
|
-
# Include only item scores in dict returned
|
|
364
|
-
else:
|
|
365
|
-
if return_output:
|
|
366
|
-
return [item for eval_result in eval_results for item in eval_result.item_scores]
|
|
367
|
-
else:
|
|
368
|
-
return [
|
|
369
|
-
{k: v for k, v in item.items() if k != "inference_output"}
|
|
370
|
-
for eval_result in eval_results
|
|
371
|
-
for item in eval_result.item_scores
|
|
372
|
-
]
|
|
373
|
-
|
|
374
|
-
# Return results as an EvalResult object
|
|
375
|
-
else:
|
|
376
|
-
out: Dict[str, List[EvalResult]] = {}
|
|
377
|
-
for er in eval_results:
|
|
378
|
-
out.setdefault(er.eval_dataset.name, []).append(er)
|
|
379
|
-
return out
|
scorebook/types/__init__.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Types package containing data structures and type definitions for the Scorebook framework.
|
|
3
|
-
|
|
4
|
-
This module provides core data types used throughout the framework for dataset handling
|
|
5
|
-
and evaluation results.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from scorebook.types.eval_dataset import EvalDataset
|
|
9
|
-
from scorebook.types.eval_result import EvalResult
|
|
10
|
-
from scorebook.types.eval_run_spec import EvalRunSpec
|
|
11
|
-
|
|
12
|
-
__all__ = ["EvalDataset", "EvalResult", "EvalRunSpec"]
|
scorebook/types/eval_result.py
DELETED
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module defines the data structures used to represent evaluation results.
|
|
3
|
-
|
|
4
|
-
including individual prediction outcomes and aggregated dataset metrics.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import csv
|
|
8
|
-
import json
|
|
9
|
-
from dataclasses import dataclass
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Any, Dict, List
|
|
12
|
-
|
|
13
|
-
from scorebook.types.eval_dataset import EvalDataset
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class EvalResult:
|
|
18
|
-
"""
|
|
19
|
-
Container for evaluation results from an entire dataset.
|
|
20
|
-
|
|
21
|
-
Attributes:
|
|
22
|
-
eval_dataset: The dataset used for evaluation.
|
|
23
|
-
inference_outputs: A list of model predictions or outputs.
|
|
24
|
-
metric_scores: A dictionary mapping metric names to their scores.
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
eval_dataset: EvalDataset
|
|
28
|
-
inference_outputs: List[Any]
|
|
29
|
-
metric_scores: Dict[str, Dict[str, Any]]
|
|
30
|
-
hyperparams: Dict[str, Any]
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
def item_scores(self) -> List[Dict[str, Any]]:
|
|
34
|
-
"""Return a list of dictionaries containing scores for each evaluated item."""
|
|
35
|
-
results = []
|
|
36
|
-
metric_names = list(self.metric_scores.keys()) if self.metric_scores else []
|
|
37
|
-
|
|
38
|
-
for idx, item in enumerate(self.eval_dataset.items):
|
|
39
|
-
if idx >= len(self.inference_outputs):
|
|
40
|
-
break
|
|
41
|
-
|
|
42
|
-
result = {
|
|
43
|
-
"item_id": idx,
|
|
44
|
-
"dataset_name": self.eval_dataset.name,
|
|
45
|
-
"inference_output": self.inference_outputs[idx],
|
|
46
|
-
**{
|
|
47
|
-
metric: self.metric_scores[metric]["item_scores"][idx]
|
|
48
|
-
for metric in metric_names
|
|
49
|
-
},
|
|
50
|
-
**self.hyperparams,
|
|
51
|
-
}
|
|
52
|
-
results.append(result)
|
|
53
|
-
|
|
54
|
-
return results
|
|
55
|
-
|
|
56
|
-
@property
|
|
57
|
-
def aggregate_scores(self) -> Dict[str, Any]:
|
|
58
|
-
"""Return the aggregated scores across all evaluated items."""
|
|
59
|
-
result: Dict[str, Any] = {"dataset_name": self.eval_dataset.name}
|
|
60
|
-
if not self.metric_scores:
|
|
61
|
-
return result
|
|
62
|
-
|
|
63
|
-
for metric, scores in self.metric_scores.items():
|
|
64
|
-
# Flatten the aggregate scores from each metric into the result
|
|
65
|
-
result.update(
|
|
66
|
-
{
|
|
67
|
-
key if key == metric else f"{metric}_{key}": value
|
|
68
|
-
for key, value in scores["aggregate_scores"].items()
|
|
69
|
-
}
|
|
70
|
-
)
|
|
71
|
-
for hyperparam, value in self.hyperparams.items():
|
|
72
|
-
result[hyperparam] = value
|
|
73
|
-
return result
|
|
74
|
-
|
|
75
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
76
|
-
"""Return a dictionary representing the evaluation results."""
|
|
77
|
-
return {
|
|
78
|
-
"aggregate_results": [
|
|
79
|
-
{
|
|
80
|
-
**getattr(self.eval_dataset, "hyperparams", {}),
|
|
81
|
-
**self.aggregate_scores,
|
|
82
|
-
}
|
|
83
|
-
],
|
|
84
|
-
"item_results": [item for item in self.item_scores],
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
def to_csv(self, file_path: str) -> None:
|
|
88
|
-
"""Save evaluation results to a CSV file.
|
|
89
|
-
|
|
90
|
-
The CSV will contain item-level results.
|
|
91
|
-
"""
|
|
92
|
-
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
-
|
|
94
|
-
with open(file_path, "w", newline="") as f:
|
|
95
|
-
writer = csv.writer(f)
|
|
96
|
-
|
|
97
|
-
# Write a header with all possible metric names
|
|
98
|
-
item_fields = list(self.eval_dataset.items[0].keys()) if self.eval_dataset.items else []
|
|
99
|
-
metric_names = list(self.metric_scores.keys()) if self.metric_scores else []
|
|
100
|
-
header = ["item_id"] + item_fields + ["inference_output"] + metric_names
|
|
101
|
-
writer.writerow(header)
|
|
102
|
-
|
|
103
|
-
# Write item data
|
|
104
|
-
for idx, item in enumerate(self.eval_dataset.items):
|
|
105
|
-
if idx >= len(self.inference_outputs):
|
|
106
|
-
break
|
|
107
|
-
|
|
108
|
-
row = (
|
|
109
|
-
[idx]
|
|
110
|
-
+ list(item.values())
|
|
111
|
-
+ [self.inference_outputs[idx]]
|
|
112
|
-
+ [self.metric_scores[metric]["item_scores"][idx] for metric in metric_names]
|
|
113
|
-
)
|
|
114
|
-
writer.writerow(row)
|
|
115
|
-
|
|
116
|
-
def to_json(self, file_path: str) -> None:
|
|
117
|
-
"""Save evaluation results to a JSON file in a structured format.
|
|
118
|
-
|
|
119
|
-
The JSON file will contain both aggregate & item results, produced by the to_dict() method.
|
|
120
|
-
"""
|
|
121
|
-
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
122
|
-
with open(file_path, "w") as f:
|
|
123
|
-
json.dump(self.to_dict(), f, indent=2)
|
|
124
|
-
|
|
125
|
-
def __str__(self) -> str:
|
|
126
|
-
"""Return a formatted string representation of the evaluation results."""
|
|
127
|
-
result = [
|
|
128
|
-
f"Eval Dataset: {self.eval_dataset.name}",
|
|
129
|
-
"\nAggregate Scores:",
|
|
130
|
-
]
|
|
131
|
-
for metric_name, score in self.aggregate_scores.items():
|
|
132
|
-
result.append(f"\n {metric_name}: {score:.4f}")
|
|
133
|
-
return "".join(result)
|
scorebook/types/eval_run_spec.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
"""Evaluation run specification types for Scorebook."""
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict, List, NamedTuple
|
|
4
|
-
|
|
5
|
-
from scorebook.types import EvalDataset
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class EvalRunSpec(NamedTuple):
|
|
9
|
-
"""Represents a single evaluation run configuration."""
|
|
10
|
-
|
|
11
|
-
dataset_idx: int
|
|
12
|
-
eval_dataset: EvalDataset
|
|
13
|
-
items: List[Dict[str, Any]]
|
|
14
|
-
labels: List[Any]
|
|
15
|
-
hyperparams: Dict[str, Any]
|
|
16
|
-
hp_idx: int
|
|
17
|
-
|
|
18
|
-
def __str__(self) -> str:
|
|
19
|
-
"""Return a formatted string summary of the evaluation run specification."""
|
|
20
|
-
hyperparams_str = ", ".join([f"{k}={v}" for k, v in self.hyperparams.items()])
|
|
21
|
-
|
|
22
|
-
return (
|
|
23
|
-
f"EvalRunSpec(dataset_idx={self.dataset_idx},"
|
|
24
|
-
f" hp_idx={self.hp_idx},"
|
|
25
|
-
f" dataset_name='{self.eval_dataset.name}',"
|
|
26
|
-
f" hyperparams=[{hyperparams_str}]"
|
|
27
|
-
f")"
|
|
28
|
-
)
|
scorebook/utils/logging_utils.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Logging utilities for Scorebook evaluation framework."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|