scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +11 -4
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/_async/evaluate_async.py +135 -130
- scorebook/evaluate/_sync/evaluate.py +135 -131
- scorebook/evaluate/evaluate_helpers.py +46 -23
- scorebook/exceptions.py +54 -2
- scorebook/inference/clients/bedrock.py +1 -1
- scorebook/inference/clients/portkey.py +1 -1
- scorebook/inference/clients/vertex.py +1 -1
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/settings.py +3 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +36 -54
- scorebook/utils/__init__.py +11 -4
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +819 -70
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
- scorebook-0.0.12.dist-info/RECORD +50 -0
- scorebook/eval_dataset.py +0 -404
- scorebook-0.0.10.dist-info/RECORD +0 -41
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
"""Evaluation Dataset implementation for scorebook."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import random
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Type, Union
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from datasets import Dataset as HuggingFaceDataset
|
|
10
|
+
from datasets import DatasetDict as HuggingFaceDatasetDict
|
|
11
|
+
from datasets import load_dataset
|
|
12
|
+
|
|
13
|
+
from scorebook.exceptions import (
|
|
14
|
+
DatasetConfigurationError,
|
|
15
|
+
DatasetLoadError,
|
|
16
|
+
DatasetNotInitializedError,
|
|
17
|
+
DatasetParseError,
|
|
18
|
+
DatasetSampleError,
|
|
19
|
+
MissingFieldError,
|
|
20
|
+
)
|
|
21
|
+
from scorebook.metrics import MetricBase, MetricRegistry
|
|
22
|
+
from scorebook.utils import render_template, validate_path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class EvalDataset:
|
|
26
|
+
"""Evaluation Dataset for model evaluation and scoring.
|
|
27
|
+
|
|
28
|
+
An evaluation dataset defines explicit input and label features.
|
|
29
|
+
During evaluation, each input is passed to the model,
|
|
30
|
+
and the resulting output is compared against the
|
|
31
|
+
corresponding label using the configured metrics.
|
|
32
|
+
|
|
33
|
+
Do not instantiate directly. Use a factory constructor:
|
|
34
|
+
- from_list
|
|
35
|
+
- from_csv
|
|
36
|
+
- from_json
|
|
37
|
+
- from_huggingface
|
|
38
|
+
- from_yaml
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
name: Human-readable dataset name.
|
|
42
|
+
metrics: List of MetricBase instances used for scoring.
|
|
43
|
+
input: Column name used as the model input.
|
|
44
|
+
label: Column name used as the ground-truth label.
|
|
45
|
+
input_template: Optional Jinja2 template that renders the input from item features.
|
|
46
|
+
label_template: Optional Jinja2 template that renders the label from item features.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
name: str,
|
|
52
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
53
|
+
hf_dataset: HuggingFaceDataset,
|
|
54
|
+
input: Optional[str] = None,
|
|
55
|
+
label: Optional[str] = None,
|
|
56
|
+
input_template: Optional[str] = None,
|
|
57
|
+
label_template: Optional[str] = None,
|
|
58
|
+
):
|
|
59
|
+
"""Create a new scorebook evaluation dataset instance.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
name: The name of the evaluation dataset.
|
|
63
|
+
metrics: The metrics used for scoring.
|
|
64
|
+
hf_dataset: Evaluation items.
|
|
65
|
+
input: Dataset feature containing input values.
|
|
66
|
+
label: Dataset feature containing label values.
|
|
67
|
+
input_template: Jinja2 template for input.
|
|
68
|
+
label_template: Jinja2 template for label.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
DatasetConfigurationError:
|
|
72
|
+
If both/neither of input and input_template,
|
|
73
|
+
or both/neither of label and label_template are provided.
|
|
74
|
+
MissingFieldError:
|
|
75
|
+
If the resolved input or label column is not present in the HF dataset.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Validate mutual exclusivity for input and input_template
|
|
79
|
+
if (input is None) == (input_template is None):
|
|
80
|
+
raise DatasetConfigurationError(
|
|
81
|
+
"Exactly one of 'input' or 'input_template' must be provided, not both or neither."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Validate mutual exclusivity for label and label_template
|
|
85
|
+
if (label is None) == (label_template is None):
|
|
86
|
+
raise DatasetConfigurationError(
|
|
87
|
+
"Exactly one of 'label' or 'label_template' must be provided, not both or neither."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Determine the feature to be used as inputs
|
|
91
|
+
input_column: str = (
|
|
92
|
+
"*input" if input_template is not None else input # type: ignore[assignment]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Determine the feature to be used as labels
|
|
96
|
+
label_column: str = (
|
|
97
|
+
"*label" if label_template is not None else label # type: ignore[assignment]
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Validate that dataset has the required columns
|
|
101
|
+
column_names = list(hf_dataset.column_names)
|
|
102
|
+
actual_columns = set(column_names)
|
|
103
|
+
|
|
104
|
+
if input_column not in actual_columns:
|
|
105
|
+
raise MissingFieldError(
|
|
106
|
+
field_name=input_column, field_type="input", available_fields=column_names
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if label_column not in actual_columns:
|
|
110
|
+
raise MissingFieldError(
|
|
111
|
+
field_name=label_column, field_type="label", available_fields=column_names
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self.name: str = name
|
|
115
|
+
self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
|
|
116
|
+
self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
|
|
117
|
+
|
|
118
|
+
# Store which columns to use for input/label
|
|
119
|
+
self.input: str = input_column
|
|
120
|
+
self.label: str = label_column
|
|
121
|
+
|
|
122
|
+
# Store templates for transparency (optional, for debugging)
|
|
123
|
+
self.input_template: Optional[str] = input_template
|
|
124
|
+
self.label_template: Optional[str] = label_template
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def items(self) -> List[Any]:
|
|
128
|
+
"""Return a list of all examples in the dataset."""
|
|
129
|
+
if self._hf_dataset is None:
|
|
130
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
131
|
+
return list(self._hf_dataset)
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def column_names(self) -> List[str]:
|
|
135
|
+
"""Return a list of column/feature names available in the dataset."""
|
|
136
|
+
if self._hf_dataset is None:
|
|
137
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
138
|
+
return list(map(str, self._hf_dataset.column_names))
|
|
139
|
+
|
|
140
|
+
def shuffle(self) -> None:
|
|
141
|
+
"""Randomly shuffle the dataset items."""
|
|
142
|
+
if self._hf_dataset is None:
|
|
143
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
144
|
+
self._hf_dataset.shuffle()
|
|
145
|
+
|
|
146
|
+
def sample(self, sample_size: int) -> "EvalDataset":
|
|
147
|
+
"""Create a new dataset with randomly sampled items from this dataset.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
sample_size: The number of items to sample from the dataset.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
A new EvalDataset with randomly sampled items.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
DatasetSampleError: If the sample size is smaller than the dataset.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
# Validate requested sample size against available items
|
|
160
|
+
dataset_size = len(self.items)
|
|
161
|
+
if sample_size > dataset_size:
|
|
162
|
+
raise DatasetSampleError(
|
|
163
|
+
sample_size=sample_size, dataset_size=dataset_size, dataset_name=self.name
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Create randomly sampled items
|
|
167
|
+
sampled_items = random.sample(self.items, sample_size)
|
|
168
|
+
|
|
169
|
+
# Create HuggingFace dataset from sampled items
|
|
170
|
+
sampled_hf_dataset = HuggingFaceDataset.from_list(sampled_items)
|
|
171
|
+
|
|
172
|
+
# # Preserve original input/label spec; omit field names when templates are used
|
|
173
|
+
input_param = None if self.input_template else self.input
|
|
174
|
+
label_param = None if self.label_template else self.label
|
|
175
|
+
|
|
176
|
+
return EvalDataset(
|
|
177
|
+
name=self.name,
|
|
178
|
+
metrics=self.metrics,
|
|
179
|
+
hf_dataset=sampled_hf_dataset,
|
|
180
|
+
input=input_param,
|
|
181
|
+
label=label_param,
|
|
182
|
+
input_template=self.input_template,
|
|
183
|
+
label_template=self.label_template,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# === Factory Methods ===
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def from_list(
|
|
190
|
+
cls,
|
|
191
|
+
name: str,
|
|
192
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
193
|
+
items: List[Dict[str, Any]],
|
|
194
|
+
input: str,
|
|
195
|
+
label: str,
|
|
196
|
+
) -> "EvalDataset":
|
|
197
|
+
"""Instantiate an EvalDataset from a list of dictionaries.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
name: The name of the evaluation dataset.
|
|
201
|
+
metrics: The specified metrics associated with the dataset.
|
|
202
|
+
items: List of dictionaries containing the dataset examples.
|
|
203
|
+
input: The field name containing the input data.
|
|
204
|
+
label: The field name containing the label.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
A scorebook EvalDataset.
|
|
208
|
+
|
|
209
|
+
Raises:
|
|
210
|
+
MissingFieldError: If the input or label feature is not present in the first item.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
if items and items[0]:
|
|
214
|
+
available_fields = list(items[0].keys())
|
|
215
|
+
|
|
216
|
+
# Raise an error if the input feature is missing from the first item
|
|
217
|
+
if input not in items[0]:
|
|
218
|
+
raise MissingFieldError(
|
|
219
|
+
field_name=input, field_type="input", available_fields=available_fields
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Raises an error if the label feature is missing from the first item
|
|
223
|
+
if label not in items[0]:
|
|
224
|
+
raise MissingFieldError(
|
|
225
|
+
field_name=label, field_type="label", available_fields=available_fields
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return cls(
|
|
229
|
+
name=name,
|
|
230
|
+
metrics=metrics,
|
|
231
|
+
hf_dataset=HuggingFaceDataset.from_list(items),
|
|
232
|
+
input=input,
|
|
233
|
+
label=label,
|
|
234
|
+
input_template=None,
|
|
235
|
+
label_template=None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
@classmethod
|
|
239
|
+
def from_csv(
|
|
240
|
+
cls,
|
|
241
|
+
path: str,
|
|
242
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
243
|
+
input: str,
|
|
244
|
+
label: str,
|
|
245
|
+
name: Optional[str] = None,
|
|
246
|
+
encoding: str = "utf-8",
|
|
247
|
+
newline: str = "",
|
|
248
|
+
**reader_kwargs: Any,
|
|
249
|
+
) -> "EvalDataset":
|
|
250
|
+
"""Instantiate an EvalDataset from a CSV file.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
path: Path to the CSV file.
|
|
254
|
+
metrics: The specified metrics associated with the dataset.
|
|
255
|
+
input: The field name containing the input data.
|
|
256
|
+
label: The field name containing the label.
|
|
257
|
+
name: Optional name for the eval dataset, if not provided, the path is used.
|
|
258
|
+
encoding: Encoding of the CSV file.
|
|
259
|
+
newline: Newline character of the CSV file.
|
|
260
|
+
reader_kwargs: Dict of kwargs passed to csv.DictReader.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
A scorebook EvalDataset.
|
|
264
|
+
|
|
265
|
+
Raises:
|
|
266
|
+
DatasetParseError: If csv parsing fails.
|
|
267
|
+
DatasetLoadError: If the csv file does not contain evaluation items.
|
|
268
|
+
MissingFieldError: If the input or label feature is not present in the first item.
|
|
269
|
+
"""
|
|
270
|
+
reader_kwargs = reader_kwargs or {}
|
|
271
|
+
validated_path = validate_path(path, expected_suffix=".csv")
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
with open(validated_path, encoding=encoding, newline=newline) as csvfile:
|
|
275
|
+
items = list(csv.DictReader(csvfile, **reader_kwargs))
|
|
276
|
+
except csv.Error as e:
|
|
277
|
+
raise DatasetParseError(f"Failed to parse CSV file {path}: {e}") from e
|
|
278
|
+
|
|
279
|
+
if not items:
|
|
280
|
+
raise DatasetLoadError(f"CSV file {path} is empty or contains only headers.")
|
|
281
|
+
|
|
282
|
+
available_fields = list(items[0].keys())
|
|
283
|
+
if input not in items[0]:
|
|
284
|
+
raise MissingFieldError(
|
|
285
|
+
field_name=input, field_type="input", available_fields=available_fields
|
|
286
|
+
)
|
|
287
|
+
if label not in items[0]:
|
|
288
|
+
raise MissingFieldError(
|
|
289
|
+
field_name=label, field_type="label", available_fields=available_fields
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
name = name if name else validated_path.stem
|
|
293
|
+
return cls(
|
|
294
|
+
name=name,
|
|
295
|
+
metrics=metrics,
|
|
296
|
+
hf_dataset=HuggingFaceDataset.from_list(items),
|
|
297
|
+
input=input,
|
|
298
|
+
label=label,
|
|
299
|
+
input_template=None,
|
|
300
|
+
label_template=None,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
@classmethod
|
|
304
|
+
def from_json(
|
|
305
|
+
cls,
|
|
306
|
+
path: str,
|
|
307
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
308
|
+
input: str,
|
|
309
|
+
label: str,
|
|
310
|
+
name: Optional[str] = None,
|
|
311
|
+
split: Optional[str] = None,
|
|
312
|
+
) -> "EvalDataset":
|
|
313
|
+
"""Instantiate an EvalDataset from a JSON file.
|
|
314
|
+
|
|
315
|
+
The JSON file must follow one of two supported formats:
|
|
316
|
+
|
|
317
|
+
1. Flat format – a list of dictionaries:
|
|
318
|
+
[
|
|
319
|
+
{"input": ..., "output": ...},
|
|
320
|
+
{"input": ..., "output": ...},
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
2. Split format – a dictionary of named splits:
|
|
324
|
+
{
|
|
325
|
+
"train": [{"input": ..., "output": ...}],
|
|
326
|
+
"test": [{"input": ..., "output": ...}]
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
path: Path to the JSON file on disk.
|
|
331
|
+
metrics: The specified metrics associated with the dataset.
|
|
332
|
+
input: The field name containing the input data.
|
|
333
|
+
label: The field name containing the label.
|
|
334
|
+
name: Optional name for the eval dataset, if not provided, the path is used
|
|
335
|
+
split: If the JSON uses a split structure, this is the split name to load.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
A Scorebook EvalDataset.
|
|
339
|
+
|
|
340
|
+
Raises:
|
|
341
|
+
DatasetParseError: If JSON parsing fails.
|
|
342
|
+
DatasetConfigurationError: If an invalid split is provided.
|
|
343
|
+
MissingFieldError: If the input or label feature is not present in the first item.
|
|
344
|
+
"""
|
|
345
|
+
validated_path = validate_path(path, expected_suffix=".json")
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
with validated_path.open("r", encoding="utf-8") as f:
|
|
349
|
+
json_data = json.load(f)
|
|
350
|
+
except json.JSONDecodeError as e:
|
|
351
|
+
raise DatasetParseError(f"Invalid JSON in {path}: {e}") from e
|
|
352
|
+
|
|
353
|
+
if isinstance(json_data, dict):
|
|
354
|
+
|
|
355
|
+
if split is None:
|
|
356
|
+
raise DatasetConfigurationError(
|
|
357
|
+
f"Split name must be provided for split-style JSON: {path}"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
items = json_data.get(split)
|
|
361
|
+
if items is None:
|
|
362
|
+
raise DatasetConfigurationError(f"Split '{split}' not found in JSON file: {path}")
|
|
363
|
+
if not isinstance(items, list):
|
|
364
|
+
raise DatasetConfigurationError(
|
|
365
|
+
f"Split '{split}' is not a list of examples in: {path}"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
elif isinstance(json_data, list):
|
|
369
|
+
items = json_data
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
raise DatasetConfigurationError(
|
|
373
|
+
f"Unsupported JSON structure in {path}. Expected list or dict."
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Validate that fields exist
|
|
377
|
+
if items and items[0]:
|
|
378
|
+
available_fields = list(items[0].keys())
|
|
379
|
+
if input not in items[0]:
|
|
380
|
+
raise MissingFieldError(
|
|
381
|
+
field_name=input, field_type="input", available_fields=available_fields
|
|
382
|
+
)
|
|
383
|
+
if label not in items[0]:
|
|
384
|
+
raise MissingFieldError(
|
|
385
|
+
field_name=label, field_type="label", available_fields=available_fields
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
name = name if name else validated_path.stem
|
|
389
|
+
return cls(
|
|
390
|
+
name=name,
|
|
391
|
+
metrics=metrics,
|
|
392
|
+
hf_dataset=HuggingFaceDataset.from_list(items),
|
|
393
|
+
input=input,
|
|
394
|
+
label=label,
|
|
395
|
+
input_template=None,
|
|
396
|
+
label_template=None,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def from_huggingface(
|
|
401
|
+
cls,
|
|
402
|
+
path: str,
|
|
403
|
+
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
404
|
+
input: Optional[str] = None,
|
|
405
|
+
input_template: Optional[str] = None,
|
|
406
|
+
label: Optional[str] = None,
|
|
407
|
+
label_template: Optional[str] = None,
|
|
408
|
+
name: Optional[str] = None,
|
|
409
|
+
split: Optional[str] = None,
|
|
410
|
+
config: Optional[str] = None,
|
|
411
|
+
) -> "EvalDataset":
|
|
412
|
+
"""Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
|
|
413
|
+
|
|
414
|
+
If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
|
|
415
|
+
If no split is specified, the method attempts to load the full dataset. If the dataset
|
|
416
|
+
is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
|
|
417
|
+
split.
|
|
418
|
+
|
|
419
|
+
For datasets where the input/label is already in a single column, use the input/label
|
|
420
|
+
parameters to specify the feature names. For datasets where the input/label needs to be
|
|
421
|
+
constructed from multiple columns, use the input_template/label_template parameters
|
|
422
|
+
with Jinja2 template strings.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
path: The path of the dataset on the Hugging Face Hub.
|
|
426
|
+
metrics: The specified metrics associated with the dataset.
|
|
427
|
+
input: Field name containing the input data (mutually exclusive with input_template).
|
|
428
|
+
input_template:
|
|
429
|
+
Jinja2 template to construct input from multiple fields
|
|
430
|
+
(mutually exclusive with input).
|
|
431
|
+
label: Field name containing the label
|
|
432
|
+
(mutually exclusive with label_template).
|
|
433
|
+
label_template:
|
|
434
|
+
Jinja2 template to construct label from multiple fields
|
|
435
|
+
(mutually exclusive with label).
|
|
436
|
+
name: Optional name for the eval dataset, by default HF "path:split:config" is used.
|
|
437
|
+
split: Optional name of the split to load.
|
|
438
|
+
config: Optional dataset configuration name.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
A Scorebook EvalDataset.
|
|
442
|
+
|
|
443
|
+
Raises:
|
|
444
|
+
DatasetConfigurationError:
|
|
445
|
+
If both/neither of input and input_template,
|
|
446
|
+
or both/neither of label and label_template are provided.
|
|
447
|
+
DatasetLoadError: If HF dataset cannot be loaded.
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
# Validate mutual exclusivity for input and input_template
|
|
451
|
+
if (input is None) == (input_template is None):
|
|
452
|
+
raise DatasetConfigurationError(
|
|
453
|
+
"Exactly one of 'input' or 'input_template' must be provided, not both or neither."
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Validate mutual exclusivity for label and label_template
|
|
457
|
+
if (label is None) == (label_template is None):
|
|
458
|
+
raise DatasetConfigurationError(
|
|
459
|
+
"Exactly one of 'label' or 'label_template' must be provided, not both or neither."
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
kwargs = {}
|
|
464
|
+
if split is not None:
|
|
465
|
+
kwargs["split"] = split
|
|
466
|
+
if config is not None:
|
|
467
|
+
kwargs["name"] = (
|
|
468
|
+
config # Hugging Face's load_dataset method param for config is "name"
|
|
469
|
+
)
|
|
470
|
+
ds = load_dataset(path, **kwargs)
|
|
471
|
+
except Exception as e:
|
|
472
|
+
raise DatasetLoadError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
|
|
473
|
+
|
|
474
|
+
if isinstance(ds, HuggingFaceDataset):
|
|
475
|
+
hf_dataset = ds
|
|
476
|
+
elif isinstance(ds, HuggingFaceDatasetDict):
|
|
477
|
+
if "test" in ds:
|
|
478
|
+
hf_dataset = ds["test"]
|
|
479
|
+
else:
|
|
480
|
+
raise DatasetConfigurationError(
|
|
481
|
+
f"Split not specified and no 'test' split found in dataset '{path}'."
|
|
482
|
+
)
|
|
483
|
+
else:
|
|
484
|
+
raise DatasetConfigurationError(f"Unexpected dataset type for '{path}': {type(ds)}")
|
|
485
|
+
|
|
486
|
+
# Only transform if templates are used
|
|
487
|
+
if input_template is not None or label_template is not None:
|
|
488
|
+
|
|
489
|
+
def transform_row(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
490
|
+
"""Add computed columns (*input, *label) when templates are used."""
|
|
491
|
+
# Start with all original data
|
|
492
|
+
result = dict(row)
|
|
493
|
+
|
|
494
|
+
# Add *input if template is used
|
|
495
|
+
if input_template is not None:
|
|
496
|
+
result["*input"] = render_template(input_template, row)
|
|
497
|
+
|
|
498
|
+
# Add *label if template is used
|
|
499
|
+
if label_template is not None:
|
|
500
|
+
result["*label"] = render_template(label_template, row)
|
|
501
|
+
|
|
502
|
+
return result
|
|
503
|
+
|
|
504
|
+
transformed_dataset = hf_dataset.map(transform_row)
|
|
505
|
+
else:
|
|
506
|
+
|
|
507
|
+
transformed_dataset = hf_dataset
|
|
508
|
+
|
|
509
|
+
dataset_name = name if name else ":".join(filter(None, [path, split, config]))
|
|
510
|
+
return cls(
|
|
511
|
+
name=dataset_name,
|
|
512
|
+
metrics=metrics,
|
|
513
|
+
hf_dataset=transformed_dataset,
|
|
514
|
+
input=input,
|
|
515
|
+
label=label,
|
|
516
|
+
input_template=input_template,
|
|
517
|
+
label_template=label_template,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
@classmethod
|
|
521
|
+
def from_yaml(cls, path: str) -> "EvalDataset":
|
|
522
|
+
r"""Instantiate an EvalDataset from Huggingface with a YAML Config file.
|
|
523
|
+
|
|
524
|
+
The YAML file should contain configuration for loading a dataset from Hugging Face.
|
|
525
|
+
|
|
526
|
+
Required fields:
|
|
527
|
+
- path: Hugging Face dataset path
|
|
528
|
+
- name: Name for the evaluation dataset
|
|
529
|
+
- metrics: List of metrics to evaluate
|
|
530
|
+
|
|
531
|
+
The input / label features must be specified / constructed by one of the following:
|
|
532
|
+
|
|
533
|
+
1. Feature Specification:
|
|
534
|
+
input: "question"
|
|
535
|
+
label: "answer"
|
|
536
|
+
|
|
537
|
+
2. Mapping Templates:
|
|
538
|
+
templates:
|
|
539
|
+
input: "{{ question }}\nOptions: {{ options }}"
|
|
540
|
+
label: "{{ answer }}"
|
|
541
|
+
|
|
542
|
+
Optional fields:
|
|
543
|
+
- split: Dataset split to load.
|
|
544
|
+
- config: Dataset configuration name.
|
|
545
|
+
- metadata: Any additional metadata.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
path: The path of YAML configuration file.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
An EvalDataset.
|
|
552
|
+
|
|
553
|
+
Raises:
|
|
554
|
+
DatasetParseError: If YAML configuration file cannot be parsed.
|
|
555
|
+
DatasetConfigurationError: Invalid YAML configuration file.
|
|
556
|
+
"""
|
|
557
|
+
validated_path = validate_path(path, expected_suffix=(".yaml", ".yml"))
|
|
558
|
+
|
|
559
|
+
try:
|
|
560
|
+
with validated_path.open("r", encoding="utf-8") as f:
|
|
561
|
+
yaml_config = yaml.safe_load(f)
|
|
562
|
+
except yaml.YAMLError as e:
|
|
563
|
+
raise DatasetParseError(f"Invalid YAML in {path}: {e}") from e
|
|
564
|
+
|
|
565
|
+
# Validate required fields
|
|
566
|
+
required_fields = ["path", "name", "metrics"]
|
|
567
|
+
missing_fields = [field for field in required_fields if field not in yaml_config]
|
|
568
|
+
if missing_fields:
|
|
569
|
+
raise DatasetConfigurationError(
|
|
570
|
+
f"Missing required fields in YAML config: {', '.join(missing_fields)}"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Validate metrics exist before calling from_huggingface
|
|
574
|
+
metrics_to_validate = yaml_config["metrics"]
|
|
575
|
+
if not isinstance(metrics_to_validate, list):
|
|
576
|
+
metrics_to_validate = [metrics_to_validate]
|
|
577
|
+
|
|
578
|
+
for metric in metrics_to_validate:
|
|
579
|
+
try:
|
|
580
|
+
MetricRegistry.get(metric)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
raise DatasetConfigurationError(f"Invalid metric '{metric}' in YAML config: {e}")
|
|
583
|
+
|
|
584
|
+
# Determine input/label specification
|
|
585
|
+
has_templates = "templates" in yaml_config
|
|
586
|
+
has_direct_input = "input" in yaml_config
|
|
587
|
+
has_direct_label = "label" in yaml_config
|
|
588
|
+
|
|
589
|
+
# Validate that we have proper input/label specification
|
|
590
|
+
if has_templates:
|
|
591
|
+
templates = yaml_config["templates"]
|
|
592
|
+
if not isinstance(templates, dict):
|
|
593
|
+
raise DatasetConfigurationError("'templates' must be a dictionary")
|
|
594
|
+
if "input" not in templates or "label" not in templates:
|
|
595
|
+
raise DatasetConfigurationError(
|
|
596
|
+
"'templates' must contain both 'input' and 'label' keys"
|
|
597
|
+
)
|
|
598
|
+
if has_direct_input or has_direct_label:
|
|
599
|
+
raise DatasetConfigurationError(
|
|
600
|
+
"Cannot specify both 'templates' and direct 'input'/'label' fields"
|
|
601
|
+
)
|
|
602
|
+
input_template = templates["input"]
|
|
603
|
+
label_template = templates["label"]
|
|
604
|
+
input_field = None
|
|
605
|
+
label_field = None
|
|
606
|
+
else:
|
|
607
|
+
if not has_direct_input or not has_direct_label:
|
|
608
|
+
raise DatasetConfigurationError(
|
|
609
|
+
"Must specify either 'templates' or both 'input' and 'label' fields"
|
|
610
|
+
)
|
|
611
|
+
input_field = yaml_config["input"]
|
|
612
|
+
label_field = yaml_config["label"]
|
|
613
|
+
input_template = None
|
|
614
|
+
label_template = None
|
|
615
|
+
|
|
616
|
+
# Load the dataset from Hugging Face
|
|
617
|
+
return cls.from_huggingface(
|
|
618
|
+
path=yaml_config["path"],
|
|
619
|
+
metrics=yaml_config["metrics"],
|
|
620
|
+
input=input_field,
|
|
621
|
+
input_template=input_template,
|
|
622
|
+
label=label_field,
|
|
623
|
+
label_template=label_template,
|
|
624
|
+
name=yaml_config.get("name"),
|
|
625
|
+
split=yaml_config.get("split"),
|
|
626
|
+
config=yaml_config.get("config"),
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# === Helper Methods ===
|
|
630
|
+
|
|
631
|
+
@staticmethod
|
|
632
|
+
def _resolve_metrics(
|
|
633
|
+
metrics: Union[
|
|
634
|
+
str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
|
|
635
|
+
]
|
|
636
|
+
) -> List[MetricBase]:
|
|
637
|
+
"""Normalize metrics params to a metric type."""
|
|
638
|
+
|
|
639
|
+
if not isinstance(metrics, list):
|
|
640
|
+
metrics = [metrics]
|
|
641
|
+
|
|
642
|
+
resolved: List[MetricBase] = []
|
|
643
|
+
for m in metrics:
|
|
644
|
+
if isinstance(m, MetricBase):
|
|
645
|
+
resolved.append(m) # Already an instance
|
|
646
|
+
else:
|
|
647
|
+
resolved.append(MetricRegistry.get(m)) # Use registry for str or class
|
|
648
|
+
|
|
649
|
+
return resolved
|
|
650
|
+
|
|
651
|
+
# === Dunder Methods ===
|
|
652
|
+
|
|
653
|
+
def __len__(self) -> int:
|
|
654
|
+
"""Return the number of items in the dataset."""
|
|
655
|
+
if self._hf_dataset is None:
|
|
656
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
657
|
+
return len(self._hf_dataset)
|
|
658
|
+
|
|
659
|
+
def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
|
|
660
|
+
"""
|
|
661
|
+
Allow item access by index (int) or by column name (str).
|
|
662
|
+
|
|
663
|
+
- eval_dataset[i] returns the i-th example (dict).
|
|
664
|
+
- eval_dataset["feature"] returns a list of values for that feature.
|
|
665
|
+
"""
|
|
666
|
+
if self._hf_dataset is None:
|
|
667
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
668
|
+
if isinstance(key, int):
|
|
669
|
+
return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
|
|
670
|
+
elif isinstance(key, str):
|
|
671
|
+
return list(self._hf_dataset[key]) # Ensure we return a List[Any]
|
|
672
|
+
else:
|
|
673
|
+
raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
|
|
674
|
+
|
|
675
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
676
|
+
"""Return an iterator over all examples in the dataset."""
|
|
677
|
+
if self._hf_dataset is None:
|
|
678
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
679
|
+
return iter(self._hf_dataset)
|
|
680
|
+
|
|
681
|
+
def __str__(self) -> str:
|
|
682
|
+
"""Return a formatted string summary of the evaluation dataset."""
|
|
683
|
+
if self._hf_dataset is None:
|
|
684
|
+
return f"EvalDataset(name='{self.name}', status='uninitialized')"
|
|
685
|
+
|
|
686
|
+
num_rows = len(self._hf_dataset)
|
|
687
|
+
fields = ", ".join(self.column_names)
|
|
688
|
+
metrics = ", ".join([metric.name for metric in self.metrics])
|
|
689
|
+
|
|
690
|
+
# Build template info string
|
|
691
|
+
template_info = []
|
|
692
|
+
if self.input_template:
|
|
693
|
+
template_preview = (
|
|
694
|
+
self.input_template[:40] + "..."
|
|
695
|
+
if len(self.input_template) > 40
|
|
696
|
+
else self.input_template
|
|
697
|
+
)
|
|
698
|
+
template_info.append(f"input_template='{template_preview}'")
|
|
699
|
+
|
|
700
|
+
if self.label_template:
|
|
701
|
+
template_preview = (
|
|
702
|
+
self.label_template[:40] + "..."
|
|
703
|
+
if len(self.label_template) > 40
|
|
704
|
+
else self.label_template
|
|
705
|
+
)
|
|
706
|
+
template_info.append(f"label_template='{template_preview}'")
|
|
707
|
+
|
|
708
|
+
template_str = ", " + ", ".join(template_info) if template_info else ""
|
|
709
|
+
|
|
710
|
+
return (
|
|
711
|
+
f"EvalDataset(\n"
|
|
712
|
+
f" name='{self.name}',\n"
|
|
713
|
+
f" rows={num_rows},\n"
|
|
714
|
+
f" fields=[{fields}],\n"
|
|
715
|
+
f" metrics=[{metrics}],\n"
|
|
716
|
+
f" input='{self.input}',\n"
|
|
717
|
+
f" label='{self.label}'{template_str}\n"
|
|
718
|
+
f")"
|
|
719
|
+
)
|