scorebook 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,719 @@
1
+ """Evaluation Dataset implementation for scorebook."""
2
+
3
+ import csv
4
+ import json
5
+ import random
6
+ from typing import Any, Dict, Iterator, List, Optional, Type, Union
7
+
8
+ import yaml
9
+ from datasets import Dataset as HuggingFaceDataset
10
+ from datasets import DatasetDict as HuggingFaceDatasetDict
11
+ from datasets import load_dataset
12
+
13
+ from scorebook.exceptions import (
14
+ DatasetConfigurationError,
15
+ DatasetLoadError,
16
+ DatasetNotInitializedError,
17
+ DatasetParseError,
18
+ DatasetSampleError,
19
+ MissingFieldError,
20
+ )
21
+ from scorebook.metrics import MetricBase, MetricRegistry
22
+ from scorebook.utils import render_template, validate_path
23
+
24
+
25
+ class EvalDataset:
26
+ """Evaluation Dataset for model evaluation and scoring.
27
+
28
+ An evaluation dataset defines explicit input and label features.
29
+ During evaluation, each input is passed to the model,
30
+ and the resulting output is compared against the
31
+ corresponding label using the configured metrics.
32
+
33
+ Do not instantiate directly. Use a factory constructor:
34
+ - from_list
35
+ - from_csv
36
+ - from_json
37
+ - from_huggingface
38
+ - from_yaml
39
+
40
+ Attributes:
41
+ name: Human-readable dataset name.
42
+ metrics: List of MetricBase instances used for scoring.
43
+ input: Column name used as the model input.
44
+ label: Column name used as the ground-truth label.
45
+ input_template: Optional Jinja2 template that renders the input from item features.
46
+ label_template: Optional Jinja2 template that renders the label from item features.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ name: str,
52
+ metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
53
+ hf_dataset: HuggingFaceDataset,
54
+ input: Optional[str] = None,
55
+ label: Optional[str] = None,
56
+ input_template: Optional[str] = None,
57
+ label_template: Optional[str] = None,
58
+ ):
59
+ """Create a new scorebook evaluation dataset instance.
60
+
61
+ Args:
62
+ name: The name of the evaluation dataset.
63
+ metrics: The metrics used for scoring.
64
+ hf_dataset: Evaluation items.
65
+ input: Dataset feature containing input values.
66
+ label: Dataset feature containing label values.
67
+ input_template: Jinja2 template for input.
68
+ label_template: Jinja2 template for label.
69
+
70
+ Raises:
71
+ DatasetConfigurationError:
72
+ If both/neither of input and input_template,
73
+ or both/neither of label and label_template are provided.
74
+ MissingFieldError:
75
+ If the resolved input or label column is not present in the HF dataset.
76
+ """
77
+
78
+ # Validate mutual exclusivity for input and input_template
79
+ if (input is None) == (input_template is None):
80
+ raise DatasetConfigurationError(
81
+ "Exactly one of 'input' or 'input_template' must be provided, not both or neither."
82
+ )
83
+
84
+ # Validate mutual exclusivity for label and label_template
85
+ if (label is None) == (label_template is None):
86
+ raise DatasetConfigurationError(
87
+ "Exactly one of 'label' or 'label_template' must be provided, not both or neither."
88
+ )
89
+
90
+ # Determine the feature to be used as inputs
91
+ input_column: str = (
92
+ "*input" if input_template is not None else input # type: ignore[assignment]
93
+ )
94
+
95
+ # Determine the feature to be used as labels
96
+ label_column: str = (
97
+ "*label" if label_template is not None else label # type: ignore[assignment]
98
+ )
99
+
100
+ # Validate that dataset has the required columns
101
+ column_names = list(hf_dataset.column_names)
102
+ actual_columns = set(column_names)
103
+
104
+ if input_column not in actual_columns:
105
+ raise MissingFieldError(
106
+ field_name=input_column, field_type="input", available_fields=column_names
107
+ )
108
+
109
+ if label_column not in actual_columns:
110
+ raise MissingFieldError(
111
+ field_name=label_column, field_type="label", available_fields=column_names
112
+ )
113
+
114
+ self.name: str = name
115
+ self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
116
+ self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
117
+
118
+ # Store which columns to use for input/label
119
+ self.input: str = input_column
120
+ self.label: str = label_column
121
+
122
+ # Store templates for transparency (optional, for debugging)
123
+ self.input_template: Optional[str] = input_template
124
+ self.label_template: Optional[str] = label_template
125
+
126
+ @property
127
+ def items(self) -> List[Any]:
128
+ """Return a list of all examples in the dataset."""
129
+ if self._hf_dataset is None:
130
+ raise DatasetNotInitializedError("Dataset is not initialized")
131
+ return list(self._hf_dataset)
132
+
133
+ @property
134
+ def column_names(self) -> List[str]:
135
+ """Return a list of column/feature names available in the dataset."""
136
+ if self._hf_dataset is None:
137
+ raise DatasetNotInitializedError("Dataset is not initialized")
138
+ return list(map(str, self._hf_dataset.column_names))
139
+
140
+ def shuffle(self) -> None:
141
+ """Randomly shuffle the dataset items."""
142
+ if self._hf_dataset is None:
143
+ raise DatasetNotInitializedError("Dataset is not initialized")
144
+ self._hf_dataset.shuffle()
145
+
146
+ def sample(self, sample_size: int) -> "EvalDataset":
147
+ """Create a new dataset with randomly sampled items from this dataset.
148
+
149
+ Args:
150
+ sample_size: The number of items to sample from the dataset.
151
+
152
+ Returns:
153
+ A new EvalDataset with randomly sampled items.
154
+
155
+ Raises:
156
+ DatasetSampleError: If the sample size is smaller than the dataset.
157
+ """
158
+
159
+ # Validate requested sample size against available items
160
+ dataset_size = len(self.items)
161
+ if sample_size > dataset_size:
162
+ raise DatasetSampleError(
163
+ sample_size=sample_size, dataset_size=dataset_size, dataset_name=self.name
164
+ )
165
+
166
+ # Create randomly sampled items
167
+ sampled_items = random.sample(self.items, sample_size)
168
+
169
+ # Create HuggingFace dataset from sampled items
170
+ sampled_hf_dataset = HuggingFaceDataset.from_list(sampled_items)
171
+
172
+ # # Preserve original input/label spec; omit field names when templates are used
173
+ input_param = None if self.input_template else self.input
174
+ label_param = None if self.label_template else self.label
175
+
176
+ return EvalDataset(
177
+ name=self.name,
178
+ metrics=self.metrics,
179
+ hf_dataset=sampled_hf_dataset,
180
+ input=input_param,
181
+ label=label_param,
182
+ input_template=self.input_template,
183
+ label_template=self.label_template,
184
+ )
185
+
186
+ # === Factory Methods ===
187
+
188
+ @classmethod
189
+ def from_list(
190
+ cls,
191
+ name: str,
192
+ metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
193
+ items: List[Dict[str, Any]],
194
+ input: str,
195
+ label: str,
196
+ ) -> "EvalDataset":
197
+ """Instantiate an EvalDataset from a list of dictionaries.
198
+
199
+ Args:
200
+ name: The name of the evaluation dataset.
201
+ metrics: The specified metrics associated with the dataset.
202
+ items: List of dictionaries containing the dataset examples.
203
+ input: The field name containing the input data.
204
+ label: The field name containing the label.
205
+
206
+ Returns:
207
+ A scorebook EvalDataset.
208
+
209
+ Raises:
210
+ MissingFieldError: If the input or label feature is not present in the first item.
211
+ """
212
+
213
+ if items and items[0]:
214
+ available_fields = list(items[0].keys())
215
+
216
+ # Raise an error if the input feature is missing from the first item
217
+ if input not in items[0]:
218
+ raise MissingFieldError(
219
+ field_name=input, field_type="input", available_fields=available_fields
220
+ )
221
+
222
+ # Raises an error if the label feature is missing from the first item
223
+ if label not in items[0]:
224
+ raise MissingFieldError(
225
+ field_name=label, field_type="label", available_fields=available_fields
226
+ )
227
+
228
+ return cls(
229
+ name=name,
230
+ metrics=metrics,
231
+ hf_dataset=HuggingFaceDataset.from_list(items),
232
+ input=input,
233
+ label=label,
234
+ input_template=None,
235
+ label_template=None,
236
+ )
237
+
238
+ @classmethod
239
+ def from_csv(
240
+ cls,
241
+ path: str,
242
+ metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
243
+ input: str,
244
+ label: str,
245
+ name: Optional[str] = None,
246
+ encoding: str = "utf-8",
247
+ newline: str = "",
248
+ **reader_kwargs: Any,
249
+ ) -> "EvalDataset":
250
+ """Instantiate an EvalDataset from a CSV file.
251
+
252
+ Args:
253
+ path: Path to the CSV file.
254
+ metrics: The specified metrics associated with the dataset.
255
+ input: The field name containing the input data.
256
+ label: The field name containing the label.
257
+ name: Optional name for the eval dataset, if not provided, the path is used.
258
+ encoding: Encoding of the CSV file.
259
+ newline: Newline character of the CSV file.
260
+ reader_kwargs: Dict of kwargs passed to csv.DictReader.
261
+
262
+ Returns:
263
+ A scorebook EvalDataset.
264
+
265
+ Raises:
266
+ DatasetParseError: If csv parsing fails.
267
+ DatasetLoadError: If the csv file does not contain evaluation items.
268
+ MissingFieldError: If the input or label feature is not present in the first item.
269
+ """
270
+ reader_kwargs = reader_kwargs or {}
271
+ validated_path = validate_path(path, expected_suffix=".csv")
272
+
273
+ try:
274
+ with open(validated_path, encoding=encoding, newline=newline) as csvfile:
275
+ items = list(csv.DictReader(csvfile, **reader_kwargs))
276
+ except csv.Error as e:
277
+ raise DatasetParseError(f"Failed to parse CSV file {path}: {e}") from e
278
+
279
+ if not items:
280
+ raise DatasetLoadError(f"CSV file {path} is empty or contains only headers.")
281
+
282
+ available_fields = list(items[0].keys())
283
+ if input not in items[0]:
284
+ raise MissingFieldError(
285
+ field_name=input, field_type="input", available_fields=available_fields
286
+ )
287
+ if label not in items[0]:
288
+ raise MissingFieldError(
289
+ field_name=label, field_type="label", available_fields=available_fields
290
+ )
291
+
292
+ name = name if name else validated_path.stem
293
+ return cls(
294
+ name=name,
295
+ metrics=metrics,
296
+ hf_dataset=HuggingFaceDataset.from_list(items),
297
+ input=input,
298
+ label=label,
299
+ input_template=None,
300
+ label_template=None,
301
+ )
302
+
303
+ @classmethod
304
+ def from_json(
305
+ cls,
306
+ path: str,
307
+ metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
308
+ input: str,
309
+ label: str,
310
+ name: Optional[str] = None,
311
+ split: Optional[str] = None,
312
+ ) -> "EvalDataset":
313
+ """Instantiate an EvalDataset from a JSON file.
314
+
315
+ The JSON file must follow one of two supported formats:
316
+
317
+ 1. Flat format – a list of dictionaries:
318
+ [
319
+ {"input": ..., "output": ...},
320
+ {"input": ..., "output": ...},
321
+ ]
322
+
323
+ 2. Split format – a dictionary of named splits:
324
+ {
325
+ "train": [{"input": ..., "output": ...}],
326
+ "test": [{"input": ..., "output": ...}]
327
+ }
328
+
329
+ Args:
330
+ path: Path to the JSON file on disk.
331
+ metrics: The specified metrics associated with the dataset.
332
+ input: The field name containing the input data.
333
+ label: The field name containing the label.
334
+ name: Optional name for the eval dataset, if not provided, the path is used
335
+ split: If the JSON uses a split structure, this is the split name to load.
336
+
337
+ Returns:
338
+ A Scorebook EvalDataset.
339
+
340
+ Raises:
341
+ DatasetParseError: If JSON parsing fails.
342
+ DatasetConfigurationError: If an invalid split is provided.
343
+ MissingFieldError: If the input or label feature is not present in the first item.
344
+ """
345
+ validated_path = validate_path(path, expected_suffix=".json")
346
+
347
+ try:
348
+ with validated_path.open("r", encoding="utf-8") as f:
349
+ json_data = json.load(f)
350
+ except json.JSONDecodeError as e:
351
+ raise DatasetParseError(f"Invalid JSON in {path}: {e}") from e
352
+
353
+ if isinstance(json_data, dict):
354
+
355
+ if split is None:
356
+ raise DatasetConfigurationError(
357
+ f"Split name must be provided for split-style JSON: {path}"
358
+ )
359
+
360
+ items = json_data.get(split)
361
+ if items is None:
362
+ raise DatasetConfigurationError(f"Split '{split}' not found in JSON file: {path}")
363
+ if not isinstance(items, list):
364
+ raise DatasetConfigurationError(
365
+ f"Split '{split}' is not a list of examples in: {path}"
366
+ )
367
+
368
+ elif isinstance(json_data, list):
369
+ items = json_data
370
+
371
+ else:
372
+ raise DatasetConfigurationError(
373
+ f"Unsupported JSON structure in {path}. Expected list or dict."
374
+ )
375
+
376
+ # Validate that fields exist
377
+ if items and items[0]:
378
+ available_fields = list(items[0].keys())
379
+ if input not in items[0]:
380
+ raise MissingFieldError(
381
+ field_name=input, field_type="input", available_fields=available_fields
382
+ )
383
+ if label not in items[0]:
384
+ raise MissingFieldError(
385
+ field_name=label, field_type="label", available_fields=available_fields
386
+ )
387
+
388
+ name = name if name else validated_path.stem
389
+ return cls(
390
+ name=name,
391
+ metrics=metrics,
392
+ hf_dataset=HuggingFaceDataset.from_list(items),
393
+ input=input,
394
+ label=label,
395
+ input_template=None,
396
+ label_template=None,
397
+ )
398
+
399
+ @classmethod
400
+ def from_huggingface(
401
+ cls,
402
+ path: str,
403
+ metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
404
+ input: Optional[str] = None,
405
+ input_template: Optional[str] = None,
406
+ label: Optional[str] = None,
407
+ label_template: Optional[str] = None,
408
+ name: Optional[str] = None,
409
+ split: Optional[str] = None,
410
+ config: Optional[str] = None,
411
+ ) -> "EvalDataset":
412
+ """Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
413
+
414
+ If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
415
+ If no split is specified, the method attempts to load the full dataset. If the dataset
416
+ is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
417
+ split.
418
+
419
+ For datasets where the input/label is already in a single column, use the input/label
420
+ parameters to specify the feature names. For datasets where the input/label needs to be
421
+ constructed from multiple columns, use the input_template/label_template parameters
422
+ with Jinja2 template strings.
423
+
424
+ Args:
425
+ path: The path of the dataset on the Hugging Face Hub.
426
+ metrics: The specified metrics associated with the dataset.
427
+ input: Field name containing the input data (mutually exclusive with input_template).
428
+ input_template:
429
+ Jinja2 template to construct input from multiple fields
430
+ (mutually exclusive with input).
431
+ label: Field name containing the label
432
+ (mutually exclusive with label_template).
433
+ label_template:
434
+ Jinja2 template to construct label from multiple fields
435
+ (mutually exclusive with label).
436
+ name: Optional name for the eval dataset, by default HF "path:split:config" is used.
437
+ split: Optional name of the split to load.
438
+ config: Optional dataset configuration name.
439
+
440
+ Returns:
441
+ A Scorebook EvalDataset.
442
+
443
+ Raises:
444
+ DatasetConfigurationError:
445
+ If both/neither of input and input_template,
446
+ or both/neither of label and label_template are provided.
447
+ DatasetLoadError: If HF dataset cannot be loaded.
448
+ """
449
+
450
+ # Validate mutual exclusivity for input and input_template
451
+ if (input is None) == (input_template is None):
452
+ raise DatasetConfigurationError(
453
+ "Exactly one of 'input' or 'input_template' must be provided, not both or neither."
454
+ )
455
+
456
+ # Validate mutual exclusivity for label and label_template
457
+ if (label is None) == (label_template is None):
458
+ raise DatasetConfigurationError(
459
+ "Exactly one of 'label' or 'label_template' must be provided, not both or neither."
460
+ )
461
+
462
+ try:
463
+ kwargs = {}
464
+ if split is not None:
465
+ kwargs["split"] = split
466
+ if config is not None:
467
+ kwargs["name"] = (
468
+ config # Hugging Face's load_dataset method param for config is "name"
469
+ )
470
+ ds = load_dataset(path, **kwargs)
471
+ except Exception as e:
472
+ raise DatasetLoadError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
473
+
474
+ if isinstance(ds, HuggingFaceDataset):
475
+ hf_dataset = ds
476
+ elif isinstance(ds, HuggingFaceDatasetDict):
477
+ if "test" in ds:
478
+ hf_dataset = ds["test"]
479
+ else:
480
+ raise DatasetConfigurationError(
481
+ f"Split not specified and no 'test' split found in dataset '{path}'."
482
+ )
483
+ else:
484
+ raise DatasetConfigurationError(f"Unexpected dataset type for '{path}': {type(ds)}")
485
+
486
+ # Only transform if templates are used
487
+ if input_template is not None or label_template is not None:
488
+
489
+ def transform_row(row: Dict[str, Any]) -> Dict[str, Any]:
490
+ """Add computed columns (*input, *label) when templates are used."""
491
+ # Start with all original data
492
+ result = dict(row)
493
+
494
+ # Add *input if template is used
495
+ if input_template is not None:
496
+ result["*input"] = render_template(input_template, row)
497
+
498
+ # Add *label if template is used
499
+ if label_template is not None:
500
+ result["*label"] = render_template(label_template, row)
501
+
502
+ return result
503
+
504
+ transformed_dataset = hf_dataset.map(transform_row)
505
+ else:
506
+
507
+ transformed_dataset = hf_dataset
508
+
509
+ dataset_name = name if name else ":".join(filter(None, [path, split, config]))
510
+ return cls(
511
+ name=dataset_name,
512
+ metrics=metrics,
513
+ hf_dataset=transformed_dataset,
514
+ input=input,
515
+ label=label,
516
+ input_template=input_template,
517
+ label_template=label_template,
518
+ )
519
+
520
+ @classmethod
521
+ def from_yaml(cls, path: str) -> "EvalDataset":
522
+ r"""Instantiate an EvalDataset from Huggingface with a YAML Config file.
523
+
524
+ The YAML file should contain configuration for loading a dataset from Hugging Face.
525
+
526
+ Required fields:
527
+ - path: Hugging Face dataset path
528
+ - name: Name for the evaluation dataset
529
+ - metrics: List of metrics to evaluate
530
+
531
+ The input / label features must be specified / constructed by one of the following:
532
+
533
+ 1. Feature Specification:
534
+ input: "question"
535
+ label: "answer"
536
+
537
+ 2. Mapping Templates:
538
+ templates:
539
+ input: "{{ question }}\nOptions: {{ options }}"
540
+ label: "{{ answer }}"
541
+
542
+ Optional fields:
543
+ - split: Dataset split to load.
544
+ - config: Dataset configuration name.
545
+ - metadata: Any additional metadata.
546
+
547
+ Args:
548
+ path: The path of YAML configuration file.
549
+
550
+ Returns:
551
+ An EvalDataset.
552
+
553
+ Raises:
554
+ DatasetParseError: If YAML configuration file cannot be parsed.
555
+ DatasetConfigurationError: Invalid YAML configuration file.
556
+ """
557
+ validated_path = validate_path(path, expected_suffix=(".yaml", ".yml"))
558
+
559
+ try:
560
+ with validated_path.open("r", encoding="utf-8") as f:
561
+ yaml_config = yaml.safe_load(f)
562
+ except yaml.YAMLError as e:
563
+ raise DatasetParseError(f"Invalid YAML in {path}: {e}") from e
564
+
565
+ # Validate required fields
566
+ required_fields = ["path", "name", "metrics"]
567
+ missing_fields = [field for field in required_fields if field not in yaml_config]
568
+ if missing_fields:
569
+ raise DatasetConfigurationError(
570
+ f"Missing required fields in YAML config: {', '.join(missing_fields)}"
571
+ )
572
+
573
+ # Validate metrics exist before calling from_huggingface
574
+ metrics_to_validate = yaml_config["metrics"]
575
+ if not isinstance(metrics_to_validate, list):
576
+ metrics_to_validate = [metrics_to_validate]
577
+
578
+ for metric in metrics_to_validate:
579
+ try:
580
+ MetricRegistry.get(metric)
581
+ except Exception as e:
582
+ raise DatasetConfigurationError(f"Invalid metric '{metric}' in YAML config: {e}")
583
+
584
+ # Determine input/label specification
585
+ has_templates = "templates" in yaml_config
586
+ has_direct_input = "input" in yaml_config
587
+ has_direct_label = "label" in yaml_config
588
+
589
+ # Validate that we have proper input/label specification
590
+ if has_templates:
591
+ templates = yaml_config["templates"]
592
+ if not isinstance(templates, dict):
593
+ raise DatasetConfigurationError("'templates' must be a dictionary")
594
+ if "input" not in templates or "label" not in templates:
595
+ raise DatasetConfigurationError(
596
+ "'templates' must contain both 'input' and 'label' keys"
597
+ )
598
+ if has_direct_input or has_direct_label:
599
+ raise DatasetConfigurationError(
600
+ "Cannot specify both 'templates' and direct 'input'/'label' fields"
601
+ )
602
+ input_template = templates["input"]
603
+ label_template = templates["label"]
604
+ input_field = None
605
+ label_field = None
606
+ else:
607
+ if not has_direct_input or not has_direct_label:
608
+ raise DatasetConfigurationError(
609
+ "Must specify either 'templates' or both 'input' and 'label' fields"
610
+ )
611
+ input_field = yaml_config["input"]
612
+ label_field = yaml_config["label"]
613
+ input_template = None
614
+ label_template = None
615
+
616
+ # Load the dataset from Hugging Face
617
+ return cls.from_huggingface(
618
+ path=yaml_config["path"],
619
+ metrics=yaml_config["metrics"],
620
+ input=input_field,
621
+ input_template=input_template,
622
+ label=label_field,
623
+ label_template=label_template,
624
+ name=yaml_config.get("name"),
625
+ split=yaml_config.get("split"),
626
+ config=yaml_config.get("config"),
627
+ )
628
+
629
+ # === Helper Methods ===
630
+
631
+ @staticmethod
632
+ def _resolve_metrics(
633
+ metrics: Union[
634
+ str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
635
+ ]
636
+ ) -> List[MetricBase]:
637
+ """Normalize metrics params to a metric type."""
638
+
639
+ if not isinstance(metrics, list):
640
+ metrics = [metrics]
641
+
642
+ resolved: List[MetricBase] = []
643
+ for m in metrics:
644
+ if isinstance(m, MetricBase):
645
+ resolved.append(m) # Already an instance
646
+ else:
647
+ resolved.append(MetricRegistry.get(m)) # Use registry for str or class
648
+
649
+ return resolved
650
+
651
+ # === Dunder Methods ===
652
+
653
+ def __len__(self) -> int:
654
+ """Return the number of items in the dataset."""
655
+ if self._hf_dataset is None:
656
+ raise DatasetNotInitializedError("Dataset is not initialized")
657
+ return len(self._hf_dataset)
658
+
659
+ def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
660
+ """
661
+ Allow item access by index (int) or by column name (str).
662
+
663
+ - eval_dataset[i] returns the i-th example (dict).
664
+ - eval_dataset["feature"] returns a list of values for that feature.
665
+ """
666
+ if self._hf_dataset is None:
667
+ raise DatasetNotInitializedError("Dataset is not initialized")
668
+ if isinstance(key, int):
669
+ return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
670
+ elif isinstance(key, str):
671
+ return list(self._hf_dataset[key]) # Ensure we return a List[Any]
672
+ else:
673
+ raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
674
+
675
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
676
+ """Return an iterator over all examples in the dataset."""
677
+ if self._hf_dataset is None:
678
+ raise DatasetNotInitializedError("Dataset is not initialized")
679
+ return iter(self._hf_dataset)
680
+
681
+ def __str__(self) -> str:
682
+ """Return a formatted string summary of the evaluation dataset."""
683
+ if self._hf_dataset is None:
684
+ return f"EvalDataset(name='{self.name}', status='uninitialized')"
685
+
686
+ num_rows = len(self._hf_dataset)
687
+ fields = ", ".join(self.column_names)
688
+ metrics = ", ".join([metric.name for metric in self.metrics])
689
+
690
+ # Build template info string
691
+ template_info = []
692
+ if self.input_template:
693
+ template_preview = (
694
+ self.input_template[:40] + "..."
695
+ if len(self.input_template) > 40
696
+ else self.input_template
697
+ )
698
+ template_info.append(f"input_template='{template_preview}'")
699
+
700
+ if self.label_template:
701
+ template_preview = (
702
+ self.label_template[:40] + "..."
703
+ if len(self.label_template) > 40
704
+ else self.label_template
705
+ )
706
+ template_info.append(f"label_template='{template_preview}'")
707
+
708
+ template_str = ", " + ", ".join(template_info) if template_info else ""
709
+
710
+ return (
711
+ f"EvalDataset(\n"
712
+ f" name='{self.name}',\n"
713
+ f" rows={num_rows},\n"
714
+ f" fields=[{fields}],\n"
715
+ f" metrics=[{metrics}],\n"
716
+ f" input='{self.input}',\n"
717
+ f" label='{self.label}'{template_str}\n"
718
+ f")"
719
+ )