scorebook 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scorebook/__init__.py +14 -6
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/eval_datasets/__init__.py +5 -0
  4. scorebook/eval_datasets/eval_dataset.py +719 -0
  5. scorebook/evaluate/__init__.py +15 -0
  6. scorebook/evaluate/_async/__init__.py +0 -0
  7. scorebook/evaluate/_async/evaluate_async.py +443 -0
  8. scorebook/evaluate/_sync/__init__.py +0 -0
  9. scorebook/evaluate/_sync/evaluate.py +443 -0
  10. scorebook/evaluate/evaluate_helpers.py +388 -0
  11. scorebook/exceptions.py +48 -0
  12. scorebook/inference/__init__.py +4 -0
  13. scorebook/inference/clients/__init__.py +8 -0
  14. scorebook/inference/{bedrock.py → clients/bedrock.py} +1 -1
  15. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  16. scorebook/inference/{portkey.py → clients/portkey.py} +1 -1
  17. scorebook/inference/{vertex.py → clients/vertex.py} +1 -1
  18. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  19. scorebook/settings.py +21 -0
  20. scorebook/trismik/__init__.py +10 -0
  21. scorebook/types.py +8 -5
  22. scorebook/utils/__init__.py +11 -4
  23. scorebook/utils/async_utils.py +20 -1
  24. scorebook/utils/io_helpers.py +18 -5
  25. scorebook/utils/progress_bars.py +739 -96
  26. scorebook/utils/{build_prompt.py → render_template.py} +13 -12
  27. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/METADATA +4 -4
  28. scorebook-0.0.11.dist-info/RECORD +42 -0
  29. scorebook/eval_dataset.py +0 -404
  30. scorebook/evaluate.py +0 -623
  31. scorebook/trismik_services/__init__.py +0 -6
  32. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  33. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  34. scorebook-0.0.9.dist-info/RECORD +0 -36
  35. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  36. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
  37. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
  38. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -7,36 +7,37 @@ and global variables, using strict undefined handling for better error detection
7
7
 
8
8
  from typing import Any, Dict, Optional
9
9
 
10
- from jinja2 import BaseLoader, Environment, StrictUndefined
10
+ from jinja2 import BaseLoader, StrictUndefined
11
+ from jinja2.sandbox import SandboxedEnvironment
11
12
 
12
13
  from scorebook.utils.jinja_helpers import default_jinja_filters, default_jinja_globals
13
14
 
14
15
 
15
- def build_prompt(
16
- prompt_template: str,
17
- prompt_args: Dict[str, Any],
16
+ def render_template(
17
+ template: str,
18
+ args: Dict[str, Any],
18
19
  filters: Optional[Dict[str, Any]] = None,
19
20
  globals_dict: Optional[Dict[str, Any]] = None,
20
21
  ) -> str:
21
22
  """
22
- Build a prompt string from a template and arguments.
23
+ Render a Jinja2 template string with the provided arguments.
23
24
 
24
25
  Args:
25
- prompt_template: Jinja2 template string
26
- prompt_args: Dictionary of arguments to pass to the template
26
+ template: Jinja2 template string
27
+ args: Dictionary of arguments to pass to the template
27
28
  filters: Dictionary of Jinja2 filters. Defaults to default_jinja_filters().
28
29
  globals_dict: Dictionary of global functions/variables. Defaults to default_jinja_globals().
29
30
 
30
31
  Returns:
31
- str: Rendered prompt string
32
+ str: Rendered template string
32
33
  """
33
34
 
34
35
  # Use defaults if not provided
35
36
  filters = filters or default_jinja_filters()
36
37
  globals_dict = globals_dict or default_jinja_globals()
37
38
 
38
- # Create a Jinja2 environment with strict undefined handling
39
- env = Environment(
39
+ # Create a sandboxed Jinja2 environment with strict undefined handling
40
+ env = SandboxedEnvironment(
40
41
  loader=BaseLoader(),
41
42
  undefined=StrictUndefined,
42
43
  trim_blocks=True,
@@ -48,5 +49,5 @@ def build_prompt(
48
49
  env.globals.update(globals_dict)
49
50
 
50
51
  # Render the template
51
- template = env.from_string(prompt_template)
52
- return str(template.render(**prompt_args))
52
+ jinja_template = env.from_string(template)
53
+ return str(jinja_template.render(**args))
@@ -1,18 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
7
7
  Author-email: euan@trismik.com
8
- Requires-Python: >=3.9
8
+ Requires-Python: >=3.9, <3.14
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
- Classifier: Programming Language :: Python :: 3.14
16
15
  Provides-Extra: bedrock
17
16
  Provides-Extra: examples
18
17
  Provides-Extra: openai
@@ -24,6 +23,7 @@ Requires-Dist: datasets (>=3.6.0)
24
23
  Requires-Dist: fsspec[gcs] ; extra == "vertex"
25
24
  Requires-Dist: google-cloud-storage ; extra == "vertex"
26
25
  Requires-Dist: google-genai ; extra == "vertex"
26
+ Requires-Dist: ipywidgets (>=8.0.0)
27
27
  Requires-Dist: notebook (>=7.4.5,<8.0.0)
28
28
  Requires-Dist: notebook ; extra == "examples"
29
29
  Requires-Dist: openai ; extra == "openai"
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
37
37
  Requires-Dist: torchaudio ; extra == "examples"
38
38
  Requires-Dist: torchvision ; extra == "examples"
39
39
  Requires-Dist: transformers ; extra == "examples"
40
- Requires-Dist: trismik (>=0.9.12)
40
+ Requires-Dist: trismik (>=1.0.1,<2.0.0)
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
  # Scorebook
@@ -0,0 +1,42 @@
1
+ scorebook/__init__.py,sha256=uifumovUYn62n_VRFTeieEP3eVP8ycZ3lDp44sVwgeI,639
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/eval_datasets/__init__.py,sha256=9YPjxjdaMaOrBUzJwvsUlFPl-KdYMgUGTV3WNd7OCU0,128
6
+ scorebook/eval_datasets/eval_dataset.py,sha256=6GgrAaWelU5dK6I-x9zXHCxVSfvo41yyYNPF0ue4zbo,27200
7
+ scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
8
+ scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ scorebook/evaluate/_async/evaluate_async.py,sha256=kcUrxfXdleqjn-Ajn06jGAfMvftmwQLzbB2zLu2-Q3U,16983
10
+ scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ scorebook/evaluate/_sync/evaluate.py,sha256=qBOQZQYfpHSz4i_uRgH7QdZvhw0isOmIYPdqcLIWvwQ,16821
12
+ scorebook/evaluate/evaluate_helpers.py,sha256=sSWWdyH5h1jJsQ_ABzRzgW0-Dy21BQ2PzdCJ0-uROC8,13829
13
+ scorebook/exceptions.py,sha256=i494gw1TnSV-vS3RWGGZ6FYdlldjcY1qQuXO1htukuQ,3551
14
+ scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
15
+ scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
16
+ scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
17
+ scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
18
+ scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
19
+ scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
20
+ scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
21
+ scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
22
+ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
23
+ scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
24
+ scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
25
+ scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
26
+ scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
27
+ scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
28
+ scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
29
+ scorebook/types.py,sha256=8weXPYzUw9m4fy2O_TleqThg1OI4z_lQyx78brJigJU,5955
30
+ scorebook/utils/__init__.py,sha256=IhL8bss1s-QkguwYp7T3jamk5GmsJRFjf9MynAN1W5g,544
31
+ scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
32
+ scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
33
+ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
34
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
35
+ scorebook/utils/progress_bars.py,sha256=bkKZBSUt5kJi_ZyRhEE0UWrgB2NZpua7jxZ8xqsNyuk,29792
36
+ scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
37
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
38
+ scorebook-0.0.11.dist-info/METADATA,sha256=FTV_4abv4NK1z9KbKx4sL8Qto74pMcHhCKZnskGWolo,11515
39
+ scorebook-0.0.11.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
40
+ scorebook-0.0.11.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
41
+ scorebook-0.0.11.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
42
+ scorebook-0.0.11.dist-info/RECORD,,
scorebook/eval_dataset.py DELETED
@@ -1,404 +0,0 @@
1
- """Eval Dataset implementation for scorebook."""
2
-
3
- import csv
4
- import json
5
- import random
6
- from typing import Any, Dict, Iterator, List, Optional, Type, Union
7
-
8
- import yaml
9
- from datasets import Dataset as HuggingFaceDataset
10
- from datasets import DatasetDict as HuggingFaceDatasetDict
11
- from datasets import load_dataset
12
-
13
- from scorebook.metrics import MetricBase, MetricRegistry
14
- from scorebook.utils import validate_path
15
-
16
-
17
- class EvalDataset:
18
- """Eval Dataset implementation for scorebook."""
19
-
20
- def __init__(
21
- self,
22
- name: str,
23
- label: str,
24
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
25
- hf_dataset: HuggingFaceDataset,
26
- prompt_template: Optional[str] = None,
27
- ):
28
- """
29
- Create a new scorebook evaluation dataset instance.
30
-
31
- :param name: The name of the evaluation dataset.
32
- :param label: The label field of the dataset.
33
- :param metrics: The specified metrics associated with the dataset.
34
- :param hf_dataset: The dataset as a hugging face dataset object.
35
- :param prompt_template: Optional prompt template for building prompts from dataset items.
36
- """
37
- self.name: str = name
38
- self.label: str = label
39
- self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
40
- self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
41
- self.prompt_template: Optional[str] = prompt_template
42
-
43
- def __len__(self) -> int:
44
- """Return the number of items in the dataset."""
45
- if self._hf_dataset is None:
46
- raise ValueError("Dataset is not initialized")
47
- return len(self._hf_dataset)
48
-
49
- def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
50
- """
51
- Allow item access by index (int) or by column name (str).
52
-
53
- - eval_dataset[i] returns the i-th example (dict).
54
- - eval_dataset["feature"] returns a list of values for that feature.
55
- """
56
- if self._hf_dataset is None:
57
- raise ValueError("Dataset is not initialized")
58
- if isinstance(key, int):
59
- return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
60
- elif isinstance(key, str):
61
- return list(self._hf_dataset[key]) # Ensure we return a List[Any]
62
- else:
63
- raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
64
-
65
- def __str__(self) -> str:
66
- """Return a formatted string summary of the evaluation dataset."""
67
- if self._hf_dataset is None:
68
- return f"EvalDataset(name='{self.name}', status='uninitialized')"
69
-
70
- num_rows = len(self._hf_dataset)
71
- fields = ", ".join(self.column_names)
72
- metrics = ", ".join([metric.name for metric in self.metrics])
73
-
74
- return (
75
- f"EvalDataset(\n"
76
- f" name='{self.name}',\n"
77
- f" rows={num_rows},\n"
78
- f" label='{self.label}',\n"
79
- f" fields=[{fields}],\n"
80
- f" metrics=[{metrics}]\n"
81
- f")"
82
- )
83
-
84
- def __iter__(self) -> Iterator[Dict[str, Any]]:
85
- """Return an iterator over all examples in the dataset."""
86
- if self._hf_dataset is None:
87
- raise ValueError("Dataset is not initialized")
88
- return iter(self._hf_dataset)
89
-
90
- def shuffle(self) -> None:
91
- """Randomly shuffle the dataset items."""
92
- if self._hf_dataset is None:
93
- raise ValueError("Dataset is not initialized")
94
- self._hf_dataset.shuffle()
95
-
96
- @property
97
- def items(self) -> List[Any]:
98
- """Return a list of all examples in the dataset."""
99
- if self._hf_dataset is None:
100
- raise ValueError("Dataset is not initialized")
101
- return list(self._hf_dataset)
102
-
103
- @property
104
- def column_names(self) -> List[str]:
105
- """Return a list of column/feature names available in the dataset."""
106
- if self._hf_dataset is None:
107
- raise ValueError("Dataset is not initialized")
108
- return list(map(str, self._hf_dataset.column_names))
109
-
110
- @classmethod
111
- def from_list(
112
- cls,
113
- name: str,
114
- label: str,
115
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
116
- data: List[Dict[str, Any]],
117
- ) -> "EvalDataset":
118
- """Instantiate an EvalDataset from a list of dictionaries.
119
-
120
- Args:
121
- name: The name of the evaluation dataset.
122
- label: The field used as the evaluation label (ground truth).
123
- metrics: The specified metrics associated with the dataset.
124
- data: List of dictionaries containing the dataset examples.
125
-
126
- Returns:
127
- A scorebook EvalDataset wrapping a Hugging Face dataset.
128
- """
129
- return cls(
130
- name=name, label=label, metrics=metrics, hf_dataset=HuggingFaceDataset.from_list(data)
131
- )
132
-
133
- @classmethod
134
- def from_csv(
135
- cls,
136
- file_path: str,
137
- label: str,
138
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
139
- name: Optional[str] = None,
140
- encoding: str = "utf-8",
141
- newline: str = "",
142
- **reader_kwargs: Any,
143
- ) -> "EvalDataset":
144
- """Instantiate a scorebook dataset from a CSV file.
145
-
146
- Args:
147
- file_path: Path to the CSV file.
148
- label: The field used as the evaluation label (ground truth).
149
- metrics: The specified metrics associated with the dataset.
150
- name: Optional name for the eval dataset, if not provided, the path is used
151
- encoding: Encoding of the CSV file.
152
- newline: Newline character of the CSV file.
153
- reader_kwargs: Dict of kwargs passed to `csv.DictReader`.
154
-
155
- Returns:
156
- A scorebook EvalDataset.
157
-
158
- Raises:
159
- FileNotFoundError: If the file does not exist at the given path.
160
- ValueError: If the CSV file cannot be parsed or is empty.
161
- """
162
- reader_kwargs = reader_kwargs or {}
163
- path = validate_path(file_path, expected_suffix=".csv")
164
-
165
- try:
166
- with open(path, encoding=encoding, newline=newline) as csvfile:
167
- reader = csv.DictReader(csvfile, **reader_kwargs)
168
- data = [row for row in reader]
169
- except csv.Error as e:
170
- raise ValueError(f"Failed to parse CSV file {file_path}: {e}") from e
171
-
172
- if not data:
173
- raise ValueError(f"CSV file {file_path} is empty or contains only headers.")
174
-
175
- name = name if name else path.stem
176
- return cls(
177
- name=name,
178
- label=label,
179
- metrics=metrics,
180
- hf_dataset=HuggingFaceDataset.from_list(data),
181
- )
182
-
183
- @classmethod
184
- def from_json(
185
- cls,
186
- file_path: str,
187
- label: str,
188
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
189
- name: Optional[str] = None,
190
- split: Optional[str] = None,
191
- ) -> "EvalDataset":
192
- """Instantiate an EvalDataset from a JSON file.
193
-
194
- The JSON file must follow one of two supported formats:
195
-
196
- 1. **Flat format** – a list of dictionaries:
197
- [
198
- {"input": "What is 2+2?", "label": "4"},
199
- {"input": "Capital of France?", "label": "Paris"}
200
- ]
201
-
202
- 2. **Split format** – a dictionary of named splits:
203
- {
204
- "train": [{"input": ..., "label": ...}],
205
- "test": [{"input": ..., "label": ...}]
206
- }
207
-
208
- Args:
209
- file_path: Path to the JSON file on disk.
210
- label: The field used as the evaluation label (ground truth).
211
- metrics: The specified metrics associated with the dataset.
212
- name: Optional name for the eval dataset, if not provided, the path is used
213
- split: If the JSON uses a split structure, this is the split name to load.
214
-
215
- Returns:
216
- A scorebook EvalDataset wrapping a Hugging Face dataset.
217
-
218
- Raises:
219
- FileNotFoundError: If the file does not exist.
220
- ValueError: If the JSON is invalid or the structure is unsupported.
221
- """
222
- path = validate_path(file_path, expected_suffix=".json")
223
-
224
- try:
225
- with path.open("r", encoding="utf-8") as f:
226
- data = json.load(f)
227
- except json.JSONDecodeError as e:
228
- raise ValueError(f"Invalid JSON in {file_path}: {e}") from e
229
-
230
- if isinstance(data, dict):
231
- if split is None:
232
- raise ValueError(f"Split name must be provided for split-style JSON: {file_path}")
233
- split_data = data.get(split)
234
- if split_data is None:
235
- raise ValueError(f"Split '{split}' not found in JSON file: {file_path}")
236
- if not isinstance(split_data, list):
237
- raise ValueError(f"Split '{split}' is not a list of examples in: {file_path}")
238
- hf_dataset = HuggingFaceDataset.from_list(split_data)
239
- elif isinstance(data, list):
240
- hf_dataset = HuggingFaceDataset.from_list(data)
241
- else:
242
- raise ValueError(f"Unsupported JSON structure in {file_path}. Expected list or dict.")
243
-
244
- name = name if name else path.stem
245
- return cls(name=name, label=label, metrics=metrics, hf_dataset=hf_dataset)
246
-
247
- @classmethod
248
- def from_huggingface(
249
- cls,
250
- path: str,
251
- label: str,
252
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
253
- split: Optional[str] = None,
254
- name: Optional[str] = None,
255
- ) -> "EvalDataset":
256
- """Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
257
-
258
- If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
259
- If no split is specified, the method attempts to load the full dataset. If the dataset
260
- is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
261
- split.
262
-
263
- Args:
264
- path: The path of the dataset on the Hugging Face Hub.
265
- label: The field used as the evaluation label (ground truth).
266
- metrics: The specified metrics associated with the dataset.
267
- split: Optional name of the split to load.
268
- name: Optional dataset configuration name.
269
-
270
- Returns:
271
- An EvalDataset wrapping the selected Hugging Face dataset.
272
-
273
- Raises:
274
- ValueError: If the dataset cannot be loaded, or the expected split is missing.
275
- """
276
- try:
277
- kwargs = {}
278
- if split is not None:
279
- kwargs["split"] = split
280
- if name is not None:
281
- kwargs["name"] = name
282
- ds = load_dataset(path, **kwargs)
283
- except Exception as e:
284
- raise ValueError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
285
-
286
- if isinstance(ds, HuggingFaceDataset):
287
- hf_dataset = ds
288
- elif isinstance(ds, HuggingFaceDatasetDict):
289
- if "test" in ds:
290
- hf_dataset = ds["test"]
291
- else:
292
- raise ValueError(
293
- f"Split not specified and no 'test' split found in dataset '{path}'."
294
- )
295
- else:
296
- raise ValueError(f"Unexpected dataset type for '{path}': {type(ds)}")
297
-
298
- return cls(name=path, label=label, metrics=metrics, hf_dataset=hf_dataset)
299
-
300
- @classmethod
301
- def from_yaml(cls, file_path: str) -> "EvalDataset":
302
- """Instantiate an EvalDataset from a YAML file.
303
-
304
- The YAML file should contain configuration for loading a dataset, including:
305
- - name: Name of the dataset or Hugging Face dataset path
306
- - label: The field used as the evaluation label
307
- - metrics: List of metrics to evaluate
308
- - split: Optional split name to load
309
- - template: Optional prompt template
310
-
311
- Returns:
312
- An EvalDataset instance configured according to the YAML file.
313
-
314
- Raises:
315
- ValueError: If the YAML file is invalid or missing required fields.
316
- """
317
- path = validate_path(file_path, expected_suffix=".yaml")
318
-
319
- try:
320
- with path.open("r", encoding="utf-8") as f:
321
- config = yaml.safe_load(f)
322
- except yaml.YAMLError as e:
323
- raise ValueError(f"Invalid YAML in {file_path}: {e}") from e
324
-
325
- # Validate required fields
326
- required_fields = ["name", "label", "metrics"]
327
- missing_fields = [field for field in required_fields if field not in config]
328
- if missing_fields:
329
- raise ValueError(f"Missing required fields in YAML config: {', '.join(missing_fields)}")
330
-
331
- # Load the dataset from Hugging Face
332
- dataset = cls.from_huggingface(
333
- path=config["name"],
334
- label=config["label"],
335
- metrics=config["metrics"],
336
- split=config.get("split"), # Optional field
337
- name=config.get("config"), # Optional field
338
- )
339
-
340
- # Add template if provided
341
- if "template" in config:
342
- dataset.prompt_template = config["template"]
343
-
344
- return dataset
345
-
346
- @staticmethod
347
- def _resolve_metrics(
348
- metrics: Union[
349
- str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
350
- ]
351
- ) -> List[MetricBase]:
352
- """
353
- Convert metric names/classes into a list of MetricBase instances using MetricRegistry.
354
-
355
- Used to normalize metrics to a metric type.
356
- """
357
- if not isinstance(metrics, list):
358
- metrics = [metrics]
359
-
360
- resolved: List[MetricBase] = []
361
- for m in metrics:
362
- if isinstance(m, MetricBase):
363
- resolved.append(m) # Already an instance
364
- else:
365
- resolved.append(MetricRegistry.get(m)) # Use registry for str or class
366
-
367
- return resolved
368
-
369
- def sample(self, sample_size: int) -> "EvalDataset":
370
- """Create a new dataset with randomly sampled items from this dataset.
371
-
372
- Args:
373
- sample_size: The number of items to sample from the dataset
374
-
375
- Returns:
376
- A new EvalDataset with randomly sampled items
377
-
378
- Raises:
379
- ValueError: If sample_size is larger than the dataset size
380
- """
381
- dataset_size = len(self.items)
382
-
383
- if sample_size > dataset_size:
384
- raise ValueError(
385
- f"Sample size {sample_size} is larger than dataset size {dataset_size} "
386
- f"for dataset '{self.name}'"
387
- )
388
-
389
- # Create randomly sampled items
390
- sampled_items = random.sample(self.items, sample_size)
391
-
392
- # Create a new EvalDataset instance with sampled items using from_list
393
- sampled_dataset = self.from_list(
394
- name=self.name,
395
- label=self.label,
396
- metrics=self.metrics,
397
- data=sampled_items,
398
- )
399
-
400
- # Preserve the prompt template if it exists
401
- if self.prompt_template is not None:
402
- sampled_dataset.prompt_template = self.prompt_template
403
-
404
- return sampled_dataset