scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,36 +7,37 @@ and global variables, using strict undefined handling for better error detection
7
7
 
8
8
  from typing import Any, Dict, Optional
9
9
 
10
- from jinja2 import BaseLoader, Environment, StrictUndefined
10
+ from jinja2 import BaseLoader, StrictUndefined
11
+ from jinja2.sandbox import SandboxedEnvironment
11
12
 
12
13
  from scorebook.utils.jinja_helpers import default_jinja_filters, default_jinja_globals
13
14
 
14
15
 
15
- def build_prompt(
16
- prompt_template: str,
17
- prompt_args: Dict[str, Any],
16
+ def render_template(
17
+ template: str,
18
+ args: Dict[str, Any],
18
19
  filters: Optional[Dict[str, Any]] = None,
19
20
  globals_dict: Optional[Dict[str, Any]] = None,
20
21
  ) -> str:
21
22
  """
22
- Build a prompt string from a template and arguments.
23
+ Render a Jinja2 template string with the provided arguments.
23
24
 
24
25
  Args:
25
- prompt_template: Jinja2 template string
26
- prompt_args: Dictionary of arguments to pass to the template
26
+ template: Jinja2 template string
27
+ args: Dictionary of arguments to pass to the template
27
28
  filters: Dictionary of Jinja2 filters. Defaults to default_jinja_filters().
28
29
  globals_dict: Dictionary of global functions/variables. Defaults to default_jinja_globals().
29
30
 
30
31
  Returns:
31
- str: Rendered prompt string
32
+ str: Rendered template string
32
33
  """
33
34
 
34
35
  # Use defaults if not provided
35
36
  filters = filters or default_jinja_filters()
36
37
  globals_dict = globals_dict or default_jinja_globals()
37
38
 
38
- # Create a Jinja2 environment with strict undefined handling
39
- env = Environment(
39
+ # Create a sandboxed Jinja2 environment with strict undefined handling
40
+ env = SandboxedEnvironment(
40
41
  loader=BaseLoader(),
41
42
  undefined=StrictUndefined,
42
43
  trim_blocks=True,
@@ -48,5 +49,5 @@ def build_prompt(
48
49
  env.globals.update(globals_dict)
49
50
 
50
51
  # Render the template
51
- template = env.from_string(prompt_template)
52
- return str(template.render(**prompt_args))
52
+ jinja_template = env.from_string(template)
53
+ return str(jinja_template.render(**args))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
@@ -23,6 +23,7 @@ Requires-Dist: datasets (>=3.6.0)
23
23
  Requires-Dist: fsspec[gcs] ; extra == "vertex"
24
24
  Requires-Dist: google-cloud-storage ; extra == "vertex"
25
25
  Requires-Dist: google-genai ; extra == "vertex"
26
+ Requires-Dist: ipywidgets (>=8.0.0)
26
27
  Requires-Dist: notebook (>=7.4.5,<8.0.0)
27
28
  Requires-Dist: notebook ; extra == "examples"
28
29
  Requires-Dist: openai ; extra == "openai"
@@ -36,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
36
37
  Requires-Dist: torchaudio ; extra == "examples"
37
38
  Requires-Dist: torchvision ; extra == "examples"
38
39
  Requires-Dist: transformers ; extra == "examples"
39
- Requires-Dist: trismik (>=1.0.1,<2.0.0)
40
+ Requires-Dist: trismik (==1.0.1)
40
41
  Description-Content-Type: text/markdown
41
42
 
42
43
  # Scorebook
@@ -0,0 +1,50 @@
1
+ scorebook/__init__.py,sha256=dcaqd4-qxLHPCw6p-LS_0b8JumEpHDtEilgwP8qNKRY,868
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/eval_datasets/__init__.py,sha256=9YPjxjdaMaOrBUzJwvsUlFPl-KdYMgUGTV3WNd7OCU0,128
6
+ scorebook/eval_datasets/eval_dataset.py,sha256=6GgrAaWelU5dK6I-x9zXHCxVSfvo41yyYNPF0ue4zbo,27200
7
+ scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
8
+ scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ scorebook/evaluate/_async/evaluate_async.py,sha256=qZ2y7-uQRT1b4saBoNaPO9fv4G2LhcP_ZyvkSsIEgHg,15629
10
+ scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ scorebook/evaluate/_sync/evaluate.py,sha256=xanhHu-CaY_WarNM1V64W2sHttkM18j42K0MKrdtrvE,15438
12
+ scorebook/evaluate/evaluate_helpers.py,sha256=swbgB1LurWdufeiVIZZ7ildsYO-ptC7uF3x6AVgptkU,13809
13
+ scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
14
+ scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
15
+ scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
16
+ scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
17
+ scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
18
+ scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
19
+ scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
20
+ scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
21
+ scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
22
+ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
23
+ scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
24
+ scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
25
+ scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
26
+ scorebook/score/__init__.py,sha256=pwjSEb8Tc1edQpYDuu49wnupazISpRX3DQGD2cfiJek,208
27
+ scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ scorebook/score/_async/score_async.py,sha256=GM84UcuFvW1x6ZIePEshG2cwVNB9GvwhhjouOduUwTA,6097
29
+ scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ scorebook/score/_sync/score.py,sha256=rbJhYEhu8auHG4AwpZIkmzw_0ZK1bzbDiIK7Q0ApxhY,6043
31
+ scorebook/score/score_helpers.py,sha256=lq0t5UrOgxa_pDiwL3yHbBlT2BL5B-SkWw1nyaXVoZU,7074
32
+ scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
33
+ scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
34
+ scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
35
+ scorebook/trismik/upload_results.py,sha256=jgT9EVFpuv6OmrYgZVi032cbRrcCOyX4ulLDeWPFBWU,9743
36
+ scorebook/types.py,sha256=x5bD2DU-Xafh7pXwmaQQ1i1zoZDsniHJjE-UEfXySAg,4827
37
+ scorebook/utils/__init__.py,sha256=crefSaTUWkhFF-w4kotUzcz9_GGZukQDgRit4HxJRHY,805
38
+ scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
39
+ scorebook/utils/common_helpers.py,sha256=jewPdQH4JqTWcYT31wn1WNucOPLtGbrGdViwwlYRhD4,1216
40
+ scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
41
+ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
42
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
43
+ scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
44
+ scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
45
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
46
+ scorebook-0.0.12.dist-info/METADATA,sha256=bMjbT1e0GYExB1HcBkAfesaUcXK2-Pck5ox2oCUBXpE,11508
47
+ scorebook-0.0.12.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
48
+ scorebook-0.0.12.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
49
+ scorebook-0.0.12.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
50
+ scorebook-0.0.12.dist-info/RECORD,,
scorebook/eval_dataset.py DELETED
@@ -1,404 +0,0 @@
1
- """Eval Dataset implementation for scorebook."""
2
-
3
- import csv
4
- import json
5
- import random
6
- from typing import Any, Dict, Iterator, List, Optional, Type, Union
7
-
8
- import yaml
9
- from datasets import Dataset as HuggingFaceDataset
10
- from datasets import DatasetDict as HuggingFaceDatasetDict
11
- from datasets import load_dataset
12
-
13
- from scorebook.metrics import MetricBase, MetricRegistry
14
- from scorebook.utils import validate_path
15
-
16
-
17
- class EvalDataset:
18
- """Eval Dataset implementation for scorebook."""
19
-
20
- def __init__(
21
- self,
22
- name: str,
23
- label: str,
24
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
25
- hf_dataset: HuggingFaceDataset,
26
- prompt_template: Optional[str] = None,
27
- ):
28
- """
29
- Create a new scorebook evaluation dataset instance.
30
-
31
- :param name: The name of the evaluation dataset.
32
- :param label: The label field of the dataset.
33
- :param metrics: The specified metrics associated with the dataset.
34
- :param hf_dataset: The dataset as a hugging face dataset object.
35
- :param prompt_template: Optional prompt template for building prompts from dataset items.
36
- """
37
- self.name: str = name
38
- self.label: str = label
39
- self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
40
- self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
41
- self.prompt_template: Optional[str] = prompt_template
42
-
43
- def __len__(self) -> int:
44
- """Return the number of items in the dataset."""
45
- if self._hf_dataset is None:
46
- raise ValueError("Dataset is not initialized")
47
- return len(self._hf_dataset)
48
-
49
- def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
50
- """
51
- Allow item access by index (int) or by column name (str).
52
-
53
- - eval_dataset[i] returns the i-th example (dict).
54
- - eval_dataset["feature"] returns a list of values for that feature.
55
- """
56
- if self._hf_dataset is None:
57
- raise ValueError("Dataset is not initialized")
58
- if isinstance(key, int):
59
- return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
60
- elif isinstance(key, str):
61
- return list(self._hf_dataset[key]) # Ensure we return a List[Any]
62
- else:
63
- raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
64
-
65
- def __str__(self) -> str:
66
- """Return a formatted string summary of the evaluation dataset."""
67
- if self._hf_dataset is None:
68
- return f"EvalDataset(name='{self.name}', status='uninitialized')"
69
-
70
- num_rows = len(self._hf_dataset)
71
- fields = ", ".join(self.column_names)
72
- metrics = ", ".join([metric.name for metric in self.metrics])
73
-
74
- return (
75
- f"EvalDataset(\n"
76
- f" name='{self.name}',\n"
77
- f" rows={num_rows},\n"
78
- f" label='{self.label}',\n"
79
- f" fields=[{fields}],\n"
80
- f" metrics=[{metrics}]\n"
81
- f")"
82
- )
83
-
84
- def __iter__(self) -> Iterator[Dict[str, Any]]:
85
- """Return an iterator over all examples in the dataset."""
86
- if self._hf_dataset is None:
87
- raise ValueError("Dataset is not initialized")
88
- return iter(self._hf_dataset)
89
-
90
- def shuffle(self) -> None:
91
- """Randomly shuffle the dataset items."""
92
- if self._hf_dataset is None:
93
- raise ValueError("Dataset is not initialized")
94
- self._hf_dataset.shuffle()
95
-
96
- @property
97
- def items(self) -> List[Any]:
98
- """Return a list of all examples in the dataset."""
99
- if self._hf_dataset is None:
100
- raise ValueError("Dataset is not initialized")
101
- return list(self._hf_dataset)
102
-
103
- @property
104
- def column_names(self) -> List[str]:
105
- """Return a list of column/feature names available in the dataset."""
106
- if self._hf_dataset is None:
107
- raise ValueError("Dataset is not initialized")
108
- return list(map(str, self._hf_dataset.column_names))
109
-
110
- @classmethod
111
- def from_list(
112
- cls,
113
- name: str,
114
- label: str,
115
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
116
- data: List[Dict[str, Any]],
117
- ) -> "EvalDataset":
118
- """Instantiate an EvalDataset from a list of dictionaries.
119
-
120
- Args:
121
- name: The name of the evaluation dataset.
122
- label: The field used as the evaluation label (ground truth).
123
- metrics: The specified metrics associated with the dataset.
124
- data: List of dictionaries containing the dataset examples.
125
-
126
- Returns:
127
- A scorebook EvalDataset wrapping a Hugging Face dataset.
128
- """
129
- return cls(
130
- name=name, label=label, metrics=metrics, hf_dataset=HuggingFaceDataset.from_list(data)
131
- )
132
-
133
- @classmethod
134
- def from_csv(
135
- cls,
136
- file_path: str,
137
- label: str,
138
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
139
- name: Optional[str] = None,
140
- encoding: str = "utf-8",
141
- newline: str = "",
142
- **reader_kwargs: Any,
143
- ) -> "EvalDataset":
144
- """Instantiate a scorebook dataset from a CSV file.
145
-
146
- Args:
147
- file_path: Path to the CSV file.
148
- label: The field used as the evaluation label (ground truth).
149
- metrics: The specified metrics associated with the dataset.
150
- name: Optional name for the eval dataset, if not provided, the path is used
151
- encoding: Encoding of the CSV file.
152
- newline: Newline character of the CSV file.
153
- reader_kwargs: Dict of kwargs passed to `csv.DictReader`.
154
-
155
- Returns:
156
- A scorebook EvalDataset.
157
-
158
- Raises:
159
- FileNotFoundError: If the file does not exist at the given path.
160
- ValueError: If the CSV file cannot be parsed or is empty.
161
- """
162
- reader_kwargs = reader_kwargs or {}
163
- path = validate_path(file_path, expected_suffix=".csv")
164
-
165
- try:
166
- with open(path, encoding=encoding, newline=newline) as csvfile:
167
- reader = csv.DictReader(csvfile, **reader_kwargs)
168
- data = [row for row in reader]
169
- except csv.Error as e:
170
- raise ValueError(f"Failed to parse CSV file {file_path}: {e}") from e
171
-
172
- if not data:
173
- raise ValueError(f"CSV file {file_path} is empty or contains only headers.")
174
-
175
- name = name if name else path.stem
176
- return cls(
177
- name=name,
178
- label=label,
179
- metrics=metrics,
180
- hf_dataset=HuggingFaceDataset.from_list(data),
181
- )
182
-
183
- @classmethod
184
- def from_json(
185
- cls,
186
- file_path: str,
187
- label: str,
188
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
189
- name: Optional[str] = None,
190
- split: Optional[str] = None,
191
- ) -> "EvalDataset":
192
- """Instantiate an EvalDataset from a JSON file.
193
-
194
- The JSON file must follow one of two supported formats:
195
-
196
- 1. **Flat format** – a list of dictionaries:
197
- [
198
- {"input": "What is 2+2?", "label": "4"},
199
- {"input": "Capital of France?", "label": "Paris"}
200
- ]
201
-
202
- 2. **Split format** – a dictionary of named splits:
203
- {
204
- "train": [{"input": ..., "label": ...}],
205
- "test": [{"input": ..., "label": ...}]
206
- }
207
-
208
- Args:
209
- file_path: Path to the JSON file on disk.
210
- label: The field used as the evaluation label (ground truth).
211
- metrics: The specified metrics associated with the dataset.
212
- name: Optional name for the eval dataset, if not provided, the path is used
213
- split: If the JSON uses a split structure, this is the split name to load.
214
-
215
- Returns:
216
- A scorebook EvalDataset wrapping a Hugging Face dataset.
217
-
218
- Raises:
219
- FileNotFoundError: If the file does not exist.
220
- ValueError: If the JSON is invalid or the structure is unsupported.
221
- """
222
- path = validate_path(file_path, expected_suffix=".json")
223
-
224
- try:
225
- with path.open("r", encoding="utf-8") as f:
226
- data = json.load(f)
227
- except json.JSONDecodeError as e:
228
- raise ValueError(f"Invalid JSON in {file_path}: {e}") from e
229
-
230
- if isinstance(data, dict):
231
- if split is None:
232
- raise ValueError(f"Split name must be provided for split-style JSON: {file_path}")
233
- split_data = data.get(split)
234
- if split_data is None:
235
- raise ValueError(f"Split '{split}' not found in JSON file: {file_path}")
236
- if not isinstance(split_data, list):
237
- raise ValueError(f"Split '{split}' is not a list of examples in: {file_path}")
238
- hf_dataset = HuggingFaceDataset.from_list(split_data)
239
- elif isinstance(data, list):
240
- hf_dataset = HuggingFaceDataset.from_list(data)
241
- else:
242
- raise ValueError(f"Unsupported JSON structure in {file_path}. Expected list or dict.")
243
-
244
- name = name if name else path.stem
245
- return cls(name=name, label=label, metrics=metrics, hf_dataset=hf_dataset)
246
-
247
- @classmethod
248
- def from_huggingface(
249
- cls,
250
- path: str,
251
- label: str,
252
- metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
253
- split: Optional[str] = None,
254
- name: Optional[str] = None,
255
- ) -> "EvalDataset":
256
- """Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
257
-
258
- If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
259
- If no split is specified, the method attempts to load the full dataset. If the dataset
260
- is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
261
- split.
262
-
263
- Args:
264
- path: The path of the dataset on the Hugging Face Hub.
265
- label: The field used as the evaluation label (ground truth).
266
- metrics: The specified metrics associated with the dataset.
267
- split: Optional name of the split to load.
268
- name: Optional dataset configuration name.
269
-
270
- Returns:
271
- An EvalDataset wrapping the selected Hugging Face dataset.
272
-
273
- Raises:
274
- ValueError: If the dataset cannot be loaded, or the expected split is missing.
275
- """
276
- try:
277
- kwargs = {}
278
- if split is not None:
279
- kwargs["split"] = split
280
- if name is not None:
281
- kwargs["name"] = name
282
- ds = load_dataset(path, **kwargs)
283
- except Exception as e:
284
- raise ValueError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
285
-
286
- if isinstance(ds, HuggingFaceDataset):
287
- hf_dataset = ds
288
- elif isinstance(ds, HuggingFaceDatasetDict):
289
- if "test" in ds:
290
- hf_dataset = ds["test"]
291
- else:
292
- raise ValueError(
293
- f"Split not specified and no 'test' split found in dataset '{path}'."
294
- )
295
- else:
296
- raise ValueError(f"Unexpected dataset type for '{path}': {type(ds)}")
297
-
298
- return cls(name=path, label=label, metrics=metrics, hf_dataset=hf_dataset)
299
-
300
- @classmethod
301
- def from_yaml(cls, file_path: str) -> "EvalDataset":
302
- """Instantiate an EvalDataset from a YAML file.
303
-
304
- The YAML file should contain configuration for loading a dataset, including:
305
- - name: Name of the dataset or Hugging Face dataset path
306
- - label: The field used as the evaluation label
307
- - metrics: List of metrics to evaluate
308
- - split: Optional split name to load
309
- - template: Optional prompt template
310
-
311
- Returns:
312
- An EvalDataset instance configured according to the YAML file.
313
-
314
- Raises:
315
- ValueError: If the YAML file is invalid or missing required fields.
316
- """
317
- path = validate_path(file_path, expected_suffix=".yaml")
318
-
319
- try:
320
- with path.open("r", encoding="utf-8") as f:
321
- config = yaml.safe_load(f)
322
- except yaml.YAMLError as e:
323
- raise ValueError(f"Invalid YAML in {file_path}: {e}") from e
324
-
325
- # Validate required fields
326
- required_fields = ["name", "label", "metrics"]
327
- missing_fields = [field for field in required_fields if field not in config]
328
- if missing_fields:
329
- raise ValueError(f"Missing required fields in YAML config: {', '.join(missing_fields)}")
330
-
331
- # Load the dataset from Hugging Face
332
- dataset = cls.from_huggingface(
333
- path=config["name"],
334
- label=config["label"],
335
- metrics=config["metrics"],
336
- split=config.get("split"), # Optional field
337
- name=config.get("config"), # Optional field
338
- )
339
-
340
- # Add template if provided
341
- if "template" in config:
342
- dataset.prompt_template = config["template"]
343
-
344
- return dataset
345
-
346
- @staticmethod
347
- def _resolve_metrics(
348
- metrics: Union[
349
- str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
350
- ]
351
- ) -> List[MetricBase]:
352
- """
353
- Convert metric names/classes into a list of MetricBase instances using MetricRegistry.
354
-
355
- Used to normalize metrics to a metric type.
356
- """
357
- if not isinstance(metrics, list):
358
- metrics = [metrics]
359
-
360
- resolved: List[MetricBase] = []
361
- for m in metrics:
362
- if isinstance(m, MetricBase):
363
- resolved.append(m) # Already an instance
364
- else:
365
- resolved.append(MetricRegistry.get(m)) # Use registry for str or class
366
-
367
- return resolved
368
-
369
- def sample(self, sample_size: int) -> "EvalDataset":
370
- """Create a new dataset with randomly sampled items from this dataset.
371
-
372
- Args:
373
- sample_size: The number of items to sample from the dataset
374
-
375
- Returns:
376
- A new EvalDataset with randomly sampled items
377
-
378
- Raises:
379
- ValueError: If sample_size is larger than the dataset size
380
- """
381
- dataset_size = len(self.items)
382
-
383
- if sample_size > dataset_size:
384
- raise ValueError(
385
- f"Sample size {sample_size} is larger than dataset size {dataset_size} "
386
- f"for dataset '{self.name}'"
387
- )
388
-
389
- # Create randomly sampled items
390
- sampled_items = random.sample(self.items, sample_size)
391
-
392
- # Create a new EvalDataset instance with sampled items using from_list
393
- sampled_dataset = self.from_list(
394
- name=self.name,
395
- label=self.label,
396
- metrics=self.metrics,
397
- data=sampled_items,
398
- )
399
-
400
- # Preserve the prompt template if it exists
401
- if self.prompt_template is not None:
402
- sampled_dataset.prompt_template = self.prompt_template
403
-
404
- return sampled_dataset
@@ -1,41 +0,0 @@
1
- scorebook/__init__.py,sha256=tAe8v8xyiNcl7P4SUIM5dPVMqU8GQ8dKzJ1pfF6B-Ms,629
2
- scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
- scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
4
- scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
- scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
6
- scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
7
- scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- scorebook/evaluate/_async/evaluate_async.py,sha256=vn8rjjveCCF6ItZWngqAP3RhfScHV_LlIomqh-z5-UU,15509
9
- scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- scorebook/evaluate/_sync/evaluate.py,sha256=4LVdXvCsPmSbkBxphJ9in5l17GL9Zqn66bZm9a8w9nc,15347
11
- scorebook/evaluate/evaluate_helpers.py,sha256=rAXUroMXfPkWqufMnA97bfscgPik38s3eeepe2RkchA,13026
12
- scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
13
- scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
14
- scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
15
- scorebook/inference/clients/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
16
- scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
17
- scorebook/inference/clients/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
18
- scorebook/inference/clients/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
19
- scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
20
- scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
21
- scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
22
- scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
23
- scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
24
- scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
25
- scorebook/settings.py,sha256=CgaumN98QpU7XKMugUG41UAO8oZVuWDco4uooSagFZY,596
26
- scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
27
- scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
28
- scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
29
- scorebook/utils/__init__.py,sha256=3xdIXJzYEp9k23z4_49VWZtasoZN8tJxVPieE_HOuww,519
30
- scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
31
- scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
32
- scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
33
- scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
34
- scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
35
- scorebook/utils/progress_bars.py,sha256=uLG_0s_QEHGgjZcVaDJ7wp14Rd3GY5dWu-F4FL8isJg,3783
36
- scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
37
- scorebook-0.0.10.dist-info/METADATA,sha256=wJXBm9ZzeNYIrhUOz4Uc4D_5_1J8arUnMiOtR5BNeOA,11479
38
- scorebook-0.0.10.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
39
- scorebook-0.0.10.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
40
- scorebook-0.0.10.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
41
- scorebook-0.0.10.dist-info/RECORD,,