scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +11 -4
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/_async/evaluate_async.py +135 -130
- scorebook/evaluate/_sync/evaluate.py +135 -131
- scorebook/evaluate/evaluate_helpers.py +46 -23
- scorebook/exceptions.py +54 -2
- scorebook/inference/clients/bedrock.py +1 -1
- scorebook/inference/clients/portkey.py +1 -1
- scorebook/inference/clients/vertex.py +1 -1
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/settings.py +3 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +36 -54
- scorebook/utils/__init__.py +11 -4
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +819 -70
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
- scorebook-0.0.12.dist-info/RECORD +50 -0
- scorebook/eval_dataset.py +0 -404
- scorebook-0.0.10.dist-info/RECORD +0 -41
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -7,36 +7,37 @@ and global variables, using strict undefined handling for better error detection
|
|
|
7
7
|
|
|
8
8
|
from typing import Any, Dict, Optional
|
|
9
9
|
|
|
10
|
-
from jinja2 import BaseLoader,
|
|
10
|
+
from jinja2 import BaseLoader, StrictUndefined
|
|
11
|
+
from jinja2.sandbox import SandboxedEnvironment
|
|
11
12
|
|
|
12
13
|
from scorebook.utils.jinja_helpers import default_jinja_filters, default_jinja_globals
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
def render_template(
|
|
17
|
+
template: str,
|
|
18
|
+
args: Dict[str, Any],
|
|
18
19
|
filters: Optional[Dict[str, Any]] = None,
|
|
19
20
|
globals_dict: Optional[Dict[str, Any]] = None,
|
|
20
21
|
) -> str:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
+
Render a Jinja2 template string with the provided arguments.
|
|
23
24
|
|
|
24
25
|
Args:
|
|
25
|
-
|
|
26
|
-
|
|
26
|
+
template: Jinja2 template string
|
|
27
|
+
args: Dictionary of arguments to pass to the template
|
|
27
28
|
filters: Dictionary of Jinja2 filters. Defaults to default_jinja_filters().
|
|
28
29
|
globals_dict: Dictionary of global functions/variables. Defaults to default_jinja_globals().
|
|
29
30
|
|
|
30
31
|
Returns:
|
|
31
|
-
str: Rendered
|
|
32
|
+
str: Rendered template string
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
35
|
# Use defaults if not provided
|
|
35
36
|
filters = filters or default_jinja_filters()
|
|
36
37
|
globals_dict = globals_dict or default_jinja_globals()
|
|
37
38
|
|
|
38
|
-
# Create a Jinja2 environment with strict undefined handling
|
|
39
|
-
env =
|
|
39
|
+
# Create a sandboxed Jinja2 environment with strict undefined handling
|
|
40
|
+
env = SandboxedEnvironment(
|
|
40
41
|
loader=BaseLoader(),
|
|
41
42
|
undefined=StrictUndefined,
|
|
42
43
|
trim_blocks=True,
|
|
@@ -48,5 +49,5 @@ def build_prompt(
|
|
|
48
49
|
env.globals.update(globals_dict)
|
|
49
50
|
|
|
50
51
|
# Render the template
|
|
51
|
-
|
|
52
|
-
return str(
|
|
52
|
+
jinja_template = env.from_string(template)
|
|
53
|
+
return str(jinja_template.render(**args))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
@@ -23,6 +23,7 @@ Requires-Dist: datasets (>=3.6.0)
|
|
|
23
23
|
Requires-Dist: fsspec[gcs] ; extra == "vertex"
|
|
24
24
|
Requires-Dist: google-cloud-storage ; extra == "vertex"
|
|
25
25
|
Requires-Dist: google-genai ; extra == "vertex"
|
|
26
|
+
Requires-Dist: ipywidgets (>=8.0.0)
|
|
26
27
|
Requires-Dist: notebook (>=7.4.5,<8.0.0)
|
|
27
28
|
Requires-Dist: notebook ; extra == "examples"
|
|
28
29
|
Requires-Dist: openai ; extra == "openai"
|
|
@@ -36,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
36
37
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
37
38
|
Requires-Dist: torchvision ; extra == "examples"
|
|
38
39
|
Requires-Dist: transformers ; extra == "examples"
|
|
39
|
-
Requires-Dist: trismik (
|
|
40
|
+
Requires-Dist: trismik (==1.0.1)
|
|
40
41
|
Description-Content-Type: text/markdown
|
|
41
42
|
|
|
42
43
|
# Scorebook
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
scorebook/__init__.py,sha256=dcaqd4-qxLHPCw6p-LS_0b8JumEpHDtEilgwP8qNKRY,868
|
|
2
|
+
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
+
scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
|
|
4
|
+
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
+
scorebook/eval_datasets/__init__.py,sha256=9YPjxjdaMaOrBUzJwvsUlFPl-KdYMgUGTV3WNd7OCU0,128
|
|
6
|
+
scorebook/eval_datasets/eval_dataset.py,sha256=6GgrAaWelU5dK6I-x9zXHCxVSfvo41yyYNPF0ue4zbo,27200
|
|
7
|
+
scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
|
|
8
|
+
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
scorebook/evaluate/_async/evaluate_async.py,sha256=qZ2y7-uQRT1b4saBoNaPO9fv4G2LhcP_ZyvkSsIEgHg,15629
|
|
10
|
+
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
scorebook/evaluate/_sync/evaluate.py,sha256=xanhHu-CaY_WarNM1V64W2sHttkM18j42K0MKrdtrvE,15438
|
|
12
|
+
scorebook/evaluate/evaluate_helpers.py,sha256=swbgB1LurWdufeiVIZZ7ildsYO-ptC7uF3x6AVgptkU,13809
|
|
13
|
+
scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
|
|
14
|
+
scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
|
|
15
|
+
scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
|
|
16
|
+
scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
|
|
17
|
+
scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
|
|
18
|
+
scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
|
|
19
|
+
scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
|
|
20
|
+
scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
|
|
21
|
+
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
22
|
+
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
23
|
+
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
24
|
+
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
25
|
+
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
26
|
+
scorebook/score/__init__.py,sha256=pwjSEb8Tc1edQpYDuu49wnupazISpRX3DQGD2cfiJek,208
|
|
27
|
+
scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
scorebook/score/_async/score_async.py,sha256=GM84UcuFvW1x6ZIePEshG2cwVNB9GvwhhjouOduUwTA,6097
|
|
29
|
+
scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
scorebook/score/_sync/score.py,sha256=rbJhYEhu8auHG4AwpZIkmzw_0ZK1bzbDiIK7Q0ApxhY,6043
|
|
31
|
+
scorebook/score/score_helpers.py,sha256=lq0t5UrOgxa_pDiwL3yHbBlT2BL5B-SkWw1nyaXVoZU,7074
|
|
32
|
+
scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
|
|
33
|
+
scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
|
|
34
|
+
scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
35
|
+
scorebook/trismik/upload_results.py,sha256=jgT9EVFpuv6OmrYgZVi032cbRrcCOyX4ulLDeWPFBWU,9743
|
|
36
|
+
scorebook/types.py,sha256=x5bD2DU-Xafh7pXwmaQQ1i1zoZDsniHJjE-UEfXySAg,4827
|
|
37
|
+
scorebook/utils/__init__.py,sha256=crefSaTUWkhFF-w4kotUzcz9_GGZukQDgRit4HxJRHY,805
|
|
38
|
+
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
39
|
+
scorebook/utils/common_helpers.py,sha256=jewPdQH4JqTWcYT31wn1WNucOPLtGbrGdViwwlYRhD4,1216
|
|
40
|
+
scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
|
|
41
|
+
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
42
|
+
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
43
|
+
scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
|
|
44
|
+
scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
|
|
45
|
+
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
46
|
+
scorebook-0.0.12.dist-info/METADATA,sha256=bMjbT1e0GYExB1HcBkAfesaUcXK2-Pck5ox2oCUBXpE,11508
|
|
47
|
+
scorebook-0.0.12.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
48
|
+
scorebook-0.0.12.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
49
|
+
scorebook-0.0.12.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
50
|
+
scorebook-0.0.12.dist-info/RECORD,,
|
scorebook/eval_dataset.py
DELETED
|
@@ -1,404 +0,0 @@
|
|
|
1
|
-
"""Eval Dataset implementation for scorebook."""
|
|
2
|
-
|
|
3
|
-
import csv
|
|
4
|
-
import json
|
|
5
|
-
import random
|
|
6
|
-
from typing import Any, Dict, Iterator, List, Optional, Type, Union
|
|
7
|
-
|
|
8
|
-
import yaml
|
|
9
|
-
from datasets import Dataset as HuggingFaceDataset
|
|
10
|
-
from datasets import DatasetDict as HuggingFaceDatasetDict
|
|
11
|
-
from datasets import load_dataset
|
|
12
|
-
|
|
13
|
-
from scorebook.metrics import MetricBase, MetricRegistry
|
|
14
|
-
from scorebook.utils import validate_path
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class EvalDataset:
|
|
18
|
-
"""Eval Dataset implementation for scorebook."""
|
|
19
|
-
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
name: str,
|
|
23
|
-
label: str,
|
|
24
|
-
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
25
|
-
hf_dataset: HuggingFaceDataset,
|
|
26
|
-
prompt_template: Optional[str] = None,
|
|
27
|
-
):
|
|
28
|
-
"""
|
|
29
|
-
Create a new scorebook evaluation dataset instance.
|
|
30
|
-
|
|
31
|
-
:param name: The name of the evaluation dataset.
|
|
32
|
-
:param label: The label field of the dataset.
|
|
33
|
-
:param metrics: The specified metrics associated with the dataset.
|
|
34
|
-
:param hf_dataset: The dataset as a hugging face dataset object.
|
|
35
|
-
:param prompt_template: Optional prompt template for building prompts from dataset items.
|
|
36
|
-
"""
|
|
37
|
-
self.name: str = name
|
|
38
|
-
self.label: str = label
|
|
39
|
-
self.metrics: List[MetricBase] = self._resolve_metrics(metrics)
|
|
40
|
-
self._hf_dataset: Optional[HuggingFaceDataset] = hf_dataset
|
|
41
|
-
self.prompt_template: Optional[str] = prompt_template
|
|
42
|
-
|
|
43
|
-
def __len__(self) -> int:
|
|
44
|
-
"""Return the number of items in the dataset."""
|
|
45
|
-
if self._hf_dataset is None:
|
|
46
|
-
raise ValueError("Dataset is not initialized")
|
|
47
|
-
return len(self._hf_dataset)
|
|
48
|
-
|
|
49
|
-
def __getitem__(self, key: Union[int, str]) -> Union[Dict[str, Any], List[Any]]:
|
|
50
|
-
"""
|
|
51
|
-
Allow item access by index (int) or by column name (str).
|
|
52
|
-
|
|
53
|
-
- eval_dataset[i] returns the i-th example (dict).
|
|
54
|
-
- eval_dataset["feature"] returns a list of values for that feature.
|
|
55
|
-
"""
|
|
56
|
-
if self._hf_dataset is None:
|
|
57
|
-
raise ValueError("Dataset is not initialized")
|
|
58
|
-
if isinstance(key, int):
|
|
59
|
-
return dict(self._hf_dataset[key]) # Ensure we return a Dict[str, Any]
|
|
60
|
-
elif isinstance(key, str):
|
|
61
|
-
return list(self._hf_dataset[key]) # Ensure we return a List[Any]
|
|
62
|
-
else:
|
|
63
|
-
raise TypeError(f"Invalid key type: {type(key)}. Must be int or str.")
|
|
64
|
-
|
|
65
|
-
def __str__(self) -> str:
|
|
66
|
-
"""Return a formatted string summary of the evaluation dataset."""
|
|
67
|
-
if self._hf_dataset is None:
|
|
68
|
-
return f"EvalDataset(name='{self.name}', status='uninitialized')"
|
|
69
|
-
|
|
70
|
-
num_rows = len(self._hf_dataset)
|
|
71
|
-
fields = ", ".join(self.column_names)
|
|
72
|
-
metrics = ", ".join([metric.name for metric in self.metrics])
|
|
73
|
-
|
|
74
|
-
return (
|
|
75
|
-
f"EvalDataset(\n"
|
|
76
|
-
f" name='{self.name}',\n"
|
|
77
|
-
f" rows={num_rows},\n"
|
|
78
|
-
f" label='{self.label}',\n"
|
|
79
|
-
f" fields=[{fields}],\n"
|
|
80
|
-
f" metrics=[{metrics}]\n"
|
|
81
|
-
f")"
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
85
|
-
"""Return an iterator over all examples in the dataset."""
|
|
86
|
-
if self._hf_dataset is None:
|
|
87
|
-
raise ValueError("Dataset is not initialized")
|
|
88
|
-
return iter(self._hf_dataset)
|
|
89
|
-
|
|
90
|
-
def shuffle(self) -> None:
|
|
91
|
-
"""Randomly shuffle the dataset items."""
|
|
92
|
-
if self._hf_dataset is None:
|
|
93
|
-
raise ValueError("Dataset is not initialized")
|
|
94
|
-
self._hf_dataset.shuffle()
|
|
95
|
-
|
|
96
|
-
@property
|
|
97
|
-
def items(self) -> List[Any]:
|
|
98
|
-
"""Return a list of all examples in the dataset."""
|
|
99
|
-
if self._hf_dataset is None:
|
|
100
|
-
raise ValueError("Dataset is not initialized")
|
|
101
|
-
return list(self._hf_dataset)
|
|
102
|
-
|
|
103
|
-
@property
|
|
104
|
-
def column_names(self) -> List[str]:
|
|
105
|
-
"""Return a list of column/feature names available in the dataset."""
|
|
106
|
-
if self._hf_dataset is None:
|
|
107
|
-
raise ValueError("Dataset is not initialized")
|
|
108
|
-
return list(map(str, self._hf_dataset.column_names))
|
|
109
|
-
|
|
110
|
-
@classmethod
|
|
111
|
-
def from_list(
|
|
112
|
-
cls,
|
|
113
|
-
name: str,
|
|
114
|
-
label: str,
|
|
115
|
-
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
116
|
-
data: List[Dict[str, Any]],
|
|
117
|
-
) -> "EvalDataset":
|
|
118
|
-
"""Instantiate an EvalDataset from a list of dictionaries.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
name: The name of the evaluation dataset.
|
|
122
|
-
label: The field used as the evaluation label (ground truth).
|
|
123
|
-
metrics: The specified metrics associated with the dataset.
|
|
124
|
-
data: List of dictionaries containing the dataset examples.
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
A scorebook EvalDataset wrapping a Hugging Face dataset.
|
|
128
|
-
"""
|
|
129
|
-
return cls(
|
|
130
|
-
name=name, label=label, metrics=metrics, hf_dataset=HuggingFaceDataset.from_list(data)
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
@classmethod
|
|
134
|
-
def from_csv(
|
|
135
|
-
cls,
|
|
136
|
-
file_path: str,
|
|
137
|
-
label: str,
|
|
138
|
-
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
139
|
-
name: Optional[str] = None,
|
|
140
|
-
encoding: str = "utf-8",
|
|
141
|
-
newline: str = "",
|
|
142
|
-
**reader_kwargs: Any,
|
|
143
|
-
) -> "EvalDataset":
|
|
144
|
-
"""Instantiate a scorebook dataset from a CSV file.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
file_path: Path to the CSV file.
|
|
148
|
-
label: The field used as the evaluation label (ground truth).
|
|
149
|
-
metrics: The specified metrics associated with the dataset.
|
|
150
|
-
name: Optional name for the eval dataset, if not provided, the path is used
|
|
151
|
-
encoding: Encoding of the CSV file.
|
|
152
|
-
newline: Newline character of the CSV file.
|
|
153
|
-
reader_kwargs: Dict of kwargs passed to `csv.DictReader`.
|
|
154
|
-
|
|
155
|
-
Returns:
|
|
156
|
-
A scorebook EvalDataset.
|
|
157
|
-
|
|
158
|
-
Raises:
|
|
159
|
-
FileNotFoundError: If the file does not exist at the given path.
|
|
160
|
-
ValueError: If the CSV file cannot be parsed or is empty.
|
|
161
|
-
"""
|
|
162
|
-
reader_kwargs = reader_kwargs or {}
|
|
163
|
-
path = validate_path(file_path, expected_suffix=".csv")
|
|
164
|
-
|
|
165
|
-
try:
|
|
166
|
-
with open(path, encoding=encoding, newline=newline) as csvfile:
|
|
167
|
-
reader = csv.DictReader(csvfile, **reader_kwargs)
|
|
168
|
-
data = [row for row in reader]
|
|
169
|
-
except csv.Error as e:
|
|
170
|
-
raise ValueError(f"Failed to parse CSV file {file_path}: {e}") from e
|
|
171
|
-
|
|
172
|
-
if not data:
|
|
173
|
-
raise ValueError(f"CSV file {file_path} is empty or contains only headers.")
|
|
174
|
-
|
|
175
|
-
name = name if name else path.stem
|
|
176
|
-
return cls(
|
|
177
|
-
name=name,
|
|
178
|
-
label=label,
|
|
179
|
-
metrics=metrics,
|
|
180
|
-
hf_dataset=HuggingFaceDataset.from_list(data),
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
@classmethod
|
|
184
|
-
def from_json(
|
|
185
|
-
cls,
|
|
186
|
-
file_path: str,
|
|
187
|
-
label: str,
|
|
188
|
-
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
189
|
-
name: Optional[str] = None,
|
|
190
|
-
split: Optional[str] = None,
|
|
191
|
-
) -> "EvalDataset":
|
|
192
|
-
"""Instantiate an EvalDataset from a JSON file.
|
|
193
|
-
|
|
194
|
-
The JSON file must follow one of two supported formats:
|
|
195
|
-
|
|
196
|
-
1. **Flat format** – a list of dictionaries:
|
|
197
|
-
[
|
|
198
|
-
{"input": "What is 2+2?", "label": "4"},
|
|
199
|
-
{"input": "Capital of France?", "label": "Paris"}
|
|
200
|
-
]
|
|
201
|
-
|
|
202
|
-
2. **Split format** – a dictionary of named splits:
|
|
203
|
-
{
|
|
204
|
-
"train": [{"input": ..., "label": ...}],
|
|
205
|
-
"test": [{"input": ..., "label": ...}]
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
Args:
|
|
209
|
-
file_path: Path to the JSON file on disk.
|
|
210
|
-
label: The field used as the evaluation label (ground truth).
|
|
211
|
-
metrics: The specified metrics associated with the dataset.
|
|
212
|
-
name: Optional name for the eval dataset, if not provided, the path is used
|
|
213
|
-
split: If the JSON uses a split structure, this is the split name to load.
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
A scorebook EvalDataset wrapping a Hugging Face dataset.
|
|
217
|
-
|
|
218
|
-
Raises:
|
|
219
|
-
FileNotFoundError: If the file does not exist.
|
|
220
|
-
ValueError: If the JSON is invalid or the structure is unsupported.
|
|
221
|
-
"""
|
|
222
|
-
path = validate_path(file_path, expected_suffix=".json")
|
|
223
|
-
|
|
224
|
-
try:
|
|
225
|
-
with path.open("r", encoding="utf-8") as f:
|
|
226
|
-
data = json.load(f)
|
|
227
|
-
except json.JSONDecodeError as e:
|
|
228
|
-
raise ValueError(f"Invalid JSON in {file_path}: {e}") from e
|
|
229
|
-
|
|
230
|
-
if isinstance(data, dict):
|
|
231
|
-
if split is None:
|
|
232
|
-
raise ValueError(f"Split name must be provided for split-style JSON: {file_path}")
|
|
233
|
-
split_data = data.get(split)
|
|
234
|
-
if split_data is None:
|
|
235
|
-
raise ValueError(f"Split '{split}' not found in JSON file: {file_path}")
|
|
236
|
-
if not isinstance(split_data, list):
|
|
237
|
-
raise ValueError(f"Split '{split}' is not a list of examples in: {file_path}")
|
|
238
|
-
hf_dataset = HuggingFaceDataset.from_list(split_data)
|
|
239
|
-
elif isinstance(data, list):
|
|
240
|
-
hf_dataset = HuggingFaceDataset.from_list(data)
|
|
241
|
-
else:
|
|
242
|
-
raise ValueError(f"Unsupported JSON structure in {file_path}. Expected list or dict.")
|
|
243
|
-
|
|
244
|
-
name = name if name else path.stem
|
|
245
|
-
return cls(name=name, label=label, metrics=metrics, hf_dataset=hf_dataset)
|
|
246
|
-
|
|
247
|
-
@classmethod
|
|
248
|
-
def from_huggingface(
|
|
249
|
-
cls,
|
|
250
|
-
path: str,
|
|
251
|
-
label: str,
|
|
252
|
-
metrics: Union[str, Type[MetricBase], List[Union[str, Type[MetricBase]]]],
|
|
253
|
-
split: Optional[str] = None,
|
|
254
|
-
name: Optional[str] = None,
|
|
255
|
-
) -> "EvalDataset":
|
|
256
|
-
"""Instantiate an EvalDataset from a dataset available on Hugging Face Hub.
|
|
257
|
-
|
|
258
|
-
If a specific split is provided (e.g., "train" or "test"), it will be loaded directly.
|
|
259
|
-
If no split is specified, the method attempts to load the full dataset. If the dataset
|
|
260
|
-
is split into multiple subsets (i.e., a DatasetDict), it defaults to loading the "test"
|
|
261
|
-
split.
|
|
262
|
-
|
|
263
|
-
Args:
|
|
264
|
-
path: The path of the dataset on the Hugging Face Hub.
|
|
265
|
-
label: The field used as the evaluation label (ground truth).
|
|
266
|
-
metrics: The specified metrics associated with the dataset.
|
|
267
|
-
split: Optional name of the split to load.
|
|
268
|
-
name: Optional dataset configuration name.
|
|
269
|
-
|
|
270
|
-
Returns:
|
|
271
|
-
An EvalDataset wrapping the selected Hugging Face dataset.
|
|
272
|
-
|
|
273
|
-
Raises:
|
|
274
|
-
ValueError: If the dataset cannot be loaded, or the expected split is missing.
|
|
275
|
-
"""
|
|
276
|
-
try:
|
|
277
|
-
kwargs = {}
|
|
278
|
-
if split is not None:
|
|
279
|
-
kwargs["split"] = split
|
|
280
|
-
if name is not None:
|
|
281
|
-
kwargs["name"] = name
|
|
282
|
-
ds = load_dataset(path, **kwargs)
|
|
283
|
-
except Exception as e:
|
|
284
|
-
raise ValueError(f"Failed to load dataset '{path}' from Hugging Face: {e}") from e
|
|
285
|
-
|
|
286
|
-
if isinstance(ds, HuggingFaceDataset):
|
|
287
|
-
hf_dataset = ds
|
|
288
|
-
elif isinstance(ds, HuggingFaceDatasetDict):
|
|
289
|
-
if "test" in ds:
|
|
290
|
-
hf_dataset = ds["test"]
|
|
291
|
-
else:
|
|
292
|
-
raise ValueError(
|
|
293
|
-
f"Split not specified and no 'test' split found in dataset '{path}'."
|
|
294
|
-
)
|
|
295
|
-
else:
|
|
296
|
-
raise ValueError(f"Unexpected dataset type for '{path}': {type(ds)}")
|
|
297
|
-
|
|
298
|
-
return cls(name=path, label=label, metrics=metrics, hf_dataset=hf_dataset)
|
|
299
|
-
|
|
300
|
-
@classmethod
|
|
301
|
-
def from_yaml(cls, file_path: str) -> "EvalDataset":
|
|
302
|
-
"""Instantiate an EvalDataset from a YAML file.
|
|
303
|
-
|
|
304
|
-
The YAML file should contain configuration for loading a dataset, including:
|
|
305
|
-
- name: Name of the dataset or Hugging Face dataset path
|
|
306
|
-
- label: The field used as the evaluation label
|
|
307
|
-
- metrics: List of metrics to evaluate
|
|
308
|
-
- split: Optional split name to load
|
|
309
|
-
- template: Optional prompt template
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
An EvalDataset instance configured according to the YAML file.
|
|
313
|
-
|
|
314
|
-
Raises:
|
|
315
|
-
ValueError: If the YAML file is invalid or missing required fields.
|
|
316
|
-
"""
|
|
317
|
-
path = validate_path(file_path, expected_suffix=".yaml")
|
|
318
|
-
|
|
319
|
-
try:
|
|
320
|
-
with path.open("r", encoding="utf-8") as f:
|
|
321
|
-
config = yaml.safe_load(f)
|
|
322
|
-
except yaml.YAMLError as e:
|
|
323
|
-
raise ValueError(f"Invalid YAML in {file_path}: {e}") from e
|
|
324
|
-
|
|
325
|
-
# Validate required fields
|
|
326
|
-
required_fields = ["name", "label", "metrics"]
|
|
327
|
-
missing_fields = [field for field in required_fields if field not in config]
|
|
328
|
-
if missing_fields:
|
|
329
|
-
raise ValueError(f"Missing required fields in YAML config: {', '.join(missing_fields)}")
|
|
330
|
-
|
|
331
|
-
# Load the dataset from Hugging Face
|
|
332
|
-
dataset = cls.from_huggingface(
|
|
333
|
-
path=config["name"],
|
|
334
|
-
label=config["label"],
|
|
335
|
-
metrics=config["metrics"],
|
|
336
|
-
split=config.get("split"), # Optional field
|
|
337
|
-
name=config.get("config"), # Optional field
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
# Add template if provided
|
|
341
|
-
if "template" in config:
|
|
342
|
-
dataset.prompt_template = config["template"]
|
|
343
|
-
|
|
344
|
-
return dataset
|
|
345
|
-
|
|
346
|
-
@staticmethod
|
|
347
|
-
def _resolve_metrics(
|
|
348
|
-
metrics: Union[
|
|
349
|
-
str, Type[MetricBase], MetricBase, List[Union[str, Type[MetricBase], MetricBase]]
|
|
350
|
-
]
|
|
351
|
-
) -> List[MetricBase]:
|
|
352
|
-
"""
|
|
353
|
-
Convert metric names/classes into a list of MetricBase instances using MetricRegistry.
|
|
354
|
-
|
|
355
|
-
Used to normalize metrics to a metric type.
|
|
356
|
-
"""
|
|
357
|
-
if not isinstance(metrics, list):
|
|
358
|
-
metrics = [metrics]
|
|
359
|
-
|
|
360
|
-
resolved: List[MetricBase] = []
|
|
361
|
-
for m in metrics:
|
|
362
|
-
if isinstance(m, MetricBase):
|
|
363
|
-
resolved.append(m) # Already an instance
|
|
364
|
-
else:
|
|
365
|
-
resolved.append(MetricRegistry.get(m)) # Use registry for str or class
|
|
366
|
-
|
|
367
|
-
return resolved
|
|
368
|
-
|
|
369
|
-
def sample(self, sample_size: int) -> "EvalDataset":
|
|
370
|
-
"""Create a new dataset with randomly sampled items from this dataset.
|
|
371
|
-
|
|
372
|
-
Args:
|
|
373
|
-
sample_size: The number of items to sample from the dataset
|
|
374
|
-
|
|
375
|
-
Returns:
|
|
376
|
-
A new EvalDataset with randomly sampled items
|
|
377
|
-
|
|
378
|
-
Raises:
|
|
379
|
-
ValueError: If sample_size is larger than the dataset size
|
|
380
|
-
"""
|
|
381
|
-
dataset_size = len(self.items)
|
|
382
|
-
|
|
383
|
-
if sample_size > dataset_size:
|
|
384
|
-
raise ValueError(
|
|
385
|
-
f"Sample size {sample_size} is larger than dataset size {dataset_size} "
|
|
386
|
-
f"for dataset '{self.name}'"
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
# Create randomly sampled items
|
|
390
|
-
sampled_items = random.sample(self.items, sample_size)
|
|
391
|
-
|
|
392
|
-
# Create a new EvalDataset instance with sampled items using from_list
|
|
393
|
-
sampled_dataset = self.from_list(
|
|
394
|
-
name=self.name,
|
|
395
|
-
label=self.label,
|
|
396
|
-
metrics=self.metrics,
|
|
397
|
-
data=sampled_items,
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
# Preserve the prompt template if it exists
|
|
401
|
-
if self.prompt_template is not None:
|
|
402
|
-
sampled_dataset.prompt_template = self.prompt_template
|
|
403
|
-
|
|
404
|
-
return sampled_dataset
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
scorebook/__init__.py,sha256=tAe8v8xyiNcl7P4SUIM5dPVMqU8GQ8dKzJ1pfF6B-Ms,629
|
|
2
|
-
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
-
scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
|
|
4
|
-
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
-
scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
|
|
6
|
-
scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
|
|
7
|
-
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
scorebook/evaluate/_async/evaluate_async.py,sha256=vn8rjjveCCF6ItZWngqAP3RhfScHV_LlIomqh-z5-UU,15509
|
|
9
|
-
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
scorebook/evaluate/_sync/evaluate.py,sha256=4LVdXvCsPmSbkBxphJ9in5l17GL9Zqn66bZm9a8w9nc,15347
|
|
11
|
-
scorebook/evaluate/evaluate_helpers.py,sha256=rAXUroMXfPkWqufMnA97bfscgPik38s3eeepe2RkchA,13026
|
|
12
|
-
scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
|
|
13
|
-
scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
|
|
14
|
-
scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
|
|
15
|
-
scorebook/inference/clients/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
|
|
16
|
-
scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
|
|
17
|
-
scorebook/inference/clients/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
18
|
-
scorebook/inference/clients/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
|
|
19
|
-
scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
|
|
20
|
-
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
21
|
-
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
22
|
-
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
23
|
-
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
24
|
-
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
25
|
-
scorebook/settings.py,sha256=CgaumN98QpU7XKMugUG41UAO8oZVuWDco4uooSagFZY,596
|
|
26
|
-
scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
|
|
27
|
-
scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
28
|
-
scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
|
|
29
|
-
scorebook/utils/__init__.py,sha256=3xdIXJzYEp9k23z4_49VWZtasoZN8tJxVPieE_HOuww,519
|
|
30
|
-
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
31
|
-
scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
|
|
32
|
-
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
33
|
-
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
34
|
-
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
35
|
-
scorebook/utils/progress_bars.py,sha256=uLG_0s_QEHGgjZcVaDJ7wp14Rd3GY5dWu-F4FL8isJg,3783
|
|
36
|
-
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
37
|
-
scorebook-0.0.10.dist-info/METADATA,sha256=wJXBm9ZzeNYIrhUOz4Uc4D_5_1J8arUnMiOtR5BNeOA,11479
|
|
38
|
-
scorebook-0.0.10.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
39
|
-
scorebook-0.0.10.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
40
|
-
scorebook-0.0.10.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
41
|
-
scorebook-0.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|