scorebook 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +2 -1
- scorebook/evaluator.py +269 -118
- scorebook/exceptions.py +54 -0
- scorebook/inference/__init__.py +0 -4
- scorebook/inference/bedrock.py +305 -0
- scorebook/inference/openai.py +75 -37
- scorebook/inference/vertex.py +295 -0
- scorebook/types/__init__.py +2 -1
- scorebook/types/eval_dataset.py +56 -0
- scorebook/types/eval_result.py +7 -3
- scorebook/types/eval_run_spec.py +28 -0
- scorebook/types/inference_pipeline.py +5 -2
- scorebook/utils/__init__.py +2 -1
- scorebook/utils/build_prompt.py +52 -0
- scorebook/utils/jinja_helpers.py +146 -0
- scorebook/utils/logging_utils.py +1 -0
- scorebook/utils/progress_bars.py +91 -34
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/METADATA +11 -1
- scorebook-0.0.3.dist-info/RECORD +31 -0
- scorebook-0.0.1.dist-info/RECORD +0 -24
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/LICENSE +0 -0
- {scorebook-0.0.1.dist-info → scorebook-0.0.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Logging utilities for Scorebook evaluation framework."""
|
scorebook/utils/progress_bars.py
CHANGED
|
@@ -9,20 +9,33 @@ from tqdm import tqdm
|
|
|
9
9
|
class EvaluationProgressBars:
|
|
10
10
|
"""Manages nested progress bars for evaluation tracking."""
|
|
11
11
|
|
|
12
|
-
def __init__(
|
|
12
|
+
def __init__(
|
|
13
|
+
self, datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
|
|
14
|
+
) -> None:
|
|
13
15
|
"""Initialize progress bar manager.
|
|
14
16
|
|
|
15
17
|
Args:
|
|
16
18
|
datasets: List of datasets being evaluated
|
|
17
19
|
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
20
|
+
parallel: Whether running in parallel mode
|
|
21
|
+
total_eval_runs: Total number of EvalRunSpecs (dataset_count * hyperparam_count)
|
|
18
22
|
"""
|
|
19
23
|
self.datasets = datasets
|
|
20
24
|
self.hyperparam_count = hyperparam_count
|
|
25
|
+
self.parallel = parallel
|
|
26
|
+
self.total_eval_runs = total_eval_runs
|
|
27
|
+
|
|
21
28
|
self.dataset_pbar: Optional[tqdm] = None
|
|
22
29
|
self.hyperparam_pbar: Optional[tqdm] = None
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
|
|
31
|
+
# Track progress per dataset
|
|
32
|
+
self.current_dataset_idx = 0
|
|
33
|
+
self.completed_hyperparams_per_dataset: dict[int, int] = {}
|
|
34
|
+
self.completed_eval_runs = 0
|
|
35
|
+
|
|
36
|
+
def start_progress_bars(self) -> None:
|
|
37
|
+
"""Start both progress bars."""
|
|
38
|
+
# Top level: Datasets
|
|
26
39
|
self.dataset_pbar = tqdm(
|
|
27
40
|
total=len(self.datasets),
|
|
28
41
|
desc="Datasets ",
|
|
@@ -33,57 +46,101 @@ class EvaluationProgressBars:
|
|
|
33
46
|
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
34
47
|
)
|
|
35
48
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self.
|
|
49
|
+
# Bottom level: Hyperparameters/Eval runs
|
|
50
|
+
if self.parallel:
|
|
51
|
+
# In parallel mode: show eval runs completed out of total
|
|
52
|
+
self.hyperparam_pbar = tqdm(
|
|
53
|
+
total=self.total_eval_runs,
|
|
54
|
+
desc="Eval Runs ",
|
|
55
|
+
unit="run",
|
|
56
|
+
position=1,
|
|
57
|
+
leave=False,
|
|
58
|
+
ncols=80,
|
|
59
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
# In sequential mode: show hyperparams per dataset
|
|
63
|
+
self.hyperparam_pbar = tqdm(
|
|
64
|
+
total=self.hyperparam_count,
|
|
65
|
+
desc="Hyperparams",
|
|
66
|
+
unit="config",
|
|
67
|
+
position=1,
|
|
68
|
+
leave=False,
|
|
69
|
+
ncols=80,
|
|
70
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
71
|
+
)
|
|
40
72
|
|
|
41
|
-
def
|
|
42
|
-
"""
|
|
43
|
-
if self.
|
|
44
|
-
|
|
45
|
-
self.dataset_pbar = None
|
|
73
|
+
def on_eval_run_completed(self, dataset_idx: int) -> None:
|
|
74
|
+
"""Update progress when an eval run (EvalRunSpec) completes in parallel mode."""
|
|
75
|
+
if not self.parallel:
|
|
76
|
+
return
|
|
46
77
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
position=1,
|
|
55
|
-
leave=False,
|
|
56
|
-
ncols=80,
|
|
57
|
-
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
78
|
+
self.completed_eval_runs += 1
|
|
79
|
+
if self.hyperparam_pbar:
|
|
80
|
+
self.hyperparam_pbar.update(1)
|
|
81
|
+
|
|
82
|
+
# Track how many runs completed for this dataset
|
|
83
|
+
self.completed_hyperparams_per_dataset[dataset_idx] = (
|
|
84
|
+
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
58
85
|
)
|
|
59
|
-
try:
|
|
60
|
-
yield self.hyperparam_pbar
|
|
61
|
-
finally:
|
|
62
|
-
self.hyperparam_pbar.close()
|
|
63
|
-
self.hyperparam_pbar = None
|
|
64
86
|
|
|
65
|
-
|
|
66
|
-
|
|
87
|
+
# Check if this dataset is complete
|
|
88
|
+
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
89
|
+
if self.dataset_pbar:
|
|
90
|
+
self.dataset_pbar.update(1)
|
|
91
|
+
|
|
92
|
+
def on_hyperparam_completed(self, dataset_idx: int) -> None:
|
|
93
|
+
"""Update progress when a hyperparameter config completes in sequential mode."""
|
|
94
|
+
if self.parallel:
|
|
95
|
+
return
|
|
96
|
+
|
|
67
97
|
if self.hyperparam_pbar:
|
|
68
98
|
self.hyperparam_pbar.update(1)
|
|
69
99
|
|
|
100
|
+
# Track completed hyperparams for this dataset
|
|
101
|
+
self.completed_hyperparams_per_dataset[dataset_idx] = (
|
|
102
|
+
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Check if this dataset is complete
|
|
106
|
+
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
107
|
+
# Update dataset progress
|
|
108
|
+
if self.dataset_pbar:
|
|
109
|
+
self.dataset_pbar.update(1)
|
|
110
|
+
|
|
111
|
+
# Reset hyperparameter progress for next dataset (if any)
|
|
112
|
+
if dataset_idx < len(self.datasets) - 1:
|
|
113
|
+
if self.hyperparam_pbar:
|
|
114
|
+
self.hyperparam_pbar.reset()
|
|
115
|
+
|
|
116
|
+
def close_progress_bars(self) -> None:
|
|
117
|
+
"""Close both progress bars."""
|
|
118
|
+
if self.hyperparam_pbar:
|
|
119
|
+
self.hyperparam_pbar.close()
|
|
120
|
+
self.hyperparam_pbar = None
|
|
121
|
+
if self.dataset_pbar:
|
|
122
|
+
self.dataset_pbar.close()
|
|
123
|
+
self.dataset_pbar = None
|
|
124
|
+
|
|
70
125
|
|
|
71
126
|
@contextmanager
|
|
72
127
|
def evaluation_progress(
|
|
73
|
-
datasets: List[Any], hyperparam_count: int
|
|
128
|
+
datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
|
|
74
129
|
) -> Generator[EvaluationProgressBars, None, None]:
|
|
75
130
|
"""Context manager for evaluation progress bars.
|
|
76
131
|
|
|
77
132
|
Args:
|
|
78
133
|
datasets: List of datasets being evaluated
|
|
79
134
|
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
135
|
+
parallel: Whether running in parallel mode
|
|
136
|
+
total_eval_runs: Total number of EvalRunSpecs
|
|
80
137
|
|
|
81
138
|
Yields:
|
|
82
139
|
EvaluationProgressBars: Progress bar manager instance
|
|
83
140
|
"""
|
|
84
|
-
progress_bars = EvaluationProgressBars(datasets, hyperparam_count)
|
|
85
|
-
progress_bars.
|
|
141
|
+
progress_bars = EvaluationProgressBars(datasets, hyperparam_count, parallel, total_eval_runs)
|
|
142
|
+
progress_bars.start_progress_bars()
|
|
86
143
|
try:
|
|
87
144
|
yield progress_bars
|
|
88
145
|
finally:
|
|
89
|
-
progress_bars.
|
|
146
|
+
progress_bars.close_progress_bars()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
Author: Euan Campbell
|
|
6
6
|
Author-email: euan@trismik.com
|
|
@@ -11,16 +11,26 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Provides-Extra: bedrock
|
|
14
15
|
Provides-Extra: examples
|
|
15
16
|
Provides-Extra: openai
|
|
16
17
|
Provides-Extra: portkey
|
|
18
|
+
Provides-Extra: vertex
|
|
17
19
|
Requires-Dist: accelerate ; extra == "examples"
|
|
20
|
+
Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
|
|
18
21
|
Requires-Dist: datasets (>=3.6.0)
|
|
22
|
+
Requires-Dist: fsspec[gcs] ; extra == "vertex"
|
|
23
|
+
Requires-Dist: google-cloud-storage ; extra == "vertex"
|
|
24
|
+
Requires-Dist: google-genai ; extra == "vertex"
|
|
25
|
+
Requires-Dist: notebook (>=7.4.5,<8.0.0)
|
|
19
26
|
Requires-Dist: notebook ; extra == "examples"
|
|
20
27
|
Requires-Dist: openai ; extra == "openai"
|
|
28
|
+
Requires-Dist: pandas ; extra == "vertex"
|
|
21
29
|
Requires-Dist: portkey-ai ; extra == "portkey"
|
|
30
|
+
Requires-Dist: python-dotenv ; extra == "bedrock"
|
|
22
31
|
Requires-Dist: python-dotenv ; extra == "openai"
|
|
23
32
|
Requires-Dist: python-dotenv ; extra == "portkey"
|
|
33
|
+
Requires-Dist: python-dotenv ; extra == "vertex"
|
|
24
34
|
Requires-Dist: torch ; extra == "examples"
|
|
25
35
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
26
36
|
Requires-Dist: torchvision ; extra == "examples"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
scorebook/__init__.py,sha256=7ac3KpXU3kKFekq8mZ3cVbF7oQ6Q9E-uqX7ijyte1Q0,406
|
|
2
|
+
scorebook/evaluator.py,sha256=mS3G3PI26nHzqkYX4tqusQZJL5Q1xTxzqshAdwscl0s,14170
|
|
3
|
+
scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
|
|
4
|
+
scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
|
|
5
|
+
scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
|
|
6
|
+
scorebook/inference/openai.py,sha256=FqXua4v4PTYSHrdTm_9fM0Us8Mo2n2LSN94CwRipRw4,7658
|
|
7
|
+
scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
8
|
+
scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
|
|
9
|
+
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
10
|
+
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
11
|
+
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
12
|
+
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
13
|
+
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
14
|
+
scorebook/types/__init__.py,sha256=dXY3Y-GiMipVExzVu7H5pbdFfg4HBMEKxqSTfENywSs,427
|
|
15
|
+
scorebook/types/eval_dataset.py,sha256=dCqOHjGaEb7pGG1VF4aGFn6hngFvlxpxddqsDtM4nTs,13870
|
|
16
|
+
scorebook/types/eval_result.py,sha256=R2zuWrx8p9_4A2W3Gmlp-xGgmelPdg8QB5PoV1hiqRc,4728
|
|
17
|
+
scorebook/types/eval_run_spec.py,sha256=nf7LGa_dG60Qb385W6O6qiu7VlJ03-dpo2X1PgKGcRQ,845
|
|
18
|
+
scorebook/types/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
|
|
19
|
+
scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
|
|
20
|
+
scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
|
|
21
|
+
scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
|
|
22
|
+
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
23
|
+
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
24
|
+
scorebook/utils/logging_utils.py,sha256=M4BXt369mJo037WYpvuWDoe3oGWVdHWaGo4Vbl0WDL0,60
|
|
25
|
+
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
26
|
+
scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
|
|
27
|
+
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
28
|
+
scorebook-0.0.3.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
29
|
+
scorebook-0.0.3.dist-info/METADATA,sha256=i0tLm4SNSiPTNEP8QU0ZjsfOqizw4uu3GWPVqdxrcso,11409
|
|
30
|
+
scorebook-0.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
31
|
+
scorebook-0.0.3.dist-info/RECORD,,
|
scorebook-0.0.1.dist-info/RECORD
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
scorebook/__init__.py,sha256=cYv8bT3_7o2MTxPVKiv51DcpaPtH_A9qOH5yF_FULZo,336
|
|
2
|
-
scorebook/evaluator.py,sha256=Ce4KerLVPlaF63xng9RKH9M1l-ldo3mdrd3T2dBs_YE,8908
|
|
3
|
-
scorebook/inference/__init__.py,sha256=sU_ZSN9eO7ajZ-QklNpx8_gf3jCdDn69J-SfU0z07-E,333
|
|
4
|
-
scorebook/inference/openai.py,sha256=XD1dbPrEHQJVXOMtqCt9a0yQ-qR381N5XXhCrgz8jio,5826
|
|
5
|
-
scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
6
|
-
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
7
|
-
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
8
|
-
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
9
|
-
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
10
|
-
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
11
|
-
scorebook/types/__init__.py,sha256=xQMOae_fIBbeyeuqoa7SbNwjxAiVinPBbckOcUzo57U,358
|
|
12
|
-
scorebook/types/eval_dataset.py,sha256=TeIeVHQ597NxedxaTEXohZO8gR5iAiDtJbCja_u69EI,11703
|
|
13
|
-
scorebook/types/eval_result.py,sha256=y0vLN6RMgiz1lyai5ltmzDibBHE25-k9bTrQ7U27RZ8,4552
|
|
14
|
-
scorebook/types/inference_pipeline.py,sha256=M3JgchpcVdhRJPzn3mh5ys6iivSt8eBmHIj4F5LcFYU,3167
|
|
15
|
-
scorebook/utils/__init__.py,sha256=DmhS61OZ2nNWkGxDfVrMBwwiH7dmLAbg3MHuNgaXhQg,382
|
|
16
|
-
scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
|
|
17
|
-
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
18
|
-
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
19
|
-
scorebook/utils/progress_bars.py,sha256=BlKqYlXDbik5eUn5nf5f7QnMvnTj8CU_CfXKxCWp3Ww,2909
|
|
20
|
-
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
21
|
-
scorebook-0.0.1.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
22
|
-
scorebook-0.0.1.dist-info/METADATA,sha256=oiwYbuJkRVkoFZkIAQej09LdG5xBLxhKPy2ozWTV-_w,10976
|
|
23
|
-
scorebook-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
24
|
-
scorebook-0.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|