scorebook 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -4
- scorebook/cli/auth.py +1 -1
- scorebook/evaluate/__init__.py +15 -0
- scorebook/evaluate/_async/__init__.py +0 -0
- scorebook/evaluate/_async/evaluate_async.py +413 -0
- scorebook/evaluate/_sync/__init__.py +0 -0
- scorebook/evaluate/_sync/evaluate.py +413 -0
- scorebook/evaluate/evaluate_helpers.py +365 -0
- scorebook/inference/__init__.py +4 -0
- scorebook/inference/clients/__init__.py +8 -0
- scorebook/inference/{openai.py → clients/openai.py} +35 -23
- scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
- scorebook/settings.py +18 -0
- scorebook/trismik/__init__.py +10 -0
- scorebook/utils/__init__.py +9 -2
- scorebook/utils/async_utils.py +20 -1
- scorebook/utils/progress_bars.py +22 -61
- {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
- scorebook-0.0.10.dist-info/RECORD +41 -0
- scorebook/evaluate.py +0 -623
- scorebook/trismik_services/__init__.py +0 -6
- scorebook/trismik_services/adaptive_testing_service.py +0 -141
- scorebook/trismik_services/upload_classic_eval_run.py +0 -102
- scorebook-0.0.8.dist-info/RECORD +0 -36
- /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
- /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
- /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
- /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
- {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
- {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
scorebook/utils/async_utils.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
"""Async utilities for handling callable objects and coroutines."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
from
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from typing import AsyncIterator, Callable, Optional, TypeVar
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T")
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
def is_awaitable(obj: Callable) -> bool:
|
|
@@ -25,3 +28,19 @@ def is_awaitable(obj: Callable) -> bool:
|
|
|
25
28
|
return True
|
|
26
29
|
|
|
27
30
|
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@asynccontextmanager
|
|
34
|
+
async def async_nullcontext(value: Optional[T] = None) -> AsyncIterator[Optional[T]]:
|
|
35
|
+
"""Async version of contextlib.nullcontext for Python 3.9 compatibility.
|
|
36
|
+
|
|
37
|
+
contextlib.nullcontext() is sync-only and cannot be used with async with on Python 3.9.
|
|
38
|
+
This provides an async equivalent that can be used with async context managers.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
value: Optional value to yield from the context manager
|
|
42
|
+
|
|
43
|
+
Yields:
|
|
44
|
+
The provided value
|
|
45
|
+
"""
|
|
46
|
+
yield value
|
scorebook/utils/progress_bars.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Progress bar utilities for evaluation tracking."""
|
|
2
2
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Generator, Optional
|
|
5
5
|
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
@@ -9,20 +9,16 @@ from tqdm import tqdm
|
|
|
9
9
|
class EvaluationProgressBars:
|
|
10
10
|
"""Manages nested progress bars for evaluation tracking."""
|
|
11
11
|
|
|
12
|
-
def __init__(
|
|
13
|
-
self, datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
|
|
14
|
-
) -> None:
|
|
12
|
+
def __init__(self, dataset_count: int, hyperparam_count: int, total_eval_runs: int) -> None:
|
|
15
13
|
"""Initialize progress bar manager.
|
|
16
14
|
|
|
17
15
|
Args:
|
|
18
|
-
|
|
16
|
+
dataset_count: Number of datasets being evaluated
|
|
19
17
|
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
20
|
-
parallel: Whether running in parallel mode
|
|
21
18
|
total_eval_runs: Total number of EvalRunSpecs (dataset_count * hyperparam_count)
|
|
22
19
|
"""
|
|
23
|
-
self.
|
|
20
|
+
self.dataset_count = dataset_count
|
|
24
21
|
self.hyperparam_count = hyperparam_count
|
|
25
|
-
self.parallel = parallel
|
|
26
22
|
self.total_eval_runs = total_eval_runs
|
|
27
23
|
|
|
28
24
|
self.dataset_pbar: Optional[tqdm] = None
|
|
@@ -37,7 +33,7 @@ class EvaluationProgressBars:
|
|
|
37
33
|
"""Start both progress bars."""
|
|
38
34
|
# Top level: Datasets
|
|
39
35
|
self.dataset_pbar = tqdm(
|
|
40
|
-
total=
|
|
36
|
+
total=self.dataset_count,
|
|
41
37
|
desc="Datasets ",
|
|
42
38
|
unit="dataset",
|
|
43
39
|
position=0,
|
|
@@ -46,35 +42,19 @@ class EvaluationProgressBars:
|
|
|
46
42
|
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
47
43
|
)
|
|
48
44
|
|
|
49
|
-
# Bottom level:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
60
|
-
)
|
|
61
|
-
else:
|
|
62
|
-
# In sequential mode: show hyperparams per dataset
|
|
63
|
-
self.hyperparam_pbar = tqdm(
|
|
64
|
-
total=self.hyperparam_count,
|
|
65
|
-
desc="Hyperparams",
|
|
66
|
-
unit="config",
|
|
67
|
-
position=1,
|
|
68
|
-
leave=False,
|
|
69
|
-
ncols=80,
|
|
70
|
-
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
71
|
-
)
|
|
45
|
+
# Bottom level: Eval runs
|
|
46
|
+
self.hyperparam_pbar = tqdm(
|
|
47
|
+
total=self.total_eval_runs,
|
|
48
|
+
desc="Eval Runs ",
|
|
49
|
+
unit="run",
|
|
50
|
+
position=1,
|
|
51
|
+
leave=False,
|
|
52
|
+
ncols=80,
|
|
53
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
54
|
+
)
|
|
72
55
|
|
|
73
56
|
def on_eval_run_completed(self, dataset_idx: int) -> None:
|
|
74
|
-
"""Update progress when an eval run (EvalRunSpec) completes
|
|
75
|
-
if not self.parallel:
|
|
76
|
-
return
|
|
77
|
-
|
|
57
|
+
"""Update progress when an eval run (EvalRunSpec) completes."""
|
|
78
58
|
self.completed_eval_runs += 1
|
|
79
59
|
if self.hyperparam_pbar:
|
|
80
60
|
self.hyperparam_pbar.update(1)
|
|
@@ -84,24 +64,6 @@ class EvaluationProgressBars:
|
|
|
84
64
|
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
85
65
|
)
|
|
86
66
|
|
|
87
|
-
# Check if this dataset is complete
|
|
88
|
-
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
89
|
-
if self.dataset_pbar:
|
|
90
|
-
self.dataset_pbar.update(1)
|
|
91
|
-
|
|
92
|
-
def on_hyperparam_completed(self, dataset_idx: int) -> None:
|
|
93
|
-
"""Update progress when a hyperparameter config completes in sequential mode."""
|
|
94
|
-
if self.parallel:
|
|
95
|
-
return
|
|
96
|
-
|
|
97
|
-
if self.hyperparam_pbar:
|
|
98
|
-
self.hyperparam_pbar.update(1)
|
|
99
|
-
|
|
100
|
-
# Track completed hyperparams for this dataset
|
|
101
|
-
self.completed_hyperparams_per_dataset[dataset_idx] = (
|
|
102
|
-
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
103
|
-
)
|
|
104
|
-
|
|
105
67
|
# Check if this dataset is complete
|
|
106
68
|
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
107
69
|
# Update dataset progress
|
|
@@ -109,7 +71,7 @@ class EvaluationProgressBars:
|
|
|
109
71
|
self.dataset_pbar.update(1)
|
|
110
72
|
|
|
111
73
|
# Reset hyperparameter progress for next dataset (if any)
|
|
112
|
-
if dataset_idx <
|
|
74
|
+
if dataset_idx < self.dataset_count - 1:
|
|
113
75
|
if self.hyperparam_pbar:
|
|
114
76
|
self.hyperparam_pbar.reset()
|
|
115
77
|
|
|
@@ -125,20 +87,19 @@ class EvaluationProgressBars:
|
|
|
125
87
|
|
|
126
88
|
@contextmanager
|
|
127
89
|
def evaluation_progress(
|
|
128
|
-
|
|
90
|
+
dataset_count: int, hyperparameter_config_count: int, run_count: int
|
|
129
91
|
) -> Generator[EvaluationProgressBars, None, None]:
|
|
130
92
|
"""Context manager for evaluation progress bars.
|
|
131
93
|
|
|
132
94
|
Args:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
total_eval_runs: Total number of EvalRunSpecs
|
|
95
|
+
dataset_count: Number of datasets being evaluated
|
|
96
|
+
hyperparameter_config_count: Number of hyperparameter configurations per dataset
|
|
97
|
+
run_count: Total number of EvalRunSpecs
|
|
137
98
|
|
|
138
99
|
Yields:
|
|
139
100
|
EvaluationProgressBars: Progress bar manager instance
|
|
140
101
|
"""
|
|
141
|
-
progress_bars = EvaluationProgressBars(
|
|
102
|
+
progress_bars = EvaluationProgressBars(dataset_count, hyperparameter_config_count, run_count)
|
|
142
103
|
progress_bars.start_progress_bars()
|
|
143
104
|
try:
|
|
144
105
|
yield progress_bars
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.10
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
7
7
|
Author-email: euan@trismik.com
|
|
8
|
-
Requires-Python: >=3.9
|
|
8
|
+
Requires-Python: >=3.9, <3.14
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
16
15
|
Provides-Extra: bedrock
|
|
17
16
|
Provides-Extra: examples
|
|
18
17
|
Provides-Extra: openai
|
|
@@ -37,7 +36,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
37
36
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
37
|
Requires-Dist: torchvision ; extra == "examples"
|
|
39
38
|
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (>=0.
|
|
39
|
+
Requires-Dist: trismik (>=1.0.1,<2.0.0)
|
|
41
40
|
Description-Content-Type: text/markdown
|
|
42
41
|
|
|
43
42
|
# Scorebook
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
scorebook/__init__.py,sha256=tAe8v8xyiNcl7P4SUIM5dPVMqU8GQ8dKzJ1pfF6B-Ms,629
|
|
2
|
+
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
+
scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
|
|
4
|
+
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
+
scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
|
|
6
|
+
scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
|
|
7
|
+
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
scorebook/evaluate/_async/evaluate_async.py,sha256=vn8rjjveCCF6ItZWngqAP3RhfScHV_LlIomqh-z5-UU,15509
|
|
9
|
+
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
scorebook/evaluate/_sync/evaluate.py,sha256=4LVdXvCsPmSbkBxphJ9in5l17GL9Zqn66bZm9a8w9nc,15347
|
|
11
|
+
scorebook/evaluate/evaluate_helpers.py,sha256=rAXUroMXfPkWqufMnA97bfscgPik38s3eeepe2RkchA,13026
|
|
12
|
+
scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
|
|
13
|
+
scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
|
|
14
|
+
scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
|
|
15
|
+
scorebook/inference/clients/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
|
|
16
|
+
scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
|
|
17
|
+
scorebook/inference/clients/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
18
|
+
scorebook/inference/clients/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
|
|
19
|
+
scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
|
|
20
|
+
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
21
|
+
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
22
|
+
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
23
|
+
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
24
|
+
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
25
|
+
scorebook/settings.py,sha256=CgaumN98QpU7XKMugUG41UAO8oZVuWDco4uooSagFZY,596
|
|
26
|
+
scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
|
|
27
|
+
scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
28
|
+
scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
|
|
29
|
+
scorebook/utils/__init__.py,sha256=3xdIXJzYEp9k23z4_49VWZtasoZN8tJxVPieE_HOuww,519
|
|
30
|
+
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
31
|
+
scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
|
|
32
|
+
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
33
|
+
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
34
|
+
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
35
|
+
scorebook/utils/progress_bars.py,sha256=uLG_0s_QEHGgjZcVaDJ7wp14Rd3GY5dWu-F4FL8isJg,3783
|
|
36
|
+
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
37
|
+
scorebook-0.0.10.dist-info/METADATA,sha256=wJXBm9ZzeNYIrhUOz4Uc4D_5_1J8arUnMiOtR5BNeOA,11479
|
|
38
|
+
scorebook-0.0.10.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
39
|
+
scorebook-0.0.10.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
40
|
+
scorebook-0.0.10.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
41
|
+
scorebook-0.0.10.dist-info/RECORD,,
|