scorebook 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. scorebook/__init__.py +12 -4
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/evaluate/__init__.py +15 -0
  4. scorebook/evaluate/_async/__init__.py +0 -0
  5. scorebook/evaluate/_async/evaluate_async.py +413 -0
  6. scorebook/evaluate/_sync/__init__.py +0 -0
  7. scorebook/evaluate/_sync/evaluate.py +413 -0
  8. scorebook/evaluate/evaluate_helpers.py +365 -0
  9. scorebook/inference/__init__.py +4 -0
  10. scorebook/inference/clients/__init__.py +8 -0
  11. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  12. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  13. scorebook/settings.py +18 -0
  14. scorebook/trismik/__init__.py +10 -0
  15. scorebook/utils/__init__.py +9 -2
  16. scorebook/utils/async_utils.py +20 -1
  17. scorebook/utils/progress_bars.py +22 -61
  18. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
  19. scorebook-0.0.10.dist-info/RECORD +41 -0
  20. scorebook/evaluate.py +0 -623
  21. scorebook/trismik_services/__init__.py +0 -6
  22. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  23. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  24. scorebook-0.0.8.dist-info/RECORD +0 -36
  25. /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
  26. /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
  27. /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
  28. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  29. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
  30. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
  31. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,10 @@
1
1
  """Async utilities for handling callable objects and coroutines."""
2
2
 
3
3
  import asyncio
4
- from typing import Callable
4
+ from contextlib import asynccontextmanager
5
+ from typing import AsyncIterator, Callable, Optional, TypeVar
6
+
7
+ T = TypeVar("T")
5
8
 
6
9
 
7
10
  def is_awaitable(obj: Callable) -> bool:
@@ -25,3 +28,19 @@ def is_awaitable(obj: Callable) -> bool:
25
28
  return True
26
29
 
27
30
  return False
31
+
32
+
33
+ @asynccontextmanager
34
+ async def async_nullcontext(value: Optional[T] = None) -> AsyncIterator[Optional[T]]:
35
+ """Async version of contextlib.nullcontext for Python 3.9 compatibility.
36
+
37
+ contextlib.nullcontext() is sync-only and cannot be used with async with on Python 3.9.
38
+ This provides an async equivalent that can be used with async context managers.
39
+
40
+ Args:
41
+ value: Optional value to yield from the context manager
42
+
43
+ Yields:
44
+ The provided value
45
+ """
46
+ yield value
@@ -1,7 +1,7 @@
1
1
  """Progress bar utilities for evaluation tracking."""
2
2
 
3
3
  from contextlib import contextmanager
4
- from typing import Any, Generator, List, Optional
4
+ from typing import Generator, Optional
5
5
 
6
6
  from tqdm import tqdm
7
7
 
@@ -9,20 +9,16 @@ from tqdm import tqdm
9
9
  class EvaluationProgressBars:
10
10
  """Manages nested progress bars for evaluation tracking."""
11
11
 
12
- def __init__(
13
- self, datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
14
- ) -> None:
12
+ def __init__(self, dataset_count: int, hyperparam_count: int, total_eval_runs: int) -> None:
15
13
  """Initialize progress bar manager.
16
14
 
17
15
  Args:
18
- datasets: List of datasets being evaluated
16
+ dataset_count: Number of datasets being evaluated
19
17
  hyperparam_count: Number of hyperparameter configurations per dataset
20
- parallel: Whether running in parallel mode
21
18
  total_eval_runs: Total number of EvalRunSpecs (dataset_count * hyperparam_count)
22
19
  """
23
- self.datasets = datasets
20
+ self.dataset_count = dataset_count
24
21
  self.hyperparam_count = hyperparam_count
25
- self.parallel = parallel
26
22
  self.total_eval_runs = total_eval_runs
27
23
 
28
24
  self.dataset_pbar: Optional[tqdm] = None
@@ -37,7 +33,7 @@ class EvaluationProgressBars:
37
33
  """Start both progress bars."""
38
34
  # Top level: Datasets
39
35
  self.dataset_pbar = tqdm(
40
- total=len(self.datasets),
36
+ total=self.dataset_count,
41
37
  desc="Datasets ",
42
38
  unit="dataset",
43
39
  position=0,
@@ -46,35 +42,19 @@ class EvaluationProgressBars:
46
42
  bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
47
43
  )
48
44
 
49
- # Bottom level: Hyperparameters/Eval runs
50
- if self.parallel:
51
- # In parallel mode: show eval runs completed out of total
52
- self.hyperparam_pbar = tqdm(
53
- total=self.total_eval_runs,
54
- desc="Eval Runs ",
55
- unit="run",
56
- position=1,
57
- leave=False,
58
- ncols=80,
59
- bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
60
- )
61
- else:
62
- # In sequential mode: show hyperparams per dataset
63
- self.hyperparam_pbar = tqdm(
64
- total=self.hyperparam_count,
65
- desc="Hyperparams",
66
- unit="config",
67
- position=1,
68
- leave=False,
69
- ncols=80,
70
- bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
71
- )
45
+ # Bottom level: Eval runs
46
+ self.hyperparam_pbar = tqdm(
47
+ total=self.total_eval_runs,
48
+ desc="Eval Runs ",
49
+ unit="run",
50
+ position=1,
51
+ leave=False,
52
+ ncols=80,
53
+ bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
54
+ )
72
55
 
73
56
  def on_eval_run_completed(self, dataset_idx: int) -> None:
74
- """Update progress when an eval run (EvalRunSpec) completes in parallel mode."""
75
- if not self.parallel:
76
- return
77
-
57
+ """Update progress when an eval run (EvalRunSpec) completes."""
78
58
  self.completed_eval_runs += 1
79
59
  if self.hyperparam_pbar:
80
60
  self.hyperparam_pbar.update(1)
@@ -84,24 +64,6 @@ class EvaluationProgressBars:
84
64
  self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
85
65
  )
86
66
 
87
- # Check if this dataset is complete
88
- if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
89
- if self.dataset_pbar:
90
- self.dataset_pbar.update(1)
91
-
92
- def on_hyperparam_completed(self, dataset_idx: int) -> None:
93
- """Update progress when a hyperparameter config completes in sequential mode."""
94
- if self.parallel:
95
- return
96
-
97
- if self.hyperparam_pbar:
98
- self.hyperparam_pbar.update(1)
99
-
100
- # Track completed hyperparams for this dataset
101
- self.completed_hyperparams_per_dataset[dataset_idx] = (
102
- self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
103
- )
104
-
105
67
  # Check if this dataset is complete
106
68
  if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
107
69
  # Update dataset progress
@@ -109,7 +71,7 @@ class EvaluationProgressBars:
109
71
  self.dataset_pbar.update(1)
110
72
 
111
73
  # Reset hyperparameter progress for next dataset (if any)
112
- if dataset_idx < len(self.datasets) - 1:
74
+ if dataset_idx < self.dataset_count - 1:
113
75
  if self.hyperparam_pbar:
114
76
  self.hyperparam_pbar.reset()
115
77
 
@@ -125,20 +87,19 @@ class EvaluationProgressBars:
125
87
 
126
88
  @contextmanager
127
89
  def evaluation_progress(
128
- datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
90
+ dataset_count: int, hyperparameter_config_count: int, run_count: int
129
91
  ) -> Generator[EvaluationProgressBars, None, None]:
130
92
  """Context manager for evaluation progress bars.
131
93
 
132
94
  Args:
133
- datasets: List of datasets being evaluated
134
- hyperparam_count: Number of hyperparameter configurations per dataset
135
- parallel: Whether running in parallel mode
136
- total_eval_runs: Total number of EvalRunSpecs
95
+ dataset_count: Number of datasets being evaluated
96
+ hyperparameter_config_count: Number of hyperparameter configurations per dataset
97
+ run_count: Total number of EvalRunSpecs
137
98
 
138
99
  Yields:
139
100
  EvaluationProgressBars: Progress bar manager instance
140
101
  """
141
- progress_bars = EvaluationProgressBars(datasets, hyperparam_count, parallel, total_eval_runs)
102
+ progress_bars = EvaluationProgressBars(dataset_count, hyperparameter_config_count, run_count)
142
103
  progress_bars.start_progress_bars()
143
104
  try:
144
105
  yield progress_bars
@@ -1,18 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
7
7
  Author-email: euan@trismik.com
8
- Requires-Python: >=3.9
8
+ Requires-Python: >=3.9, <3.14
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
- Classifier: Programming Language :: Python :: 3.14
16
15
  Provides-Extra: bedrock
17
16
  Provides-Extra: examples
18
17
  Provides-Extra: openai
@@ -37,7 +36,7 @@ Requires-Dist: torch ; extra == "examples"
37
36
  Requires-Dist: torchaudio ; extra == "examples"
38
37
  Requires-Dist: torchvision ; extra == "examples"
39
38
  Requires-Dist: transformers ; extra == "examples"
40
- Requires-Dist: trismik (>=0.9.12)
39
+ Requires-Dist: trismik (>=1.0.1,<2.0.0)
41
40
  Description-Content-Type: text/markdown
42
41
 
43
42
  # Scorebook
@@ -0,0 +1,41 @@
1
+ scorebook/__init__.py,sha256=tAe8v8xyiNcl7P4SUIM5dPVMqU8GQ8dKzJ1pfF6B-Ms,629
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
6
+ scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
7
+ scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ scorebook/evaluate/_async/evaluate_async.py,sha256=vn8rjjveCCF6ItZWngqAP3RhfScHV_LlIomqh-z5-UU,15509
9
+ scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ scorebook/evaluate/_sync/evaluate.py,sha256=4LVdXvCsPmSbkBxphJ9in5l17GL9Zqn66bZm9a8w9nc,15347
11
+ scorebook/evaluate/evaluate_helpers.py,sha256=rAXUroMXfPkWqufMnA97bfscgPik38s3eeepe2RkchA,13026
12
+ scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
13
+ scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
14
+ scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
15
+ scorebook/inference/clients/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
16
+ scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
17
+ scorebook/inference/clients/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
18
+ scorebook/inference/clients/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
19
+ scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
20
+ scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
21
+ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
22
+ scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
23
+ scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
24
+ scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
25
+ scorebook/settings.py,sha256=CgaumN98QpU7XKMugUG41UAO8oZVuWDco4uooSagFZY,596
26
+ scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
27
+ scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
28
+ scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
29
+ scorebook/utils/__init__.py,sha256=3xdIXJzYEp9k23z4_49VWZtasoZN8tJxVPieE_HOuww,519
30
+ scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
31
+ scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
32
+ scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
33
+ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
34
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
35
+ scorebook/utils/progress_bars.py,sha256=uLG_0s_QEHGgjZcVaDJ7wp14Rd3GY5dWu-F4FL8isJg,3783
36
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
37
+ scorebook-0.0.10.dist-info/METADATA,sha256=wJXBm9ZzeNYIrhUOz4Uc4D_5_1J8arUnMiOtR5BNeOA,11479
38
+ scorebook-0.0.10.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
39
+ scorebook-0.0.10.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
40
+ scorebook-0.0.10.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
41
+ scorebook-0.0.10.dist-info/RECORD,,