scorebook 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {scorebook-0.0.1 → scorebook-0.0.3}/PKG-INFO +11 -1
  2. {scorebook-0.0.1 → scorebook-0.0.3}/pyproject.toml +5 -1
  3. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/__init__.py +2 -1
  4. scorebook-0.0.3/src/scorebook/evaluator.py +379 -0
  5. scorebook-0.0.3/src/scorebook/exceptions.py +54 -0
  6. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/inference/__init__.py +0 -4
  7. scorebook-0.0.3/src/scorebook/inference/bedrock.py +305 -0
  8. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/inference/openai.py +75 -37
  9. scorebook-0.0.3/src/scorebook/inference/vertex.py +295 -0
  10. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/types/__init__.py +2 -1
  11. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/types/eval_dataset.py +56 -0
  12. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/types/eval_result.py +7 -3
  13. scorebook-0.0.3/src/scorebook/types/eval_run_spec.py +28 -0
  14. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/types/inference_pipeline.py +5 -2
  15. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/utils/__init__.py +2 -1
  16. scorebook-0.0.3/src/scorebook/utils/build_prompt.py +52 -0
  17. scorebook-0.0.3/src/scorebook/utils/jinja_helpers.py +146 -0
  18. scorebook-0.0.3/src/scorebook/utils/logging_utils.py +1 -0
  19. scorebook-0.0.3/src/scorebook/utils/progress_bars.py +146 -0
  20. scorebook-0.0.1/src/scorebook/evaluator.py +0 -228
  21. scorebook-0.0.1/src/scorebook/utils/progress_bars.py +0 -89
  22. {scorebook-0.0.1 → scorebook-0.0.3}/LICENSE +0 -0
  23. {scorebook-0.0.1 → scorebook-0.0.3}/README.md +0 -0
  24. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/inference/portkey.py +0 -0
  25. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/metrics/__init__.py +0 -0
  26. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/metrics/accuracy.py +0 -0
  27. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/metrics/metric_base.py +0 -0
  28. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/metrics/metric_registry.py +0 -0
  29. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/metrics/precision.py +0 -0
  30. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/utils/async_utils.py +0 -0
  31. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/utils/io_helpers.py +0 -0
  32. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/utils/mappers.py +0 -0
  33. {scorebook-0.0.1 → scorebook-0.0.3}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: scorebook
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: A Python project for LLM evaluation.
5
5
  Author: Euan Campbell
6
6
  Author-email: euan@trismik.com
@@ -11,16 +11,26 @@ Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Classifier: Programming Language :: Python :: 3.12
13
13
  Classifier: Programming Language :: Python :: 3.13
14
+ Provides-Extra: bedrock
14
15
  Provides-Extra: examples
15
16
  Provides-Extra: openai
16
17
  Provides-Extra: portkey
18
+ Provides-Extra: vertex
17
19
  Requires-Dist: accelerate ; extra == "examples"
20
+ Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
18
21
  Requires-Dist: datasets (>=3.6.0)
22
+ Requires-Dist: fsspec[gcs] ; extra == "vertex"
23
+ Requires-Dist: google-cloud-storage ; extra == "vertex"
24
+ Requires-Dist: google-genai ; extra == "vertex"
25
+ Requires-Dist: notebook (>=7.4.5,<8.0.0)
19
26
  Requires-Dist: notebook ; extra == "examples"
20
27
  Requires-Dist: openai ; extra == "openai"
28
+ Requires-Dist: pandas ; extra == "vertex"
21
29
  Requires-Dist: portkey-ai ; extra == "portkey"
30
+ Requires-Dist: python-dotenv ; extra == "bedrock"
22
31
  Requires-Dist: python-dotenv ; extra == "openai"
23
32
  Requires-Dist: python-dotenv ; extra == "portkey"
33
+ Requires-Dist: python-dotenv ; extra == "vertex"
24
34
  Requires-Dist: torch ; extra == "examples"
25
35
  Requires-Dist: torchaudio ; extra == "examples"
26
36
  Requires-Dist: torchvision ; extra == "examples"
@@ -10,10 +10,11 @@ readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  dependencies = [
12
12
  "datasets>=3.6.0",
13
+ "notebook (>=7.4.5,<8.0.0)",
13
14
  ]
14
15
 
15
16
  [tool.poetry]
16
- version = "0.0.1" # base version
17
+ version = "0.0.3" # base version
17
18
  packages = [{ include = "scorebook", from = "src" }]
18
19
 
19
20
 
@@ -28,10 +29,13 @@ flake8 = "^7.0.0"
28
29
  mypy = "^1.15.0"
29
30
  autoflake = "^2.3.1"
30
31
  toml = "^0.10.2"
32
+ types-pyyaml = "^6.0.12.20250822"
31
33
 
32
34
  [project.optional-dependencies]
33
35
  openai = ["openai", "python-dotenv"]
34
36
  portkey = ["portkey-ai", "python-dotenv"]
37
+ bedrock = ["boto3==1.40.0", "python-dotenv"]
38
+ vertex = ["google-genai", "pandas", "google-cloud-storage", "fsspec[gcs]", "python-dotenv"]
35
39
  examples = ["transformers", "torch", "torchvision", "torchaudio", "accelerate", "notebook"]
36
40
 
37
41
 
@@ -11,5 +11,6 @@ __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
12
  from scorebook.evaluator import evaluate
13
13
  from scorebook.types.eval_dataset import EvalDataset
14
+ from scorebook.utils.build_prompt import build_prompt
14
15
 
15
- __all__ = ["EvalDataset", "evaluate"]
16
+ __all__ = ["EvalDataset", "evaluate", "build_prompt"]
@@ -0,0 +1,379 @@
1
+ """
2
+ Model evaluation functionality for the Scorebook framework.
3
+
4
+ This module provides the core evaluation logic to assess model predictions
5
+ against ground truth labels using configurable metrics. It supports:
6
+
7
+ - Batch evaluation of models across multiple datasets
8
+ - Flexible metric computation and aggregation
9
+ - Optional parameter sweeping and experiment tracking
10
+ - Customizable inference functions
11
+
12
+ The main entry point is the `evaluate()` function which handles running
13
+ models on datasets and computing metric scores.
14
+ """
15
+
16
+ import asyncio
17
+ import logging
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ from scorebook.exceptions import (
21
+ DataMismatchError,
22
+ MetricComputationError,
23
+ ParallelExecutionError,
24
+ ParameterValidationError,
25
+ )
26
+ from scorebook.types import EvalDataset, EvalResult, EvalRunSpec
27
+ from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def evaluate(
33
+ inference_callable: Callable,
34
+ eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
35
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
36
+ experiment_id: Optional[str] = None,
37
+ project_id: Optional[str] = None,
38
+ parallel: bool = False,
39
+ return_dict: bool = True,
40
+ return_aggregates: bool = True,
41
+ return_items: bool = False,
42
+ return_output: bool = False,
43
+ sample_size: Optional[int] = None,
44
+ ) -> Union[Dict, List]:
45
+ """
46
+ Evaluate model predictions using specified metrics on given datasets.
47
+
48
+ This function runs the provided inference callable on one or more evaluation datasets,
49
+ computes metric scores, and returns the evaluation results. It supports batch processing,
50
+ parameter sweeping, and different result formatting options.
51
+
52
+ Args:
53
+ inference_callable: A callable function or object that takes (items, hyperparameters)
54
+ and returns predictions. Can be a regular function, async function,
55
+ or callable instance (like a class with __call__ method).
56
+ eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
57
+ - A single EvalDataset instance
58
+ - A list of EvalDataset instances
59
+ - A string identifier (for future dataset registry support)
60
+ - A list of string identifiers
61
+ hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
62
+ experiment_id: Optional string identifier for tracking multiple evaluation runs.
63
+ return_dict: If True, returns eval results as a dict
64
+ return_aggregates: If True, returns aggregate scores for each dataset
65
+ return_items: If True, returns individual items for each dataset
66
+ return_output: If True, returns model outputs for each dataset item evaluated
67
+ sample_size: If set, only return a sample of the dataset items (for debugging)
68
+ parallel: If True, run inference functions in parallel (requires all functions to be async)
69
+
70
+ Returns:
71
+ Dictionary mapping dataset names to their evaluation results. For each dataset,
72
+ returns a dictionary containing:
73
+ - items: List of EvalResult objects with predictions and ground truth
74
+ - metrics: Dictionary mapping metric names to their computed scores
75
+
76
+ Example:
77
+
78
+ python
79
+ dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
80
+ def inference_fn(items):
81
+ # Model inference logic here - process all items at once
82
+ return [prediction for item in items]
83
+
84
+ results = evaluate(inference_fn, dataset, item_limit=100)
85
+ """
86
+
87
+ logger.info(
88
+ "Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
89
+ experiment_id,
90
+ project_id,
91
+ parallel,
92
+ )
93
+
94
+ return asyncio.run(
95
+ _evaluate_async(
96
+ inference_callable=inference_callable,
97
+ eval_datasets=eval_datasets,
98
+ hyperparameters=hyperparameters,
99
+ experiment_id=experiment_id,
100
+ project_id=project_id,
101
+ parallel=parallel,
102
+ return_dict=return_dict,
103
+ return_aggregates=return_aggregates,
104
+ return_items=return_items,
105
+ return_output=return_output,
106
+ sample_size=sample_size,
107
+ )
108
+ )
109
+
110
+
111
+ async def _evaluate_async(
112
+ inference_callable: Callable,
113
+ eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
114
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
115
+ experiment_id: Optional[str] = None,
116
+ project_id: Optional[str] = None,
117
+ return_dict: bool = True,
118
+ return_aggregates: bool = True,
119
+ return_items: bool = False,
120
+ return_output: bool = False,
121
+ parallel: bool = False,
122
+ sample_size: Optional[int] = None,
123
+ ) -> Union[Dict, List]:
124
+ _validate_parameters(locals())
125
+ datasets, adaptive_datasets = _prepare_datasets(eval_datasets, sample_size)
126
+ hyperparameters = _prepare_hyperparameters(hyperparameters)
127
+
128
+ logger.info(
129
+ "Prepared %d datasets and %d hyperparameter configurations",
130
+ len(datasets),
131
+ len(hyperparameters),
132
+ )
133
+
134
+ runs = _build_runs(datasets, hyperparameters)
135
+ runs.sort(key=lambda run: (run.dataset_idx, run.hp_idx))
136
+
137
+ logger.info("Created %d evaluation runs", len(runs))
138
+
139
+ with evaluation_progress(datasets, len(hyperparameters), parallel, len(runs)) as progress_bars:
140
+ if parallel:
141
+ eval_results = await _run_parallel(inference_callable, runs, progress_bars)
142
+ else:
143
+ eval_results = await _run_sequential(inference_callable, runs, progress_bars)
144
+
145
+ logger.info("Evaluation completed successfully")
146
+
147
+ return _format_results(
148
+ eval_results, return_dict, return_aggregates, return_items, return_output
149
+ )
150
+
151
+
152
+ # ===== ORCHESTRATION PATHS =====
153
+
154
+
155
+ async def _run_parallel(
156
+ inference_callable: Callable,
157
+ runs: List[EvalRunSpec],
158
+ progress_bars: Any,
159
+ ) -> List[EvalResult]:
160
+ logger.debug("Running inference in parallel")
161
+
162
+ async def worker(run: EvalRunSpec) -> Tuple[EvalRunSpec, EvalResult]:
163
+ er = await _execute_run(inference_callable, run)
164
+ progress_bars.on_eval_run_completed(run.dataset_idx)
165
+ return run, er
166
+
167
+ pairs = await asyncio.gather(*[worker(r) for r in runs])
168
+ # Return in canonical (dataset_idx, hp_idx) order for stability
169
+ pairs.sort(key=lambda p: (p[0].dataset_idx, p[0].hp_idx))
170
+ return [er for _, er in pairs]
171
+
172
+
173
+ async def _run_sequential(
174
+ inference_callable: Callable,
175
+ runs: List[EvalRunSpec],
176
+ progress_bars: Any,
177
+ ) -> List[EvalResult]:
178
+ logger.debug("Running inference sequentially")
179
+ results: List[EvalResult] = []
180
+ for run in runs:
181
+ er = await _execute_run(inference_callable, run)
182
+ results.append(er)
183
+ progress_bars.on_hyperparam_completed(run.dataset_idx)
184
+ return results
185
+
186
+
187
+ # ===== EVALUATION EXECUTIONS =====
188
+
189
+
190
+ async def _execute_run(inference_callable: Callable, run: EvalRunSpec) -> EvalResult:
191
+ logger.debug("Executing run for %s", run)
192
+
193
+ outputs = await _run_inference_callable(inference_callable, run.items, run.hyperparams)
194
+ logger.debug("Inference completed for run %s", run)
195
+
196
+ metric_scores = _score_metrics(run.eval_dataset, outputs, run.labels)
197
+ logger.debug("Metrics computed for run %s. - scores: %s", run, list(metric_scores.keys()))
198
+
199
+ return EvalResult(run.eval_dataset, outputs, metric_scores, run.hyperparams)
200
+
201
+
202
+ # ===== HELPER FUNCTIONS =====
203
+
204
+
205
+ def _validate_parameters(params: Dict[str, Any]) -> None:
206
+ """Validate all parameters for evaluation."""
207
+
208
+ if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
209
+ raise ParameterValidationError(
210
+ "When return_dict=True, at least one of return_aggregates or return_items must be True"
211
+ )
212
+
213
+ if params["parallel"] and not is_awaitable(params["inference_callable"]):
214
+ raise ParallelExecutionError(
215
+ "parallel=True requires the inference_callable to be async. "
216
+ "Please make your inference function async or set parallel=False."
217
+ )
218
+
219
+
220
+ def _prepare_datasets(
221
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
222
+ sample_size: Optional[int] = None,
223
+ ) -> Tuple[List[EvalDataset], List[str]]:
224
+ """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
225
+
226
+ # Ensure datasets is always a list for consistent processing
227
+ if not isinstance(datasets, list):
228
+ datasets = [datasets]
229
+
230
+ # Extract classical datasets TODO: handle other types (string registry)
231
+ classic_eval_datasets = [dataset for dataset in datasets if isinstance(dataset, EvalDataset)]
232
+
233
+ # Reduce datasets to a random sample
234
+ if sample_size:
235
+ logger.info("Sampling datasets to %d items each", sample_size)
236
+ for dataset in classic_eval_datasets:
237
+ dataset.shuffle()
238
+ if len(dataset) > sample_size:
239
+ original_size = len(dataset)
240
+ dataset._hf_dataset = dataset._hf_dataset.select(range(sample_size))
241
+ logger.debug(
242
+ "Sampled dataset '%s' from %d to %d items",
243
+ dataset.name,
244
+ original_size,
245
+ sample_size,
246
+ )
247
+
248
+ # Extract adaptive dataset strings
249
+ adaptive_eval_datasets = [
250
+ dataset.replace(":adaptive", "")
251
+ for dataset in datasets
252
+ if isinstance(dataset, str) and dataset.endswith(":adaptive")
253
+ ]
254
+
255
+ logger.info("Evaluating on classic datasets: %s", [ds.name for ds in classic_eval_datasets])
256
+ logger.info("Evaluating on adaptive datasets: %s", adaptive_eval_datasets)
257
+
258
+ return classic_eval_datasets, adaptive_eval_datasets
259
+
260
+
261
+ def _prepare_hyperparameters(
262
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
263
+ ) -> List[Dict[str, Any]]:
264
+ """Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
265
+ if hyperparameters is None:
266
+ return [{}]
267
+ if not isinstance(hyperparameters, list):
268
+ expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
269
+ return expanded
270
+
271
+ logger.info("Evaluating with hyperparameters: %s", hyperparameters)
272
+
273
+ return hyperparameters
274
+
275
+
276
+ async def _run_inference_callable(
277
+ inference_callable: Callable,
278
+ items: List[Dict[str, Any]],
279
+ hyperparams: Dict[str, Any],
280
+ ) -> Any:
281
+ if is_awaitable(inference_callable):
282
+ return await inference_callable(items, **hyperparams)
283
+ else:
284
+ return inference_callable(items, **hyperparams)
285
+
286
+
287
+ def _build_runs(
288
+ datasets: List[EvalDataset],
289
+ hyperparameters: List[Dict[str, Any]],
290
+ ) -> List[EvalRunSpec]:
291
+ """Build RunSpec objects for each dataset/hyperparameter combination."""
292
+ runs: List[EvalRunSpec] = []
293
+ for d_idx, ds in enumerate(datasets):
294
+ items = ds.items
295
+ labels = [item.get(ds.label) for item in items]
296
+ for hp_idx, hp in enumerate(hyperparameters):
297
+ run_spec = EvalRunSpec(d_idx, ds, items, labels, hp, hp_idx)
298
+ logger.debug("Built RunSpec: %s", run_spec)
299
+ runs.append(run_spec)
300
+ return runs
301
+
302
+
303
+ def _score_metrics(
304
+ eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
305
+ ) -> Dict[str, Dict[str, Any]]:
306
+ """Compute metric scores for a given dataset and inference outputs."""
307
+ metric_scores: Dict[str, Dict[str, Any]] = {}
308
+
309
+ if len(outputs) != len(labels):
310
+ raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
311
+
312
+ for metric in eval_dataset.metrics:
313
+ try:
314
+ aggregate_scores, item_scores = metric.score(outputs, labels)
315
+ metric_scores[metric.name] = {
316
+ "aggregate_scores": aggregate_scores,
317
+ "item_scores": item_scores,
318
+ }
319
+ except Exception as e:
320
+ logger.error(
321
+ "Failed to compute metric '%s' for dataset '%s': %s",
322
+ metric.name,
323
+ eval_dataset.name,
324
+ str(e),
325
+ )
326
+ raise MetricComputationError(metric.name, eval_dataset.name, e)
327
+
328
+ return metric_scores
329
+
330
+
331
+ def _format_results(
332
+ eval_results: List[EvalResult],
333
+ return_dict: bool,
334
+ return_aggregates: bool,
335
+ return_items: bool,
336
+ return_output: bool,
337
+ ) -> Union[Dict, List]:
338
+
339
+ # Return results as a dict
340
+ if return_dict:
341
+
342
+ # Include both aggregate and item scores in dict returned
343
+ if return_aggregates and return_items:
344
+ results: Dict[str, List[Dict[str, Any]]] = {"aggregate_results": [], "item_results": []}
345
+ for eval_result in eval_results:
346
+ eval_result_dict = eval_result.to_dict()
347
+ results["aggregate_results"].extend(eval_result_dict["aggregate_results"])
348
+ if return_output:
349
+ results["item_results"].extend(eval_result_dict["item_results"])
350
+ else:
351
+ results["item_results"].extend(
352
+ [
353
+ {k: v for k, v in item.items() if k != "inference_output"}
354
+ for item in eval_result_dict["item_results"]
355
+ ]
356
+ )
357
+ return results
358
+
359
+ # Include only aggregate scores in dict returned
360
+ elif return_aggregates:
361
+ return [eval_result.aggregate_scores for eval_result in eval_results]
362
+
363
+ # Include only item scores in dict returned
364
+ else:
365
+ if return_output:
366
+ return [item for eval_result in eval_results for item in eval_result.item_scores]
367
+ else:
368
+ return [
369
+ {k: v for k, v in item.items() if k != "inference_output"}
370
+ for eval_result in eval_results
371
+ for item in eval_result.item_scores
372
+ ]
373
+
374
+ # Return results as an EvalResult object
375
+ else:
376
+ out: Dict[str, List[EvalResult]] = {}
377
+ for er in eval_results:
378
+ out.setdefault(er.eval_dataset.name, []).append(er)
379
+ return out
@@ -0,0 +1,54 @@
1
+ """
2
+ Custom exceptions for the Scorebook framework.
3
+
4
+ This module defines specific exception types used throughout the Scorebook
5
+ evaluation framework to provide clear error handling and debugging information.
6
+ """
7
+
8
+
9
+ class ScoreBookError(Exception):
10
+ """Base exception class for all Scorebook-related errors."""
11
+
12
+
13
+ class EvaluationError(ScoreBookError):
14
+ """Raised when there are errors during model evaluation."""
15
+
16
+
17
+ class ParameterValidationError(ScoreBookError):
18
+ """Raised when invalid parameters are provided to evaluation functions."""
19
+
20
+
21
+ class InferenceError(EvaluationError):
22
+ """Raised when there are errors during model inference."""
23
+
24
+
25
+ class MetricComputationError(EvaluationError):
26
+ """Raised when metric computation fails."""
27
+
28
+ def __init__(self, metric_name: str, dataset_name: str, original_error: Exception):
29
+ """Initialize metric computation error."""
30
+ self.metric_name = metric_name
31
+ self.dataset_name = dataset_name
32
+ self.original_error = original_error
33
+ super().__init__(
34
+ f"Failed to compute metric '{metric_name}' for dataset "
35
+ f"'{dataset_name}': {original_error}"
36
+ )
37
+
38
+
39
+ class DataMismatchError(EvaluationError):
40
+ """Raised when there's a mismatch between outputs and expected labels."""
41
+
42
+ def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
43
+ """Initialize data mismatch error."""
44
+ self.outputs_count = outputs_count
45
+ self.labels_count = labels_count
46
+ self.dataset_name = dataset_name
47
+ super().__init__(
48
+ f"Output count ({outputs_count}) doesn't match label count ({labels_count}) "
49
+ f"for dataset '{dataset_name}'"
50
+ )
51
+
52
+
53
+ class ParallelExecutionError(ScoreBookError):
54
+ """Raised when parallel execution requirements are not met."""
@@ -5,7 +5,3 @@ This module provides functionality for running inference with various models
5
5
  and processing their responses. It includes utilities for both single and
6
6
  batch inference operations.
7
7
  """
8
-
9
- from scorebook.inference.openai import batch, responses
10
-
11
- __all__ = ["responses", "batch"]