scorebook 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/evaluator.py +215 -107
- scorebook/exceptions.py +54 -0
- scorebook/inference/openai.py +75 -37
- scorebook/types/__init__.py +2 -1
- scorebook/types/eval_dataset.py +6 -0
- scorebook/types/eval_run_spec.py +28 -0
- scorebook/types/inference_pipeline.py +2 -2
- scorebook/utils/logging_utils.py +1 -0
- scorebook/utils/progress_bars.py +91 -34
- {scorebook-0.0.2.dist-info → scorebook-0.0.4.dist-info}/METADATA +1 -1
- {scorebook-0.0.2.dist-info → scorebook-0.0.4.dist-info}/RECORD +13 -10
- {scorebook-0.0.2.dist-info → scorebook-0.0.4.dist-info}/LICENSE +0 -0
- {scorebook-0.0.2.dist-info → scorebook-0.0.4.dist-info}/WHEEL +0 -0
scorebook/evaluator.py
CHANGED
|
@@ -14,84 +14,19 @@ models on datasets and computing metric scores.
|
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from scorebook.
|
|
17
|
+
import logging
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
from scorebook.exceptions import (
|
|
21
|
+
DataMismatchError,
|
|
22
|
+
MetricComputationError,
|
|
23
|
+
ParallelExecutionError,
|
|
24
|
+
ParameterValidationError,
|
|
25
|
+
)
|
|
26
|
+
from scorebook.types import EvalDataset, EvalResult, EvalRunSpec
|
|
21
27
|
from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
|
|
22
28
|
|
|
23
|
-
|
|
24
|
-
async def _evaluate_async(
|
|
25
|
-
inference_callable: Callable,
|
|
26
|
-
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
27
|
-
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
28
|
-
experiment_id: Optional[str] = None,
|
|
29
|
-
return_dict: bool = True,
|
|
30
|
-
return_aggregates: bool = True,
|
|
31
|
-
return_items: bool = False,
|
|
32
|
-
return_output: bool = False,
|
|
33
|
-
sample_size: Optional[int] = None,
|
|
34
|
-
) -> Union[Dict, List]:
|
|
35
|
-
"""Run inference across datasets/hyperparams, compute metrics, and format results."""
|
|
36
|
-
|
|
37
|
-
# Validate parameters
|
|
38
|
-
if return_dict and not return_aggregates and not return_items:
|
|
39
|
-
raise ValueError(
|
|
40
|
-
"When return_dict=True, at least one of return_aggregates or return_items must be True"
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
normalized_datasets = _normalize_datasets(eval_datasets)
|
|
44
|
-
|
|
45
|
-
if hyperparameters is None:
|
|
46
|
-
hyperparam_grid: List[Dict[str, Any]] = [{}]
|
|
47
|
-
elif not isinstance(hyperparameters, list):
|
|
48
|
-
hyperparam_grid = _expand_hyperparams(hyperparameters)
|
|
49
|
-
else:
|
|
50
|
-
hyperparam_grid = hyperparameters
|
|
51
|
-
|
|
52
|
-
eval_results: List[EvalResult] = []
|
|
53
|
-
|
|
54
|
-
with evaluation_progress(normalized_datasets, len(hyperparam_grid)) as progress_bars:
|
|
55
|
-
# Loop through datasets, then hyperparameters for clear progress tracking
|
|
56
|
-
for dataset_idx, eval_dataset in enumerate(normalized_datasets):
|
|
57
|
-
with progress_bars.hyperparam_progress_context():
|
|
58
|
-
# Run inference for each hyperparameter configuration on this dataset
|
|
59
|
-
for hp_idx, hyperparam_config in enumerate(hyperparam_grid):
|
|
60
|
-
|
|
61
|
-
if sample_size:
|
|
62
|
-
items = _get_items_sample(eval_dataset.items, sample_size)
|
|
63
|
-
else:
|
|
64
|
-
items = eval_dataset.items
|
|
65
|
-
|
|
66
|
-
labels = _get_labels_for_items(items, eval_dataset.label)
|
|
67
|
-
|
|
68
|
-
# 1) Run inference
|
|
69
|
-
outputs = await _run_inference_callable(
|
|
70
|
-
inference_callable, items, hyperparam_config
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
# 2) Score metrics
|
|
74
|
-
metric_scores = _score_metrics(eval_dataset, outputs, labels)
|
|
75
|
-
|
|
76
|
-
# 3) Wrap into EvalResult
|
|
77
|
-
eval_results.append(
|
|
78
|
-
EvalResult(eval_dataset, outputs, metric_scores, hyperparam_config)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
# Update inner progress bar
|
|
82
|
-
progress_bars.update_hyperparam_progress()
|
|
83
|
-
|
|
84
|
-
# Update the outer progress bar
|
|
85
|
-
progress_bars.update_dataset_progress()
|
|
86
|
-
|
|
87
|
-
# TODO: experiment_id handling (left as passthrough to preserve behavior)
|
|
88
|
-
if experiment_id:
|
|
89
|
-
pass
|
|
90
|
-
|
|
91
|
-
# 4) Format as requested
|
|
92
|
-
return _format_results(
|
|
93
|
-
eval_results, return_dict, return_aggregates, return_items, return_output
|
|
94
|
-
)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
95
30
|
|
|
96
31
|
|
|
97
32
|
def evaluate(
|
|
@@ -99,6 +34,8 @@ def evaluate(
|
|
|
99
34
|
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
100
35
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
101
36
|
experiment_id: Optional[str] = None,
|
|
37
|
+
project_id: Optional[str] = None,
|
|
38
|
+
parallel: bool = False,
|
|
102
39
|
return_dict: bool = True,
|
|
103
40
|
return_aggregates: bool = True,
|
|
104
41
|
return_items: bool = False,
|
|
@@ -128,6 +65,7 @@ def evaluate(
|
|
|
128
65
|
return_items: If True, returns individual items for each dataset
|
|
129
66
|
return_output: If True, returns model outputs for each dataset item evaluated
|
|
130
67
|
sample_size: If set, only return a sample of the dataset items (for debugging)
|
|
68
|
+
parallel: If True, run inference functions in parallel (requires all functions to be async)
|
|
131
69
|
|
|
132
70
|
Returns:
|
|
133
71
|
Dictionary mapping dataset names to their evaluation results. For each dataset,
|
|
@@ -145,12 +83,22 @@ def evaluate(
|
|
|
145
83
|
|
|
146
84
|
results = evaluate(inference_fn, dataset, item_limit=100)
|
|
147
85
|
"""
|
|
86
|
+
|
|
87
|
+
logger.info(
|
|
88
|
+
"Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
|
|
89
|
+
experiment_id,
|
|
90
|
+
project_id,
|
|
91
|
+
parallel,
|
|
92
|
+
)
|
|
93
|
+
|
|
148
94
|
return asyncio.run(
|
|
149
95
|
_evaluate_async(
|
|
150
96
|
inference_callable=inference_callable,
|
|
151
97
|
eval_datasets=eval_datasets,
|
|
152
98
|
hyperparameters=hyperparameters,
|
|
153
99
|
experiment_id=experiment_id,
|
|
100
|
+
project_id=project_id,
|
|
101
|
+
parallel=parallel,
|
|
154
102
|
return_dict=return_dict,
|
|
155
103
|
return_aggregates=return_aggregates,
|
|
156
104
|
return_items=return_items,
|
|
@@ -160,30 +108,169 @@ def evaluate(
|
|
|
160
108
|
)
|
|
161
109
|
|
|
162
110
|
|
|
163
|
-
|
|
111
|
+
async def _evaluate_async(
|
|
112
|
+
inference_callable: Callable,
|
|
113
|
+
eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
114
|
+
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
115
|
+
experiment_id: Optional[str] = None,
|
|
116
|
+
project_id: Optional[str] = None,
|
|
117
|
+
return_dict: bool = True,
|
|
118
|
+
return_aggregates: bool = True,
|
|
119
|
+
return_items: bool = False,
|
|
120
|
+
return_output: bool = False,
|
|
121
|
+
parallel: bool = False,
|
|
122
|
+
sample_size: Optional[int] = None,
|
|
123
|
+
) -> Union[Dict, List]:
|
|
124
|
+
_validate_parameters(locals())
|
|
125
|
+
datasets, adaptive_datasets = _prepare_datasets(eval_datasets, sample_size)
|
|
126
|
+
hyperparameters = _prepare_hyperparameters(hyperparameters)
|
|
127
|
+
|
|
128
|
+
logger.info(
|
|
129
|
+
"Prepared %d datasets and %d hyperparameter configurations",
|
|
130
|
+
len(datasets),
|
|
131
|
+
len(hyperparameters),
|
|
132
|
+
)
|
|
164
133
|
|
|
134
|
+
runs = _build_runs(datasets, hyperparameters)
|
|
135
|
+
runs.sort(key=lambda run: (run.dataset_idx, run.hp_idx))
|
|
165
136
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
137
|
+
logger.info("Created %d evaluation runs", len(runs))
|
|
138
|
+
|
|
139
|
+
with evaluation_progress(datasets, len(hyperparameters), parallel, len(runs)) as progress_bars:
|
|
140
|
+
if parallel:
|
|
141
|
+
eval_results = await _run_parallel(inference_callable, runs, progress_bars)
|
|
142
|
+
else:
|
|
143
|
+
eval_results = await _run_sequential(inference_callable, runs, progress_bars)
|
|
144
|
+
|
|
145
|
+
logger.info("Evaluation completed successfully")
|
|
146
|
+
|
|
147
|
+
return _format_results(
|
|
148
|
+
eval_results, return_dict, return_aggregates, return_items, return_output
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ===== ORCHESTRATION PATHS =====
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def _run_parallel(
|
|
156
|
+
inference_callable: Callable,
|
|
157
|
+
runs: List[EvalRunSpec],
|
|
158
|
+
progress_bars: Any,
|
|
159
|
+
) -> List[EvalResult]:
|
|
160
|
+
logger.debug("Running inference in parallel")
|
|
161
|
+
|
|
162
|
+
async def worker(run: EvalRunSpec) -> Tuple[EvalRunSpec, EvalResult]:
|
|
163
|
+
er = await _execute_run(inference_callable, run)
|
|
164
|
+
progress_bars.on_eval_run_completed(run.dataset_idx)
|
|
165
|
+
return run, er
|
|
166
|
+
|
|
167
|
+
pairs = await asyncio.gather(*[worker(r) for r in runs])
|
|
168
|
+
# Return in canonical (dataset_idx, hp_idx) order for stability
|
|
169
|
+
pairs.sort(key=lambda p: (p[0].dataset_idx, p[0].hp_idx))
|
|
170
|
+
return [er for _, er in pairs]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
async def _run_sequential(
|
|
174
|
+
inference_callable: Callable,
|
|
175
|
+
runs: List[EvalRunSpec],
|
|
176
|
+
progress_bars: Any,
|
|
177
|
+
) -> List[EvalResult]:
|
|
178
|
+
logger.debug("Running inference sequentially")
|
|
179
|
+
results: List[EvalResult] = []
|
|
180
|
+
for run in runs:
|
|
181
|
+
er = await _execute_run(inference_callable, run)
|
|
182
|
+
results.append(er)
|
|
183
|
+
progress_bars.on_hyperparam_completed(run.dataset_idx)
|
|
184
|
+
return results
|
|
173
185
|
|
|
174
186
|
|
|
175
|
-
|
|
176
|
-
return expand_dict(hyperparameters or {})
|
|
187
|
+
# ===== EVALUATION EXECUTIONS =====
|
|
177
188
|
|
|
178
189
|
|
|
179
|
-
def
|
|
180
|
-
|
|
190
|
+
async def _execute_run(inference_callable: Callable, run: EvalRunSpec) -> EvalResult:
|
|
191
|
+
logger.debug("Executing run for %s", run)
|
|
192
|
+
|
|
193
|
+
outputs = await _run_inference_callable(inference_callable, run.items, run.hyperparams)
|
|
194
|
+
logger.debug("Inference completed for run %s", run)
|
|
195
|
+
|
|
196
|
+
metric_scores = _score_metrics(run.eval_dataset, outputs, run.labels)
|
|
197
|
+
logger.debug("Metrics computed for run %s. - scores: %s", run, list(metric_scores.keys()))
|
|
198
|
+
|
|
199
|
+
return EvalResult(run.eval_dataset, outputs, metric_scores, run.hyperparams)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ===== HELPER FUNCTIONS =====
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _validate_parameters(params: Dict[str, Any]) -> None:
|
|
206
|
+
"""Validate all parameters for evaluation."""
|
|
207
|
+
|
|
208
|
+
if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
|
|
209
|
+
raise ParameterValidationError(
|
|
210
|
+
"When return_dict=True, at least one of return_aggregates or return_items must be True"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if params["parallel"] and not is_awaitable(params["inference_callable"]):
|
|
214
|
+
raise ParallelExecutionError(
|
|
215
|
+
"parallel=True requires the inference_callable to be async. "
|
|
216
|
+
"Please make your inference function async or set parallel=False."
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _prepare_datasets(
|
|
221
|
+
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
222
|
+
sample_size: Optional[int] = None,
|
|
223
|
+
) -> Tuple[List[EvalDataset], List[str]]:
|
|
224
|
+
"""Prepare and separate input datasets into classic and adaptive evaluation datasets."""
|
|
225
|
+
|
|
226
|
+
# Ensure datasets is always a list for consistent processing
|
|
227
|
+
if not isinstance(datasets, list):
|
|
228
|
+
datasets = [datasets]
|
|
229
|
+
|
|
230
|
+
# Extract classical datasets TODO: handle other types (string registry)
|
|
231
|
+
classic_eval_datasets = [dataset for dataset in datasets if isinstance(dataset, EvalDataset)]
|
|
232
|
+
|
|
233
|
+
# Reduce datasets to a random sample
|
|
234
|
+
if sample_size:
|
|
235
|
+
logger.info("Sampling datasets to %d items each", sample_size)
|
|
236
|
+
for dataset in classic_eval_datasets:
|
|
237
|
+
dataset.shuffle()
|
|
238
|
+
if len(dataset) > sample_size:
|
|
239
|
+
original_size = len(dataset)
|
|
240
|
+
dataset._hf_dataset = dataset._hf_dataset.select(range(sample_size))
|
|
241
|
+
logger.debug(
|
|
242
|
+
"Sampled dataset '%s' from %d to %d items",
|
|
243
|
+
dataset.name,
|
|
244
|
+
original_size,
|
|
245
|
+
sample_size,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Extract adaptive dataset strings
|
|
249
|
+
adaptive_eval_datasets = [
|
|
250
|
+
dataset.replace(":adaptive", "")
|
|
251
|
+
for dataset in datasets
|
|
252
|
+
if isinstance(dataset, str) and dataset.endswith(":adaptive")
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
logger.info("Evaluating on classic datasets: %s", [ds.name for ds in classic_eval_datasets])
|
|
256
|
+
logger.info("Evaluating on adaptive datasets: %s", adaptive_eval_datasets)
|
|
257
|
+
|
|
258
|
+
return classic_eval_datasets, adaptive_eval_datasets
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _prepare_hyperparameters(
|
|
262
|
+
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
|
|
181
263
|
) -> List[Dict[str, Any]]:
|
|
182
|
-
|
|
264
|
+
"""Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
|
|
265
|
+
if hyperparameters is None:
|
|
266
|
+
return [{}]
|
|
267
|
+
if not isinstance(hyperparameters, list):
|
|
268
|
+
expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
|
|
269
|
+
return expanded
|
|
183
270
|
|
|
271
|
+
logger.info("Evaluating with hyperparameters: %s", hyperparameters)
|
|
184
272
|
|
|
185
|
-
|
|
186
|
-
return [item.get(label_key) for item in items]
|
|
273
|
+
return hyperparameters
|
|
187
274
|
|
|
188
275
|
|
|
189
276
|
async def _run_inference_callable(
|
|
@@ -197,29 +284,47 @@ async def _run_inference_callable(
|
|
|
197
284
|
return inference_callable(items, **hyperparams)
|
|
198
285
|
|
|
199
286
|
|
|
200
|
-
|
|
201
|
-
def _iter_dataset_jobs(
|
|
287
|
+
def _build_runs(
|
|
202
288
|
datasets: List[EvalDataset],
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
289
|
+
hyperparameters: List[Dict[str, Any]],
|
|
290
|
+
) -> List[EvalRunSpec]:
|
|
291
|
+
"""Build RunSpec objects for each dataset/hyperparameter combination."""
|
|
292
|
+
runs: List[EvalRunSpec] = []
|
|
293
|
+
for d_idx, ds in enumerate(datasets):
|
|
294
|
+
items = ds.items
|
|
295
|
+
labels = [item.get(ds.label) for item in items]
|
|
296
|
+
for hp_idx, hp in enumerate(hyperparameters):
|
|
297
|
+
run_spec = EvalRunSpec(d_idx, ds, items, labels, hp, hp_idx)
|
|
298
|
+
logger.debug("Built RunSpec: %s", run_spec)
|
|
299
|
+
runs.append(run_spec)
|
|
300
|
+
return runs
|
|
211
301
|
|
|
212
302
|
|
|
213
303
|
def _score_metrics(
|
|
214
304
|
eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
|
|
215
305
|
) -> Dict[str, Dict[str, Any]]:
|
|
306
|
+
"""Compute metric scores for a given dataset and inference outputs."""
|
|
216
307
|
metric_scores: Dict[str, Dict[str, Any]] = {}
|
|
308
|
+
|
|
309
|
+
if len(outputs) != len(labels):
|
|
310
|
+
raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
|
|
311
|
+
|
|
217
312
|
for metric in eval_dataset.metrics:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
313
|
+
try:
|
|
314
|
+
aggregate_scores, item_scores = metric.score(outputs, labels)
|
|
315
|
+
metric_scores[metric.name] = {
|
|
316
|
+
"aggregate_scores": aggregate_scores,
|
|
317
|
+
"item_scores": item_scores,
|
|
318
|
+
}
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.error(
|
|
321
|
+
"Failed to compute metric '%s' for dataset '%s': %s",
|
|
322
|
+
metric.name,
|
|
323
|
+
eval_dataset.name,
|
|
324
|
+
str(e),
|
|
325
|
+
)
|
|
326
|
+
raise MetricComputationError(metric.name, eval_dataset.name, e)
|
|
327
|
+
|
|
223
328
|
return metric_scores
|
|
224
329
|
|
|
225
330
|
|
|
@@ -268,4 +373,7 @@ def _format_results(
|
|
|
268
373
|
|
|
269
374
|
# Return results as an EvalResult object
|
|
270
375
|
else:
|
|
271
|
-
|
|
376
|
+
out: Dict[str, List[EvalResult]] = {}
|
|
377
|
+
for er in eval_results:
|
|
378
|
+
out.setdefault(er.eval_dataset.name, []).append(er)
|
|
379
|
+
return out
|
scorebook/exceptions.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions for the Scorebook framework.
|
|
3
|
+
|
|
4
|
+
This module defines specific exception types used throughout the Scorebook
|
|
5
|
+
evaluation framework to provide clear error handling and debugging information.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ScoreBookError(Exception):
|
|
10
|
+
"""Base exception class for all Scorebook-related errors."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EvaluationError(ScoreBookError):
|
|
14
|
+
"""Raised when there are errors during model evaluation."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParameterValidationError(ScoreBookError):
|
|
18
|
+
"""Raised when invalid parameters are provided to evaluation functions."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InferenceError(EvaluationError):
|
|
22
|
+
"""Raised when there are errors during model inference."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricComputationError(EvaluationError):
|
|
26
|
+
"""Raised when metric computation fails."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, metric_name: str, dataset_name: str, original_error: Exception):
|
|
29
|
+
"""Initialize metric computation error."""
|
|
30
|
+
self.metric_name = metric_name
|
|
31
|
+
self.dataset_name = dataset_name
|
|
32
|
+
self.original_error = original_error
|
|
33
|
+
super().__init__(
|
|
34
|
+
f"Failed to compute metric '{metric_name}' for dataset "
|
|
35
|
+
f"'{dataset_name}': {original_error}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DataMismatchError(EvaluationError):
|
|
40
|
+
"""Raised when there's a mismatch between outputs and expected labels."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
|
|
43
|
+
"""Initialize data mismatch error."""
|
|
44
|
+
self.outputs_count = outputs_count
|
|
45
|
+
self.labels_count = labels_count
|
|
46
|
+
self.dataset_name = dataset_name
|
|
47
|
+
super().__init__(
|
|
48
|
+
f"Output count ({outputs_count}) doesn't match label count ({labels_count}) "
|
|
49
|
+
f"for dataset '{dataset_name}'"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ParallelExecutionError(ScoreBookError):
|
|
54
|
+
"""Raised when parallel execution requirements are not met."""
|
scorebook/inference/openai.py
CHANGED
|
@@ -8,17 +8,19 @@ API communication, request formatting, and response processing.
|
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
10
|
import json
|
|
11
|
+
import logging
|
|
11
12
|
import tempfile
|
|
12
13
|
from typing import Any, List
|
|
13
14
|
|
|
14
|
-
from openai import
|
|
15
|
-
|
|
15
|
+
from openai import AsyncOpenAI
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
async def responses(
|
|
19
21
|
items: List[Any], model: str = "gpt-4.1-nano", client: Any = None, **hyperparameters: Any
|
|
20
22
|
) -> List[Any]:
|
|
21
|
-
"""Process multiple inference requests using OpenAI's API.
|
|
23
|
+
"""Process multiple inference requests using OpenAI's Async API.
|
|
22
24
|
|
|
23
25
|
This asynchronous function handles multiple inference requests,
|
|
24
26
|
manages the API communication, and processes the responses.
|
|
@@ -35,13 +37,67 @@ async def responses(
|
|
|
35
37
|
Raises:
|
|
36
38
|
NotImplementedError: Currently not implemented.
|
|
37
39
|
"""
|
|
38
|
-
|
|
39
|
-
|
|
40
|
+
logger.debug("OpenAI responses function called with %d items", len(items))
|
|
41
|
+
logger.debug("Using model: %s", model)
|
|
42
|
+
logger.debug("Hyperparameters: %s", hyperparameters)
|
|
40
43
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
44
|
+
if client is None:
|
|
45
|
+
logger.debug("Creating new AsyncOpenAI client")
|
|
46
|
+
client = AsyncOpenAI()
|
|
47
|
+
|
|
48
|
+
# Create all tasks concurrently for true parallelism
|
|
49
|
+
tasks = []
|
|
50
|
+
for i, item in enumerate(items):
|
|
51
|
+
logger.debug(
|
|
52
|
+
"Processing item %d: %s",
|
|
53
|
+
i,
|
|
54
|
+
str(item)[:100] + "..." if len(str(item)) > 100 else str(item),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Handle string input from preprocessor - convert to proper messages format
|
|
58
|
+
if isinstance(item, str):
|
|
59
|
+
# Convert the string format to proper OpenAI messages array
|
|
60
|
+
messages = [{"role": "user", "content": item}]
|
|
61
|
+
logger.debug(
|
|
62
|
+
"Converted string to messages format: %s",
|
|
63
|
+
(
|
|
64
|
+
messages[0]["content"][:100] + "..."
|
|
65
|
+
if len(messages[0]["content"]) > 100
|
|
66
|
+
else messages[0]["content"]
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
elif isinstance(item, list):
|
|
70
|
+
# Already in proper messages format
|
|
71
|
+
messages = item
|
|
72
|
+
logger.debug("Item %d already in messages format", i)
|
|
73
|
+
else:
|
|
74
|
+
# Fallback: treat as user message
|
|
75
|
+
messages = [{"role": "user", "content": str(item)}]
|
|
76
|
+
logger.debug("Item %d converted to fallback format", i)
|
|
77
|
+
|
|
78
|
+
logger.debug("Creating OpenAI task %d with messages: %s", i, messages)
|
|
79
|
+
task = client.chat.completions.create(model=model, messages=messages, **hyperparameters)
|
|
80
|
+
tasks.append(task)
|
|
81
|
+
|
|
82
|
+
logger.debug("Created %d tasks, waiting for OpenAI responses...", len(tasks))
|
|
83
|
+
# Wait for all requests to complete in parallel
|
|
84
|
+
results = await asyncio.gather(*tasks)
|
|
85
|
+
logger.debug("Received %d responses from OpenAI", len(results))
|
|
86
|
+
|
|
87
|
+
for i, result in enumerate(results):
|
|
88
|
+
logger.debug("Response %d type: %s", i, type(result))
|
|
89
|
+
try:
|
|
90
|
+
if hasattr(result, "choices") and result.choices:
|
|
91
|
+
content = result.choices[0].message.content
|
|
92
|
+
logger.debug(
|
|
93
|
+
"Response %d content: %s",
|
|
94
|
+
i,
|
|
95
|
+
content[:100] + "..." if content and len(content) > 100 else content,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
logger.debug("Response %d has no choices or unexpected format", i)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error("Error logging response %d: %s", i, e)
|
|
45
101
|
|
|
46
102
|
return results
|
|
47
103
|
|
|
@@ -70,40 +126,23 @@ async def batch(
|
|
|
70
126
|
NotImplementedError: Currently not implemented.
|
|
71
127
|
"""
|
|
72
128
|
if client is None:
|
|
73
|
-
client =
|
|
129
|
+
client = AsyncOpenAI()
|
|
74
130
|
|
|
75
|
-
file_id = _upload_batch(items, client)
|
|
76
|
-
batch_id = _start_batch(file_id, client)
|
|
77
|
-
|
|
78
|
-
# Initialize progress bar
|
|
79
|
-
pbar = tqdm(total=len(items), desc="Batch processing", unit="requests")
|
|
131
|
+
file_id = await _upload_batch(items, client)
|
|
132
|
+
batch_id = await _start_batch(file_id, client)
|
|
80
133
|
|
|
81
134
|
awaiting_batch = True
|
|
82
135
|
while awaiting_batch:
|
|
83
136
|
batch_object = await _get_batch(batch_id, client)
|
|
84
137
|
batch_status = batch_object.status
|
|
85
138
|
|
|
86
|
-
if hasattr(batch_object, "request_counts") and batch_object.request_counts:
|
|
87
|
-
completed = batch_object.request_counts.completed
|
|
88
|
-
total = batch_object.request_counts.total
|
|
89
|
-
pbar.n = completed
|
|
90
|
-
pbar.set_postfix(status=batch_status, completed=f"{completed}/{total}")
|
|
91
|
-
else:
|
|
92
|
-
pbar.set_postfix(status=batch_status)
|
|
93
|
-
|
|
94
|
-
pbar.refresh()
|
|
95
|
-
|
|
96
139
|
if batch_status == "completed":
|
|
97
140
|
awaiting_batch = False
|
|
98
|
-
pbar.n = pbar.total
|
|
99
|
-
pbar.set_postfix(status="completed")
|
|
100
141
|
elif batch_status == "failed":
|
|
101
142
|
raise Exception("Batch processing failed")
|
|
102
143
|
else:
|
|
103
144
|
await asyncio.sleep(60)
|
|
104
145
|
|
|
105
|
-
pbar.close()
|
|
106
|
-
|
|
107
146
|
# Get the final batch object to access output_file_id
|
|
108
147
|
final_batch_object = await _get_batch(batch_id, client)
|
|
109
148
|
output_file_id = final_batch_object.output_file_id
|
|
@@ -112,7 +151,7 @@ async def batch(
|
|
|
112
151
|
return batch_result
|
|
113
152
|
|
|
114
153
|
|
|
115
|
-
def _upload_batch(items: List[Any], client: Any) -> str:
|
|
154
|
+
async def _upload_batch(items: List[Any], client: Any) -> str:
|
|
116
155
|
"""Create a .jsonl file from preprocessed items and upload to OpenAI for batch processing.
|
|
117
156
|
|
|
118
157
|
Args:
|
|
@@ -121,10 +160,9 @@ def _upload_batch(items: List[Any], client: Any) -> str:
|
|
|
121
160
|
Returns:
|
|
122
161
|
The file ID returned by OpenAI after uploading.
|
|
123
162
|
"""
|
|
124
|
-
print("Uploading batch...")
|
|
125
163
|
# Instantiate OpenAI client
|
|
126
164
|
if client is None:
|
|
127
|
-
client =
|
|
165
|
+
client = AsyncOpenAI()
|
|
128
166
|
|
|
129
167
|
# Create temp .jsonl file
|
|
130
168
|
with tempfile.NamedTemporaryFile(mode="w+", suffix=".jsonl", delete=False) as f:
|
|
@@ -141,13 +179,13 @@ def _upload_batch(items: List[Any], client: Any) -> str:
|
|
|
141
179
|
|
|
142
180
|
# Upload file to OpenAI
|
|
143
181
|
with open(file_path, "rb") as upload_file:
|
|
144
|
-
response = client.files.create(file=upload_file, purpose="batch")
|
|
182
|
+
response = await client.files.create(file=upload_file, purpose="batch")
|
|
145
183
|
|
|
146
184
|
return str(response.id)
|
|
147
185
|
|
|
148
186
|
|
|
149
|
-
def _start_batch(file_id: str, client: Any) -> str:
|
|
150
|
-
batch_response = client.batches.create(
|
|
187
|
+
async def _start_batch(file_id: str, client: Any) -> str:
|
|
188
|
+
batch_response = await client.batches.create(
|
|
151
189
|
input_file_id=file_id,
|
|
152
190
|
endpoint="/v1/chat/completions",
|
|
153
191
|
completion_window="24h",
|
|
@@ -156,13 +194,13 @@ def _start_batch(file_id: str, client: Any) -> str:
|
|
|
156
194
|
|
|
157
195
|
|
|
158
196
|
async def _get_batch(batch_id: str, client: Any) -> Any:
|
|
159
|
-
batch_object = client.batches.retrieve(batch_id)
|
|
197
|
+
batch_object = await client.batches.retrieve(batch_id)
|
|
160
198
|
return batch_object
|
|
161
199
|
|
|
162
200
|
|
|
163
201
|
async def _get_results_file(output_file_id: str, client: Any) -> List[str]:
|
|
164
202
|
"""Download and parse the batch results file from OpenAI."""
|
|
165
|
-
response = client.files.content(output_file_id)
|
|
203
|
+
response = await client.files.content(output_file_id)
|
|
166
204
|
|
|
167
205
|
# Parse the JSONL content
|
|
168
206
|
content = response.content.decode("utf-8")
|
scorebook/types/__init__.py
CHANGED
|
@@ -7,5 +7,6 @@ and evaluation results.
|
|
|
7
7
|
|
|
8
8
|
from scorebook.types.eval_dataset import EvalDataset
|
|
9
9
|
from scorebook.types.eval_result import EvalResult
|
|
10
|
+
from scorebook.types.eval_run_spec import EvalRunSpec
|
|
10
11
|
|
|
11
|
-
__all__ = ["EvalDataset", "EvalResult"]
|
|
12
|
+
__all__ = ["EvalDataset", "EvalResult", "EvalRunSpec"]
|
scorebook/types/eval_dataset.py
CHANGED
|
@@ -86,6 +86,12 @@ class EvalDataset:
|
|
|
86
86
|
raise ValueError("Dataset is not initialized")
|
|
87
87
|
return iter(self._hf_dataset)
|
|
88
88
|
|
|
89
|
+
def shuffle(self) -> None:
|
|
90
|
+
"""Randomly shuffle the dataset items."""
|
|
91
|
+
if self._hf_dataset is None:
|
|
92
|
+
raise ValueError("Dataset is not initialized")
|
|
93
|
+
self._hf_dataset.shuffle()
|
|
94
|
+
|
|
89
95
|
@property
|
|
90
96
|
def items(self) -> List[Any]:
|
|
91
97
|
"""Return a list of all examples in the dataset."""
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Evaluation run specification types for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, NamedTuple
|
|
4
|
+
|
|
5
|
+
from scorebook.types import EvalDataset
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EvalRunSpec(NamedTuple):
|
|
9
|
+
"""Represents a single evaluation run configuration."""
|
|
10
|
+
|
|
11
|
+
dataset_idx: int
|
|
12
|
+
eval_dataset: EvalDataset
|
|
13
|
+
items: List[Dict[str, Any]]
|
|
14
|
+
labels: List[Any]
|
|
15
|
+
hyperparams: Dict[str, Any]
|
|
16
|
+
hp_idx: int
|
|
17
|
+
|
|
18
|
+
def __str__(self) -> str:
|
|
19
|
+
"""Return a formatted string summary of the evaluation run specification."""
|
|
20
|
+
hyperparams_str = ", ".join([f"{k}={v}" for k, v in self.hyperparams.items()])
|
|
21
|
+
|
|
22
|
+
return (
|
|
23
|
+
f"EvalRunSpec(dataset_idx={self.dataset_idx},"
|
|
24
|
+
f" hp_idx={self.hp_idx},"
|
|
25
|
+
f" dataset_name='{self.eval_dataset.name}',"
|
|
26
|
+
f" hyperparams=[{hyperparams_str}]"
|
|
27
|
+
f")"
|
|
28
|
+
)
|
|
@@ -57,7 +57,7 @@ class InferencePipeline:
|
|
|
57
57
|
List of processed outputs after running through the complete pipeline
|
|
58
58
|
"""
|
|
59
59
|
if self.preprocessor:
|
|
60
|
-
input_items = [self.preprocessor(item, hyperparameters) for item in items]
|
|
60
|
+
input_items = [self.preprocessor(item, **hyperparameters) for item in items]
|
|
61
61
|
else:
|
|
62
62
|
input_items = items
|
|
63
63
|
|
|
@@ -68,7 +68,7 @@ class InferencePipeline:
|
|
|
68
68
|
|
|
69
69
|
if self.postprocessor:
|
|
70
70
|
return [
|
|
71
|
-
self.postprocessor(inference_output, hyperparameters)
|
|
71
|
+
self.postprocessor(inference_output, **hyperparameters)
|
|
72
72
|
for inference_output in inference_outputs
|
|
73
73
|
]
|
|
74
74
|
else:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Logging utilities for Scorebook evaluation framework."""
|
scorebook/utils/progress_bars.py
CHANGED
|
@@ -9,20 +9,33 @@ from tqdm import tqdm
|
|
|
9
9
|
class EvaluationProgressBars:
|
|
10
10
|
"""Manages nested progress bars for evaluation tracking."""
|
|
11
11
|
|
|
12
|
-
def __init__(
|
|
12
|
+
def __init__(
|
|
13
|
+
self, datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
|
|
14
|
+
) -> None:
|
|
13
15
|
"""Initialize progress bar manager.
|
|
14
16
|
|
|
15
17
|
Args:
|
|
16
18
|
datasets: List of datasets being evaluated
|
|
17
19
|
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
20
|
+
parallel: Whether running in parallel mode
|
|
21
|
+
total_eval_runs: Total number of EvalRunSpecs (dataset_count * hyperparam_count)
|
|
18
22
|
"""
|
|
19
23
|
self.datasets = datasets
|
|
20
24
|
self.hyperparam_count = hyperparam_count
|
|
25
|
+
self.parallel = parallel
|
|
26
|
+
self.total_eval_runs = total_eval_runs
|
|
27
|
+
|
|
21
28
|
self.dataset_pbar: Optional[tqdm] = None
|
|
22
29
|
self.hyperparam_pbar: Optional[tqdm] = None
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
|
|
31
|
+
# Track progress per dataset
|
|
32
|
+
self.current_dataset_idx = 0
|
|
33
|
+
self.completed_hyperparams_per_dataset: dict[int, int] = {}
|
|
34
|
+
self.completed_eval_runs = 0
|
|
35
|
+
|
|
36
|
+
def start_progress_bars(self) -> None:
|
|
37
|
+
"""Start both progress bars."""
|
|
38
|
+
# Top level: Datasets
|
|
26
39
|
self.dataset_pbar = tqdm(
|
|
27
40
|
total=len(self.datasets),
|
|
28
41
|
desc="Datasets ",
|
|
@@ -33,57 +46,101 @@ class EvaluationProgressBars:
|
|
|
33
46
|
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
34
47
|
)
|
|
35
48
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self.
|
|
49
|
+
# Bottom level: Hyperparameters/Eval runs
|
|
50
|
+
if self.parallel:
|
|
51
|
+
# In parallel mode: show eval runs completed out of total
|
|
52
|
+
self.hyperparam_pbar = tqdm(
|
|
53
|
+
total=self.total_eval_runs,
|
|
54
|
+
desc="Eval Runs ",
|
|
55
|
+
unit="run",
|
|
56
|
+
position=1,
|
|
57
|
+
leave=False,
|
|
58
|
+
ncols=80,
|
|
59
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
# In sequential mode: show hyperparams per dataset
|
|
63
|
+
self.hyperparam_pbar = tqdm(
|
|
64
|
+
total=self.hyperparam_count,
|
|
65
|
+
desc="Hyperparams",
|
|
66
|
+
unit="config",
|
|
67
|
+
position=1,
|
|
68
|
+
leave=False,
|
|
69
|
+
ncols=80,
|
|
70
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
71
|
+
)
|
|
40
72
|
|
|
41
|
-
def
|
|
42
|
-
"""
|
|
43
|
-
if self.
|
|
44
|
-
|
|
45
|
-
self.dataset_pbar = None
|
|
73
|
+
def on_eval_run_completed(self, dataset_idx: int) -> None:
|
|
74
|
+
"""Update progress when an eval run (EvalRunSpec) completes in parallel mode."""
|
|
75
|
+
if not self.parallel:
|
|
76
|
+
return
|
|
46
77
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
position=1,
|
|
55
|
-
leave=False,
|
|
56
|
-
ncols=80,
|
|
57
|
-
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
78
|
+
self.completed_eval_runs += 1
|
|
79
|
+
if self.hyperparam_pbar:
|
|
80
|
+
self.hyperparam_pbar.update(1)
|
|
81
|
+
|
|
82
|
+
# Track how many runs completed for this dataset
|
|
83
|
+
self.completed_hyperparams_per_dataset[dataset_idx] = (
|
|
84
|
+
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
58
85
|
)
|
|
59
|
-
try:
|
|
60
|
-
yield self.hyperparam_pbar
|
|
61
|
-
finally:
|
|
62
|
-
self.hyperparam_pbar.close()
|
|
63
|
-
self.hyperparam_pbar = None
|
|
64
86
|
|
|
65
|
-
|
|
66
|
-
|
|
87
|
+
# Check if this dataset is complete
|
|
88
|
+
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
89
|
+
if self.dataset_pbar:
|
|
90
|
+
self.dataset_pbar.update(1)
|
|
91
|
+
|
|
92
|
+
def on_hyperparam_completed(self, dataset_idx: int) -> None:
|
|
93
|
+
"""Update progress when a hyperparameter config completes in sequential mode."""
|
|
94
|
+
if self.parallel:
|
|
95
|
+
return
|
|
96
|
+
|
|
67
97
|
if self.hyperparam_pbar:
|
|
68
98
|
self.hyperparam_pbar.update(1)
|
|
69
99
|
|
|
100
|
+
# Track completed hyperparams for this dataset
|
|
101
|
+
self.completed_hyperparams_per_dataset[dataset_idx] = (
|
|
102
|
+
self.completed_hyperparams_per_dataset.get(dataset_idx, 0) + 1
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Check if this dataset is complete
|
|
106
|
+
if self.completed_hyperparams_per_dataset[dataset_idx] == self.hyperparam_count:
|
|
107
|
+
# Update dataset progress
|
|
108
|
+
if self.dataset_pbar:
|
|
109
|
+
self.dataset_pbar.update(1)
|
|
110
|
+
|
|
111
|
+
# Reset hyperparameter progress for next dataset (if any)
|
|
112
|
+
if dataset_idx < len(self.datasets) - 1:
|
|
113
|
+
if self.hyperparam_pbar:
|
|
114
|
+
self.hyperparam_pbar.reset()
|
|
115
|
+
|
|
116
|
+
def close_progress_bars(self) -> None:
|
|
117
|
+
"""Close both progress bars."""
|
|
118
|
+
if self.hyperparam_pbar:
|
|
119
|
+
self.hyperparam_pbar.close()
|
|
120
|
+
self.hyperparam_pbar = None
|
|
121
|
+
if self.dataset_pbar:
|
|
122
|
+
self.dataset_pbar.close()
|
|
123
|
+
self.dataset_pbar = None
|
|
124
|
+
|
|
70
125
|
|
|
71
126
|
@contextmanager
|
|
72
127
|
def evaluation_progress(
|
|
73
|
-
datasets: List[Any], hyperparam_count: int
|
|
128
|
+
datasets: List[Any], hyperparam_count: int, parallel: bool, total_eval_runs: int
|
|
74
129
|
) -> Generator[EvaluationProgressBars, None, None]:
|
|
75
130
|
"""Context manager for evaluation progress bars.
|
|
76
131
|
|
|
77
132
|
Args:
|
|
78
133
|
datasets: List of datasets being evaluated
|
|
79
134
|
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
135
|
+
parallel: Whether running in parallel mode
|
|
136
|
+
total_eval_runs: Total number of EvalRunSpecs
|
|
80
137
|
|
|
81
138
|
Yields:
|
|
82
139
|
EvaluationProgressBars: Progress bar manager instance
|
|
83
140
|
"""
|
|
84
|
-
progress_bars = EvaluationProgressBars(datasets, hyperparam_count)
|
|
85
|
-
progress_bars.
|
|
141
|
+
progress_bars = EvaluationProgressBars(datasets, hyperparam_count, parallel, total_eval_runs)
|
|
142
|
+
progress_bars.start_progress_bars()
|
|
86
143
|
try:
|
|
87
144
|
yield progress_bars
|
|
88
145
|
finally:
|
|
89
|
-
progress_bars.
|
|
146
|
+
progress_bars.close_progress_bars()
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
scorebook/__init__.py,sha256=7ac3KpXU3kKFekq8mZ3cVbF7oQ6Q9E-uqX7ijyte1Q0,406
|
|
2
|
-
scorebook/evaluator.py,sha256=
|
|
2
|
+
scorebook/evaluator.py,sha256=mS3G3PI26nHzqkYX4tqusQZJL5Q1xTxzqshAdwscl0s,14170
|
|
3
|
+
scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
|
|
3
4
|
scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
|
|
4
5
|
scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
|
|
5
|
-
scorebook/inference/openai.py,sha256=
|
|
6
|
+
scorebook/inference/openai.py,sha256=FqXua4v4PTYSHrdTm_9fM0Us8Mo2n2LSN94CwRipRw4,7658
|
|
6
7
|
scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
7
8
|
scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
|
|
8
9
|
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
@@ -10,19 +11,21 @@ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo
|
|
|
10
11
|
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
11
12
|
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
12
13
|
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
13
|
-
scorebook/types/__init__.py,sha256=
|
|
14
|
-
scorebook/types/eval_dataset.py,sha256=
|
|
14
|
+
scorebook/types/__init__.py,sha256=dXY3Y-GiMipVExzVu7H5pbdFfg4HBMEKxqSTfENywSs,427
|
|
15
|
+
scorebook/types/eval_dataset.py,sha256=dCqOHjGaEb7pGG1VF4aGFn6hngFvlxpxddqsDtM4nTs,13870
|
|
15
16
|
scorebook/types/eval_result.py,sha256=R2zuWrx8p9_4A2W3Gmlp-xGgmelPdg8QB5PoV1hiqRc,4728
|
|
16
|
-
scorebook/types/
|
|
17
|
+
scorebook/types/eval_run_spec.py,sha256=nf7LGa_dG60Qb385W6O6qiu7VlJ03-dpo2X1PgKGcRQ,845
|
|
18
|
+
scorebook/types/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
|
|
17
19
|
scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
|
|
18
20
|
scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
|
|
19
21
|
scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
|
|
20
22
|
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
21
23
|
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
24
|
+
scorebook/utils/logging_utils.py,sha256=M4BXt369mJo037WYpvuWDoe3oGWVdHWaGo4Vbl0WDL0,60
|
|
22
25
|
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
23
|
-
scorebook/utils/progress_bars.py,sha256=
|
|
26
|
+
scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
|
|
24
27
|
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
25
|
-
scorebook-0.0.
|
|
26
|
-
scorebook-0.0.
|
|
27
|
-
scorebook-0.0.
|
|
28
|
-
scorebook-0.0.
|
|
28
|
+
scorebook-0.0.4.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
29
|
+
scorebook-0.0.4.dist-info/METADATA,sha256=7odU7Q8SHfuHru2oBBk1XlZ2tXLi2WaSShbUhfmX60A,11409
|
|
30
|
+
scorebook-0.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
31
|
+
scorebook-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|