scorebook 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +3 -3
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/_async/evaluate_async.py +58 -28
- scorebook/evaluate/_sync/evaluate.py +58 -28
- scorebook/evaluate/evaluate_helpers.py +31 -8
- scorebook/exceptions.py +48 -0
- scorebook/inference/clients/bedrock.py +1 -1
- scorebook/inference/clients/portkey.py +1 -1
- scorebook/inference/clients/vertex.py +1 -1
- scorebook/settings.py +3 -0
- scorebook/types.py +8 -5
- scorebook/utils/__init__.py +4 -4
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +752 -70
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/METADATA +2 -1
- {scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/RECORD +21 -20
- scorebook/eval_dataset.py +0 -404
- {scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,7 @@ from trismik.types import (
|
|
|
11
11
|
TrismikRunMetadata,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
-
from scorebook.
|
|
14
|
+
from scorebook.eval_datasets import EvalDataset
|
|
15
15
|
from scorebook.evaluate.evaluate_helpers import (
|
|
16
16
|
build_eval_run_specs,
|
|
17
17
|
create_trismik_async_client,
|
|
@@ -20,6 +20,7 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
20
20
|
make_trismik_inference,
|
|
21
21
|
prepare_datasets,
|
|
22
22
|
prepare_hyperparameter_configs,
|
|
23
|
+
resolve_show_progress,
|
|
23
24
|
resolve_upload_results,
|
|
24
25
|
score_metrics,
|
|
25
26
|
validate_parameters,
|
|
@@ -32,7 +33,7 @@ from scorebook.types import (
|
|
|
32
33
|
EvalResult,
|
|
33
34
|
EvalRunSpec,
|
|
34
35
|
)
|
|
35
|
-
from scorebook.utils import async_nullcontext,
|
|
36
|
+
from scorebook.utils import async_nullcontext, evaluation_progress_context
|
|
36
37
|
|
|
37
38
|
logger = logging.getLogger(__name__)
|
|
38
39
|
|
|
@@ -50,6 +51,7 @@ async def evaluate_async(
|
|
|
50
51
|
return_output: bool = False,
|
|
51
52
|
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
52
53
|
sample_size: Optional[int] = None,
|
|
54
|
+
show_progress: Optional[bool] = None,
|
|
53
55
|
) -> Union[Dict, List, EvalResult]:
|
|
54
56
|
"""
|
|
55
57
|
Evaluate a model across a collection of hyperparameters and datasets.
|
|
@@ -67,6 +69,8 @@ async def evaluate_async(
|
|
|
67
69
|
return_output: If True, returns model outputs for each dataset item
|
|
68
70
|
upload_results: If True, uploads results to Trismik's dashboard
|
|
69
71
|
sample_size: Optional number of items to sample from each dataset
|
|
72
|
+
show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
|
|
73
|
+
If True/False, explicitly enables/disables progress bars for this evaluation.
|
|
70
74
|
|
|
71
75
|
Returns:
|
|
72
76
|
The evaluation results in the format specified by return parameters:
|
|
@@ -75,6 +79,7 @@ async def evaluate_async(
|
|
|
75
79
|
"""
|
|
76
80
|
# Resolve and validate parameters
|
|
77
81
|
upload_results = cast(bool, resolve_upload_results(upload_results))
|
|
82
|
+
show_progress_bars = resolve_show_progress(show_progress)
|
|
78
83
|
validate_parameters(locals(), evaluate_async)
|
|
79
84
|
|
|
80
85
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
@@ -95,10 +100,17 @@ async def evaluate_async(
|
|
|
95
100
|
|
|
96
101
|
async with trismik_client or async_nullcontext():
|
|
97
102
|
# Execute evaluation runs
|
|
98
|
-
|
|
103
|
+
# Calculate total items across all runs
|
|
104
|
+
total_items = sum(len(run.dataset.items) for run in eval_run_specs)
|
|
105
|
+
model_display = get_model_name(inference)
|
|
106
|
+
|
|
107
|
+
with evaluation_progress_context(
|
|
108
|
+
total_eval_runs=len(eval_run_specs),
|
|
109
|
+
total_items=total_items,
|
|
99
110
|
dataset_count=len(datasets),
|
|
100
|
-
|
|
101
|
-
|
|
111
|
+
hyperparam_count=len(hyperparameter_configs),
|
|
112
|
+
model_display=model_display,
|
|
113
|
+
enabled=show_progress_bars,
|
|
102
114
|
) as progress_bars:
|
|
103
115
|
eval_result = await execute_runs(
|
|
104
116
|
inference,
|
|
@@ -136,7 +148,10 @@ async def execute_runs(
|
|
|
136
148
|
run_result = await execute_run(
|
|
137
149
|
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
138
150
|
)
|
|
139
|
-
|
|
151
|
+
# Update progress bars with items processed and success status
|
|
152
|
+
if progress_bars is not None:
|
|
153
|
+
items_processed = len(run.dataset.items)
|
|
154
|
+
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
140
155
|
|
|
141
156
|
if (
|
|
142
157
|
upload_results
|
|
@@ -146,10 +161,18 @@ async def execute_runs(
|
|
|
146
161
|
and run_result.run_completed
|
|
147
162
|
and trismik_client is not None
|
|
148
163
|
):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
164
|
+
try:
|
|
165
|
+
run_id = await upload_classic_run_results(
|
|
166
|
+
run_result, experiment_id, project_id, inference, metadata, trismik_client
|
|
167
|
+
)
|
|
168
|
+
run_result.run_id = run_id
|
|
169
|
+
if progress_bars is not None:
|
|
170
|
+
progress_bars.on_upload_completed(succeeded=True)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning(f"Failed to upload run results: {e}")
|
|
173
|
+
if progress_bars is not None:
|
|
174
|
+
progress_bars.on_upload_completed(succeeded=False)
|
|
175
|
+
# Continue evaluation even if upload fails
|
|
153
176
|
|
|
154
177
|
return run_result
|
|
155
178
|
|
|
@@ -203,7 +226,7 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
|
|
|
203
226
|
|
|
204
227
|
try:
|
|
205
228
|
inference_outputs = await run_inference_callable(
|
|
206
|
-
inference, run.
|
|
229
|
+
inference, run.inputs, run.hyperparameter_config
|
|
207
230
|
)
|
|
208
231
|
metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
|
|
209
232
|
logger.debug("Classic evaluation completed for run %s", run)
|
|
@@ -216,13 +239,13 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
|
|
|
216
239
|
|
|
217
240
|
async def run_inference_callable(
|
|
218
241
|
inference: Callable,
|
|
219
|
-
|
|
242
|
+
inputs: List[Any],
|
|
220
243
|
hyperparameter_config: Dict[str, Any],
|
|
221
244
|
) -> Any:
|
|
222
245
|
"""Run inference on a given dataset and hyperparameter configuration."""
|
|
223
246
|
|
|
224
247
|
try:
|
|
225
|
-
predictions = await inference(
|
|
248
|
+
predictions = await inference(inputs, **hyperparameter_config)
|
|
226
249
|
except Exception as e:
|
|
227
250
|
logger.error(
|
|
228
251
|
"Inference callable raised an exception: %s",
|
|
@@ -230,11 +253,11 @@ async def run_inference_callable(
|
|
|
230
253
|
)
|
|
231
254
|
raise InferenceError(f"Inference failed: {str(e)}") from e
|
|
232
255
|
|
|
233
|
-
if not isinstance(predictions, list) or len(predictions) != len(
|
|
256
|
+
if not isinstance(predictions, list) or len(predictions) != len(inputs):
|
|
234
257
|
raise InferenceError(
|
|
235
258
|
"Inference callable must return a list of predictions "
|
|
236
|
-
"of shared length as the
|
|
237
|
-
f"
|
|
259
|
+
"of shared length as the inputs. "
|
|
260
|
+
f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
|
|
238
261
|
)
|
|
239
262
|
|
|
240
263
|
if all(prediction == "" for prediction in predictions):
|
|
@@ -257,15 +280,20 @@ async def execute_adaptive_eval_run(
|
|
|
257
280
|
"""Execute an adaptive evaluation run."""
|
|
258
281
|
logger.debug("Executing adaptive run for %s", run)
|
|
259
282
|
|
|
260
|
-
|
|
261
|
-
|
|
283
|
+
try:
|
|
284
|
+
if trismik_client is None:
|
|
285
|
+
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
262
286
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
287
|
+
adaptive_eval_run_result = await run_adaptive_evaluation(
|
|
288
|
+
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
289
|
+
)
|
|
290
|
+
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
267
291
|
|
|
268
|
-
|
|
292
|
+
return adaptive_eval_run_result
|
|
293
|
+
|
|
294
|
+
except Exception as e:
|
|
295
|
+
logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
|
|
296
|
+
return AdaptiveEvalRunResult(run, False, {})
|
|
269
297
|
|
|
270
298
|
|
|
271
299
|
async def upload_classic_run_results(
|
|
@@ -291,10 +319,12 @@ async def upload_classic_run_results(
|
|
|
291
319
|
"""
|
|
292
320
|
model = get_model_name(inference_callable)
|
|
293
321
|
|
|
294
|
-
# Create eval items from run_spec
|
|
322
|
+
# Create eval items from run_spec inputs, outputs, and labels
|
|
295
323
|
items: List[TrismikClassicEvalItem] = []
|
|
296
|
-
|
|
297
|
-
|
|
324
|
+
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
+
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
+
labels = run_result.run_spec.labels
|
|
327
|
+
label = labels[idx] if idx < len(labels) else ""
|
|
298
328
|
|
|
299
329
|
# Calculate item-level metrics for this item
|
|
300
330
|
item_metrics: Dict[str, Any] = {}
|
|
@@ -309,7 +339,7 @@ async def upload_classic_run_results(
|
|
|
309
339
|
|
|
310
340
|
eval_item = TrismikClassicEvalItem(
|
|
311
341
|
datasetItemId=str(idx),
|
|
312
|
-
modelInput=str(
|
|
342
|
+
modelInput=str(input_value),
|
|
313
343
|
modelOutput=str(output),
|
|
314
344
|
goldOutput=str(label),
|
|
315
345
|
metrics=item_metrics,
|
|
@@ -410,4 +440,4 @@ async def run_adaptive_evaluation(
|
|
|
410
440
|
# Make scores JSON serializable
|
|
411
441
|
scores = make_json_serializable(scores)
|
|
412
442
|
|
|
413
|
-
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
|
|
443
|
+
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
|
|
@@ -10,7 +10,7 @@ from trismik.types import (
|
|
|
10
10
|
TrismikRunMetadata,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
-
from scorebook.
|
|
13
|
+
from scorebook.eval_datasets import EvalDataset
|
|
14
14
|
from scorebook.evaluate.evaluate_helpers import (
|
|
15
15
|
build_eval_run_specs,
|
|
16
16
|
create_trismik_sync_client,
|
|
@@ -19,6 +19,7 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
19
19
|
make_trismik_inference,
|
|
20
20
|
prepare_datasets,
|
|
21
21
|
prepare_hyperparameter_configs,
|
|
22
|
+
resolve_show_progress,
|
|
22
23
|
resolve_upload_results,
|
|
23
24
|
score_metrics,
|
|
24
25
|
validate_parameters,
|
|
@@ -32,7 +33,7 @@ from scorebook.types import (
|
|
|
32
33
|
EvalRunSpec,
|
|
33
34
|
)
|
|
34
35
|
from contextlib import nullcontext
|
|
35
|
-
from scorebook.utils import
|
|
36
|
+
from scorebook.utils import evaluation_progress_context
|
|
36
37
|
|
|
37
38
|
logger = logging.getLogger(__name__)
|
|
38
39
|
|
|
@@ -50,6 +51,7 @@ def evaluate(
|
|
|
50
51
|
return_output: bool = False,
|
|
51
52
|
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
52
53
|
sample_size: Optional[int] = None,
|
|
54
|
+
show_progress: Optional[bool] = None,
|
|
53
55
|
) -> Union[Dict, List, EvalResult]:
|
|
54
56
|
"""
|
|
55
57
|
Evaluate a model across a collection of hyperparameters and datasets.
|
|
@@ -67,6 +69,8 @@ def evaluate(
|
|
|
67
69
|
return_output: If True, returns model outputs for each dataset item
|
|
68
70
|
upload_results: If True, uploads results to Trismik's dashboard
|
|
69
71
|
sample_size: Optional number of items to sample from each dataset
|
|
72
|
+
show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
|
|
73
|
+
If True/False, explicitly enables/disables progress bars for this evaluation.
|
|
70
74
|
|
|
71
75
|
Returns:
|
|
72
76
|
The evaluation results in the format specified by return parameters:
|
|
@@ -75,6 +79,7 @@ def evaluate(
|
|
|
75
79
|
"""
|
|
76
80
|
# Resolve and validate parameters
|
|
77
81
|
upload_results = cast(bool, resolve_upload_results(upload_results))
|
|
82
|
+
show_progress_bars = resolve_show_progress(show_progress)
|
|
78
83
|
validate_parameters(locals(), evaluate)
|
|
79
84
|
|
|
80
85
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
@@ -95,10 +100,17 @@ def evaluate(
|
|
|
95
100
|
|
|
96
101
|
with trismik_client or nullcontext():
|
|
97
102
|
# Execute evaluation runs
|
|
98
|
-
|
|
103
|
+
# Calculate total items across all runs
|
|
104
|
+
total_items = sum(len(run.dataset.items) for run in eval_run_specs)
|
|
105
|
+
model_display = get_model_name(inference)
|
|
106
|
+
|
|
107
|
+
with evaluation_progress_context(
|
|
108
|
+
total_eval_runs=len(eval_run_specs),
|
|
109
|
+
total_items=total_items,
|
|
99
110
|
dataset_count=len(datasets),
|
|
100
|
-
|
|
101
|
-
|
|
111
|
+
hyperparam_count=len(hyperparameter_configs),
|
|
112
|
+
model_display=model_display,
|
|
113
|
+
enabled=show_progress_bars,
|
|
102
114
|
) as progress_bars:
|
|
103
115
|
eval_result = execute_runs(
|
|
104
116
|
inference,
|
|
@@ -136,7 +148,10 @@ def execute_runs(
|
|
|
136
148
|
run_result = execute_run(
|
|
137
149
|
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
138
150
|
)
|
|
139
|
-
|
|
151
|
+
# Update progress bars with items processed and success status
|
|
152
|
+
if progress_bars is not None:
|
|
153
|
+
items_processed = len(run.dataset.items)
|
|
154
|
+
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
140
155
|
|
|
141
156
|
if (
|
|
142
157
|
upload_results
|
|
@@ -146,10 +161,18 @@ def execute_runs(
|
|
|
146
161
|
and run_result.run_completed
|
|
147
162
|
and trismik_client is not None
|
|
148
163
|
):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
164
|
+
try:
|
|
165
|
+
run_id = upload_classic_run_results(
|
|
166
|
+
run_result, experiment_id, project_id, inference, metadata, trismik_client
|
|
167
|
+
)
|
|
168
|
+
run_result.run_id = run_id
|
|
169
|
+
if progress_bars is not None:
|
|
170
|
+
progress_bars.on_upload_completed(succeeded=True)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning(f"Failed to upload run results: {e}")
|
|
173
|
+
if progress_bars is not None:
|
|
174
|
+
progress_bars.on_upload_completed(succeeded=False)
|
|
175
|
+
# Continue evaluation even if upload fails
|
|
153
176
|
|
|
154
177
|
return run_result
|
|
155
178
|
|
|
@@ -203,7 +226,7 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
|
|
|
203
226
|
|
|
204
227
|
try:
|
|
205
228
|
inference_outputs = run_inference_callable(
|
|
206
|
-
inference, run.
|
|
229
|
+
inference, run.inputs, run.hyperparameter_config
|
|
207
230
|
)
|
|
208
231
|
metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
|
|
209
232
|
logger.debug("Classic evaluation completed for run %s", run)
|
|
@@ -216,13 +239,13 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
|
|
|
216
239
|
|
|
217
240
|
def run_inference_callable(
|
|
218
241
|
inference: Callable,
|
|
219
|
-
|
|
242
|
+
inputs: List[Any],
|
|
220
243
|
hyperparameter_config: Dict[str, Any],
|
|
221
244
|
) -> Any:
|
|
222
245
|
"""Run inference on a given dataset and hyperparameter configuration."""
|
|
223
246
|
|
|
224
247
|
try:
|
|
225
|
-
predictions = inference(
|
|
248
|
+
predictions = inference(inputs, **hyperparameter_config)
|
|
226
249
|
except Exception as e:
|
|
227
250
|
logger.error(
|
|
228
251
|
"Inference callable raised an exception: %s",
|
|
@@ -230,11 +253,11 @@ def run_inference_callable(
|
|
|
230
253
|
)
|
|
231
254
|
raise InferenceError(f"Inference failed: {str(e)}") from e
|
|
232
255
|
|
|
233
|
-
if not isinstance(predictions, list) or len(predictions) != len(
|
|
256
|
+
if not isinstance(predictions, list) or len(predictions) != len(inputs):
|
|
234
257
|
raise InferenceError(
|
|
235
258
|
"Inference callable must return a list of predictions "
|
|
236
|
-
"of shared length as the
|
|
237
|
-
f"
|
|
259
|
+
"of shared length as the inputs. "
|
|
260
|
+
f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
|
|
238
261
|
)
|
|
239
262
|
|
|
240
263
|
if all(prediction == "" for prediction in predictions):
|
|
@@ -257,15 +280,20 @@ def execute_adaptive_eval_run(
|
|
|
257
280
|
"""Execute an adaptive evaluation run."""
|
|
258
281
|
logger.debug("Executing adaptive run for %s", run)
|
|
259
282
|
|
|
260
|
-
|
|
261
|
-
|
|
283
|
+
try:
|
|
284
|
+
if trismik_client is None:
|
|
285
|
+
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
262
286
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
287
|
+
adaptive_eval_run_result = run_adaptive_evaluation(
|
|
288
|
+
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
289
|
+
)
|
|
290
|
+
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
267
291
|
|
|
268
|
-
|
|
292
|
+
return adaptive_eval_run_result
|
|
293
|
+
|
|
294
|
+
except Exception as e:
|
|
295
|
+
logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
|
|
296
|
+
return AdaptiveEvalRunResult(run, False, {})
|
|
269
297
|
|
|
270
298
|
|
|
271
299
|
def upload_classic_run_results(
|
|
@@ -291,10 +319,12 @@ def upload_classic_run_results(
|
|
|
291
319
|
"""
|
|
292
320
|
model = get_model_name(inference_callable)
|
|
293
321
|
|
|
294
|
-
# Create eval items from run_spec
|
|
322
|
+
# Create eval items from run_spec inputs, outputs, and labels
|
|
295
323
|
items: List[TrismikClassicEvalItem] = []
|
|
296
|
-
|
|
297
|
-
|
|
324
|
+
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
+
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
+
labels = run_result.run_spec.labels
|
|
327
|
+
label = labels[idx] if idx < len(labels) else ""
|
|
298
328
|
|
|
299
329
|
# Calculate item-level metrics for this item
|
|
300
330
|
item_metrics: Dict[str, Any] = {}
|
|
@@ -309,7 +339,7 @@ def upload_classic_run_results(
|
|
|
309
339
|
|
|
310
340
|
eval_item = TrismikClassicEvalItem(
|
|
311
341
|
datasetItemId=str(idx),
|
|
312
|
-
modelInput=str(
|
|
342
|
+
modelInput=str(input_value),
|
|
313
343
|
modelOutput=str(output),
|
|
314
344
|
goldOutput=str(label),
|
|
315
345
|
metrics=item_metrics,
|
|
@@ -410,4 +440,4 @@ def run_adaptive_evaluation(
|
|
|
410
440
|
# Make scores JSON serializable
|
|
411
441
|
scores = make_json_serializable(scores)
|
|
412
442
|
|
|
413
|
-
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
|
|
443
|
+
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
|
|
@@ -35,6 +35,22 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
|
|
|
35
35
|
return upload_results
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
def resolve_show_progress(show_progress: Optional[bool]) -> bool:
|
|
39
|
+
"""Resolve whether to show progress bars.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
show_progress: Explicit setting (None uses default from settings)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
bool: Whether to show progress bars
|
|
46
|
+
"""
|
|
47
|
+
if show_progress is None:
|
|
48
|
+
from scorebook.settings import SHOW_PROGRESS_BARS
|
|
49
|
+
|
|
50
|
+
return bool(SHOW_PROGRESS_BARS)
|
|
51
|
+
return show_progress
|
|
52
|
+
|
|
53
|
+
|
|
38
54
|
def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
|
|
39
55
|
"""Validate all parameters for evaluation."""
|
|
40
56
|
|
|
@@ -172,15 +188,22 @@ def build_classic_eval_run_spec(
|
|
|
172
188
|
hyperparameters: Dict[str, Any],
|
|
173
189
|
hyperparameters_index: int,
|
|
174
190
|
) -> EvalRunSpec:
|
|
175
|
-
"""Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
|
|
176
|
-
|
|
177
|
-
|
|
191
|
+
"""Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
|
|
192
|
+
|
|
193
|
+
Extracts input and label values from the appropriate columns in the dataset.
|
|
194
|
+
The column names are determined by dataset.input and dataset.label,
|
|
195
|
+
which may be original field names (e.g., "question", "answer") or computed
|
|
196
|
+
column names (e.g., "*input", "*label") if templates were used.
|
|
197
|
+
"""
|
|
198
|
+
# Extract inputs and labels using the dataset's column specifications
|
|
199
|
+
inputs = dataset[dataset.input] # Returns List[Any]
|
|
200
|
+
labels = dataset[dataset.label] # Returns List[Any]
|
|
178
201
|
eval_run_spec = EvalRunSpec(
|
|
179
202
|
dataset,
|
|
180
203
|
dataset_index,
|
|
181
204
|
hyperparameters,
|
|
182
205
|
hyperparameters_index,
|
|
183
|
-
|
|
206
|
+
inputs,
|
|
184
207
|
labels,
|
|
185
208
|
)
|
|
186
209
|
logger.debug("Built EvalRunSpec: %s", eval_run_spec)
|
|
@@ -256,7 +279,7 @@ def create_trismik_sync_client() -> TrismikClient:
|
|
|
256
279
|
def get_model_name(
|
|
257
280
|
inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
|
|
258
281
|
) -> str:
|
|
259
|
-
"""Determine a model's name with the fallback "
|
|
282
|
+
"""Determine a model's name with the fallback "Model"."""
|
|
260
283
|
|
|
261
284
|
# First priority: metadata.model
|
|
262
285
|
if metadata and "model" in metadata:
|
|
@@ -266,8 +289,8 @@ def get_model_name(
|
|
|
266
289
|
if inference_callable and hasattr(inference_callable, "model"):
|
|
267
290
|
return str(inference_callable.model)
|
|
268
291
|
|
|
269
|
-
# Fallback: "
|
|
270
|
-
return "
|
|
292
|
+
# Fallback: "Model"
|
|
293
|
+
return "Model"
|
|
271
294
|
|
|
272
295
|
|
|
273
296
|
def format_results(
|
|
@@ -292,7 +315,7 @@ def format_results(
|
|
|
292
315
|
# Remove inference output if not requested
|
|
293
316
|
if not return_output:
|
|
294
317
|
for item in item_scores:
|
|
295
|
-
item.pop("
|
|
318
|
+
item.pop("output", None)
|
|
296
319
|
|
|
297
320
|
results["item_results"] = item_scores
|
|
298
321
|
|
scorebook/exceptions.py
CHANGED
|
@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
|
|
|
10
10
|
"""Base exception class for all Scorebook-related errors."""
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
class EvalDatasetError(ScoreBookError):
|
|
14
|
+
"""Base exception class for all EvalDataset errors."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetConfigurationError(EvalDatasetError):
|
|
18
|
+
"""Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MissingFieldError(EvalDatasetError):
|
|
22
|
+
"""Raised when required field is missing from dataset."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
|
|
25
|
+
"""Initialize missing field error with structured context."""
|
|
26
|
+
self.field_name = field_name
|
|
27
|
+
self.field_type = field_type # "input" or "label"
|
|
28
|
+
self.available_fields = available_fields
|
|
29
|
+
super().__init__(
|
|
30
|
+
f"{field_type.capitalize()} field '{field_name}' not found. "
|
|
31
|
+
f"Available fields: {', '.join(available_fields)}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatasetLoadError(EvalDatasetError):
|
|
36
|
+
"""Raised when dataset fails to load from source (file or remote)."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetParseError(EvalDatasetError):
|
|
40
|
+
"""Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DatasetNotInitializedError(EvalDatasetError):
|
|
44
|
+
"""Raised when operations are attempted on uninitialized dataset."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DatasetSampleError(EvalDatasetError):
|
|
48
|
+
"""Raised when sampling parameters are invalid."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
|
|
51
|
+
"""Initialize dataset sample error with structured context."""
|
|
52
|
+
self.sample_size = sample_size
|
|
53
|
+
self.dataset_size = dataset_size
|
|
54
|
+
self.dataset_name = dataset_name
|
|
55
|
+
super().__init__(
|
|
56
|
+
f"Sample size {sample_size} exceeds dataset size {dataset_size} "
|
|
57
|
+
f"for dataset '{dataset_name}'"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
13
61
|
class EvaluationError(ScoreBookError):
|
|
14
62
|
"""Raised when there are errors during model evaluation."""
|
|
15
63
|
|
scorebook/settings.py
CHANGED
|
@@ -16,3 +16,6 @@ TRISMIK_ADAPTIVE_TESTING_URL = f"{TRISMIK_API_BASE_URL}/adaptive-testing"
|
|
|
16
16
|
|
|
17
17
|
# Allow override via environment variable
|
|
18
18
|
TRISMIK_SERVICE_URL = os.environ.get("TRISMIK_SERVICE_URL", TRISMIK_ADAPTIVE_TESTING_URL)
|
|
19
|
+
|
|
20
|
+
# Progress bar configuration
|
|
21
|
+
SHOW_PROGRESS_BARS = os.environ.get("SCOREBOOK_SHOW_PROGRESS_BARS", "true").lower() == "true"
|
scorebook/types.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from scorebook.
|
|
6
|
+
from scorebook.eval_datasets import EvalDataset
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass
|
|
@@ -21,7 +21,7 @@ class EvalRunSpec:
|
|
|
21
21
|
dataset_index: int
|
|
22
22
|
hyperparameter_config: Dict[str, Any]
|
|
23
23
|
hyperparameters_index: int
|
|
24
|
-
|
|
24
|
+
inputs: List[Any]
|
|
25
25
|
labels: List[Any]
|
|
26
26
|
|
|
27
27
|
def __str__(self) -> str:
|
|
@@ -64,13 +64,15 @@ class ClassicEvalRunResult:
|
|
|
64
64
|
|
|
65
65
|
if self.outputs:
|
|
66
66
|
for idx, output in enumerate(self.outputs):
|
|
67
|
-
if idx >= len(self.run_spec.
|
|
67
|
+
if idx >= len(self.run_spec.inputs):
|
|
68
68
|
break
|
|
69
69
|
|
|
70
70
|
result = {
|
|
71
|
-
"
|
|
71
|
+
"id": idx,
|
|
72
72
|
"dataset_name": self.run_spec.dataset.name,
|
|
73
|
-
"
|
|
73
|
+
"input": self.run_spec.inputs[idx],
|
|
74
|
+
"label": self.run_spec.labels[idx] if idx < len(self.run_spec.labels) else None,
|
|
75
|
+
"output": output,
|
|
74
76
|
**self.run_spec.hyperparameter_config,
|
|
75
77
|
}
|
|
76
78
|
|
|
@@ -125,6 +127,7 @@ class AdaptiveEvalRunResult:
|
|
|
125
127
|
"""Results from executing an adaptive evaluation run."""
|
|
126
128
|
|
|
127
129
|
run_spec: AdaptiveEvalRunSpec
|
|
130
|
+
run_completed: bool
|
|
128
131
|
scores: Dict[str, Any]
|
|
129
132
|
|
|
130
133
|
@property
|
scorebook/utils/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
2
|
|
|
3
3
|
from scorebook.utils.async_utils import async_nullcontext, is_awaitable
|
|
4
|
-
from scorebook.utils.build_prompt import build_prompt
|
|
5
4
|
from scorebook.utils.io_helpers import validate_path
|
|
6
|
-
from scorebook.utils.progress_bars import
|
|
5
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
6
|
+
from scorebook.utils.render_template import render_template
|
|
7
7
|
from scorebook.utils.transform_helpers import expand_dict
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
@@ -11,6 +11,6 @@ __all__ = [
|
|
|
11
11
|
"is_awaitable",
|
|
12
12
|
"validate_path",
|
|
13
13
|
"expand_dict",
|
|
14
|
-
"
|
|
15
|
-
"
|
|
14
|
+
"evaluation_progress_context",
|
|
15
|
+
"render_template",
|
|
16
16
|
]
|