scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +11 -4
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/_async/evaluate_async.py +135 -130
- scorebook/evaluate/_sync/evaluate.py +135 -131
- scorebook/evaluate/evaluate_helpers.py +46 -23
- scorebook/exceptions.py +54 -2
- scorebook/inference/clients/bedrock.py +1 -1
- scorebook/inference/clients/portkey.py +1 -1
- scorebook/inference/clients/vertex.py +1 -1
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/settings.py +3 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +36 -54
- scorebook/utils/__init__.py +11 -4
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +819 -70
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
- scorebook-0.0.12.dist-info/RECORD +50 -0
- scorebook/eval_dataset.py +0 -404
- scorebook-0.0.10.dist-info/RECORD +0 -41
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,15 +2,10 @@ import logging
|
|
|
2
2
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
4
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
5
|
-
from trismik.
|
|
6
|
-
|
|
7
|
-
TrismikClassicEvalMetric,
|
|
8
|
-
TrismikClassicEvalRequest,
|
|
9
|
-
TrismikClassicEvalResponse,
|
|
10
|
-
TrismikRunMetadata,
|
|
11
|
-
)
|
|
5
|
+
from trismik.settings import evaluation_settings
|
|
6
|
+
from trismik.types import TrismikRunMetadata
|
|
12
7
|
|
|
13
|
-
from scorebook.
|
|
8
|
+
from scorebook.eval_datasets import EvalDataset
|
|
14
9
|
from scorebook.evaluate.evaluate_helpers import (
|
|
15
10
|
build_eval_run_specs,
|
|
16
11
|
create_trismik_sync_client,
|
|
@@ -19,11 +14,11 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
19
14
|
make_trismik_inference,
|
|
20
15
|
prepare_datasets,
|
|
21
16
|
prepare_hyperparameter_configs,
|
|
22
|
-
resolve_upload_results,
|
|
23
|
-
score_metrics,
|
|
24
17
|
validate_parameters,
|
|
25
18
|
)
|
|
26
19
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
20
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
21
|
+
from scorebook.score._sync.score import score
|
|
27
22
|
from scorebook.types import (
|
|
28
23
|
AdaptiveEvalRunResult,
|
|
29
24
|
AdaptiveEvalRunSpec,
|
|
@@ -31,14 +26,18 @@ from scorebook.types import (
|
|
|
31
26
|
EvalResult,
|
|
32
27
|
EvalRunSpec,
|
|
33
28
|
)
|
|
34
|
-
from
|
|
35
|
-
|
|
29
|
+
from scorebook.utils import (
|
|
30
|
+
nullcontext,
|
|
31
|
+
evaluation_progress_context,
|
|
32
|
+
resolve_show_progress,
|
|
33
|
+
resolve_upload_results,
|
|
34
|
+
)
|
|
36
35
|
|
|
37
36
|
logger = logging.getLogger(__name__)
|
|
38
37
|
|
|
39
38
|
|
|
40
39
|
def evaluate(
|
|
41
|
-
inference: Callable,
|
|
40
|
+
inference: Union[Callable, InferencePipeline],
|
|
42
41
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
43
42
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
44
43
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -50,6 +49,7 @@ def evaluate(
|
|
|
50
49
|
return_output: bool = False,
|
|
51
50
|
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
52
51
|
sample_size: Optional[int] = None,
|
|
52
|
+
show_progress: Optional[bool] = None,
|
|
53
53
|
) -> Union[Dict, List, EvalResult]:
|
|
54
54
|
"""
|
|
55
55
|
Evaluate a model across a collection of hyperparameters and datasets.
|
|
@@ -67,6 +67,8 @@ def evaluate(
|
|
|
67
67
|
return_output: If True, returns model outputs for each dataset item
|
|
68
68
|
upload_results: If True, uploads results to Trismik's dashboard
|
|
69
69
|
sample_size: Optional number of items to sample from each dataset
|
|
70
|
+
show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
|
|
71
|
+
If True/False, explicitly enables/disables progress bars for this evaluation.
|
|
70
72
|
|
|
71
73
|
Returns:
|
|
72
74
|
The evaluation results in the format specified by return parameters:
|
|
@@ -75,6 +77,7 @@ def evaluate(
|
|
|
75
77
|
"""
|
|
76
78
|
# Resolve and validate parameters
|
|
77
79
|
upload_results = cast(bool, resolve_upload_results(upload_results))
|
|
80
|
+
show_progress_bars = resolve_show_progress(show_progress)
|
|
78
81
|
validate_parameters(locals(), evaluate)
|
|
79
82
|
|
|
80
83
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
@@ -85,7 +88,7 @@ def evaluate(
|
|
|
85
88
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
86
89
|
)
|
|
87
90
|
|
|
88
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
91
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
89
92
|
needs_client = upload_results or any(
|
|
90
93
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
91
94
|
)
|
|
@@ -95,10 +98,24 @@ def evaluate(
|
|
|
95
98
|
|
|
96
99
|
with trismik_client or nullcontext():
|
|
97
100
|
# Execute evaluation runs
|
|
98
|
-
|
|
101
|
+
# Calculate total items across all runs
|
|
102
|
+
total_items = sum(
|
|
103
|
+
(
|
|
104
|
+
len(run.dataset.items)
|
|
105
|
+
if isinstance(run, EvalRunSpec)
|
|
106
|
+
else evaluation_settings["max_iterations"]
|
|
107
|
+
) # Adaptive evals use max_iterations
|
|
108
|
+
for run in eval_run_specs
|
|
109
|
+
)
|
|
110
|
+
model_display = get_model_name(inference)
|
|
111
|
+
|
|
112
|
+
with evaluation_progress_context(
|
|
113
|
+
total_eval_runs=len(eval_run_specs),
|
|
114
|
+
total_items=total_items,
|
|
99
115
|
dataset_count=len(datasets),
|
|
100
|
-
|
|
101
|
-
|
|
116
|
+
hyperparam_count=len(hyperparameter_configs),
|
|
117
|
+
model_display=model_display,
|
|
118
|
+
enabled=show_progress_bars,
|
|
102
119
|
) as progress_bars:
|
|
103
120
|
eval_result = execute_runs(
|
|
104
121
|
inference,
|
|
@@ -133,23 +150,32 @@ def execute_runs(
|
|
|
133
150
|
def worker(
|
|
134
151
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
135
152
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
153
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
136
154
|
run_result = execute_run(
|
|
137
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
155
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
138
156
|
)
|
|
139
|
-
progress_bars.on_eval_run_completed(run.dataset_index)
|
|
140
157
|
|
|
158
|
+
# Update progress bars with items processed and success status
|
|
159
|
+
if progress_bars is not None:
|
|
160
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
161
|
+
items_processed = (
|
|
162
|
+
len(run.dataset.items)
|
|
163
|
+
if isinstance(run, EvalRunSpec)
|
|
164
|
+
else evaluation_settings["max_iterations"]
|
|
165
|
+
)
|
|
166
|
+
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
167
|
+
|
|
168
|
+
# Update upload progress for classic evals
|
|
141
169
|
if (
|
|
142
170
|
upload_results
|
|
143
171
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
144
|
-
and experiment_id
|
|
145
|
-
and project_id
|
|
146
172
|
and run_result.run_completed
|
|
147
|
-
and trismik_client is not None
|
|
148
173
|
):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
174
|
+
# Check if upload succeeded by checking for run_id
|
|
175
|
+
if experiment_id and project_id:
|
|
176
|
+
upload_succeeded = run_result.run_id is not None
|
|
177
|
+
if progress_bars is not None:
|
|
178
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
153
179
|
|
|
154
180
|
return run_result
|
|
155
181
|
|
|
@@ -168,6 +194,7 @@ def execute_runs(
|
|
|
168
194
|
def execute_run(
|
|
169
195
|
inference: Callable,
|
|
170
196
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
197
|
+
upload_results: bool, # NEW PARAMETER
|
|
171
198
|
experiment_id: Optional[str] = None,
|
|
172
199
|
project_id: Optional[str] = None,
|
|
173
200
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -176,7 +203,9 @@ def execute_run(
|
|
|
176
203
|
"""Execute a single evaluation run."""
|
|
177
204
|
|
|
178
205
|
if isinstance(run, EvalRunSpec):
|
|
179
|
-
return execute_classic_eval_run(
|
|
206
|
+
return execute_classic_eval_run(
|
|
207
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
208
|
+
)
|
|
180
209
|
|
|
181
210
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
182
211
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -194,35 +223,90 @@ def execute_run(
|
|
|
194
223
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
195
224
|
|
|
196
225
|
|
|
197
|
-
def execute_classic_eval_run(
|
|
198
|
-
|
|
226
|
+
def execute_classic_eval_run(
|
|
227
|
+
inference: Callable,
|
|
228
|
+
run: EvalRunSpec,
|
|
229
|
+
upload_results: bool,
|
|
230
|
+
experiment_id: Optional[str],
|
|
231
|
+
project_id: Optional[str],
|
|
232
|
+
metadata: Optional[Dict[str, Any]],
|
|
233
|
+
) -> ClassicEvalRunResult:
|
|
234
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
199
235
|
logger.debug("Executing classic eval run for %s", run)
|
|
200
236
|
|
|
201
237
|
inference_outputs = None
|
|
202
|
-
|
|
238
|
+
scores = None
|
|
203
239
|
|
|
204
240
|
try:
|
|
241
|
+
# 1. Run inference
|
|
205
242
|
inference_outputs = run_inference_callable(
|
|
206
|
-
inference, run.
|
|
243
|
+
inference, run.inputs, run.hyperparameter_config
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# 2. Build items for score_async
|
|
247
|
+
items = [
|
|
248
|
+
{
|
|
249
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
250
|
+
"output": inference_outputs[i],
|
|
251
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
252
|
+
}
|
|
253
|
+
for i in range(len(inference_outputs))
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
# 3. Get the model name for upload
|
|
257
|
+
model_name = get_model_name(inference, metadata)
|
|
258
|
+
|
|
259
|
+
# 4. Call score_async
|
|
260
|
+
scores = score(
|
|
261
|
+
items=items,
|
|
262
|
+
metrics=run.dataset.metrics,
|
|
263
|
+
output_column="output", # Explicit parameter
|
|
264
|
+
label_column="label", # Explicit parameter
|
|
265
|
+
input_column="input", # Explicit parameter
|
|
266
|
+
hyperparameters=run.hyperparameter_config,
|
|
267
|
+
dataset_name=run.dataset.name,
|
|
268
|
+
model_name=model_name,
|
|
269
|
+
metadata=metadata,
|
|
270
|
+
experiment_id=experiment_id,
|
|
271
|
+
project_id=project_id,
|
|
272
|
+
upload_results=upload_results,
|
|
273
|
+
show_progress=False,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# 5. Extract run_id if upload succeeded
|
|
277
|
+
run_id = None
|
|
278
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
279
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
280
|
+
|
|
281
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
282
|
+
return ClassicEvalRunResult(
|
|
283
|
+
run_spec=run,
|
|
284
|
+
run_completed=True,
|
|
285
|
+
outputs=inference_outputs,
|
|
286
|
+
scores=scores,
|
|
287
|
+
run_id=run_id,
|
|
207
288
|
)
|
|
208
|
-
metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
|
|
209
|
-
logger.debug("Classic evaluation completed for run %s", run)
|
|
210
|
-
return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
|
|
211
289
|
|
|
212
290
|
except Exception as e:
|
|
213
291
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
214
|
-
return ClassicEvalRunResult(
|
|
292
|
+
return ClassicEvalRunResult(
|
|
293
|
+
run_spec=run,
|
|
294
|
+
run_completed=False,
|
|
295
|
+
outputs=inference_outputs,
|
|
296
|
+
scores=scores,
|
|
297
|
+
run_id=None,
|
|
298
|
+
)
|
|
215
299
|
|
|
216
300
|
|
|
217
301
|
def run_inference_callable(
|
|
218
302
|
inference: Callable,
|
|
219
|
-
|
|
303
|
+
inputs: List[Any],
|
|
220
304
|
hyperparameter_config: Dict[str, Any],
|
|
221
305
|
) -> Any:
|
|
222
306
|
"""Run inference on a given dataset and hyperparameter configuration."""
|
|
223
307
|
|
|
224
308
|
try:
|
|
225
|
-
predictions = inference(
|
|
309
|
+
predictions = inference(inputs, **hyperparameter_config)
|
|
226
310
|
except Exception as e:
|
|
227
311
|
logger.error(
|
|
228
312
|
"Inference callable raised an exception: %s",
|
|
@@ -230,11 +314,11 @@ def run_inference_callable(
|
|
|
230
314
|
)
|
|
231
315
|
raise InferenceError(f"Inference failed: {str(e)}") from e
|
|
232
316
|
|
|
233
|
-
if not isinstance(predictions, list) or len(predictions) != len(
|
|
317
|
+
if not isinstance(predictions, list) or len(predictions) != len(inputs):
|
|
234
318
|
raise InferenceError(
|
|
235
319
|
"Inference callable must return a list of predictions "
|
|
236
|
-
"of shared length as the
|
|
237
|
-
f"
|
|
320
|
+
"of shared length as the inputs. "
|
|
321
|
+
f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
|
|
238
322
|
)
|
|
239
323
|
|
|
240
324
|
if all(prediction == "" for prediction in predictions):
|
|
@@ -257,100 +341,20 @@ def execute_adaptive_eval_run(
|
|
|
257
341
|
"""Execute an adaptive evaluation run."""
|
|
258
342
|
logger.debug("Executing adaptive run for %s", run)
|
|
259
343
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
adaptive_eval_run_result = run_adaptive_evaluation(
|
|
264
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
265
|
-
)
|
|
266
|
-
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
267
|
-
|
|
268
|
-
return adaptive_eval_run_result
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
def upload_classic_run_results(
|
|
272
|
-
run_result: ClassicEvalRunResult,
|
|
273
|
-
experiment_id: str,
|
|
274
|
-
project_id: str,
|
|
275
|
-
inference_callable: Optional[Callable],
|
|
276
|
-
metadata: Optional[Dict[str, Any]],
|
|
277
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
278
|
-
) -> str:
|
|
279
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
run: The evaluation run result to upload
|
|
283
|
-
experiment_id: Trismik experiment identifier
|
|
284
|
-
project_id: Trismik project identifier
|
|
285
|
-
model: Model name used for evaluation
|
|
286
|
-
metadata: Optional metadata dictionary
|
|
287
|
-
trismik_client: Trismik client instance
|
|
344
|
+
try:
|
|
345
|
+
if trismik_client is None:
|
|
346
|
+
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
288
347
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
"""
|
|
292
|
-
model = get_model_name(inference_callable)
|
|
293
|
-
|
|
294
|
-
# Create eval items from run_spec items, outputs, and labels
|
|
295
|
-
items: List[TrismikClassicEvalItem] = []
|
|
296
|
-
for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
|
|
297
|
-
label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
|
|
298
|
-
|
|
299
|
-
# Calculate item-level metrics for this item
|
|
300
|
-
item_metrics: Dict[str, Any] = {}
|
|
301
|
-
if run_result.scores:
|
|
302
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
303
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
304
|
-
if idx < len(metric_data["item_scores"]):
|
|
305
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
306
|
-
else:
|
|
307
|
-
# If scores is just a single value, use it for all items
|
|
308
|
-
item_metrics[metric_name] = metric_data
|
|
309
|
-
|
|
310
|
-
eval_item = TrismikClassicEvalItem(
|
|
311
|
-
datasetItemId=str(idx),
|
|
312
|
-
modelInput=str(item),
|
|
313
|
-
modelOutput=str(output),
|
|
314
|
-
goldOutput=str(label),
|
|
315
|
-
metrics=item_metrics,
|
|
348
|
+
adaptive_eval_run_result = run_adaptive_evaluation(
|
|
349
|
+
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
316
350
|
)
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Create eval metrics from run aggregate scores
|
|
320
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
321
|
-
if run_result.scores:
|
|
322
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
323
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
324
|
-
# Handle structured metric data with aggregate scores
|
|
325
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
326
|
-
metric_id = (
|
|
327
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
328
|
-
)
|
|
329
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
330
|
-
metrics.append(metric)
|
|
331
|
-
else:
|
|
332
|
-
# Handle simple metric data (single value)
|
|
333
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
334
|
-
metrics.append(metric)
|
|
335
|
-
|
|
336
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
337
|
-
project_id,
|
|
338
|
-
experiment_id,
|
|
339
|
-
run_result.run_spec.dataset.name,
|
|
340
|
-
model,
|
|
341
|
-
run_result.run_spec.hyperparameter_config,
|
|
342
|
-
items,
|
|
343
|
-
metrics,
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
|
|
347
|
-
classic_eval_request
|
|
348
|
-
)
|
|
351
|
+
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
349
352
|
|
|
350
|
-
|
|
351
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
353
|
+
return adaptive_eval_run_result
|
|
352
354
|
|
|
353
|
-
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
|
|
357
|
+
return AdaptiveEvalRunResult(run, False, {})
|
|
354
358
|
|
|
355
359
|
|
|
356
360
|
def run_adaptive_evaluation(
|
|
@@ -410,4 +414,4 @@ def run_adaptive_evaluation(
|
|
|
410
414
|
# Make scores JSON serializable
|
|
411
415
|
scores = make_json_serializable(scores)
|
|
412
416
|
|
|
413
|
-
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
|
|
417
|
+
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
|
|
@@ -2,9 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import dataclasses
|
|
5
|
-
import inspect
|
|
6
5
|
import logging
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List,
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
|
|
8
7
|
|
|
9
8
|
from trismik._async.client import TrismikAsyncClient
|
|
10
9
|
from trismik._sync.client import TrismikClient
|
|
@@ -25,14 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
|
|
|
25
24
|
logger = logging.getLogger(__name__)
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
# TODO: Remove this when backend supports boolean item metrics
|
|
28
|
+
NORMALIZE_METRICS_FOR_UPLOAD = True
|
|
30
29
|
|
|
31
|
-
if upload_results == "auto":
|
|
32
|
-
upload_results = get_token() is not None
|
|
33
|
-
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
34
30
|
|
|
35
|
-
|
|
31
|
+
def normalize_metric_value(value: Any) -> Any:
|
|
32
|
+
"""Normalize metric values for API upload compatibility.
|
|
33
|
+
|
|
34
|
+
TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
|
|
35
|
+
This function converts boolean values to floats (True -> 1.0, False -> 0.0)
|
|
36
|
+
to ensure upload compatibility.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
value: The metric value to normalize
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Float if value is bool, otherwise unchanged
|
|
43
|
+
|
|
44
|
+
TODO: Remove this function when backend supports boolean metrics natively.
|
|
45
|
+
To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
|
|
46
|
+
"""
|
|
47
|
+
if not NORMALIZE_METRICS_FOR_UPLOAD:
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
# Convert booleans to floats for API compatibility
|
|
51
|
+
if isinstance(value, bool):
|
|
52
|
+
return float(value) # True -> 1.0, False -> 0.0
|
|
53
|
+
|
|
54
|
+
return value
|
|
36
55
|
|
|
37
56
|
|
|
38
57
|
def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
|
|
@@ -93,7 +112,7 @@ def prepare_datasets(
|
|
|
93
112
|
|
|
94
113
|
# Prepare adaptive datasets
|
|
95
114
|
elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
|
|
96
|
-
datasets_out.append(AdaptiveEvalDataset(dataset
|
|
115
|
+
datasets_out.append(AdaptiveEvalDataset(dataset))
|
|
97
116
|
|
|
98
117
|
# TODO: dataset name string registry
|
|
99
118
|
elif isinstance(dataset, str):
|
|
@@ -172,15 +191,22 @@ def build_classic_eval_run_spec(
|
|
|
172
191
|
hyperparameters: Dict[str, Any],
|
|
173
192
|
hyperparameters_index: int,
|
|
174
193
|
) -> EvalRunSpec:
|
|
175
|
-
"""Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
|
|
176
|
-
|
|
177
|
-
|
|
194
|
+
"""Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
|
|
195
|
+
|
|
196
|
+
Extracts input and label values from the appropriate columns in the dataset.
|
|
197
|
+
The column names are determined by dataset.input and dataset.label,
|
|
198
|
+
which may be original field names (e.g., "question", "answer") or computed
|
|
199
|
+
column names (e.g., "*input", "*label") if templates were used.
|
|
200
|
+
"""
|
|
201
|
+
# Extract inputs and labels using the dataset's column specifications
|
|
202
|
+
inputs = dataset[dataset.input] # Returns List[Any]
|
|
203
|
+
labels = dataset[dataset.label] # Returns List[Any]
|
|
178
204
|
eval_run_spec = EvalRunSpec(
|
|
179
205
|
dataset,
|
|
180
206
|
dataset_index,
|
|
181
207
|
hyperparameters,
|
|
182
208
|
hyperparameters_index,
|
|
183
|
-
|
|
209
|
+
inputs,
|
|
184
210
|
labels,
|
|
185
211
|
)
|
|
186
212
|
logger.debug("Built EvalRunSpec: %s", eval_run_spec)
|
|
@@ -197,9 +223,9 @@ def build_adaptive_eval_run_spec(
|
|
|
197
223
|
metadata: Optional[Dict[str, Any]] = None,
|
|
198
224
|
) -> AdaptiveEvalRunSpec:
|
|
199
225
|
"""Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
|
|
200
|
-
dataset
|
|
226
|
+
# Keep the full dataset name including ":adaptive" suffix for backend API
|
|
201
227
|
adaptive_eval_run_spec = AdaptiveEvalRunSpec(
|
|
202
|
-
|
|
228
|
+
adaptive_dataset,
|
|
203
229
|
dataset_index,
|
|
204
230
|
hyperparameter_config,
|
|
205
231
|
hyperparameter_config_index,
|
|
@@ -256,7 +282,7 @@ def create_trismik_sync_client() -> TrismikClient:
|
|
|
256
282
|
def get_model_name(
|
|
257
283
|
inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
|
|
258
284
|
) -> str:
|
|
259
|
-
"""Determine a model's name with the fallback "
|
|
285
|
+
"""Determine a model's name with the fallback "Model"."""
|
|
260
286
|
|
|
261
287
|
# First priority: metadata.model
|
|
262
288
|
if metadata and "model" in metadata:
|
|
@@ -266,8 +292,8 @@ def get_model_name(
|
|
|
266
292
|
if inference_callable and hasattr(inference_callable, "model"):
|
|
267
293
|
return str(inference_callable.model)
|
|
268
294
|
|
|
269
|
-
# Fallback: "
|
|
270
|
-
return "
|
|
295
|
+
# Fallback: "Model"
|
|
296
|
+
return "Model"
|
|
271
297
|
|
|
272
298
|
|
|
273
299
|
def format_results(
|
|
@@ -292,7 +318,7 @@ def format_results(
|
|
|
292
318
|
# Remove inference output if not requested
|
|
293
319
|
if not return_output:
|
|
294
320
|
for item in item_scores:
|
|
295
|
-
item.pop("
|
|
321
|
+
item.pop("output", None)
|
|
296
322
|
|
|
297
323
|
results["item_results"] = item_scores
|
|
298
324
|
|
|
@@ -322,10 +348,7 @@ def make_trismik_inference(
|
|
|
322
348
|
"""
|
|
323
349
|
|
|
324
350
|
# Check if the inference function is async
|
|
325
|
-
is_async =
|
|
326
|
-
hasattr(inference_function, "__call__")
|
|
327
|
-
and inspect.iscoroutinefunction(inference_function.__call__)
|
|
328
|
-
)
|
|
351
|
+
is_async = is_awaitable(inference_function)
|
|
329
352
|
|
|
330
353
|
def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
|
|
331
354
|
# Single TrismikMultipleChoiceTextItem dataclass
|
scorebook/exceptions.py
CHANGED
|
@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
|
|
|
10
10
|
"""Base exception class for all Scorebook-related errors."""
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
class EvalDatasetError(ScoreBookError):
|
|
14
|
+
"""Base exception class for all EvalDataset errors."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetConfigurationError(EvalDatasetError):
|
|
18
|
+
"""Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MissingFieldError(EvalDatasetError):
|
|
22
|
+
"""Raised when required field is missing from dataset."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
|
|
25
|
+
"""Initialize missing field error with structured context."""
|
|
26
|
+
self.field_name = field_name
|
|
27
|
+
self.field_type = field_type # "input" or "label"
|
|
28
|
+
self.available_fields = available_fields
|
|
29
|
+
super().__init__(
|
|
30
|
+
f"{field_type.capitalize()} field '{field_name}' not found. "
|
|
31
|
+
f"Available fields: {', '.join(available_fields)}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatasetLoadError(EvalDatasetError):
|
|
36
|
+
"""Raised when dataset fails to load from source (file or remote)."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetParseError(EvalDatasetError):
|
|
40
|
+
"""Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DatasetNotInitializedError(EvalDatasetError):
|
|
44
|
+
"""Raised when operations are attempted on uninitialized dataset."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DatasetSampleError(EvalDatasetError):
|
|
48
|
+
"""Raised when sampling parameters are invalid."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
|
|
51
|
+
"""Initialize dataset sample error with structured context."""
|
|
52
|
+
self.sample_size = sample_size
|
|
53
|
+
self.dataset_size = dataset_size
|
|
54
|
+
self.dataset_name = dataset_name
|
|
55
|
+
super().__init__(
|
|
56
|
+
f"Sample size {sample_size} exceeds dataset size {dataset_size} "
|
|
57
|
+
f"for dataset '{dataset_name}'"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
13
61
|
class EvaluationError(ScoreBookError):
|
|
14
62
|
"""Raised when there are errors during model evaluation."""
|
|
15
63
|
|
|
@@ -36,10 +84,14 @@ class MetricComputationError(EvaluationError):
|
|
|
36
84
|
)
|
|
37
85
|
|
|
38
86
|
|
|
39
|
-
class
|
|
87
|
+
class ScoreError(ScoreBookError):
|
|
88
|
+
"""Raised when there are errors during scoring."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DataMismatchError(ScoreError):
|
|
40
92
|
"""Raised when there's a mismatch between outputs and expected labels."""
|
|
41
93
|
|
|
42
|
-
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
|
|
94
|
+
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
|
|
43
95
|
"""Initialize data mismatch error."""
|
|
44
96
|
self.outputs_count = outputs_count
|
|
45
97
|
self.labels_count = labels_count
|
|
File without changes
|