scorebook 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +8 -1
- scorebook/eval_datasets/eval_dataset.py +18 -0
- scorebook/evaluate/_async/evaluate_async.py +116 -126
- scorebook/evaluate/_sync/evaluate.py +116 -127
- scorebook/evaluate/evaluate_helpers.py +98 -25
- scorebook/exceptions.py +6 -2
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +35 -54
- scorebook/utils/__init__.py +8 -1
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/progress_bars.py +67 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/METADATA +2 -2
- {scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/RECORD +22 -14
- {scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/WHEEL +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,13 +2,8 @@ import logging
|
|
|
2
2
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
4
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
5
|
-
from trismik.
|
|
6
|
-
|
|
7
|
-
TrismikClassicEvalMetric,
|
|
8
|
-
TrismikClassicEvalRequest,
|
|
9
|
-
TrismikClassicEvalResponse,
|
|
10
|
-
TrismikRunMetadata,
|
|
11
|
-
)
|
|
5
|
+
from trismik.settings import evaluation_settings
|
|
6
|
+
from trismik.types import TrismikRunMetadata
|
|
12
7
|
|
|
13
8
|
from scorebook.eval_datasets import EvalDataset
|
|
14
9
|
from scorebook.evaluate.evaluate_helpers import (
|
|
@@ -19,12 +14,12 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
19
14
|
make_trismik_inference,
|
|
20
15
|
prepare_datasets,
|
|
21
16
|
prepare_hyperparameter_configs,
|
|
22
|
-
|
|
23
|
-
resolve_upload_results,
|
|
24
|
-
score_metrics,
|
|
17
|
+
resolve_adaptive_split,
|
|
25
18
|
validate_parameters,
|
|
26
19
|
)
|
|
27
20
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
21
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
22
|
+
from scorebook.score._sync.score import score
|
|
28
23
|
from scorebook.types import (
|
|
29
24
|
AdaptiveEvalRunResult,
|
|
30
25
|
AdaptiveEvalRunSpec,
|
|
@@ -32,15 +27,20 @@ from scorebook.types import (
|
|
|
32
27
|
EvalResult,
|
|
33
28
|
EvalRunSpec,
|
|
34
29
|
)
|
|
35
|
-
from
|
|
36
|
-
|
|
30
|
+
from scorebook.utils import (
|
|
31
|
+
nullcontext,
|
|
32
|
+
evaluation_progress_context,
|
|
33
|
+
resolve_show_progress,
|
|
34
|
+
resolve_upload_results,
|
|
35
|
+
)
|
|
37
36
|
|
|
38
37
|
logger = logging.getLogger(__name__)
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
def evaluate(
|
|
42
|
-
inference: Callable,
|
|
41
|
+
inference: Union[Callable, InferencePipeline],
|
|
43
42
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
43
|
+
split: Optional[str] = None,
|
|
44
44
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
45
|
metadata: Optional[Dict[str, Any]] = None,
|
|
46
46
|
experiment_id: Optional[str] = None,
|
|
@@ -59,6 +59,7 @@ def evaluate(
|
|
|
59
59
|
Args:
|
|
60
60
|
inference: The inference callable to evaluate
|
|
61
61
|
datasets: Dataset(s) to evaluate on
|
|
62
|
+
split: Split to use for evaluation (default: "validation")
|
|
62
63
|
hyperparameters: Hyperparameter configuration(s) to evaluate with
|
|
63
64
|
metadata: Optional metadata to attach to the evaluation
|
|
64
65
|
experiment_id: Optional experiment identifier
|
|
@@ -83,14 +84,14 @@ def evaluate(
|
|
|
83
84
|
validate_parameters(locals(), evaluate)
|
|
84
85
|
|
|
85
86
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
86
|
-
datasets = prepare_datasets(datasets, sample_size)
|
|
87
|
+
datasets = prepare_datasets(datasets, split, sample_size)
|
|
87
88
|
hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
|
|
88
89
|
eval_run_specs = sorted(
|
|
89
90
|
build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
|
|
90
91
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
91
92
|
)
|
|
92
93
|
|
|
93
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
94
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
94
95
|
needs_client = upload_results or any(
|
|
95
96
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
96
97
|
)
|
|
@@ -101,7 +102,14 @@ def evaluate(
|
|
|
101
102
|
with trismik_client or nullcontext():
|
|
102
103
|
# Execute evaluation runs
|
|
103
104
|
# Calculate total items across all runs
|
|
104
|
-
total_items = sum(
|
|
105
|
+
total_items = sum(
|
|
106
|
+
(
|
|
107
|
+
len(run.dataset.items)
|
|
108
|
+
if isinstance(run, EvalRunSpec)
|
|
109
|
+
else evaluation_settings["max_iterations"]
|
|
110
|
+
) # Adaptive evals use max_iterations
|
|
111
|
+
for run in eval_run_specs
|
|
112
|
+
)
|
|
105
113
|
model_display = get_model_name(inference)
|
|
106
114
|
|
|
107
115
|
with evaluation_progress_context(
|
|
@@ -145,34 +153,32 @@ def execute_runs(
|
|
|
145
153
|
def worker(
|
|
146
154
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
147
155
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
156
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
148
157
|
run_result = execute_run(
|
|
149
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
158
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
150
159
|
)
|
|
160
|
+
|
|
151
161
|
# Update progress bars with items processed and success status
|
|
152
162
|
if progress_bars is not None:
|
|
153
|
-
|
|
163
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
164
|
+
items_processed = (
|
|
165
|
+
len(run.dataset.items)
|
|
166
|
+
if isinstance(run, EvalRunSpec)
|
|
167
|
+
else evaluation_settings["max_iterations"]
|
|
168
|
+
)
|
|
154
169
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
155
170
|
|
|
171
|
+
# Update upload progress for classic evals
|
|
156
172
|
if (
|
|
157
173
|
upload_results
|
|
158
174
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
159
|
-
and experiment_id
|
|
160
|
-
and project_id
|
|
161
175
|
and run_result.run_completed
|
|
162
|
-
and trismik_client is not None
|
|
163
176
|
):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
run_result.run_id = run_id
|
|
177
|
+
# Check if upload succeeded by checking for run_id
|
|
178
|
+
if experiment_id and project_id:
|
|
179
|
+
upload_succeeded = run_result.run_id is not None
|
|
169
180
|
if progress_bars is not None:
|
|
170
|
-
progress_bars.on_upload_completed(succeeded=
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.warning(f"Failed to upload run results: {e}")
|
|
173
|
-
if progress_bars is not None:
|
|
174
|
-
progress_bars.on_upload_completed(succeeded=False)
|
|
175
|
-
# Continue evaluation even if upload fails
|
|
181
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
176
182
|
|
|
177
183
|
return run_result
|
|
178
184
|
|
|
@@ -191,6 +197,7 @@ def execute_runs(
|
|
|
191
197
|
def execute_run(
|
|
192
198
|
inference: Callable,
|
|
193
199
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
200
|
+
upload_results: bool, # NEW PARAMETER
|
|
194
201
|
experiment_id: Optional[str] = None,
|
|
195
202
|
project_id: Optional[str] = None,
|
|
196
203
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -199,7 +206,9 @@ def execute_run(
|
|
|
199
206
|
"""Execute a single evaluation run."""
|
|
200
207
|
|
|
201
208
|
if isinstance(run, EvalRunSpec):
|
|
202
|
-
return execute_classic_eval_run(
|
|
209
|
+
return execute_classic_eval_run(
|
|
210
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
211
|
+
)
|
|
203
212
|
|
|
204
213
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
205
214
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -217,24 +226,79 @@ def execute_run(
|
|
|
217
226
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
218
227
|
|
|
219
228
|
|
|
220
|
-
def execute_classic_eval_run(
|
|
221
|
-
|
|
229
|
+
def execute_classic_eval_run(
|
|
230
|
+
inference: Callable,
|
|
231
|
+
run: EvalRunSpec,
|
|
232
|
+
upload_results: bool,
|
|
233
|
+
experiment_id: Optional[str],
|
|
234
|
+
project_id: Optional[str],
|
|
235
|
+
metadata: Optional[Dict[str, Any]],
|
|
236
|
+
) -> ClassicEvalRunResult:
|
|
237
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
222
238
|
logger.debug("Executing classic eval run for %s", run)
|
|
223
239
|
|
|
224
240
|
inference_outputs = None
|
|
225
|
-
|
|
241
|
+
scores = None
|
|
226
242
|
|
|
227
243
|
try:
|
|
244
|
+
# 1. Run inference
|
|
228
245
|
inference_outputs = run_inference_callable(
|
|
229
246
|
inference, run.inputs, run.hyperparameter_config
|
|
230
247
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
248
|
+
|
|
249
|
+
# 2. Build items for score_async
|
|
250
|
+
items = [
|
|
251
|
+
{
|
|
252
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
253
|
+
"output": inference_outputs[i],
|
|
254
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
255
|
+
}
|
|
256
|
+
for i in range(len(inference_outputs))
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
# 3. Get the model name for upload
|
|
260
|
+
model_name = get_model_name(inference, metadata)
|
|
261
|
+
|
|
262
|
+
# 4. Call score_async
|
|
263
|
+
scores = score(
|
|
264
|
+
items=items,
|
|
265
|
+
metrics=run.dataset.metrics,
|
|
266
|
+
output_column="output", # Explicit parameter
|
|
267
|
+
label_column="label", # Explicit parameter
|
|
268
|
+
input_column="input", # Explicit parameter
|
|
269
|
+
hyperparameters=run.hyperparameter_config,
|
|
270
|
+
dataset_name=run.dataset.name,
|
|
271
|
+
model_name=model_name,
|
|
272
|
+
metadata=metadata,
|
|
273
|
+
experiment_id=experiment_id,
|
|
274
|
+
project_id=project_id,
|
|
275
|
+
upload_results=upload_results,
|
|
276
|
+
show_progress=False,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# 5. Extract run_id if upload succeeded
|
|
280
|
+
run_id = None
|
|
281
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
282
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
283
|
+
|
|
284
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
285
|
+
return ClassicEvalRunResult(
|
|
286
|
+
run_spec=run,
|
|
287
|
+
run_completed=True,
|
|
288
|
+
outputs=inference_outputs,
|
|
289
|
+
scores=scores,
|
|
290
|
+
run_id=run_id,
|
|
291
|
+
)
|
|
234
292
|
|
|
235
293
|
except Exception as e:
|
|
236
294
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
237
|
-
return ClassicEvalRunResult(
|
|
295
|
+
return ClassicEvalRunResult(
|
|
296
|
+
run_spec=run,
|
|
297
|
+
run_completed=False,
|
|
298
|
+
outputs=inference_outputs,
|
|
299
|
+
scores=scores,
|
|
300
|
+
run_id=None,
|
|
301
|
+
)
|
|
238
302
|
|
|
239
303
|
|
|
240
304
|
def run_inference_callable(
|
|
@@ -296,93 +360,6 @@ def execute_adaptive_eval_run(
|
|
|
296
360
|
return AdaptiveEvalRunResult(run, False, {})
|
|
297
361
|
|
|
298
362
|
|
|
299
|
-
def upload_classic_run_results(
|
|
300
|
-
run_result: ClassicEvalRunResult,
|
|
301
|
-
experiment_id: str,
|
|
302
|
-
project_id: str,
|
|
303
|
-
inference_callable: Optional[Callable],
|
|
304
|
-
metadata: Optional[Dict[str, Any]],
|
|
305
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
306
|
-
) -> str:
|
|
307
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
run: The evaluation run result to upload
|
|
311
|
-
experiment_id: Trismik experiment identifier
|
|
312
|
-
project_id: Trismik project identifier
|
|
313
|
-
model: Model name used for evaluation
|
|
314
|
-
metadata: Optional metadata dictionary
|
|
315
|
-
trismik_client: Trismik client instance
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Run id
|
|
319
|
-
"""
|
|
320
|
-
model = get_model_name(inference_callable)
|
|
321
|
-
|
|
322
|
-
# Create eval items from run_spec inputs, outputs, and labels
|
|
323
|
-
items: List[TrismikClassicEvalItem] = []
|
|
324
|
-
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
-
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
-
labels = run_result.run_spec.labels
|
|
327
|
-
label = labels[idx] if idx < len(labels) else ""
|
|
328
|
-
|
|
329
|
-
# Calculate item-level metrics for this item
|
|
330
|
-
item_metrics: Dict[str, Any] = {}
|
|
331
|
-
if run_result.scores:
|
|
332
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
333
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
334
|
-
if idx < len(metric_data["item_scores"]):
|
|
335
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
336
|
-
else:
|
|
337
|
-
# If scores is just a single value, use it for all items
|
|
338
|
-
item_metrics[metric_name] = metric_data
|
|
339
|
-
|
|
340
|
-
eval_item = TrismikClassicEvalItem(
|
|
341
|
-
datasetItemId=str(idx),
|
|
342
|
-
modelInput=str(input_value),
|
|
343
|
-
modelOutput=str(output),
|
|
344
|
-
goldOutput=str(label),
|
|
345
|
-
metrics=item_metrics,
|
|
346
|
-
)
|
|
347
|
-
items.append(eval_item)
|
|
348
|
-
|
|
349
|
-
# Create eval metrics from run aggregate scores
|
|
350
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
351
|
-
if run_result.scores:
|
|
352
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
353
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
354
|
-
# Handle structured metric data with aggregate scores
|
|
355
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
356
|
-
metric_id = (
|
|
357
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
358
|
-
)
|
|
359
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
360
|
-
metrics.append(metric)
|
|
361
|
-
else:
|
|
362
|
-
# Handle simple metric data (single value)
|
|
363
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
364
|
-
metrics.append(metric)
|
|
365
|
-
|
|
366
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
367
|
-
project_id,
|
|
368
|
-
experiment_id,
|
|
369
|
-
run_result.run_spec.dataset.name,
|
|
370
|
-
model,
|
|
371
|
-
run_result.run_spec.hyperparameter_config,
|
|
372
|
-
items,
|
|
373
|
-
metrics,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
|
|
377
|
-
classic_eval_request
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
run_id: str = response.id
|
|
381
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
382
|
-
|
|
383
|
-
return run_id
|
|
384
|
-
|
|
385
|
-
|
|
386
363
|
def run_adaptive_evaluation(
|
|
387
364
|
inference: Callable,
|
|
388
365
|
adaptive_run_spec: AdaptiveEvalRunSpec,
|
|
@@ -403,8 +380,20 @@ def run_adaptive_evaluation(
|
|
|
403
380
|
Returns:
|
|
404
381
|
Results from the adaptive evaluation
|
|
405
382
|
"""
|
|
383
|
+
# Fetch available splits from Trismik
|
|
384
|
+
dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
|
|
385
|
+
available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
|
|
386
|
+
|
|
387
|
+
# Resolve the split to use (with fallback: user-specified -> validation -> test)
|
|
388
|
+
resolved_split = resolve_adaptive_split(
|
|
389
|
+
test_id=adaptive_run_spec.dataset,
|
|
390
|
+
user_specified_split=adaptive_run_spec.split,
|
|
391
|
+
available_splits=available_splits,
|
|
392
|
+
)
|
|
393
|
+
|
|
406
394
|
trismik_results = trismik_client.run(
|
|
407
395
|
test_id=adaptive_run_spec.dataset,
|
|
396
|
+
split=resolved_split,
|
|
408
397
|
project_id=project_id,
|
|
409
398
|
experiment=experiment_id,
|
|
410
399
|
run_metadata=TrismikRunMetadata(
|
|
@@ -2,9 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import dataclasses
|
|
5
|
-
import inspect
|
|
6
5
|
import logging
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List,
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
|
|
8
7
|
|
|
9
8
|
from trismik._async.client import TrismikAsyncClient
|
|
10
9
|
from trismik._sync.client import TrismikClient
|
|
@@ -25,30 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
|
|
|
25
24
|
logger = logging.getLogger(__name__)
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
# TODO: Remove this when backend supports boolean item metrics
|
|
28
|
+
NORMALIZE_METRICS_FOR_UPLOAD = True
|
|
30
29
|
|
|
31
|
-
if upload_results == "auto":
|
|
32
|
-
upload_results = get_token() is not None
|
|
33
|
-
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
34
30
|
|
|
35
|
-
|
|
31
|
+
def normalize_metric_value(value: Any) -> Any:
|
|
32
|
+
"""Normalize metric values for API upload compatibility.
|
|
36
33
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
34
|
+
TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
|
|
35
|
+
This function converts boolean values to floats (True -> 1.0, False -> 0.0)
|
|
36
|
+
to ensure upload compatibility.
|
|
40
37
|
|
|
41
38
|
Args:
|
|
42
|
-
|
|
39
|
+
value: The metric value to normalize
|
|
43
40
|
|
|
44
41
|
Returns:
|
|
45
|
-
|
|
42
|
+
Float if value is bool, otherwise unchanged
|
|
43
|
+
|
|
44
|
+
TODO: Remove this function when backend supports boolean metrics natively.
|
|
45
|
+
To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
|
|
46
46
|
"""
|
|
47
|
-
if
|
|
48
|
-
|
|
47
|
+
if not NORMALIZE_METRICS_FOR_UPLOAD:
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
# Convert booleans to floats for API compatibility
|
|
51
|
+
if isinstance(value, bool):
|
|
52
|
+
return float(value) # True -> 1.0, False -> 0.0
|
|
49
53
|
|
|
50
|
-
|
|
51
|
-
return show_progress
|
|
54
|
+
return value
|
|
52
55
|
|
|
53
56
|
|
|
54
57
|
def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
|
|
@@ -88,6 +91,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
|
|
|
88
91
|
|
|
89
92
|
def prepare_datasets(
|
|
90
93
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
94
|
+
split: Optional[str] = None,
|
|
91
95
|
sample_size: Optional[int] = None,
|
|
92
96
|
) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
|
|
93
97
|
"""Prepare and separate input datasets into classic and adaptive evaluation datasets."""
|
|
@@ -101,6 +105,12 @@ def prepare_datasets(
|
|
|
101
105
|
|
|
102
106
|
# Prepare classic datasets
|
|
103
107
|
if isinstance(dataset, EvalDataset):
|
|
108
|
+
# Warn if dataset split differs from provided split parameter
|
|
109
|
+
if split is not None and dataset.split is not None and dataset.split != split:
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
|
|
112
|
+
f"parameter is '{split}'. The dataset split will be used."
|
|
113
|
+
)
|
|
104
114
|
|
|
105
115
|
if sample_size is not None:
|
|
106
116
|
dataset = dataset.sample(sample_size)
|
|
@@ -108,8 +118,17 @@ def prepare_datasets(
|
|
|
108
118
|
datasets_out.append(dataset)
|
|
109
119
|
|
|
110
120
|
# Prepare adaptive datasets
|
|
111
|
-
elif isinstance(dataset, str) and
|
|
112
|
-
|
|
121
|
+
elif isinstance(dataset, str) and ":adaptive" in dataset:
|
|
122
|
+
# Parse adaptive dataset
|
|
123
|
+
parts = dataset.split(":")
|
|
124
|
+
if len(parts) != 2 or parts[1] != "adaptive":
|
|
125
|
+
raise ParameterValidationError(
|
|
126
|
+
f"Invalid adaptive dataset format: '{dataset}'. "
|
|
127
|
+
f"Use 'test_id:adaptive' format and specify split via the split parameter."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Use the split parameter for all adaptive datasets
|
|
131
|
+
datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
|
|
113
132
|
|
|
114
133
|
# TODO: dataset name string registry
|
|
115
134
|
elif isinstance(dataset, str):
|
|
@@ -171,6 +190,7 @@ def build_eval_run_specs(
|
|
|
171
190
|
hyperparameters_index,
|
|
172
191
|
experiment_id,
|
|
173
192
|
project_id,
|
|
193
|
+
dataset.split,
|
|
174
194
|
metadata,
|
|
175
195
|
)
|
|
176
196
|
)
|
|
@@ -217,17 +237,19 @@ def build_adaptive_eval_run_spec(
|
|
|
217
237
|
hyperparameter_config_index: int,
|
|
218
238
|
experiment_id: str,
|
|
219
239
|
project_id: str,
|
|
240
|
+
split: Optional[str] = None,
|
|
220
241
|
metadata: Optional[Dict[str, Any]] = None,
|
|
221
242
|
) -> AdaptiveEvalRunSpec:
|
|
222
243
|
"""Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
|
|
223
|
-
dataset
|
|
244
|
+
# Keep the full dataset name including ":adaptive" suffix for backend API
|
|
224
245
|
adaptive_eval_run_spec = AdaptiveEvalRunSpec(
|
|
225
|
-
|
|
246
|
+
adaptive_dataset,
|
|
226
247
|
dataset_index,
|
|
227
248
|
hyperparameter_config,
|
|
228
249
|
hyperparameter_config_index,
|
|
229
250
|
experiment_id,
|
|
230
251
|
project_id,
|
|
252
|
+
split,
|
|
231
253
|
metadata,
|
|
232
254
|
)
|
|
233
255
|
logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
|
|
@@ -345,10 +367,7 @@ def make_trismik_inference(
|
|
|
345
367
|
"""
|
|
346
368
|
|
|
347
369
|
# Check if the inference function is async
|
|
348
|
-
is_async =
|
|
349
|
-
hasattr(inference_function, "__call__")
|
|
350
|
-
and inspect.iscoroutinefunction(inference_function.__call__)
|
|
351
|
-
)
|
|
370
|
+
is_async = is_awaitable(inference_function)
|
|
352
371
|
|
|
353
372
|
def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
|
|
354
373
|
# Single TrismikMultipleChoiceTextItem dataclass
|
|
@@ -386,3 +405,57 @@ def make_trismik_inference(
|
|
|
386
405
|
)
|
|
387
406
|
|
|
388
407
|
return sync_trismik_inference_function
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def resolve_adaptive_split(
|
|
411
|
+
test_id: str,
|
|
412
|
+
user_specified_split: Optional[str],
|
|
413
|
+
available_splits: List[str],
|
|
414
|
+
) -> str:
|
|
415
|
+
"""Resolve the dataset split to use for adaptive evaluation.
|
|
416
|
+
|
|
417
|
+
Resolution order:
|
|
418
|
+
1. If user specified a split, validate it exists and use it
|
|
419
|
+
2. If not specified and exactly one split is available, use it
|
|
420
|
+
3. If not specified and multiple splits are available, raise an error
|
|
421
|
+
4. If no splits are available, raise an error
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
test_id: The test dataset ID (without ":adaptive" suffix)
|
|
425
|
+
user_specified_split: Optional split name specified by the user
|
|
426
|
+
available_splits: List of available split names for this dataset
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
The resolved split name to use
|
|
430
|
+
|
|
431
|
+
Raises:
|
|
432
|
+
ScoreBookError: If the specified split doesn't exist, multiple splits exist without
|
|
433
|
+
user specification, or no splits are available
|
|
434
|
+
"""
|
|
435
|
+
logger.debug(f"Available splits for {test_id}: {available_splits}")
|
|
436
|
+
|
|
437
|
+
# If user specified a split, validate and use it
|
|
438
|
+
if user_specified_split is not None:
|
|
439
|
+
if user_specified_split in available_splits:
|
|
440
|
+
logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
|
|
441
|
+
return user_specified_split
|
|
442
|
+
else:
|
|
443
|
+
raise ScoreBookError(
|
|
444
|
+
f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
|
|
445
|
+
f"Available splits: {available_splits}"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# No split specified - check available splits
|
|
449
|
+
if len(available_splits) == 0:
|
|
450
|
+
raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
|
|
451
|
+
elif len(available_splits) == 1:
|
|
452
|
+
# Exactly one split - auto-select it
|
|
453
|
+
selected_split = available_splits[0]
|
|
454
|
+
logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
|
|
455
|
+
return selected_split
|
|
456
|
+
else:
|
|
457
|
+
# Multiple splits available - user must specify
|
|
458
|
+
raise ScoreBookError(
|
|
459
|
+
f"Multiple splits available for dataset '{test_id}': {available_splits}. "
|
|
460
|
+
f"Please specify which split to use via evaluate's 'split' parameter."
|
|
461
|
+
)
|
scorebook/exceptions.py
CHANGED
|
@@ -84,10 +84,14 @@ class MetricComputationError(EvaluationError):
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
|
|
87
|
-
class
|
|
87
|
+
class ScoreError(ScoreBookError):
|
|
88
|
+
"""Raised when there are errors during scoring."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DataMismatchError(ScoreError):
|
|
88
92
|
"""Raised when there's a mismatch between outputs and expected labels."""
|
|
89
93
|
|
|
90
|
-
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
|
|
94
|
+
def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
|
|
91
95
|
"""Initialize data mismatch error."""
|
|
92
96
|
self.outputs_count = outputs_count
|
|
93
97
|
self.labels_count = labels_count
|
|
File without changes
|