scorebook 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +8 -1
- scorebook/evaluate/_async/evaluate_async.py +100 -125
- scorebook/evaluate/_sync/evaluate.py +100 -126
- scorebook/evaluate/evaluate_helpers.py +24 -24
- scorebook/exceptions.py +6 -2
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +33 -54
- scorebook/utils/__init__.py +8 -1
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/progress_bars.py +67 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/METADATA +2 -2
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/RECORD +21 -13
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.11.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py
CHANGED
|
@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
|
|
|
12
12
|
from scorebook.eval_datasets import EvalDataset
|
|
13
13
|
from scorebook.evaluate import evaluate, evaluate_async
|
|
14
14
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.
|
|
15
|
+
from scorebook.score import score, score_async
|
|
16
|
+
from scorebook.trismik.credentials import login, logout, whoami
|
|
17
|
+
from scorebook.trismik.upload_results import upload_result, upload_result_async
|
|
16
18
|
from scorebook.utils.render_template import render_template
|
|
17
19
|
|
|
18
20
|
__all__ = [
|
|
19
21
|
"EvalDataset",
|
|
20
22
|
"evaluate",
|
|
21
23
|
"evaluate_async",
|
|
24
|
+
"score",
|
|
25
|
+
"score_async",
|
|
22
26
|
"render_template",
|
|
23
27
|
"login",
|
|
28
|
+
"logout",
|
|
24
29
|
"whoami",
|
|
25
30
|
"InferencePipeline",
|
|
31
|
+
"upload_result",
|
|
32
|
+
"upload_result_async",
|
|
26
33
|
]
|
|
@@ -3,13 +3,8 @@ import logging
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
6
|
-
from trismik.
|
|
7
|
-
|
|
8
|
-
TrismikClassicEvalMetric,
|
|
9
|
-
TrismikClassicEvalRequest,
|
|
10
|
-
TrismikClassicEvalResponse,
|
|
11
|
-
TrismikRunMetadata,
|
|
12
|
-
)
|
|
6
|
+
from trismik.settings import evaluation_settings
|
|
7
|
+
from trismik.types import TrismikRunMetadata
|
|
13
8
|
|
|
14
9
|
from scorebook.eval_datasets import EvalDataset
|
|
15
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
@@ -20,12 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
20
15
|
make_trismik_inference,
|
|
21
16
|
prepare_datasets,
|
|
22
17
|
prepare_hyperparameter_configs,
|
|
23
|
-
resolve_show_progress,
|
|
24
|
-
resolve_upload_results,
|
|
25
|
-
score_metrics,
|
|
26
18
|
validate_parameters,
|
|
27
19
|
)
|
|
28
20
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
21
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
22
|
+
from scorebook.score._async.score_async import score_async
|
|
29
23
|
from scorebook.types import (
|
|
30
24
|
AdaptiveEvalRunResult,
|
|
31
25
|
AdaptiveEvalRunSpec,
|
|
@@ -33,13 +27,18 @@ from scorebook.types import (
|
|
|
33
27
|
EvalResult,
|
|
34
28
|
EvalRunSpec,
|
|
35
29
|
)
|
|
36
|
-
from scorebook.utils import
|
|
30
|
+
from scorebook.utils import (
|
|
31
|
+
async_nullcontext,
|
|
32
|
+
evaluation_progress_context,
|
|
33
|
+
resolve_show_progress,
|
|
34
|
+
resolve_upload_results,
|
|
35
|
+
)
|
|
37
36
|
|
|
38
37
|
logger = logging.getLogger(__name__)
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
async def evaluate_async(
|
|
42
|
-
inference: Callable,
|
|
41
|
+
inference: Union[Callable, InferencePipeline],
|
|
43
42
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
44
43
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
44
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -90,7 +89,7 @@ async def evaluate_async(
|
|
|
90
89
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
91
90
|
)
|
|
92
91
|
|
|
93
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
92
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
94
93
|
needs_client = upload_results or any(
|
|
95
94
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
96
95
|
)
|
|
@@ -101,7 +100,14 @@ async def evaluate_async(
|
|
|
101
100
|
async with trismik_client or async_nullcontext():
|
|
102
101
|
# Execute evaluation runs
|
|
103
102
|
# Calculate total items across all runs
|
|
104
|
-
total_items = sum(
|
|
103
|
+
total_items = sum(
|
|
104
|
+
(
|
|
105
|
+
len(run.dataset.items)
|
|
106
|
+
if isinstance(run, EvalRunSpec)
|
|
107
|
+
else evaluation_settings["max_iterations"]
|
|
108
|
+
) # Adaptive evals use max_iterations
|
|
109
|
+
for run in eval_run_specs
|
|
110
|
+
)
|
|
105
111
|
model_display = get_model_name(inference)
|
|
106
112
|
|
|
107
113
|
with evaluation_progress_context(
|
|
@@ -145,34 +151,32 @@ async def execute_runs(
|
|
|
145
151
|
async def worker(
|
|
146
152
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
147
153
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
154
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
148
155
|
run_result = await execute_run(
|
|
149
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
156
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
150
157
|
)
|
|
158
|
+
|
|
151
159
|
# Update progress bars with items processed and success status
|
|
152
160
|
if progress_bars is not None:
|
|
153
|
-
|
|
161
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
162
|
+
items_processed = (
|
|
163
|
+
len(run.dataset.items)
|
|
164
|
+
if isinstance(run, EvalRunSpec)
|
|
165
|
+
else evaluation_settings["max_iterations"]
|
|
166
|
+
)
|
|
154
167
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
155
168
|
|
|
169
|
+
# Update upload progress for classic evals
|
|
156
170
|
if (
|
|
157
171
|
upload_results
|
|
158
172
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
159
|
-
and experiment_id
|
|
160
|
-
and project_id
|
|
161
173
|
and run_result.run_completed
|
|
162
|
-
and trismik_client is not None
|
|
163
174
|
):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
run_result.run_id = run_id
|
|
169
|
-
if progress_bars is not None:
|
|
170
|
-
progress_bars.on_upload_completed(succeeded=True)
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.warning(f"Failed to upload run results: {e}")
|
|
175
|
+
# Check if upload succeeded by checking for run_id
|
|
176
|
+
if experiment_id and project_id:
|
|
177
|
+
upload_succeeded = run_result.run_id is not None
|
|
173
178
|
if progress_bars is not None:
|
|
174
|
-
progress_bars.on_upload_completed(succeeded=
|
|
175
|
-
# Continue evaluation even if upload fails
|
|
179
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
176
180
|
|
|
177
181
|
return run_result
|
|
178
182
|
|
|
@@ -191,6 +195,7 @@ async def execute_runs(
|
|
|
191
195
|
async def execute_run(
|
|
192
196
|
inference: Callable,
|
|
193
197
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
198
|
+
upload_results: bool, # NEW PARAMETER
|
|
194
199
|
experiment_id: Optional[str] = None,
|
|
195
200
|
project_id: Optional[str] = None,
|
|
196
201
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -199,7 +204,9 @@ async def execute_run(
|
|
|
199
204
|
"""Execute a single evaluation run."""
|
|
200
205
|
|
|
201
206
|
if isinstance(run, EvalRunSpec):
|
|
202
|
-
return await execute_classic_eval_run(
|
|
207
|
+
return await execute_classic_eval_run(
|
|
208
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
209
|
+
)
|
|
203
210
|
|
|
204
211
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
205
212
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -217,24 +224,79 @@ async def execute_run(
|
|
|
217
224
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
218
225
|
|
|
219
226
|
|
|
220
|
-
async def execute_classic_eval_run(
|
|
221
|
-
|
|
227
|
+
async def execute_classic_eval_run(
|
|
228
|
+
inference: Callable,
|
|
229
|
+
run: EvalRunSpec,
|
|
230
|
+
upload_results: bool,
|
|
231
|
+
experiment_id: Optional[str],
|
|
232
|
+
project_id: Optional[str],
|
|
233
|
+
metadata: Optional[Dict[str, Any]],
|
|
234
|
+
) -> ClassicEvalRunResult:
|
|
235
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
222
236
|
logger.debug("Executing classic eval run for %s", run)
|
|
223
237
|
|
|
224
238
|
inference_outputs = None
|
|
225
|
-
|
|
239
|
+
scores = None
|
|
226
240
|
|
|
227
241
|
try:
|
|
242
|
+
# 1. Run inference
|
|
228
243
|
inference_outputs = await run_inference_callable(
|
|
229
244
|
inference, run.inputs, run.hyperparameter_config
|
|
230
245
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
246
|
+
|
|
247
|
+
# 2. Build items for score_async
|
|
248
|
+
items = [
|
|
249
|
+
{
|
|
250
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
251
|
+
"output": inference_outputs[i],
|
|
252
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
253
|
+
}
|
|
254
|
+
for i in range(len(inference_outputs))
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
# 3. Get the model name for upload
|
|
258
|
+
model_name = get_model_name(inference, metadata)
|
|
259
|
+
|
|
260
|
+
# 4. Call score_async
|
|
261
|
+
scores = await score_async(
|
|
262
|
+
items=items,
|
|
263
|
+
metrics=run.dataset.metrics,
|
|
264
|
+
output_column="output", # Explicit parameter
|
|
265
|
+
label_column="label", # Explicit parameter
|
|
266
|
+
input_column="input", # Explicit parameter
|
|
267
|
+
hyperparameters=run.hyperparameter_config,
|
|
268
|
+
dataset_name=run.dataset.name,
|
|
269
|
+
model_name=model_name,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
experiment_id=experiment_id,
|
|
272
|
+
project_id=project_id,
|
|
273
|
+
upload_results=upload_results,
|
|
274
|
+
show_progress=False,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# 5. Extract run_id if upload succeeded
|
|
278
|
+
run_id = None
|
|
279
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
280
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
281
|
+
|
|
282
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
283
|
+
return ClassicEvalRunResult(
|
|
284
|
+
run_spec=run,
|
|
285
|
+
run_completed=True,
|
|
286
|
+
outputs=inference_outputs,
|
|
287
|
+
scores=scores,
|
|
288
|
+
run_id=run_id,
|
|
289
|
+
)
|
|
234
290
|
|
|
235
291
|
except Exception as e:
|
|
236
292
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
237
|
-
return ClassicEvalRunResult(
|
|
293
|
+
return ClassicEvalRunResult(
|
|
294
|
+
run_spec=run,
|
|
295
|
+
run_completed=False,
|
|
296
|
+
outputs=inference_outputs,
|
|
297
|
+
scores=scores,
|
|
298
|
+
run_id=None,
|
|
299
|
+
)
|
|
238
300
|
|
|
239
301
|
|
|
240
302
|
async def run_inference_callable(
|
|
@@ -296,93 +358,6 @@ async def execute_adaptive_eval_run(
|
|
|
296
358
|
return AdaptiveEvalRunResult(run, False, {})
|
|
297
359
|
|
|
298
360
|
|
|
299
|
-
async def upload_classic_run_results(
|
|
300
|
-
run_result: ClassicEvalRunResult,
|
|
301
|
-
experiment_id: str,
|
|
302
|
-
project_id: str,
|
|
303
|
-
inference_callable: Optional[Callable],
|
|
304
|
-
metadata: Optional[Dict[str, Any]],
|
|
305
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
306
|
-
) -> str:
|
|
307
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
run: The evaluation run result to upload
|
|
311
|
-
experiment_id: Trismik experiment identifier
|
|
312
|
-
project_id: Trismik project identifier
|
|
313
|
-
model: Model name used for evaluation
|
|
314
|
-
metadata: Optional metadata dictionary
|
|
315
|
-
trismik_client: Trismik client instance
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Run id
|
|
319
|
-
"""
|
|
320
|
-
model = get_model_name(inference_callable)
|
|
321
|
-
|
|
322
|
-
# Create eval items from run_spec inputs, outputs, and labels
|
|
323
|
-
items: List[TrismikClassicEvalItem] = []
|
|
324
|
-
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
-
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
-
labels = run_result.run_spec.labels
|
|
327
|
-
label = labels[idx] if idx < len(labels) else ""
|
|
328
|
-
|
|
329
|
-
# Calculate item-level metrics for this item
|
|
330
|
-
item_metrics: Dict[str, Any] = {}
|
|
331
|
-
if run_result.scores:
|
|
332
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
333
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
334
|
-
if idx < len(metric_data["item_scores"]):
|
|
335
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
336
|
-
else:
|
|
337
|
-
# If scores is just a single value, use it for all items
|
|
338
|
-
item_metrics[metric_name] = metric_data
|
|
339
|
-
|
|
340
|
-
eval_item = TrismikClassicEvalItem(
|
|
341
|
-
datasetItemId=str(idx),
|
|
342
|
-
modelInput=str(input_value),
|
|
343
|
-
modelOutput=str(output),
|
|
344
|
-
goldOutput=str(label),
|
|
345
|
-
metrics=item_metrics,
|
|
346
|
-
)
|
|
347
|
-
items.append(eval_item)
|
|
348
|
-
|
|
349
|
-
# Create eval metrics from run aggregate scores
|
|
350
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
351
|
-
if run_result.scores:
|
|
352
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
353
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
354
|
-
# Handle structured metric data with aggregate scores
|
|
355
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
356
|
-
metric_id = (
|
|
357
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
358
|
-
)
|
|
359
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
360
|
-
metrics.append(metric)
|
|
361
|
-
else:
|
|
362
|
-
# Handle simple metric data (single value)
|
|
363
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
364
|
-
metrics.append(metric)
|
|
365
|
-
|
|
366
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
367
|
-
project_id,
|
|
368
|
-
experiment_id,
|
|
369
|
-
run_result.run_spec.dataset.name,
|
|
370
|
-
model,
|
|
371
|
-
run_result.run_spec.hyperparameter_config,
|
|
372
|
-
items,
|
|
373
|
-
metrics,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
|
|
377
|
-
classic_eval_request
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
run_id: str = response.id
|
|
381
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
382
|
-
|
|
383
|
-
return run_id
|
|
384
|
-
|
|
385
|
-
|
|
386
361
|
async def run_adaptive_evaluation(
|
|
387
362
|
inference: Callable,
|
|
388
363
|
adaptive_run_spec: AdaptiveEvalRunSpec,
|
|
@@ -2,13 +2,8 @@ import logging
|
|
|
2
2
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
3
3
|
|
|
4
4
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
5
|
-
from trismik.
|
|
6
|
-
|
|
7
|
-
TrismikClassicEvalMetric,
|
|
8
|
-
TrismikClassicEvalRequest,
|
|
9
|
-
TrismikClassicEvalResponse,
|
|
10
|
-
TrismikRunMetadata,
|
|
11
|
-
)
|
|
5
|
+
from trismik.settings import evaluation_settings
|
|
6
|
+
from trismik.types import TrismikRunMetadata
|
|
12
7
|
|
|
13
8
|
from scorebook.eval_datasets import EvalDataset
|
|
14
9
|
from scorebook.evaluate.evaluate_helpers import (
|
|
@@ -19,12 +14,11 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
19
14
|
make_trismik_inference,
|
|
20
15
|
prepare_datasets,
|
|
21
16
|
prepare_hyperparameter_configs,
|
|
22
|
-
resolve_show_progress,
|
|
23
|
-
resolve_upload_results,
|
|
24
|
-
score_metrics,
|
|
25
17
|
validate_parameters,
|
|
26
18
|
)
|
|
27
19
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
20
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
21
|
+
from scorebook.score._sync.score import score
|
|
28
22
|
from scorebook.types import (
|
|
29
23
|
AdaptiveEvalRunResult,
|
|
30
24
|
AdaptiveEvalRunSpec,
|
|
@@ -32,14 +26,18 @@ from scorebook.types import (
|
|
|
32
26
|
EvalResult,
|
|
33
27
|
EvalRunSpec,
|
|
34
28
|
)
|
|
35
|
-
from
|
|
36
|
-
|
|
29
|
+
from scorebook.utils import (
|
|
30
|
+
nullcontext,
|
|
31
|
+
evaluation_progress_context,
|
|
32
|
+
resolve_show_progress,
|
|
33
|
+
resolve_upload_results,
|
|
34
|
+
)
|
|
37
35
|
|
|
38
36
|
logger = logging.getLogger(__name__)
|
|
39
37
|
|
|
40
38
|
|
|
41
39
|
def evaluate(
|
|
42
|
-
inference: Callable,
|
|
40
|
+
inference: Union[Callable, InferencePipeline],
|
|
43
41
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
44
42
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
43
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -90,7 +88,7 @@ def evaluate(
|
|
|
90
88
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
91
89
|
)
|
|
92
90
|
|
|
93
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
91
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
94
92
|
needs_client = upload_results or any(
|
|
95
93
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
96
94
|
)
|
|
@@ -101,7 +99,14 @@ def evaluate(
|
|
|
101
99
|
with trismik_client or nullcontext():
|
|
102
100
|
# Execute evaluation runs
|
|
103
101
|
# Calculate total items across all runs
|
|
104
|
-
total_items = sum(
|
|
102
|
+
total_items = sum(
|
|
103
|
+
(
|
|
104
|
+
len(run.dataset.items)
|
|
105
|
+
if isinstance(run, EvalRunSpec)
|
|
106
|
+
else evaluation_settings["max_iterations"]
|
|
107
|
+
) # Adaptive evals use max_iterations
|
|
108
|
+
for run in eval_run_specs
|
|
109
|
+
)
|
|
105
110
|
model_display = get_model_name(inference)
|
|
106
111
|
|
|
107
112
|
with evaluation_progress_context(
|
|
@@ -145,34 +150,32 @@ def execute_runs(
|
|
|
145
150
|
def worker(
|
|
146
151
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
147
152
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
153
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
148
154
|
run_result = execute_run(
|
|
149
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
155
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
150
156
|
)
|
|
157
|
+
|
|
151
158
|
# Update progress bars with items processed and success status
|
|
152
159
|
if progress_bars is not None:
|
|
153
|
-
|
|
160
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
161
|
+
items_processed = (
|
|
162
|
+
len(run.dataset.items)
|
|
163
|
+
if isinstance(run, EvalRunSpec)
|
|
164
|
+
else evaluation_settings["max_iterations"]
|
|
165
|
+
)
|
|
154
166
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
155
167
|
|
|
168
|
+
# Update upload progress for classic evals
|
|
156
169
|
if (
|
|
157
170
|
upload_results
|
|
158
171
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
159
|
-
and experiment_id
|
|
160
|
-
and project_id
|
|
161
172
|
and run_result.run_completed
|
|
162
|
-
and trismik_client is not None
|
|
163
173
|
):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
run_result.run_id = run_id
|
|
169
|
-
if progress_bars is not None:
|
|
170
|
-
progress_bars.on_upload_completed(succeeded=True)
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.warning(f"Failed to upload run results: {e}")
|
|
174
|
+
# Check if upload succeeded by checking for run_id
|
|
175
|
+
if experiment_id and project_id:
|
|
176
|
+
upload_succeeded = run_result.run_id is not None
|
|
173
177
|
if progress_bars is not None:
|
|
174
|
-
progress_bars.on_upload_completed(succeeded=
|
|
175
|
-
# Continue evaluation even if upload fails
|
|
178
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
176
179
|
|
|
177
180
|
return run_result
|
|
178
181
|
|
|
@@ -191,6 +194,7 @@ def execute_runs(
|
|
|
191
194
|
def execute_run(
|
|
192
195
|
inference: Callable,
|
|
193
196
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
197
|
+
upload_results: bool, # NEW PARAMETER
|
|
194
198
|
experiment_id: Optional[str] = None,
|
|
195
199
|
project_id: Optional[str] = None,
|
|
196
200
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -199,7 +203,9 @@ def execute_run(
|
|
|
199
203
|
"""Execute a single evaluation run."""
|
|
200
204
|
|
|
201
205
|
if isinstance(run, EvalRunSpec):
|
|
202
|
-
return execute_classic_eval_run(
|
|
206
|
+
return execute_classic_eval_run(
|
|
207
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
208
|
+
)
|
|
203
209
|
|
|
204
210
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
205
211
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -217,24 +223,79 @@ def execute_run(
|
|
|
217
223
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
218
224
|
|
|
219
225
|
|
|
220
|
-
def execute_classic_eval_run(
|
|
221
|
-
|
|
226
|
+
def execute_classic_eval_run(
|
|
227
|
+
inference: Callable,
|
|
228
|
+
run: EvalRunSpec,
|
|
229
|
+
upload_results: bool,
|
|
230
|
+
experiment_id: Optional[str],
|
|
231
|
+
project_id: Optional[str],
|
|
232
|
+
metadata: Optional[Dict[str, Any]],
|
|
233
|
+
) -> ClassicEvalRunResult:
|
|
234
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
222
235
|
logger.debug("Executing classic eval run for %s", run)
|
|
223
236
|
|
|
224
237
|
inference_outputs = None
|
|
225
|
-
|
|
238
|
+
scores = None
|
|
226
239
|
|
|
227
240
|
try:
|
|
241
|
+
# 1. Run inference
|
|
228
242
|
inference_outputs = run_inference_callable(
|
|
229
243
|
inference, run.inputs, run.hyperparameter_config
|
|
230
244
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
245
|
+
|
|
246
|
+
# 2. Build items for score_async
|
|
247
|
+
items = [
|
|
248
|
+
{
|
|
249
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
250
|
+
"output": inference_outputs[i],
|
|
251
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
252
|
+
}
|
|
253
|
+
for i in range(len(inference_outputs))
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
# 3. Get the model name for upload
|
|
257
|
+
model_name = get_model_name(inference, metadata)
|
|
258
|
+
|
|
259
|
+
# 4. Call score_async
|
|
260
|
+
scores = score(
|
|
261
|
+
items=items,
|
|
262
|
+
metrics=run.dataset.metrics,
|
|
263
|
+
output_column="output", # Explicit parameter
|
|
264
|
+
label_column="label", # Explicit parameter
|
|
265
|
+
input_column="input", # Explicit parameter
|
|
266
|
+
hyperparameters=run.hyperparameter_config,
|
|
267
|
+
dataset_name=run.dataset.name,
|
|
268
|
+
model_name=model_name,
|
|
269
|
+
metadata=metadata,
|
|
270
|
+
experiment_id=experiment_id,
|
|
271
|
+
project_id=project_id,
|
|
272
|
+
upload_results=upload_results,
|
|
273
|
+
show_progress=False,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# 5. Extract run_id if upload succeeded
|
|
277
|
+
run_id = None
|
|
278
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
279
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
280
|
+
|
|
281
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
282
|
+
return ClassicEvalRunResult(
|
|
283
|
+
run_spec=run,
|
|
284
|
+
run_completed=True,
|
|
285
|
+
outputs=inference_outputs,
|
|
286
|
+
scores=scores,
|
|
287
|
+
run_id=run_id,
|
|
288
|
+
)
|
|
234
289
|
|
|
235
290
|
except Exception as e:
|
|
236
291
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
237
|
-
return ClassicEvalRunResult(
|
|
292
|
+
return ClassicEvalRunResult(
|
|
293
|
+
run_spec=run,
|
|
294
|
+
run_completed=False,
|
|
295
|
+
outputs=inference_outputs,
|
|
296
|
+
scores=scores,
|
|
297
|
+
run_id=None,
|
|
298
|
+
)
|
|
238
299
|
|
|
239
300
|
|
|
240
301
|
def run_inference_callable(
|
|
@@ -296,93 +357,6 @@ def execute_adaptive_eval_run(
|
|
|
296
357
|
return AdaptiveEvalRunResult(run, False, {})
|
|
297
358
|
|
|
298
359
|
|
|
299
|
-
def upload_classic_run_results(
|
|
300
|
-
run_result: ClassicEvalRunResult,
|
|
301
|
-
experiment_id: str,
|
|
302
|
-
project_id: str,
|
|
303
|
-
inference_callable: Optional[Callable],
|
|
304
|
-
metadata: Optional[Dict[str, Any]],
|
|
305
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
306
|
-
) -> str:
|
|
307
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
run: The evaluation run result to upload
|
|
311
|
-
experiment_id: Trismik experiment identifier
|
|
312
|
-
project_id: Trismik project identifier
|
|
313
|
-
model: Model name used for evaluation
|
|
314
|
-
metadata: Optional metadata dictionary
|
|
315
|
-
trismik_client: Trismik client instance
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Run id
|
|
319
|
-
"""
|
|
320
|
-
model = get_model_name(inference_callable)
|
|
321
|
-
|
|
322
|
-
# Create eval items from run_spec inputs, outputs, and labels
|
|
323
|
-
items: List[TrismikClassicEvalItem] = []
|
|
324
|
-
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
-
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
-
labels = run_result.run_spec.labels
|
|
327
|
-
label = labels[idx] if idx < len(labels) else ""
|
|
328
|
-
|
|
329
|
-
# Calculate item-level metrics for this item
|
|
330
|
-
item_metrics: Dict[str, Any] = {}
|
|
331
|
-
if run_result.scores:
|
|
332
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
333
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
334
|
-
if idx < len(metric_data["item_scores"]):
|
|
335
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
336
|
-
else:
|
|
337
|
-
# If scores is just a single value, use it for all items
|
|
338
|
-
item_metrics[metric_name] = metric_data
|
|
339
|
-
|
|
340
|
-
eval_item = TrismikClassicEvalItem(
|
|
341
|
-
datasetItemId=str(idx),
|
|
342
|
-
modelInput=str(input_value),
|
|
343
|
-
modelOutput=str(output),
|
|
344
|
-
goldOutput=str(label),
|
|
345
|
-
metrics=item_metrics,
|
|
346
|
-
)
|
|
347
|
-
items.append(eval_item)
|
|
348
|
-
|
|
349
|
-
# Create eval metrics from run aggregate scores
|
|
350
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
351
|
-
if run_result.scores:
|
|
352
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
353
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
354
|
-
# Handle structured metric data with aggregate scores
|
|
355
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
356
|
-
metric_id = (
|
|
357
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
358
|
-
)
|
|
359
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
360
|
-
metrics.append(metric)
|
|
361
|
-
else:
|
|
362
|
-
# Handle simple metric data (single value)
|
|
363
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
364
|
-
metrics.append(metric)
|
|
365
|
-
|
|
366
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
367
|
-
project_id,
|
|
368
|
-
experiment_id,
|
|
369
|
-
run_result.run_spec.dataset.name,
|
|
370
|
-
model,
|
|
371
|
-
run_result.run_spec.hyperparameter_config,
|
|
372
|
-
items,
|
|
373
|
-
metrics,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
|
|
377
|
-
classic_eval_request
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
run_id: str = response.id
|
|
381
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
382
|
-
|
|
383
|
-
return run_id
|
|
384
|
-
|
|
385
|
-
|
|
386
360
|
def run_adaptive_evaluation(
|
|
387
361
|
inference: Callable,
|
|
388
362
|
adaptive_run_spec: AdaptiveEvalRunSpec,
|