scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +11 -4
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/_async/evaluate_async.py +135 -130
- scorebook/evaluate/_sync/evaluate.py +135 -131
- scorebook/evaluate/evaluate_helpers.py +46 -23
- scorebook/exceptions.py +54 -2
- scorebook/inference/clients/bedrock.py +1 -1
- scorebook/inference/clients/portkey.py +1 -1
- scorebook/inference/clients/vertex.py +1 -1
- scorebook/score/__init__.py +6 -0
- scorebook/score/_async/__init__.py +0 -0
- scorebook/score/_async/score_async.py +145 -0
- scorebook/score/_sync/__init__.py +0 -0
- scorebook/score/_sync/score.py +145 -0
- scorebook/score/score_helpers.py +207 -0
- scorebook/settings.py +3 -0
- scorebook/trismik/upload_results.py +254 -0
- scorebook/types.py +36 -54
- scorebook/utils/__init__.py +11 -4
- scorebook/utils/common_helpers.py +41 -0
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +819 -70
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/METADATA +3 -2
- scorebook-0.0.12.dist-info/RECORD +50 -0
- scorebook/eval_dataset.py +0 -404
- scorebook-0.0.10.dist-info/RECORD +0 -41
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/WHEEL +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.10.dist-info → scorebook-0.0.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,15 +3,10 @@ import logging
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
6
|
-
from trismik.
|
|
7
|
-
|
|
8
|
-
TrismikClassicEvalMetric,
|
|
9
|
-
TrismikClassicEvalRequest,
|
|
10
|
-
TrismikClassicEvalResponse,
|
|
11
|
-
TrismikRunMetadata,
|
|
12
|
-
)
|
|
6
|
+
from trismik.settings import evaluation_settings
|
|
7
|
+
from trismik.types import TrismikRunMetadata
|
|
13
8
|
|
|
14
|
-
from scorebook.
|
|
9
|
+
from scorebook.eval_datasets import EvalDataset
|
|
15
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
16
11
|
build_eval_run_specs,
|
|
17
12
|
create_trismik_async_client,
|
|
@@ -20,11 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
20
15
|
make_trismik_inference,
|
|
21
16
|
prepare_datasets,
|
|
22
17
|
prepare_hyperparameter_configs,
|
|
23
|
-
resolve_upload_results,
|
|
24
|
-
score_metrics,
|
|
25
18
|
validate_parameters,
|
|
26
19
|
)
|
|
27
20
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
21
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
22
|
+
from scorebook.score._async.score_async import score_async
|
|
28
23
|
from scorebook.types import (
|
|
29
24
|
AdaptiveEvalRunResult,
|
|
30
25
|
AdaptiveEvalRunSpec,
|
|
@@ -32,13 +27,18 @@ from scorebook.types import (
|
|
|
32
27
|
EvalResult,
|
|
33
28
|
EvalRunSpec,
|
|
34
29
|
)
|
|
35
|
-
from scorebook.utils import
|
|
30
|
+
from scorebook.utils import (
|
|
31
|
+
async_nullcontext,
|
|
32
|
+
evaluation_progress_context,
|
|
33
|
+
resolve_show_progress,
|
|
34
|
+
resolve_upload_results,
|
|
35
|
+
)
|
|
36
36
|
|
|
37
37
|
logger = logging.getLogger(__name__)
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
async def evaluate_async(
|
|
41
|
-
inference: Callable,
|
|
41
|
+
inference: Union[Callable, InferencePipeline],
|
|
42
42
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
43
43
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
44
44
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -50,6 +50,7 @@ async def evaluate_async(
|
|
|
50
50
|
return_output: bool = False,
|
|
51
51
|
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
52
52
|
sample_size: Optional[int] = None,
|
|
53
|
+
show_progress: Optional[bool] = None,
|
|
53
54
|
) -> Union[Dict, List, EvalResult]:
|
|
54
55
|
"""
|
|
55
56
|
Evaluate a model across a collection of hyperparameters and datasets.
|
|
@@ -67,6 +68,8 @@ async def evaluate_async(
|
|
|
67
68
|
return_output: If True, returns model outputs for each dataset item
|
|
68
69
|
upload_results: If True, uploads results to Trismik's dashboard
|
|
69
70
|
sample_size: Optional number of items to sample from each dataset
|
|
71
|
+
show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
|
|
72
|
+
If True/False, explicitly enables/disables progress bars for this evaluation.
|
|
70
73
|
|
|
71
74
|
Returns:
|
|
72
75
|
The evaluation results in the format specified by return parameters:
|
|
@@ -75,6 +78,7 @@ async def evaluate_async(
|
|
|
75
78
|
"""
|
|
76
79
|
# Resolve and validate parameters
|
|
77
80
|
upload_results = cast(bool, resolve_upload_results(upload_results))
|
|
81
|
+
show_progress_bars = resolve_show_progress(show_progress)
|
|
78
82
|
validate_parameters(locals(), evaluate_async)
|
|
79
83
|
|
|
80
84
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
@@ -85,7 +89,7 @@ async def evaluate_async(
|
|
|
85
89
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
86
90
|
)
|
|
87
91
|
|
|
88
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
92
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
89
93
|
needs_client = upload_results or any(
|
|
90
94
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
91
95
|
)
|
|
@@ -95,10 +99,24 @@ async def evaluate_async(
|
|
|
95
99
|
|
|
96
100
|
async with trismik_client or async_nullcontext():
|
|
97
101
|
# Execute evaluation runs
|
|
98
|
-
|
|
102
|
+
# Calculate total items across all runs
|
|
103
|
+
total_items = sum(
|
|
104
|
+
(
|
|
105
|
+
len(run.dataset.items)
|
|
106
|
+
if isinstance(run, EvalRunSpec)
|
|
107
|
+
else evaluation_settings["max_iterations"]
|
|
108
|
+
) # Adaptive evals use max_iterations
|
|
109
|
+
for run in eval_run_specs
|
|
110
|
+
)
|
|
111
|
+
model_display = get_model_name(inference)
|
|
112
|
+
|
|
113
|
+
with evaluation_progress_context(
|
|
114
|
+
total_eval_runs=len(eval_run_specs),
|
|
115
|
+
total_items=total_items,
|
|
99
116
|
dataset_count=len(datasets),
|
|
100
|
-
|
|
101
|
-
|
|
117
|
+
hyperparam_count=len(hyperparameter_configs),
|
|
118
|
+
model_display=model_display,
|
|
119
|
+
enabled=show_progress_bars,
|
|
102
120
|
) as progress_bars:
|
|
103
121
|
eval_result = await execute_runs(
|
|
104
122
|
inference,
|
|
@@ -133,23 +151,32 @@ async def execute_runs(
|
|
|
133
151
|
async def worker(
|
|
134
152
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
135
153
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
154
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
136
155
|
run_result = await execute_run(
|
|
137
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
156
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
138
157
|
)
|
|
139
|
-
progress_bars.on_eval_run_completed(run.dataset_index)
|
|
140
158
|
|
|
159
|
+
# Update progress bars with items processed and success status
|
|
160
|
+
if progress_bars is not None:
|
|
161
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
162
|
+
items_processed = (
|
|
163
|
+
len(run.dataset.items)
|
|
164
|
+
if isinstance(run, EvalRunSpec)
|
|
165
|
+
else evaluation_settings["max_iterations"]
|
|
166
|
+
)
|
|
167
|
+
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
168
|
+
|
|
169
|
+
# Update upload progress for classic evals
|
|
141
170
|
if (
|
|
142
171
|
upload_results
|
|
143
172
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
144
|
-
and experiment_id
|
|
145
|
-
and project_id
|
|
146
173
|
and run_result.run_completed
|
|
147
|
-
and trismik_client is not None
|
|
148
174
|
):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
175
|
+
# Check if upload succeeded by checking for run_id
|
|
176
|
+
if experiment_id and project_id:
|
|
177
|
+
upload_succeeded = run_result.run_id is not None
|
|
178
|
+
if progress_bars is not None:
|
|
179
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
153
180
|
|
|
154
181
|
return run_result
|
|
155
182
|
|
|
@@ -168,6 +195,7 @@ async def execute_runs(
|
|
|
168
195
|
async def execute_run(
|
|
169
196
|
inference: Callable,
|
|
170
197
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
198
|
+
upload_results: bool, # NEW PARAMETER
|
|
171
199
|
experiment_id: Optional[str] = None,
|
|
172
200
|
project_id: Optional[str] = None,
|
|
173
201
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -176,7 +204,9 @@ async def execute_run(
|
|
|
176
204
|
"""Execute a single evaluation run."""
|
|
177
205
|
|
|
178
206
|
if isinstance(run, EvalRunSpec):
|
|
179
|
-
return await execute_classic_eval_run(
|
|
207
|
+
return await execute_classic_eval_run(
|
|
208
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
209
|
+
)
|
|
180
210
|
|
|
181
211
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
182
212
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -194,35 +224,90 @@ async def execute_run(
|
|
|
194
224
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
195
225
|
|
|
196
226
|
|
|
197
|
-
async def execute_classic_eval_run(
|
|
198
|
-
|
|
227
|
+
async def execute_classic_eval_run(
|
|
228
|
+
inference: Callable,
|
|
229
|
+
run: EvalRunSpec,
|
|
230
|
+
upload_results: bool,
|
|
231
|
+
experiment_id: Optional[str],
|
|
232
|
+
project_id: Optional[str],
|
|
233
|
+
metadata: Optional[Dict[str, Any]],
|
|
234
|
+
) -> ClassicEvalRunResult:
|
|
235
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
199
236
|
logger.debug("Executing classic eval run for %s", run)
|
|
200
237
|
|
|
201
238
|
inference_outputs = None
|
|
202
|
-
|
|
239
|
+
scores = None
|
|
203
240
|
|
|
204
241
|
try:
|
|
242
|
+
# 1. Run inference
|
|
205
243
|
inference_outputs = await run_inference_callable(
|
|
206
|
-
inference, run.
|
|
244
|
+
inference, run.inputs, run.hyperparameter_config
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# 2. Build items for score_async
|
|
248
|
+
items = [
|
|
249
|
+
{
|
|
250
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
251
|
+
"output": inference_outputs[i],
|
|
252
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
253
|
+
}
|
|
254
|
+
for i in range(len(inference_outputs))
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
# 3. Get the model name for upload
|
|
258
|
+
model_name = get_model_name(inference, metadata)
|
|
259
|
+
|
|
260
|
+
# 4. Call score_async
|
|
261
|
+
scores = await score_async(
|
|
262
|
+
items=items,
|
|
263
|
+
metrics=run.dataset.metrics,
|
|
264
|
+
output_column="output", # Explicit parameter
|
|
265
|
+
label_column="label", # Explicit parameter
|
|
266
|
+
input_column="input", # Explicit parameter
|
|
267
|
+
hyperparameters=run.hyperparameter_config,
|
|
268
|
+
dataset_name=run.dataset.name,
|
|
269
|
+
model_name=model_name,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
experiment_id=experiment_id,
|
|
272
|
+
project_id=project_id,
|
|
273
|
+
upload_results=upload_results,
|
|
274
|
+
show_progress=False,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# 5. Extract run_id if upload succeeded
|
|
278
|
+
run_id = None
|
|
279
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
280
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
281
|
+
|
|
282
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
283
|
+
return ClassicEvalRunResult(
|
|
284
|
+
run_spec=run,
|
|
285
|
+
run_completed=True,
|
|
286
|
+
outputs=inference_outputs,
|
|
287
|
+
scores=scores,
|
|
288
|
+
run_id=run_id,
|
|
207
289
|
)
|
|
208
|
-
metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
|
|
209
|
-
logger.debug("Classic evaluation completed for run %s", run)
|
|
210
|
-
return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
|
|
211
290
|
|
|
212
291
|
except Exception as e:
|
|
213
292
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
214
|
-
return ClassicEvalRunResult(
|
|
293
|
+
return ClassicEvalRunResult(
|
|
294
|
+
run_spec=run,
|
|
295
|
+
run_completed=False,
|
|
296
|
+
outputs=inference_outputs,
|
|
297
|
+
scores=scores,
|
|
298
|
+
run_id=None,
|
|
299
|
+
)
|
|
215
300
|
|
|
216
301
|
|
|
217
302
|
async def run_inference_callable(
|
|
218
303
|
inference: Callable,
|
|
219
|
-
|
|
304
|
+
inputs: List[Any],
|
|
220
305
|
hyperparameter_config: Dict[str, Any],
|
|
221
306
|
) -> Any:
|
|
222
307
|
"""Run inference on a given dataset and hyperparameter configuration."""
|
|
223
308
|
|
|
224
309
|
try:
|
|
225
|
-
predictions = await inference(
|
|
310
|
+
predictions = await inference(inputs, **hyperparameter_config)
|
|
226
311
|
except Exception as e:
|
|
227
312
|
logger.error(
|
|
228
313
|
"Inference callable raised an exception: %s",
|
|
@@ -230,11 +315,11 @@ async def run_inference_callable(
|
|
|
230
315
|
)
|
|
231
316
|
raise InferenceError(f"Inference failed: {str(e)}") from e
|
|
232
317
|
|
|
233
|
-
if not isinstance(predictions, list) or len(predictions) != len(
|
|
318
|
+
if not isinstance(predictions, list) or len(predictions) != len(inputs):
|
|
234
319
|
raise InferenceError(
|
|
235
320
|
"Inference callable must return a list of predictions "
|
|
236
|
-
"of shared length as the
|
|
237
|
-
f"
|
|
321
|
+
"of shared length as the inputs. "
|
|
322
|
+
f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
|
|
238
323
|
)
|
|
239
324
|
|
|
240
325
|
if all(prediction == "" for prediction in predictions):
|
|
@@ -257,100 +342,20 @@ async def execute_adaptive_eval_run(
|
|
|
257
342
|
"""Execute an adaptive evaluation run."""
|
|
258
343
|
logger.debug("Executing adaptive run for %s", run)
|
|
259
344
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
adaptive_eval_run_result = await run_adaptive_evaluation(
|
|
264
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
265
|
-
)
|
|
266
|
-
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
267
|
-
|
|
268
|
-
return adaptive_eval_run_result
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
async def upload_classic_run_results(
|
|
272
|
-
run_result: ClassicEvalRunResult,
|
|
273
|
-
experiment_id: str,
|
|
274
|
-
project_id: str,
|
|
275
|
-
inference_callable: Optional[Callable],
|
|
276
|
-
metadata: Optional[Dict[str, Any]],
|
|
277
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
278
|
-
) -> str:
|
|
279
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
run: The evaluation run result to upload
|
|
283
|
-
experiment_id: Trismik experiment identifier
|
|
284
|
-
project_id: Trismik project identifier
|
|
285
|
-
model: Model name used for evaluation
|
|
286
|
-
metadata: Optional metadata dictionary
|
|
287
|
-
trismik_client: Trismik client instance
|
|
345
|
+
try:
|
|
346
|
+
if trismik_client is None:
|
|
347
|
+
raise ScoreBookError("Trismik client is required for adaptive evaluation")
|
|
288
348
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
"""
|
|
292
|
-
model = get_model_name(inference_callable)
|
|
293
|
-
|
|
294
|
-
# Create eval items from run_spec items, outputs, and labels
|
|
295
|
-
items: List[TrismikClassicEvalItem] = []
|
|
296
|
-
for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
|
|
297
|
-
label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
|
|
298
|
-
|
|
299
|
-
# Calculate item-level metrics for this item
|
|
300
|
-
item_metrics: Dict[str, Any] = {}
|
|
301
|
-
if run_result.scores:
|
|
302
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
303
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
304
|
-
if idx < len(metric_data["item_scores"]):
|
|
305
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
306
|
-
else:
|
|
307
|
-
# If scores is just a single value, use it for all items
|
|
308
|
-
item_metrics[metric_name] = metric_data
|
|
309
|
-
|
|
310
|
-
eval_item = TrismikClassicEvalItem(
|
|
311
|
-
datasetItemId=str(idx),
|
|
312
|
-
modelInput=str(item),
|
|
313
|
-
modelOutput=str(output),
|
|
314
|
-
goldOutput=str(label),
|
|
315
|
-
metrics=item_metrics,
|
|
349
|
+
adaptive_eval_run_result = await run_adaptive_evaluation(
|
|
350
|
+
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
316
351
|
)
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Create eval metrics from run aggregate scores
|
|
320
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
321
|
-
if run_result.scores:
|
|
322
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
323
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
324
|
-
# Handle structured metric data with aggregate scores
|
|
325
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
326
|
-
metric_id = (
|
|
327
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
328
|
-
)
|
|
329
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
330
|
-
metrics.append(metric)
|
|
331
|
-
else:
|
|
332
|
-
# Handle simple metric data (single value)
|
|
333
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
334
|
-
metrics.append(metric)
|
|
335
|
-
|
|
336
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
337
|
-
project_id,
|
|
338
|
-
experiment_id,
|
|
339
|
-
run_result.run_spec.dataset.name,
|
|
340
|
-
model,
|
|
341
|
-
run_result.run_spec.hyperparameter_config,
|
|
342
|
-
items,
|
|
343
|
-
metrics,
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
|
|
347
|
-
classic_eval_request
|
|
348
|
-
)
|
|
352
|
+
logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
|
|
349
353
|
|
|
350
|
-
|
|
351
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
354
|
+
return adaptive_eval_run_result
|
|
352
355
|
|
|
353
|
-
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
|
|
358
|
+
return AdaptiveEvalRunResult(run, False, {})
|
|
354
359
|
|
|
355
360
|
|
|
356
361
|
async def run_adaptive_evaluation(
|
|
@@ -410,4 +415,4 @@ async def run_adaptive_evaluation(
|
|
|
410
415
|
# Make scores JSON serializable
|
|
411
416
|
scores = make_json_serializable(scores)
|
|
412
417
|
|
|
413
|
-
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
|
|
418
|
+
return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
|