scorebook 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scorebook/__init__.py CHANGED
@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
12
12
  from scorebook.eval_datasets import EvalDataset
13
13
  from scorebook.evaluate import evaluate, evaluate_async
14
14
  from scorebook.inference.inference_pipeline import InferencePipeline
15
- from scorebook.trismik.credentials import login, whoami
15
+ from scorebook.score import score, score_async
16
+ from scorebook.trismik.credentials import login, logout, whoami
17
+ from scorebook.trismik.upload_results import upload_result, upload_result_async
16
18
  from scorebook.utils.render_template import render_template
17
19
 
18
20
  __all__ = [
19
21
  "EvalDataset",
20
22
  "evaluate",
21
23
  "evaluate_async",
24
+ "score",
25
+ "score_async",
22
26
  "render_template",
23
27
  "login",
28
+ "logout",
24
29
  "whoami",
25
30
  "InferencePipeline",
31
+ "upload_result",
32
+ "upload_result_async",
26
33
  ]
@@ -3,13 +3,8 @@ import logging
3
3
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
4
4
 
5
5
  from trismik import TrismikAsyncClient, TrismikClient
6
- from trismik.types import (
7
- TrismikClassicEvalItem,
8
- TrismikClassicEvalMetric,
9
- TrismikClassicEvalRequest,
10
- TrismikClassicEvalResponse,
11
- TrismikRunMetadata,
12
- )
6
+ from trismik.settings import evaluation_settings
7
+ from trismik.types import TrismikRunMetadata
13
8
 
14
9
  from scorebook.eval_datasets import EvalDataset
15
10
  from scorebook.evaluate.evaluate_helpers import (
@@ -20,12 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
20
15
  make_trismik_inference,
21
16
  prepare_datasets,
22
17
  prepare_hyperparameter_configs,
23
- resolve_show_progress,
24
- resolve_upload_results,
25
- score_metrics,
26
18
  validate_parameters,
27
19
  )
28
20
  from scorebook.exceptions import InferenceError, ScoreBookError
21
+ from scorebook.inference.inference_pipeline import InferencePipeline
22
+ from scorebook.score._async.score_async import score_async
29
23
  from scorebook.types import (
30
24
  AdaptiveEvalRunResult,
31
25
  AdaptiveEvalRunSpec,
@@ -33,13 +27,18 @@ from scorebook.types import (
33
27
  EvalResult,
34
28
  EvalRunSpec,
35
29
  )
36
- from scorebook.utils import async_nullcontext, evaluation_progress_context
30
+ from scorebook.utils import (
31
+ async_nullcontext,
32
+ evaluation_progress_context,
33
+ resolve_show_progress,
34
+ resolve_upload_results,
35
+ )
37
36
 
38
37
  logger = logging.getLogger(__name__)
39
38
 
40
39
 
41
40
  async def evaluate_async(
42
- inference: Callable,
41
+ inference: Union[Callable, InferencePipeline],
43
42
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
43
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
44
  metadata: Optional[Dict[str, Any]] = None,
@@ -90,7 +89,7 @@ async def evaluate_async(
90
89
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
90
  )
92
91
 
93
- # Create Trismik client if needed (for adaptive evals or uploads)
92
+ # Create a Trismik client if needed (for adaptive evals or uploads)
94
93
  needs_client = upload_results or any(
95
94
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
95
  )
@@ -101,7 +100,14 @@ async def evaluate_async(
101
100
  async with trismik_client or async_nullcontext():
102
101
  # Execute evaluation runs
103
102
  # Calculate total items across all runs
104
- total_items = sum(len(run.dataset.items) for run in eval_run_specs)
103
+ total_items = sum(
104
+ (
105
+ len(run.dataset.items)
106
+ if isinstance(run, EvalRunSpec)
107
+ else evaluation_settings["max_iterations"]
108
+ ) # Adaptive evals use max_iterations
109
+ for run in eval_run_specs
110
+ )
105
111
  model_display = get_model_name(inference)
106
112
 
107
113
  with evaluation_progress_context(
@@ -145,34 +151,32 @@ async def execute_runs(
145
151
  async def worker(
146
152
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
153
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
154
+ # Execute run (score_async handles upload internally for classic evals)
148
155
  run_result = await execute_run(
149
- inference, run, experiment_id, project_id, metadata, trismik_client
156
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
150
157
  )
158
+
151
159
  # Update progress bars with items processed and success status
152
160
  if progress_bars is not None:
153
- items_processed = len(run.dataset.items)
161
+ # Classic evals have .items; adaptive evals use max_iterations
162
+ items_processed = (
163
+ len(run.dataset.items)
164
+ if isinstance(run, EvalRunSpec)
165
+ else evaluation_settings["max_iterations"]
166
+ )
154
167
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
168
 
169
+ # Update upload progress for classic evals
156
170
  if (
157
171
  upload_results
158
172
  and isinstance(run_result, ClassicEvalRunResult)
159
- and experiment_id
160
- and project_id
161
173
  and run_result.run_completed
162
- and trismik_client is not None
163
174
  ):
164
- try:
165
- run_id = await upload_classic_run_results(
166
- run_result, experiment_id, project_id, inference, metadata, trismik_client
167
- )
168
- run_result.run_id = run_id
169
- if progress_bars is not None:
170
- progress_bars.on_upload_completed(succeeded=True)
171
- except Exception as e:
172
- logger.warning(f"Failed to upload run results: {e}")
175
+ # Check if upload succeeded by checking for run_id
176
+ if experiment_id and project_id:
177
+ upload_succeeded = run_result.run_id is not None
173
178
  if progress_bars is not None:
174
- progress_bars.on_upload_completed(succeeded=False)
175
- # Continue evaluation even if upload fails
179
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
176
180
 
177
181
  return run_result
178
182
 
@@ -191,6 +195,7 @@ async def execute_runs(
191
195
  async def execute_run(
192
196
  inference: Callable,
193
197
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
198
+ upload_results: bool, # NEW PARAMETER
194
199
  experiment_id: Optional[str] = None,
195
200
  project_id: Optional[str] = None,
196
201
  metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +204,9 @@ async def execute_run(
199
204
  """Execute a single evaluation run."""
200
205
 
201
206
  if isinstance(run, EvalRunSpec):
202
- return await execute_classic_eval_run(inference, run)
207
+ return await execute_classic_eval_run(
208
+ inference, run, upload_results, experiment_id, project_id, metadata
209
+ )
203
210
 
204
211
  elif isinstance(run, AdaptiveEvalRunSpec):
205
212
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +224,79 @@ async def execute_run(
217
224
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
225
 
219
226
 
220
- async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
- """Execute a classic evaluation run."""
227
+ async def execute_classic_eval_run(
228
+ inference: Callable,
229
+ run: EvalRunSpec,
230
+ upload_results: bool,
231
+ experiment_id: Optional[str],
232
+ project_id: Optional[str],
233
+ metadata: Optional[Dict[str, Any]],
234
+ ) -> ClassicEvalRunResult:
235
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
222
236
  logger.debug("Executing classic eval run for %s", run)
223
237
 
224
238
  inference_outputs = None
225
- metric_scores = None
239
+ scores = None
226
240
 
227
241
  try:
242
+ # 1. Run inference
228
243
  inference_outputs = await run_inference_callable(
229
244
  inference, run.inputs, run.hyperparameter_config
230
245
  )
231
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
- logger.debug("Classic evaluation completed for run %s", run)
233
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
246
+
247
+ # 2. Build items for score_async
248
+ items = [
249
+ {
250
+ "input": run.inputs[i] if i < len(run.inputs) else None,
251
+ "output": inference_outputs[i],
252
+ "label": run.labels[i] if i < len(run.labels) else "",
253
+ }
254
+ for i in range(len(inference_outputs))
255
+ ]
256
+
257
+ # 3. Get the model name for upload
258
+ model_name = get_model_name(inference, metadata)
259
+
260
+ # 4. Call score_async
261
+ scores = await score_async(
262
+ items=items,
263
+ metrics=run.dataset.metrics,
264
+ output_column="output", # Explicit parameter
265
+ label_column="label", # Explicit parameter
266
+ input_column="input", # Explicit parameter
267
+ hyperparameters=run.hyperparameter_config,
268
+ dataset_name=run.dataset.name,
269
+ model_name=model_name,
270
+ metadata=metadata,
271
+ experiment_id=experiment_id,
272
+ project_id=project_id,
273
+ upload_results=upload_results,
274
+ show_progress=False,
275
+ )
276
+
277
+ # 5. Extract run_id if upload succeeded
278
+ run_id = None
279
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
280
+ run_id = scores["aggregate_results"][0].get("run_id")
281
+
282
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
283
+ return ClassicEvalRunResult(
284
+ run_spec=run,
285
+ run_completed=True,
286
+ outputs=inference_outputs,
287
+ scores=scores,
288
+ run_id=run_id,
289
+ )
234
290
 
235
291
  except Exception as e:
236
292
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
293
+ return ClassicEvalRunResult(
294
+ run_spec=run,
295
+ run_completed=False,
296
+ outputs=inference_outputs,
297
+ scores=scores,
298
+ run_id=None,
299
+ )
238
300
 
239
301
 
240
302
  async def run_inference_callable(
@@ -296,93 +358,6 @@ async def execute_adaptive_eval_run(
296
358
  return AdaptiveEvalRunResult(run, False, {})
297
359
 
298
360
 
299
- async def upload_classic_run_results(
300
- run_result: ClassicEvalRunResult,
301
- experiment_id: str,
302
- project_id: str,
303
- inference_callable: Optional[Callable],
304
- metadata: Optional[Dict[str, Any]],
305
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
- ) -> str:
307
- """Upload a classic evaluation run result to Trismik platform.
308
-
309
- Args:
310
- run: The evaluation run result to upload
311
- experiment_id: Trismik experiment identifier
312
- project_id: Trismik project identifier
313
- model: Model name used for evaluation
314
- metadata: Optional metadata dictionary
315
- trismik_client: Trismik client instance
316
-
317
- Returns:
318
- Run id
319
- """
320
- model = get_model_name(inference_callable)
321
-
322
- # Create eval items from run_spec inputs, outputs, and labels
323
- items: List[TrismikClassicEvalItem] = []
324
- inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
- for idx, (input_value, output) in enumerate(inputs_outputs):
326
- labels = run_result.run_spec.labels
327
- label = labels[idx] if idx < len(labels) else ""
328
-
329
- # Calculate item-level metrics for this item
330
- item_metrics: Dict[str, Any] = {}
331
- if run_result.scores:
332
- for metric_name, metric_data in run_result.scores.items():
333
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
- if idx < len(metric_data["item_scores"]):
335
- item_metrics[metric_name] = metric_data["item_scores"][idx]
336
- else:
337
- # If scores is just a single value, use it for all items
338
- item_metrics[metric_name] = metric_data
339
-
340
- eval_item = TrismikClassicEvalItem(
341
- datasetItemId=str(idx),
342
- modelInput=str(input_value),
343
- modelOutput=str(output),
344
- goldOutput=str(label),
345
- metrics=item_metrics,
346
- )
347
- items.append(eval_item)
348
-
349
- # Create eval metrics from run aggregate scores
350
- metrics: List[TrismikClassicEvalMetric] = []
351
- if run_result.scores:
352
- for metric_name, metric_data in run_result.scores.items():
353
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
- # Handle structured metric data with aggregate scores
355
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
- metric_id = (
357
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
- )
359
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
- metrics.append(metric)
361
- else:
362
- # Handle simple metric data (single value)
363
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
- metrics.append(metric)
365
-
366
- classic_eval_request = TrismikClassicEvalRequest(
367
- project_id,
368
- experiment_id,
369
- run_result.run_spec.dataset.name,
370
- model,
371
- run_result.run_spec.hyperparameter_config,
372
- items,
373
- metrics,
374
- )
375
-
376
- response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
377
- classic_eval_request
378
- )
379
-
380
- run_id: str = response.id
381
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
-
383
- return run_id
384
-
385
-
386
361
  async def run_adaptive_evaluation(
387
362
  inference: Callable,
388
363
  adaptive_run_spec: AdaptiveEvalRunSpec,
@@ -2,13 +2,8 @@ import logging
2
2
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
3
3
 
4
4
  from trismik import TrismikAsyncClient, TrismikClient
5
- from trismik.types import (
6
- TrismikClassicEvalItem,
7
- TrismikClassicEvalMetric,
8
- TrismikClassicEvalRequest,
9
- TrismikClassicEvalResponse,
10
- TrismikRunMetadata,
11
- )
5
+ from trismik.settings import evaluation_settings
6
+ from trismik.types import TrismikRunMetadata
12
7
 
13
8
  from scorebook.eval_datasets import EvalDataset
14
9
  from scorebook.evaluate.evaluate_helpers import (
@@ -19,12 +14,11 @@ from scorebook.evaluate.evaluate_helpers import (
19
14
  make_trismik_inference,
20
15
  prepare_datasets,
21
16
  prepare_hyperparameter_configs,
22
- resolve_show_progress,
23
- resolve_upload_results,
24
- score_metrics,
25
17
  validate_parameters,
26
18
  )
27
19
  from scorebook.exceptions import InferenceError, ScoreBookError
20
+ from scorebook.inference.inference_pipeline import InferencePipeline
21
+ from scorebook.score._sync.score import score
28
22
  from scorebook.types import (
29
23
  AdaptiveEvalRunResult,
30
24
  AdaptiveEvalRunSpec,
@@ -32,14 +26,18 @@ from scorebook.types import (
32
26
  EvalResult,
33
27
  EvalRunSpec,
34
28
  )
35
- from contextlib import nullcontext
36
- from scorebook.utils import evaluation_progress_context
29
+ from scorebook.utils import (
30
+ nullcontext,
31
+ evaluation_progress_context,
32
+ resolve_show_progress,
33
+ resolve_upload_results,
34
+ )
37
35
 
38
36
  logger = logging.getLogger(__name__)
39
37
 
40
38
 
41
39
  def evaluate(
42
- inference: Callable,
40
+ inference: Union[Callable, InferencePipeline],
43
41
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
42
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
43
  metadata: Optional[Dict[str, Any]] = None,
@@ -90,7 +88,7 @@ def evaluate(
90
88
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
89
  )
92
90
 
93
- # Create Trismik client if needed (for adaptive evals or uploads)
91
+ # Create a Trismik client if needed (for adaptive evals or uploads)
94
92
  needs_client = upload_results or any(
95
93
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
94
  )
@@ -101,7 +99,14 @@ def evaluate(
101
99
  with trismik_client or nullcontext():
102
100
  # Execute evaluation runs
103
101
  # Calculate total items across all runs
104
- total_items = sum(len(run.dataset.items) for run in eval_run_specs)
102
+ total_items = sum(
103
+ (
104
+ len(run.dataset.items)
105
+ if isinstance(run, EvalRunSpec)
106
+ else evaluation_settings["max_iterations"]
107
+ ) # Adaptive evals use max_iterations
108
+ for run in eval_run_specs
109
+ )
105
110
  model_display = get_model_name(inference)
106
111
 
107
112
  with evaluation_progress_context(
@@ -145,34 +150,32 @@ def execute_runs(
145
150
  def worker(
146
151
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
152
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
153
+ # Execute run (score_async handles upload internally for classic evals)
148
154
  run_result = execute_run(
149
- inference, run, experiment_id, project_id, metadata, trismik_client
155
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
150
156
  )
157
+
151
158
  # Update progress bars with items processed and success status
152
159
  if progress_bars is not None:
153
- items_processed = len(run.dataset.items)
160
+ # Classic evals have .items; adaptive evals use max_iterations
161
+ items_processed = (
162
+ len(run.dataset.items)
163
+ if isinstance(run, EvalRunSpec)
164
+ else evaluation_settings["max_iterations"]
165
+ )
154
166
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
167
 
168
+ # Update upload progress for classic evals
156
169
  if (
157
170
  upload_results
158
171
  and isinstance(run_result, ClassicEvalRunResult)
159
- and experiment_id
160
- and project_id
161
172
  and run_result.run_completed
162
- and trismik_client is not None
163
173
  ):
164
- try:
165
- run_id = upload_classic_run_results(
166
- run_result, experiment_id, project_id, inference, metadata, trismik_client
167
- )
168
- run_result.run_id = run_id
169
- if progress_bars is not None:
170
- progress_bars.on_upload_completed(succeeded=True)
171
- except Exception as e:
172
- logger.warning(f"Failed to upload run results: {e}")
174
+ # Check if upload succeeded by checking for run_id
175
+ if experiment_id and project_id:
176
+ upload_succeeded = run_result.run_id is not None
173
177
  if progress_bars is not None:
174
- progress_bars.on_upload_completed(succeeded=False)
175
- # Continue evaluation even if upload fails
178
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
176
179
 
177
180
  return run_result
178
181
 
@@ -191,6 +194,7 @@ def execute_runs(
191
194
  def execute_run(
192
195
  inference: Callable,
193
196
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
197
+ upload_results: bool, # NEW PARAMETER
194
198
  experiment_id: Optional[str] = None,
195
199
  project_id: Optional[str] = None,
196
200
  metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +203,9 @@ def execute_run(
199
203
  """Execute a single evaluation run."""
200
204
 
201
205
  if isinstance(run, EvalRunSpec):
202
- return execute_classic_eval_run(inference, run)
206
+ return execute_classic_eval_run(
207
+ inference, run, upload_results, experiment_id, project_id, metadata
208
+ )
203
209
 
204
210
  elif isinstance(run, AdaptiveEvalRunSpec):
205
211
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +223,79 @@ def execute_run(
217
223
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
224
 
219
225
 
220
- def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
- """Execute a classic evaluation run."""
226
+ def execute_classic_eval_run(
227
+ inference: Callable,
228
+ run: EvalRunSpec,
229
+ upload_results: bool,
230
+ experiment_id: Optional[str],
231
+ project_id: Optional[str],
232
+ metadata: Optional[Dict[str, Any]],
233
+ ) -> ClassicEvalRunResult:
234
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
222
235
  logger.debug("Executing classic eval run for %s", run)
223
236
 
224
237
  inference_outputs = None
225
- metric_scores = None
238
+ scores = None
226
239
 
227
240
  try:
241
+ # 1. Run inference
228
242
  inference_outputs = run_inference_callable(
229
243
  inference, run.inputs, run.hyperparameter_config
230
244
  )
231
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
- logger.debug("Classic evaluation completed for run %s", run)
233
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
245
+
246
+ # 2. Build items for score_async
247
+ items = [
248
+ {
249
+ "input": run.inputs[i] if i < len(run.inputs) else None,
250
+ "output": inference_outputs[i],
251
+ "label": run.labels[i] if i < len(run.labels) else "",
252
+ }
253
+ for i in range(len(inference_outputs))
254
+ ]
255
+
256
+ # 3. Get the model name for upload
257
+ model_name = get_model_name(inference, metadata)
258
+
259
+ # 4. Call score_async
260
+ scores = score(
261
+ items=items,
262
+ metrics=run.dataset.metrics,
263
+ output_column="output", # Explicit parameter
264
+ label_column="label", # Explicit parameter
265
+ input_column="input", # Explicit parameter
266
+ hyperparameters=run.hyperparameter_config,
267
+ dataset_name=run.dataset.name,
268
+ model_name=model_name,
269
+ metadata=metadata,
270
+ experiment_id=experiment_id,
271
+ project_id=project_id,
272
+ upload_results=upload_results,
273
+ show_progress=False,
274
+ )
275
+
276
+ # 5. Extract run_id if upload succeeded
277
+ run_id = None
278
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
279
+ run_id = scores["aggregate_results"][0].get("run_id")
280
+
281
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
282
+ return ClassicEvalRunResult(
283
+ run_spec=run,
284
+ run_completed=True,
285
+ outputs=inference_outputs,
286
+ scores=scores,
287
+ run_id=run_id,
288
+ )
234
289
 
235
290
  except Exception as e:
236
291
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
292
+ return ClassicEvalRunResult(
293
+ run_spec=run,
294
+ run_completed=False,
295
+ outputs=inference_outputs,
296
+ scores=scores,
297
+ run_id=None,
298
+ )
238
299
 
239
300
 
240
301
  def run_inference_callable(
@@ -296,93 +357,6 @@ def execute_adaptive_eval_run(
296
357
  return AdaptiveEvalRunResult(run, False, {})
297
358
 
298
359
 
299
- def upload_classic_run_results(
300
- run_result: ClassicEvalRunResult,
301
- experiment_id: str,
302
- project_id: str,
303
- inference_callable: Optional[Callable],
304
- metadata: Optional[Dict[str, Any]],
305
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
- ) -> str:
307
- """Upload a classic evaluation run result to Trismik platform.
308
-
309
- Args:
310
- run: The evaluation run result to upload
311
- experiment_id: Trismik experiment identifier
312
- project_id: Trismik project identifier
313
- model: Model name used for evaluation
314
- metadata: Optional metadata dictionary
315
- trismik_client: Trismik client instance
316
-
317
- Returns:
318
- Run id
319
- """
320
- model = get_model_name(inference_callable)
321
-
322
- # Create eval items from run_spec inputs, outputs, and labels
323
- items: List[TrismikClassicEvalItem] = []
324
- inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
- for idx, (input_value, output) in enumerate(inputs_outputs):
326
- labels = run_result.run_spec.labels
327
- label = labels[idx] if idx < len(labels) else ""
328
-
329
- # Calculate item-level metrics for this item
330
- item_metrics: Dict[str, Any] = {}
331
- if run_result.scores:
332
- for metric_name, metric_data in run_result.scores.items():
333
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
- if idx < len(metric_data["item_scores"]):
335
- item_metrics[metric_name] = metric_data["item_scores"][idx]
336
- else:
337
- # If scores is just a single value, use it for all items
338
- item_metrics[metric_name] = metric_data
339
-
340
- eval_item = TrismikClassicEvalItem(
341
- datasetItemId=str(idx),
342
- modelInput=str(input_value),
343
- modelOutput=str(output),
344
- goldOutput=str(label),
345
- metrics=item_metrics,
346
- )
347
- items.append(eval_item)
348
-
349
- # Create eval metrics from run aggregate scores
350
- metrics: List[TrismikClassicEvalMetric] = []
351
- if run_result.scores:
352
- for metric_name, metric_data in run_result.scores.items():
353
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
- # Handle structured metric data with aggregate scores
355
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
- metric_id = (
357
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
- )
359
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
- metrics.append(metric)
361
- else:
362
- # Handle simple metric data (single value)
363
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
- metrics.append(metric)
365
-
366
- classic_eval_request = TrismikClassicEvalRequest(
367
- project_id,
368
- experiment_id,
369
- run_result.run_spec.dataset.name,
370
- model,
371
- run_result.run_spec.hyperparameter_config,
372
- items,
373
- metrics,
374
- )
375
-
376
- response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
377
- classic_eval_request
378
- )
379
-
380
- run_id: str = response.id
381
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
-
383
- return run_id
384
-
385
-
386
360
  def run_adaptive_evaluation(
387
361
  inference: Callable,
388
362
  adaptive_run_spec: AdaptiveEvalRunSpec,