scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,10 @@ import logging
3
3
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
4
4
 
5
5
  from trismik import TrismikAsyncClient, TrismikClient
6
- from trismik.types import (
7
- TrismikClassicEvalItem,
8
- TrismikClassicEvalMetric,
9
- TrismikClassicEvalRequest,
10
- TrismikClassicEvalResponse,
11
- TrismikRunMetadata,
12
- )
6
+ from trismik.settings import evaluation_settings
7
+ from trismik.types import TrismikRunMetadata
13
8
 
14
- from scorebook.eval_dataset import EvalDataset
9
+ from scorebook.eval_datasets import EvalDataset
15
10
  from scorebook.evaluate.evaluate_helpers import (
16
11
  build_eval_run_specs,
17
12
  create_trismik_async_client,
@@ -20,11 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
20
15
  make_trismik_inference,
21
16
  prepare_datasets,
22
17
  prepare_hyperparameter_configs,
23
- resolve_upload_results,
24
- score_metrics,
25
18
  validate_parameters,
26
19
  )
27
20
  from scorebook.exceptions import InferenceError, ScoreBookError
21
+ from scorebook.inference.inference_pipeline import InferencePipeline
22
+ from scorebook.score._async.score_async import score_async
28
23
  from scorebook.types import (
29
24
  AdaptiveEvalRunResult,
30
25
  AdaptiveEvalRunSpec,
@@ -32,13 +27,18 @@ from scorebook.types import (
32
27
  EvalResult,
33
28
  EvalRunSpec,
34
29
  )
35
- from scorebook.utils import async_nullcontext, evaluation_progress
30
+ from scorebook.utils import (
31
+ async_nullcontext,
32
+ evaluation_progress_context,
33
+ resolve_show_progress,
34
+ resolve_upload_results,
35
+ )
36
36
 
37
37
  logger = logging.getLogger(__name__)
38
38
 
39
39
 
40
40
  async def evaluate_async(
41
- inference: Callable,
41
+ inference: Union[Callable, InferencePipeline],
42
42
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
43
43
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
44
44
  metadata: Optional[Dict[str, Any]] = None,
@@ -50,6 +50,7 @@ async def evaluate_async(
50
50
  return_output: bool = False,
51
51
  upload_results: Union[Literal["auto"], bool] = "auto",
52
52
  sample_size: Optional[int] = None,
53
+ show_progress: Optional[bool] = None,
53
54
  ) -> Union[Dict, List, EvalResult]:
54
55
  """
55
56
  Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +68,8 @@ async def evaluate_async(
67
68
  return_output: If True, returns model outputs for each dataset item
68
69
  upload_results: If True, uploads results to Trismik's dashboard
69
70
  sample_size: Optional number of items to sample from each dataset
71
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
72
+ If True/False, explicitly enables/disables progress bars for this evaluation.
70
73
 
71
74
  Returns:
72
75
  The evaluation results in the format specified by return parameters:
@@ -75,6 +78,7 @@ async def evaluate_async(
75
78
  """
76
79
  # Resolve and validate parameters
77
80
  upload_results = cast(bool, resolve_upload_results(upload_results))
81
+ show_progress_bars = resolve_show_progress(show_progress)
78
82
  validate_parameters(locals(), evaluate_async)
79
83
 
80
84
  # Prepare datasets, hyperparameters, and eval run specs
@@ -85,7 +89,7 @@ async def evaluate_async(
85
89
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
86
90
  )
87
91
 
88
- # Create Trismik client if needed (for adaptive evals or uploads)
92
+ # Create a Trismik client if needed (for adaptive evals or uploads)
89
93
  needs_client = upload_results or any(
90
94
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
91
95
  )
@@ -95,10 +99,24 @@ async def evaluate_async(
95
99
 
96
100
  async with trismik_client or async_nullcontext():
97
101
  # Execute evaluation runs
98
- with evaluation_progress(
102
+ # Calculate total items across all runs
103
+ total_items = sum(
104
+ (
105
+ len(run.dataset.items)
106
+ if isinstance(run, EvalRunSpec)
107
+ else evaluation_settings["max_iterations"]
108
+ ) # Adaptive evals use max_iterations
109
+ for run in eval_run_specs
110
+ )
111
+ model_display = get_model_name(inference)
112
+
113
+ with evaluation_progress_context(
114
+ total_eval_runs=len(eval_run_specs),
115
+ total_items=total_items,
99
116
  dataset_count=len(datasets),
100
- hyperparameter_config_count=len(hyperparameter_configs),
101
- run_count=len(eval_run_specs),
117
+ hyperparam_count=len(hyperparameter_configs),
118
+ model_display=model_display,
119
+ enabled=show_progress_bars,
102
120
  ) as progress_bars:
103
121
  eval_result = await execute_runs(
104
122
  inference,
@@ -133,23 +151,32 @@ async def execute_runs(
133
151
  async def worker(
134
152
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
135
153
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
154
+ # Execute run (score_async handles upload internally for classic evals)
136
155
  run_result = await execute_run(
137
- inference, run, experiment_id, project_id, metadata, trismik_client
156
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
138
157
  )
139
- progress_bars.on_eval_run_completed(run.dataset_index)
140
158
 
159
+ # Update progress bars with items processed and success status
160
+ if progress_bars is not None:
161
+ # Classic evals have .items; adaptive evals use max_iterations
162
+ items_processed = (
163
+ len(run.dataset.items)
164
+ if isinstance(run, EvalRunSpec)
165
+ else evaluation_settings["max_iterations"]
166
+ )
167
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
168
+
169
+ # Update upload progress for classic evals
141
170
  if (
142
171
  upload_results
143
172
  and isinstance(run_result, ClassicEvalRunResult)
144
- and experiment_id
145
- and project_id
146
173
  and run_result.run_completed
147
- and trismik_client is not None
148
174
  ):
149
- run_id = await upload_classic_run_results(
150
- run_result, experiment_id, project_id, inference, metadata, trismik_client
151
- )
152
- run_result.run_id = run_id
175
+ # Check if upload succeeded by checking for run_id
176
+ if experiment_id and project_id:
177
+ upload_succeeded = run_result.run_id is not None
178
+ if progress_bars is not None:
179
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
153
180
 
154
181
  return run_result
155
182
 
@@ -168,6 +195,7 @@ async def execute_runs(
168
195
  async def execute_run(
169
196
  inference: Callable,
170
197
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
198
+ upload_results: bool, # NEW PARAMETER
171
199
  experiment_id: Optional[str] = None,
172
200
  project_id: Optional[str] = None,
173
201
  metadata: Optional[Dict[str, Any]] = None,
@@ -176,7 +204,9 @@ async def execute_run(
176
204
  """Execute a single evaluation run."""
177
205
 
178
206
  if isinstance(run, EvalRunSpec):
179
- return await execute_classic_eval_run(inference, run)
207
+ return await execute_classic_eval_run(
208
+ inference, run, upload_results, experiment_id, project_id, metadata
209
+ )
180
210
 
181
211
  elif isinstance(run, AdaptiveEvalRunSpec):
182
212
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -194,35 +224,90 @@ async def execute_run(
194
224
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
195
225
 
196
226
 
197
- async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
198
- """Execute a classic evaluation run."""
227
+ async def execute_classic_eval_run(
228
+ inference: Callable,
229
+ run: EvalRunSpec,
230
+ upload_results: bool,
231
+ experiment_id: Optional[str],
232
+ project_id: Optional[str],
233
+ metadata: Optional[Dict[str, Any]],
234
+ ) -> ClassicEvalRunResult:
235
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
199
236
  logger.debug("Executing classic eval run for %s", run)
200
237
 
201
238
  inference_outputs = None
202
- metric_scores = None
239
+ scores = None
203
240
 
204
241
  try:
242
+ # 1. Run inference
205
243
  inference_outputs = await run_inference_callable(
206
- inference, run.dataset.items, run.hyperparameter_config
244
+ inference, run.inputs, run.hyperparameter_config
245
+ )
246
+
247
+ # 2. Build items for score_async
248
+ items = [
249
+ {
250
+ "input": run.inputs[i] if i < len(run.inputs) else None,
251
+ "output": inference_outputs[i],
252
+ "label": run.labels[i] if i < len(run.labels) else "",
253
+ }
254
+ for i in range(len(inference_outputs))
255
+ ]
256
+
257
+ # 3. Get the model name for upload
258
+ model_name = get_model_name(inference, metadata)
259
+
260
+ # 4. Call score_async
261
+ scores = await score_async(
262
+ items=items,
263
+ metrics=run.dataset.metrics,
264
+ output_column="output", # Explicit parameter
265
+ label_column="label", # Explicit parameter
266
+ input_column="input", # Explicit parameter
267
+ hyperparameters=run.hyperparameter_config,
268
+ dataset_name=run.dataset.name,
269
+ model_name=model_name,
270
+ metadata=metadata,
271
+ experiment_id=experiment_id,
272
+ project_id=project_id,
273
+ upload_results=upload_results,
274
+ show_progress=False,
275
+ )
276
+
277
+ # 5. Extract run_id if upload succeeded
278
+ run_id = None
279
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
280
+ run_id = scores["aggregate_results"][0].get("run_id")
281
+
282
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
283
+ return ClassicEvalRunResult(
284
+ run_spec=run,
285
+ run_completed=True,
286
+ outputs=inference_outputs,
287
+ scores=scores,
288
+ run_id=run_id,
207
289
  )
208
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
209
- logger.debug("Classic evaluation completed for run %s", run)
210
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
211
290
 
212
291
  except Exception as e:
213
292
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
214
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
293
+ return ClassicEvalRunResult(
294
+ run_spec=run,
295
+ run_completed=False,
296
+ outputs=inference_outputs,
297
+ scores=scores,
298
+ run_id=None,
299
+ )
215
300
 
216
301
 
217
302
  async def run_inference_callable(
218
303
  inference: Callable,
219
- items: List[Dict[str, Any]],
304
+ inputs: List[Any],
220
305
  hyperparameter_config: Dict[str, Any],
221
306
  ) -> Any:
222
307
  """Run inference on a given dataset and hyperparameter configuration."""
223
308
 
224
309
  try:
225
- predictions = await inference(items, **hyperparameter_config)
310
+ predictions = await inference(inputs, **hyperparameter_config)
226
311
  except Exception as e:
227
312
  logger.error(
228
313
  "Inference callable raised an exception: %s",
@@ -230,11 +315,11 @@ async def run_inference_callable(
230
315
  )
231
316
  raise InferenceError(f"Inference failed: {str(e)}") from e
232
317
 
233
- if not isinstance(predictions, list) or len(predictions) != len(items):
318
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
234
319
  raise InferenceError(
235
320
  "Inference callable must return a list of predictions "
236
- "of shared length as the input items. "
237
- f"Items length: {len(items)}, predictions length: {len(predictions)}"
321
+ "of shared length as the inputs. "
322
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
238
323
  )
239
324
 
240
325
  if all(prediction == "" for prediction in predictions):
@@ -257,100 +342,20 @@ async def execute_adaptive_eval_run(
257
342
  """Execute an adaptive evaluation run."""
258
343
  logger.debug("Executing adaptive run for %s", run)
259
344
 
260
- if trismik_client is None:
261
- raise ScoreBookError("Trismik client is required for adaptive evaluation")
262
-
263
- adaptive_eval_run_result = await run_adaptive_evaluation(
264
- inference, run, experiment_id, project_id, metadata, trismik_client
265
- )
266
- logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
267
-
268
- return adaptive_eval_run_result
269
-
270
-
271
- async def upload_classic_run_results(
272
- run_result: ClassicEvalRunResult,
273
- experiment_id: str,
274
- project_id: str,
275
- inference_callable: Optional[Callable],
276
- metadata: Optional[Dict[str, Any]],
277
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
278
- ) -> str:
279
- """Upload a classic evaluation run result to Trismik platform.
280
-
281
- Args:
282
- run: The evaluation run result to upload
283
- experiment_id: Trismik experiment identifier
284
- project_id: Trismik project identifier
285
- model: Model name used for evaluation
286
- metadata: Optional metadata dictionary
287
- trismik_client: Trismik client instance
345
+ try:
346
+ if trismik_client is None:
347
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
288
348
 
289
- Returns:
290
- Run id
291
- """
292
- model = get_model_name(inference_callable)
293
-
294
- # Create eval items from run_spec items, outputs, and labels
295
- items: List[TrismikClassicEvalItem] = []
296
- for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
297
- label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
298
-
299
- # Calculate item-level metrics for this item
300
- item_metrics: Dict[str, Any] = {}
301
- if run_result.scores:
302
- for metric_name, metric_data in run_result.scores.items():
303
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
304
- if idx < len(metric_data["item_scores"]):
305
- item_metrics[metric_name] = metric_data["item_scores"][idx]
306
- else:
307
- # If scores is just a single value, use it for all items
308
- item_metrics[metric_name] = metric_data
309
-
310
- eval_item = TrismikClassicEvalItem(
311
- datasetItemId=str(idx),
312
- modelInput=str(item),
313
- modelOutput=str(output),
314
- goldOutput=str(label),
315
- metrics=item_metrics,
349
+ adaptive_eval_run_result = await run_adaptive_evaluation(
350
+ inference, run, experiment_id, project_id, metadata, trismik_client
316
351
  )
317
- items.append(eval_item)
318
-
319
- # Create eval metrics from run aggregate scores
320
- metrics: List[TrismikClassicEvalMetric] = []
321
- if run_result.scores:
322
- for metric_name, metric_data in run_result.scores.items():
323
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
324
- # Handle structured metric data with aggregate scores
325
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
326
- metric_id = (
327
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
328
- )
329
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
330
- metrics.append(metric)
331
- else:
332
- # Handle simple metric data (single value)
333
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
334
- metrics.append(metric)
335
-
336
- classic_eval_request = TrismikClassicEvalRequest(
337
- project_id,
338
- experiment_id,
339
- run_result.run_spec.dataset.name,
340
- model,
341
- run_result.run_spec.hyperparameter_config,
342
- items,
343
- metrics,
344
- )
345
-
346
- response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
347
- classic_eval_request
348
- )
352
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
349
353
 
350
- run_id: str = response.id
351
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
354
+ return adaptive_eval_run_result
352
355
 
353
- return run_id
356
+ except Exception as e:
357
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
358
+ return AdaptiveEvalRunResult(run, False, {})
354
359
 
355
360
 
356
361
  async def run_adaptive_evaluation(
@@ -410,4 +415,4 @@ async def run_adaptive_evaluation(
410
415
  # Make scores JSON serializable
411
416
  scores = make_json_serializable(scores)
412
417
 
413
- return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
418
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)