scorebook 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scorebook/__init__.py +14 -6
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/eval_datasets/__init__.py +5 -0
  4. scorebook/eval_datasets/eval_dataset.py +719 -0
  5. scorebook/evaluate/__init__.py +15 -0
  6. scorebook/evaluate/_async/__init__.py +0 -0
  7. scorebook/evaluate/_async/evaluate_async.py +443 -0
  8. scorebook/evaluate/_sync/__init__.py +0 -0
  9. scorebook/evaluate/_sync/evaluate.py +443 -0
  10. scorebook/evaluate/evaluate_helpers.py +388 -0
  11. scorebook/exceptions.py +48 -0
  12. scorebook/inference/__init__.py +4 -0
  13. scorebook/inference/clients/__init__.py +8 -0
  14. scorebook/inference/{bedrock.py → clients/bedrock.py} +1 -1
  15. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  16. scorebook/inference/{portkey.py → clients/portkey.py} +1 -1
  17. scorebook/inference/{vertex.py → clients/vertex.py} +1 -1
  18. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  19. scorebook/settings.py +21 -0
  20. scorebook/trismik/__init__.py +10 -0
  21. scorebook/types.py +8 -5
  22. scorebook/utils/__init__.py +11 -4
  23. scorebook/utils/async_utils.py +20 -1
  24. scorebook/utils/io_helpers.py +18 -5
  25. scorebook/utils/progress_bars.py +739 -96
  26. scorebook/utils/{build_prompt.py → render_template.py} +13 -12
  27. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/METADATA +4 -4
  28. scorebook-0.0.11.dist-info/RECORD +42 -0
  29. scorebook/eval_dataset.py +0 -404
  30. scorebook/evaluate.py +0 -623
  31. scorebook/trismik_services/__init__.py +0 -6
  32. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  33. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  34. scorebook-0.0.9.dist-info/RECORD +0 -36
  35. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  36. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
  37. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
  38. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,443 @@
1
+ import logging
2
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
3
+
4
+ from trismik import TrismikAsyncClient, TrismikClient
5
+ from trismik.types import (
6
+ TrismikClassicEvalItem,
7
+ TrismikClassicEvalMetric,
8
+ TrismikClassicEvalRequest,
9
+ TrismikClassicEvalResponse,
10
+ TrismikRunMetadata,
11
+ )
12
+
13
+ from scorebook.eval_datasets import EvalDataset
14
+ from scorebook.evaluate.evaluate_helpers import (
15
+ build_eval_run_specs,
16
+ create_trismik_sync_client,
17
+ format_results,
18
+ get_model_name,
19
+ make_trismik_inference,
20
+ prepare_datasets,
21
+ prepare_hyperparameter_configs,
22
+ resolve_show_progress,
23
+ resolve_upload_results,
24
+ score_metrics,
25
+ validate_parameters,
26
+ )
27
+ from scorebook.exceptions import InferenceError, ScoreBookError
28
+ from scorebook.types import (
29
+ AdaptiveEvalRunResult,
30
+ AdaptiveEvalRunSpec,
31
+ ClassicEvalRunResult,
32
+ EvalResult,
33
+ EvalRunSpec,
34
+ )
35
+ from contextlib import nullcontext
36
+ from scorebook.utils import evaluation_progress_context
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def evaluate(
42
+ inference: Callable,
43
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
+ metadata: Optional[Dict[str, Any]] = None,
46
+ experiment_id: Optional[str] = None,
47
+ project_id: Optional[str] = None,
48
+ return_dict: bool = True,
49
+ return_aggregates: bool = True,
50
+ return_items: bool = False,
51
+ return_output: bool = False,
52
+ upload_results: Union[Literal["auto"], bool] = "auto",
53
+ sample_size: Optional[int] = None,
54
+ show_progress: Optional[bool] = None,
55
+ ) -> Union[Dict, List, EvalResult]:
56
+ """
57
+ Evaluate a model across a collection of hyperparameters and datasets.
58
+
59
+ Args:
60
+ inference: The inference callable to evaluate
61
+ datasets: Dataset(s) to evaluate on
62
+ hyperparameters: Hyperparameter configuration(s) to evaluate with
63
+ metadata: Optional metadata to attach to the evaluation
64
+ experiment_id: Optional experiment identifier
65
+ project_id: Optional project identifier
66
+ return_dict: If True, returns eval results as a dict
67
+ return_aggregates: If True, returns aggregate scores for each dataset
68
+ return_items: If True, returns individual items for each dataset
69
+ return_output: If True, returns model outputs for each dataset item
70
+ upload_results: If True, uploads results to Trismik's dashboard
71
+ sample_size: Optional number of items to sample from each dataset
72
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
73
+ If True/False, explicitly enables/disables progress bars for this evaluation.
74
+
75
+ Returns:
76
+ The evaluation results in the format specified by return parameters:
77
+ - return_dict=True: Returns the evaluation results as a dict
78
+ - return_dict=False: Returns an EvalResult object containing all run results
79
+ """
80
+ # Resolve and validate parameters
81
+ upload_results = cast(bool, resolve_upload_results(upload_results))
82
+ show_progress_bars = resolve_show_progress(show_progress)
83
+ validate_parameters(locals(), evaluate)
84
+
85
+ # Prepare datasets, hyperparameters, and eval run specs
86
+ datasets = prepare_datasets(datasets, sample_size)
87
+ hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
88
+ eval_run_specs = sorted(
89
+ build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
90
+ key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
+ )
92
+
93
+ # Create Trismik client if needed (for adaptive evals or uploads)
94
+ needs_client = upload_results or any(
95
+ isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
+ )
97
+
98
+ # Use context manager for automatic cleanup, or None if not needed
99
+ trismik_client = create_trismik_sync_client() if needs_client else None
100
+
101
+ with trismik_client or nullcontext():
102
+ # Execute evaluation runs
103
+ # Calculate total items across all runs
104
+ total_items = sum(len(run.dataset.items) for run in eval_run_specs)
105
+ model_display = get_model_name(inference)
106
+
107
+ with evaluation_progress_context(
108
+ total_eval_runs=len(eval_run_specs),
109
+ total_items=total_items,
110
+ dataset_count=len(datasets),
111
+ hyperparam_count=len(hyperparameter_configs),
112
+ model_display=model_display,
113
+ enabled=show_progress_bars,
114
+ ) as progress_bars:
115
+ eval_result = execute_runs(
116
+ inference,
117
+ eval_run_specs,
118
+ progress_bars,
119
+ experiment_id,
120
+ project_id,
121
+ metadata,
122
+ upload_results,
123
+ trismik_client,
124
+ )
125
+ logger.info("Synchronous evaluation complete")
126
+
127
+ return format_results(
128
+ eval_result, return_dict, return_aggregates, return_items, return_output
129
+ )
130
+
131
+
132
+ def execute_runs(
133
+ inference: Callable,
134
+ runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
135
+ progress_bars: Any,
136
+ experiment_id: Optional[str] = None,
137
+ project_id: Optional[str] = None,
138
+ metadata: Optional[Dict[str, Any]] = None,
139
+ upload_results: bool = False,
140
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
141
+ ) -> EvalResult:
142
+ """Run evaluation sequentially."""
143
+
144
+ # Worker function to execute individual runs and handle uploads
145
+ def worker(
146
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
148
+ run_result = execute_run(
149
+ inference, run, experiment_id, project_id, metadata, trismik_client
150
+ )
151
+ # Update progress bars with items processed and success status
152
+ if progress_bars is not None:
153
+ items_processed = len(run.dataset.items)
154
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
+
156
+ if (
157
+ upload_results
158
+ and isinstance(run_result, ClassicEvalRunResult)
159
+ and experiment_id
160
+ and project_id
161
+ and run_result.run_completed
162
+ and trismik_client is not None
163
+ ):
164
+ try:
165
+ run_id = upload_classic_run_results(
166
+ run_result, experiment_id, project_id, inference, metadata, trismik_client
167
+ )
168
+ run_result.run_id = run_id
169
+ if progress_bars is not None:
170
+ progress_bars.on_upload_completed(succeeded=True)
171
+ except Exception as e:
172
+ logger.warning(f"Failed to upload run results: {e}")
173
+ if progress_bars is not None:
174
+ progress_bars.on_upload_completed(succeeded=False)
175
+ # Continue evaluation even if upload fails
176
+
177
+ return run_result
178
+
179
+ # Execute all runs sequentially
180
+ run_results = [worker(run) for run in runs]
181
+
182
+ # Return in canonical (dataset_idx, hp_idx) order for stability
183
+ run_results.sort(
184
+ key=lambda result: (result.run_spec.dataset_index, result.run_spec.hyperparameters_index)
185
+ )
186
+
187
+ # Return EvalResult
188
+ return EvalResult(run_results)
189
+
190
+
191
+ def execute_run(
192
+ inference: Callable,
193
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
194
+ experiment_id: Optional[str] = None,
195
+ project_id: Optional[str] = None,
196
+ metadata: Optional[Dict[str, Any]] = None,
197
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
198
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
199
+ """Execute a single evaluation run."""
200
+
201
+ if isinstance(run, EvalRunSpec):
202
+ return execute_classic_eval_run(inference, run)
203
+
204
+ elif isinstance(run, AdaptiveEvalRunSpec):
205
+ resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
206
+ resolved_project_id = project_id if project_id is not None else run.project_id
207
+ return execute_adaptive_eval_run(
208
+ inference,
209
+ run,
210
+ resolved_experiment_id,
211
+ resolved_project_id,
212
+ metadata,
213
+ trismik_client,
214
+ )
215
+
216
+ else:
217
+ raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
+
219
+
220
+ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
+ """Execute a classic evaluation run."""
222
+ logger.debug("Executing classic eval run for %s", run)
223
+
224
+ inference_outputs = None
225
+ metric_scores = None
226
+
227
+ try:
228
+ inference_outputs = run_inference_callable(
229
+ inference, run.inputs, run.hyperparameter_config
230
+ )
231
+ metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
+ logger.debug("Classic evaluation completed for run %s", run)
233
+ return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
234
+
235
+ except Exception as e:
236
+ logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
+ return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
238
+
239
+
240
+ def run_inference_callable(
241
+ inference: Callable,
242
+ inputs: List[Any],
243
+ hyperparameter_config: Dict[str, Any],
244
+ ) -> Any:
245
+ """Run inference on a given dataset and hyperparameter configuration."""
246
+
247
+ try:
248
+ predictions = inference(inputs, **hyperparameter_config)
249
+ except Exception as e:
250
+ logger.error(
251
+ "Inference callable raised an exception: %s",
252
+ str(e),
253
+ )
254
+ raise InferenceError(f"Inference failed: {str(e)}") from e
255
+
256
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
257
+ raise InferenceError(
258
+ "Inference callable must return a list of predictions "
259
+ "of shared length as the inputs. "
260
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
261
+ )
262
+
263
+ if all(prediction == "" for prediction in predictions):
264
+ logger.warning("Inference callable returned all empty strings for all items")
265
+
266
+ if all(prediction is None for prediction in predictions):
267
+ raise InferenceError("Inference callable returned all None for all items")
268
+
269
+ return predictions
270
+
271
+
272
+ def execute_adaptive_eval_run(
273
+ inference: Callable,
274
+ run: AdaptiveEvalRunSpec,
275
+ experiment_id: str,
276
+ project_id: str,
277
+ metadata: Optional[Dict[str, Any]] = None,
278
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
279
+ ) -> AdaptiveEvalRunResult:
280
+ """Execute an adaptive evaluation run."""
281
+ logger.debug("Executing adaptive run for %s", run)
282
+
283
+ try:
284
+ if trismik_client is None:
285
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
286
+
287
+ adaptive_eval_run_result = run_adaptive_evaluation(
288
+ inference, run, experiment_id, project_id, metadata, trismik_client
289
+ )
290
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
291
+
292
+ return adaptive_eval_run_result
293
+
294
+ except Exception as e:
295
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
296
+ return AdaptiveEvalRunResult(run, False, {})
297
+
298
+
299
+ def upload_classic_run_results(
300
+ run_result: ClassicEvalRunResult,
301
+ experiment_id: str,
302
+ project_id: str,
303
+ inference_callable: Optional[Callable],
304
+ metadata: Optional[Dict[str, Any]],
305
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
+ ) -> str:
307
+ """Upload a classic evaluation run result to Trismik platform.
308
+
309
+ Args:
310
+ run: The evaluation run result to upload
311
+ experiment_id: Trismik experiment identifier
312
+ project_id: Trismik project identifier
313
+ model: Model name used for evaluation
314
+ metadata: Optional metadata dictionary
315
+ trismik_client: Trismik client instance
316
+
317
+ Returns:
318
+ Run id
319
+ """
320
+ model = get_model_name(inference_callable)
321
+
322
+ # Create eval items from run_spec inputs, outputs, and labels
323
+ items: List[TrismikClassicEvalItem] = []
324
+ inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
+ for idx, (input_value, output) in enumerate(inputs_outputs):
326
+ labels = run_result.run_spec.labels
327
+ label = labels[idx] if idx < len(labels) else ""
328
+
329
+ # Calculate item-level metrics for this item
330
+ item_metrics: Dict[str, Any] = {}
331
+ if run_result.scores:
332
+ for metric_name, metric_data in run_result.scores.items():
333
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
+ if idx < len(metric_data["item_scores"]):
335
+ item_metrics[metric_name] = metric_data["item_scores"][idx]
336
+ else:
337
+ # If scores is just a single value, use it for all items
338
+ item_metrics[metric_name] = metric_data
339
+
340
+ eval_item = TrismikClassicEvalItem(
341
+ datasetItemId=str(idx),
342
+ modelInput=str(input_value),
343
+ modelOutput=str(output),
344
+ goldOutput=str(label),
345
+ metrics=item_metrics,
346
+ )
347
+ items.append(eval_item)
348
+
349
+ # Create eval metrics from run aggregate scores
350
+ metrics: List[TrismikClassicEvalMetric] = []
351
+ if run_result.scores:
352
+ for metric_name, metric_data in run_result.scores.items():
353
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
+ # Handle structured metric data with aggregate scores
355
+ for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
+ metric_id = (
357
+ f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
+ )
359
+ metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
+ metrics.append(metric)
361
+ else:
362
+ # Handle simple metric data (single value)
363
+ metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
+ metrics.append(metric)
365
+
366
+ classic_eval_request = TrismikClassicEvalRequest(
367
+ project_id,
368
+ experiment_id,
369
+ run_result.run_spec.dataset.name,
370
+ model,
371
+ run_result.run_spec.hyperparameter_config,
372
+ items,
373
+ metrics,
374
+ )
375
+
376
+ response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
377
+ classic_eval_request
378
+ )
379
+
380
+ run_id: str = response.id
381
+ logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
+
383
+ return run_id
384
+
385
+
386
+ def run_adaptive_evaluation(
387
+ inference: Callable,
388
+ adaptive_run_spec: AdaptiveEvalRunSpec,
389
+ experiment_id: str,
390
+ project_id: str,
391
+ metadata: Any,
392
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
393
+ ) -> AdaptiveEvalRunResult:
394
+ """Run an adaptive evaluation using the Trismik API.
395
+
396
+ Args:
397
+ inference: Function to run inference
398
+ adaptive_run_spec: Specification for the adaptive evaluation
399
+ experiment_id: Experiment identifier
400
+ project_id: Trismik project ID
401
+ metadata: Additional metadata
402
+ trismik_client: Trismik client instance
403
+ Returns:
404
+ Results from the adaptive evaluation
405
+ """
406
+ trismik_results = trismik_client.run(
407
+ test_id=adaptive_run_spec.dataset,
408
+ project_id=project_id,
409
+ experiment=experiment_id,
410
+ run_metadata=TrismikRunMetadata(
411
+ model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
412
+ test_configuration={},
413
+ inference_setup={},
414
+ ),
415
+ item_processor=make_trismik_inference(inference),
416
+ return_dict=False,
417
+ )
418
+
419
+ # Convert TrismikRunResults to AdaptiveEvalRunResult
420
+ # Extract scores from the Trismik results
421
+ scores = {}
422
+ if hasattr(trismik_results, "scores") and trismik_results.scores:
423
+ scores = trismik_results.scores
424
+ elif hasattr(trismik_results, "__dict__"):
425
+ # If scores aren't directly available, include all attributes as scores
426
+ scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
427
+
428
+ # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
429
+ def make_json_serializable(obj: Any) -> Any:
430
+ if hasattr(obj, "theta") and hasattr(obj, "std_error"):
431
+ # This is likely an AdaptiveTestScore object
432
+ return {"theta": obj.theta, "std_error": obj.std_error}
433
+ elif isinstance(obj, dict):
434
+ return {k: make_json_serializable(v) for k, v in obj.items()}
435
+ elif isinstance(obj, (list, tuple)):
436
+ return [make_json_serializable(item) for item in obj]
437
+ else:
438
+ return obj
439
+
440
+ # Make scores JSON serializable
441
+ scores = make_json_serializable(scores)
442
+
443
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)