scorebook 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scorebook/__init__.py +14 -6
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/eval_datasets/__init__.py +5 -0
  4. scorebook/eval_datasets/eval_dataset.py +719 -0
  5. scorebook/evaluate/__init__.py +15 -0
  6. scorebook/evaluate/_async/__init__.py +0 -0
  7. scorebook/evaluate/_async/evaluate_async.py +443 -0
  8. scorebook/evaluate/_sync/__init__.py +0 -0
  9. scorebook/evaluate/_sync/evaluate.py +443 -0
  10. scorebook/evaluate/evaluate_helpers.py +388 -0
  11. scorebook/exceptions.py +48 -0
  12. scorebook/inference/__init__.py +4 -0
  13. scorebook/inference/clients/__init__.py +8 -0
  14. scorebook/inference/{bedrock.py → clients/bedrock.py} +1 -1
  15. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  16. scorebook/inference/{portkey.py → clients/portkey.py} +1 -1
  17. scorebook/inference/{vertex.py → clients/vertex.py} +1 -1
  18. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  19. scorebook/settings.py +21 -0
  20. scorebook/trismik/__init__.py +10 -0
  21. scorebook/types.py +8 -5
  22. scorebook/utils/__init__.py +11 -4
  23. scorebook/utils/async_utils.py +20 -1
  24. scorebook/utils/io_helpers.py +18 -5
  25. scorebook/utils/progress_bars.py +739 -96
  26. scorebook/utils/{build_prompt.py → render_template.py} +13 -12
  27. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/METADATA +4 -4
  28. scorebook-0.0.11.dist-info/RECORD +42 -0
  29. scorebook/eval_dataset.py +0 -404
  30. scorebook/evaluate.py +0 -623
  31. scorebook/trismik_services/__init__.py +0 -6
  32. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  33. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  34. scorebook-0.0.9.dist-info/RECORD +0 -36
  35. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  36. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
  37. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
  38. {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,15 @@
1
+ """
2
+ Evaluation module for Scorebook.
3
+
4
+ This module provides both synchronous and asynchronous evaluation functions.
5
+ The async version serves as the source of truth, with the sync version
6
+ automatically generated using unasync.
7
+ """
8
+
9
+ # Import from async module
10
+ from ._async.evaluate_async import evaluate_async
11
+
12
+ # Import from generated sync module
13
+ from ._sync.evaluate import evaluate
14
+
15
+ __all__ = ["evaluate", "evaluate_async"]
File without changes
@@ -0,0 +1,443 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
4
+
5
+ from trismik import TrismikAsyncClient, TrismikClient
6
+ from trismik.types import (
7
+ TrismikClassicEvalItem,
8
+ TrismikClassicEvalMetric,
9
+ TrismikClassicEvalRequest,
10
+ TrismikClassicEvalResponse,
11
+ TrismikRunMetadata,
12
+ )
13
+
14
+ from scorebook.eval_datasets import EvalDataset
15
+ from scorebook.evaluate.evaluate_helpers import (
16
+ build_eval_run_specs,
17
+ create_trismik_async_client,
18
+ format_results,
19
+ get_model_name,
20
+ make_trismik_inference,
21
+ prepare_datasets,
22
+ prepare_hyperparameter_configs,
23
+ resolve_show_progress,
24
+ resolve_upload_results,
25
+ score_metrics,
26
+ validate_parameters,
27
+ )
28
+ from scorebook.exceptions import InferenceError, ScoreBookError
29
+ from scorebook.types import (
30
+ AdaptiveEvalRunResult,
31
+ AdaptiveEvalRunSpec,
32
+ ClassicEvalRunResult,
33
+ EvalResult,
34
+ EvalRunSpec,
35
+ )
36
+ from scorebook.utils import async_nullcontext, evaluation_progress_context
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ async def evaluate_async(
42
+ inference: Callable,
43
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
+ metadata: Optional[Dict[str, Any]] = None,
46
+ experiment_id: Optional[str] = None,
47
+ project_id: Optional[str] = None,
48
+ return_dict: bool = True,
49
+ return_aggregates: bool = True,
50
+ return_items: bool = False,
51
+ return_output: bool = False,
52
+ upload_results: Union[Literal["auto"], bool] = "auto",
53
+ sample_size: Optional[int] = None,
54
+ show_progress: Optional[bool] = None,
55
+ ) -> Union[Dict, List, EvalResult]:
56
+ """
57
+ Evaluate a model across a collection of hyperparameters and datasets.
58
+
59
+ Args:
60
+ inference: The inference callable to evaluate
61
+ datasets: Dataset(s) to evaluate on
62
+ hyperparameters: Hyperparameter configuration(s) to evaluate with
63
+ metadata: Optional metadata to attach to the evaluation
64
+ experiment_id: Optional experiment identifier
65
+ project_id: Optional project identifier
66
+ return_dict: If True, returns eval results as a dict
67
+ return_aggregates: If True, returns aggregate scores for each dataset
68
+ return_items: If True, returns individual items for each dataset
69
+ return_output: If True, returns model outputs for each dataset item
70
+ upload_results: If True, uploads results to Trismik's dashboard
71
+ sample_size: Optional number of items to sample from each dataset
72
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
73
+ If True/False, explicitly enables/disables progress bars for this evaluation.
74
+
75
+ Returns:
76
+ The evaluation results in the format specified by return parameters:
77
+ - return_dict=True: Returns the evaluation results as a dict
78
+ - return_dict=False: Returns an EvalResult object containing all run results
79
+ """
80
+ # Resolve and validate parameters
81
+ upload_results = cast(bool, resolve_upload_results(upload_results))
82
+ show_progress_bars = resolve_show_progress(show_progress)
83
+ validate_parameters(locals(), evaluate_async)
84
+
85
+ # Prepare datasets, hyperparameters, and eval run specs
86
+ datasets = prepare_datasets(datasets, sample_size)
87
+ hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
88
+ eval_run_specs = sorted(
89
+ build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
90
+ key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
+ )
92
+
93
+ # Create Trismik client if needed (for adaptive evals or uploads)
94
+ needs_client = upload_results or any(
95
+ isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
+ )
97
+
98
+ # Use context manager for automatic cleanup, or None if not needed
99
+ trismik_client = create_trismik_async_client() if needs_client else None
100
+
101
+ async with trismik_client or async_nullcontext():
102
+ # Execute evaluation runs
103
+ # Calculate total items across all runs
104
+ total_items = sum(len(run.dataset.items) for run in eval_run_specs)
105
+ model_display = get_model_name(inference)
106
+
107
+ with evaluation_progress_context(
108
+ total_eval_runs=len(eval_run_specs),
109
+ total_items=total_items,
110
+ dataset_count=len(datasets),
111
+ hyperparam_count=len(hyperparameter_configs),
112
+ model_display=model_display,
113
+ enabled=show_progress_bars,
114
+ ) as progress_bars:
115
+ eval_result = await execute_runs(
116
+ inference,
117
+ eval_run_specs,
118
+ progress_bars,
119
+ experiment_id,
120
+ project_id,
121
+ metadata,
122
+ upload_results,
123
+ trismik_client,
124
+ )
125
+ logger.info("Asynchronous evaluation complete")
126
+
127
+ return format_results(
128
+ eval_result, return_dict, return_aggregates, return_items, return_output
129
+ )
130
+
131
+
132
+ async def execute_runs(
133
+ inference: Callable,
134
+ runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
135
+ progress_bars: Any,
136
+ experiment_id: Optional[str] = None,
137
+ project_id: Optional[str] = None,
138
+ metadata: Optional[Dict[str, Any]] = None,
139
+ upload_results: bool = False,
140
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
141
+ ) -> EvalResult:
142
+ """Run evaluation in parallel."""
143
+
144
+ # Worker function to execute individual runs and handle uploads
145
+ async def worker(
146
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
148
+ run_result = await execute_run(
149
+ inference, run, experiment_id, project_id, metadata, trismik_client
150
+ )
151
+ # Update progress bars with items processed and success status
152
+ if progress_bars is not None:
153
+ items_processed = len(run.dataset.items)
154
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
+
156
+ if (
157
+ upload_results
158
+ and isinstance(run_result, ClassicEvalRunResult)
159
+ and experiment_id
160
+ and project_id
161
+ and run_result.run_completed
162
+ and trismik_client is not None
163
+ ):
164
+ try:
165
+ run_id = await upload_classic_run_results(
166
+ run_result, experiment_id, project_id, inference, metadata, trismik_client
167
+ )
168
+ run_result.run_id = run_id
169
+ if progress_bars is not None:
170
+ progress_bars.on_upload_completed(succeeded=True)
171
+ except Exception as e:
172
+ logger.warning(f"Failed to upload run results: {e}")
173
+ if progress_bars is not None:
174
+ progress_bars.on_upload_completed(succeeded=False)
175
+ # Continue evaluation even if upload fails
176
+
177
+ return run_result
178
+
179
+ # Execute all runs concurrently
180
+ run_results = await asyncio.gather(*[worker(run) for run in runs])
181
+
182
+ # Return in canonical (dataset_idx, hp_idx) order for stability
183
+ run_results.sort(
184
+ key=lambda result: (result.run_spec.dataset_index, result.run_spec.hyperparameters_index)
185
+ )
186
+
187
+ # Return EvalResult
188
+ return EvalResult(run_results)
189
+
190
+
191
+ async def execute_run(
192
+ inference: Callable,
193
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
194
+ experiment_id: Optional[str] = None,
195
+ project_id: Optional[str] = None,
196
+ metadata: Optional[Dict[str, Any]] = None,
197
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
198
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
199
+ """Execute a single evaluation run."""
200
+
201
+ if isinstance(run, EvalRunSpec):
202
+ return await execute_classic_eval_run(inference, run)
203
+
204
+ elif isinstance(run, AdaptiveEvalRunSpec):
205
+ resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
206
+ resolved_project_id = project_id if project_id is not None else run.project_id
207
+ return await execute_adaptive_eval_run(
208
+ inference,
209
+ run,
210
+ resolved_experiment_id,
211
+ resolved_project_id,
212
+ metadata,
213
+ trismik_client,
214
+ )
215
+
216
+ else:
217
+ raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
+
219
+
220
+ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
+ """Execute a classic evaluation run."""
222
+ logger.debug("Executing classic eval run for %s", run)
223
+
224
+ inference_outputs = None
225
+ metric_scores = None
226
+
227
+ try:
228
+ inference_outputs = await run_inference_callable(
229
+ inference, run.inputs, run.hyperparameter_config
230
+ )
231
+ metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
+ logger.debug("Classic evaluation completed for run %s", run)
233
+ return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
234
+
235
+ except Exception as e:
236
+ logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
+ return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
238
+
239
+
240
+ async def run_inference_callable(
241
+ inference: Callable,
242
+ inputs: List[Any],
243
+ hyperparameter_config: Dict[str, Any],
244
+ ) -> Any:
245
+ """Run inference on a given dataset and hyperparameter configuration."""
246
+
247
+ try:
248
+ predictions = await inference(inputs, **hyperparameter_config)
249
+ except Exception as e:
250
+ logger.error(
251
+ "Inference callable raised an exception: %s",
252
+ str(e),
253
+ )
254
+ raise InferenceError(f"Inference failed: {str(e)}") from e
255
+
256
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
257
+ raise InferenceError(
258
+ "Inference callable must return a list of predictions "
259
+ "of shared length as the inputs. "
260
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
261
+ )
262
+
263
+ if all(prediction == "" for prediction in predictions):
264
+ logger.warning("Inference callable returned all empty strings for all items")
265
+
266
+ if all(prediction is None for prediction in predictions):
267
+ raise InferenceError("Inference callable returned all None for all items")
268
+
269
+ return predictions
270
+
271
+
272
+ async def execute_adaptive_eval_run(
273
+ inference: Callable,
274
+ run: AdaptiveEvalRunSpec,
275
+ experiment_id: str,
276
+ project_id: str,
277
+ metadata: Optional[Dict[str, Any]] = None,
278
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
279
+ ) -> AdaptiveEvalRunResult:
280
+ """Execute an adaptive evaluation run."""
281
+ logger.debug("Executing adaptive run for %s", run)
282
+
283
+ try:
284
+ if trismik_client is None:
285
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
286
+
287
+ adaptive_eval_run_result = await run_adaptive_evaluation(
288
+ inference, run, experiment_id, project_id, metadata, trismik_client
289
+ )
290
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
291
+
292
+ return adaptive_eval_run_result
293
+
294
+ except Exception as e:
295
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
296
+ return AdaptiveEvalRunResult(run, False, {})
297
+
298
+
299
+ async def upload_classic_run_results(
300
+ run_result: ClassicEvalRunResult,
301
+ experiment_id: str,
302
+ project_id: str,
303
+ inference_callable: Optional[Callable],
304
+ metadata: Optional[Dict[str, Any]],
305
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
+ ) -> str:
307
+ """Upload a classic evaluation run result to Trismik platform.
308
+
309
+ Args:
310
+ run: The evaluation run result to upload
311
+ experiment_id: Trismik experiment identifier
312
+ project_id: Trismik project identifier
313
+ model: Model name used for evaluation
314
+ metadata: Optional metadata dictionary
315
+ trismik_client: Trismik client instance
316
+
317
+ Returns:
318
+ Run id
319
+ """
320
+ model = get_model_name(inference_callable)
321
+
322
+ # Create eval items from run_spec inputs, outputs, and labels
323
+ items: List[TrismikClassicEvalItem] = []
324
+ inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
+ for idx, (input_value, output) in enumerate(inputs_outputs):
326
+ labels = run_result.run_spec.labels
327
+ label = labels[idx] if idx < len(labels) else ""
328
+
329
+ # Calculate item-level metrics for this item
330
+ item_metrics: Dict[str, Any] = {}
331
+ if run_result.scores:
332
+ for metric_name, metric_data in run_result.scores.items():
333
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
+ if idx < len(metric_data["item_scores"]):
335
+ item_metrics[metric_name] = metric_data["item_scores"][idx]
336
+ else:
337
+ # If scores is just a single value, use it for all items
338
+ item_metrics[metric_name] = metric_data
339
+
340
+ eval_item = TrismikClassicEvalItem(
341
+ datasetItemId=str(idx),
342
+ modelInput=str(input_value),
343
+ modelOutput=str(output),
344
+ goldOutput=str(label),
345
+ metrics=item_metrics,
346
+ )
347
+ items.append(eval_item)
348
+
349
+ # Create eval metrics from run aggregate scores
350
+ metrics: List[TrismikClassicEvalMetric] = []
351
+ if run_result.scores:
352
+ for metric_name, metric_data in run_result.scores.items():
353
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
+ # Handle structured metric data with aggregate scores
355
+ for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
+ metric_id = (
357
+ f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
+ )
359
+ metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
+ metrics.append(metric)
361
+ else:
362
+ # Handle simple metric data (single value)
363
+ metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
+ metrics.append(metric)
365
+
366
+ classic_eval_request = TrismikClassicEvalRequest(
367
+ project_id,
368
+ experiment_id,
369
+ run_result.run_spec.dataset.name,
370
+ model,
371
+ run_result.run_spec.hyperparameter_config,
372
+ items,
373
+ metrics,
374
+ )
375
+
376
+ response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
377
+ classic_eval_request
378
+ )
379
+
380
+ run_id: str = response.id
381
+ logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
+
383
+ return run_id
384
+
385
+
386
+ async def run_adaptive_evaluation(
387
+ inference: Callable,
388
+ adaptive_run_spec: AdaptiveEvalRunSpec,
389
+ experiment_id: str,
390
+ project_id: str,
391
+ metadata: Any,
392
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
393
+ ) -> AdaptiveEvalRunResult:
394
+ """Run an adaptive evaluation using the Trismik API.
395
+
396
+ Args:
397
+ inference: Function to run inference
398
+ adaptive_run_spec: Specification for the adaptive evaluation
399
+ experiment_id: Experiment identifier
400
+ project_id: Trismik project ID
401
+ metadata: Additional metadata
402
+ trismik_client: Trismik client instance
403
+ Returns:
404
+ Results from the adaptive evaluation
405
+ """
406
+ trismik_results = await trismik_client.run(
407
+ test_id=adaptive_run_spec.dataset,
408
+ project_id=project_id,
409
+ experiment=experiment_id,
410
+ run_metadata=TrismikRunMetadata(
411
+ model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
412
+ test_configuration={},
413
+ inference_setup={},
414
+ ),
415
+ item_processor=make_trismik_inference(inference),
416
+ return_dict=False,
417
+ )
418
+
419
+ # Convert TrismikRunResults to AdaptiveEvalRunResult
420
+ # Extract scores from the Trismik results
421
+ scores = {}
422
+ if hasattr(trismik_results, "scores") and trismik_results.scores:
423
+ scores = trismik_results.scores
424
+ elif hasattr(trismik_results, "__dict__"):
425
+ # If scores aren't directly available, include all attributes as scores
426
+ scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
427
+
428
+ # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
429
+ def make_json_serializable(obj: Any) -> Any:
430
+ if hasattr(obj, "theta") and hasattr(obj, "std_error"):
431
+ # This is likely an AdaptiveTestScore object
432
+ return {"theta": obj.theta, "std_error": obj.std_error}
433
+ elif isinstance(obj, dict):
434
+ return {k: make_json_serializable(v) for k, v in obj.items()}
435
+ elif isinstance(obj, (list, tuple)):
436
+ return [make_json_serializable(item) for item in obj]
437
+ else:
438
+ return obj
439
+
440
+ # Make scores JSON serializable
441
+ scores = make_json_serializable(scores)
442
+
443
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
File without changes