scorebook 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. scorebook/__init__.py +12 -4
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/evaluate/__init__.py +15 -0
  4. scorebook/evaluate/_async/__init__.py +0 -0
  5. scorebook/evaluate/_async/evaluate_async.py +413 -0
  6. scorebook/evaluate/_sync/__init__.py +0 -0
  7. scorebook/evaluate/_sync/evaluate.py +413 -0
  8. scorebook/evaluate/evaluate_helpers.py +365 -0
  9. scorebook/inference/__init__.py +4 -0
  10. scorebook/inference/clients/__init__.py +8 -0
  11. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  12. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  13. scorebook/settings.py +18 -0
  14. scorebook/trismik/__init__.py +10 -0
  15. scorebook/utils/__init__.py +9 -2
  16. scorebook/utils/async_utils.py +20 -1
  17. scorebook/utils/progress_bars.py +22 -61
  18. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
  19. scorebook-0.0.10.dist-info/RECORD +41 -0
  20. scorebook/evaluate.py +0 -623
  21. scorebook/trismik_services/__init__.py +0 -6
  22. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  23. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  24. scorebook-0.0.8.dist-info/RECORD +0 -36
  25. /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
  26. /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
  27. /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
  28. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  29. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
  30. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
  31. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py CHANGED
@@ -10,9 +10,17 @@ import importlib.metadata
10
10
  __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
12
  from scorebook.eval_dataset import EvalDataset
13
- from scorebook.evaluate import evaluate
14
- from scorebook.inference_pipeline import InferencePipeline
15
- from scorebook.trismik_services.login import login, whoami
13
+ from scorebook.evaluate import evaluate, evaluate_async
14
+ from scorebook.inference.inference_pipeline import InferencePipeline
15
+ from scorebook.trismik.credentials import login, whoami
16
16
  from scorebook.utils.build_prompt import build_prompt
17
17
 
18
- __all__ = ["EvalDataset", "evaluate", "build_prompt", "login", "whoami", "InferencePipeline"]
18
+ __all__ = [
19
+ "EvalDataset",
20
+ "evaluate",
21
+ "evaluate_async",
22
+ "build_prompt",
23
+ "login",
24
+ "whoami",
25
+ "InferencePipeline",
26
+ ]
scorebook/cli/auth.py CHANGED
@@ -4,7 +4,7 @@ import argparse
4
4
  import getpass
5
5
  import sys
6
6
 
7
- from scorebook.trismik.login import get_stored_token, get_token_path, login, logout, whoami
7
+ from scorebook.trismik.credentials import get_stored_token, get_token_path, login, logout, whoami
8
8
 
9
9
 
10
10
  def auth_command(args: argparse.Namespace) -> int:
@@ -0,0 +1,15 @@
1
+ """
2
+ Evaluation module for Scorebook.
3
+
4
+ This module provides both synchronous and asynchronous evaluation functions.
5
+ The async version serves as the source of truth, with the sync version
6
+ automatically generated using unasync.
7
+ """
8
+
9
+ # Import from async module
10
+ from ._async.evaluate_async import evaluate_async
11
+
12
+ # Import from generated sync module
13
+ from ._sync.evaluate import evaluate
14
+
15
+ __all__ = ["evaluate", "evaluate_async"]
File without changes
@@ -0,0 +1,413 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
4
+
5
+ from trismik import TrismikAsyncClient, TrismikClient
6
+ from trismik.types import (
7
+ TrismikClassicEvalItem,
8
+ TrismikClassicEvalMetric,
9
+ TrismikClassicEvalRequest,
10
+ TrismikClassicEvalResponse,
11
+ TrismikRunMetadata,
12
+ )
13
+
14
+ from scorebook.eval_dataset import EvalDataset
15
+ from scorebook.evaluate.evaluate_helpers import (
16
+ build_eval_run_specs,
17
+ create_trismik_async_client,
18
+ format_results,
19
+ get_model_name,
20
+ make_trismik_inference,
21
+ prepare_datasets,
22
+ prepare_hyperparameter_configs,
23
+ resolve_upload_results,
24
+ score_metrics,
25
+ validate_parameters,
26
+ )
27
+ from scorebook.exceptions import InferenceError, ScoreBookError
28
+ from scorebook.types import (
29
+ AdaptiveEvalRunResult,
30
+ AdaptiveEvalRunSpec,
31
+ ClassicEvalRunResult,
32
+ EvalResult,
33
+ EvalRunSpec,
34
+ )
35
+ from scorebook.utils import async_nullcontext, evaluation_progress
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ async def evaluate_async(
41
+ inference: Callable,
42
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
43
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
44
+ metadata: Optional[Dict[str, Any]] = None,
45
+ experiment_id: Optional[str] = None,
46
+ project_id: Optional[str] = None,
47
+ return_dict: bool = True,
48
+ return_aggregates: bool = True,
49
+ return_items: bool = False,
50
+ return_output: bool = False,
51
+ upload_results: Union[Literal["auto"], bool] = "auto",
52
+ sample_size: Optional[int] = None,
53
+ ) -> Union[Dict, List, EvalResult]:
54
+ """
55
+ Evaluate a model across a collection of hyperparameters and datasets.
56
+
57
+ Args:
58
+ inference: The inference callable to evaluate
59
+ datasets: Dataset(s) to evaluate on
60
+ hyperparameters: Hyperparameter configuration(s) to evaluate with
61
+ metadata: Optional metadata to attach to the evaluation
62
+ experiment_id: Optional experiment identifier
63
+ project_id: Optional project identifier
64
+ return_dict: If True, returns eval results as a dict
65
+ return_aggregates: If True, returns aggregate scores for each dataset
66
+ return_items: If True, returns individual items for each dataset
67
+ return_output: If True, returns model outputs for each dataset item
68
+ upload_results: If True, uploads results to Trismik's dashboard
69
+ sample_size: Optional number of items to sample from each dataset
70
+
71
+ Returns:
72
+ The evaluation results in the format specified by return parameters:
73
+ - return_dict=True: Returns the evaluation results as a dict
74
+ - return_dict=False: Returns an EvalResult object containing all run results
75
+ """
76
+ # Resolve and validate parameters
77
+ upload_results = cast(bool, resolve_upload_results(upload_results))
78
+ validate_parameters(locals(), evaluate_async)
79
+
80
+ # Prepare datasets, hyperparameters, and eval run specs
81
+ datasets = prepare_datasets(datasets, sample_size)
82
+ hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
83
+ eval_run_specs = sorted(
84
+ build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
85
+ key=lambda run: (run.dataset_index, run.hyperparameters_index),
86
+ )
87
+
88
+ # Create Trismik client if needed (for adaptive evals or uploads)
89
+ needs_client = upload_results or any(
90
+ isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
91
+ )
92
+
93
+ # Use context manager for automatic cleanup, or None if not needed
94
+ trismik_client = create_trismik_async_client() if needs_client else None
95
+
96
+ async with trismik_client or async_nullcontext():
97
+ # Execute evaluation runs
98
+ with evaluation_progress(
99
+ dataset_count=len(datasets),
100
+ hyperparameter_config_count=len(hyperparameter_configs),
101
+ run_count=len(eval_run_specs),
102
+ ) as progress_bars:
103
+ eval_result = await execute_runs(
104
+ inference,
105
+ eval_run_specs,
106
+ progress_bars,
107
+ experiment_id,
108
+ project_id,
109
+ metadata,
110
+ upload_results,
111
+ trismik_client,
112
+ )
113
+ logger.info("Asynchronous evaluation complete")
114
+
115
+ return format_results(
116
+ eval_result, return_dict, return_aggregates, return_items, return_output
117
+ )
118
+
119
+
120
+ async def execute_runs(
121
+ inference: Callable,
122
+ runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
123
+ progress_bars: Any,
124
+ experiment_id: Optional[str] = None,
125
+ project_id: Optional[str] = None,
126
+ metadata: Optional[Dict[str, Any]] = None,
127
+ upload_results: bool = False,
128
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
129
+ ) -> EvalResult:
130
+ """Run evaluation in parallel."""
131
+
132
+ # Worker function to execute individual runs and handle uploads
133
+ async def worker(
134
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
135
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
136
+ run_result = await execute_run(
137
+ inference, run, experiment_id, project_id, metadata, trismik_client
138
+ )
139
+ progress_bars.on_eval_run_completed(run.dataset_index)
140
+
141
+ if (
142
+ upload_results
143
+ and isinstance(run_result, ClassicEvalRunResult)
144
+ and experiment_id
145
+ and project_id
146
+ and run_result.run_completed
147
+ and trismik_client is not None
148
+ ):
149
+ run_id = await upload_classic_run_results(
150
+ run_result, experiment_id, project_id, inference, metadata, trismik_client
151
+ )
152
+ run_result.run_id = run_id
153
+
154
+ return run_result
155
+
156
+ # Execute all runs concurrently
157
+ run_results = await asyncio.gather(*[worker(run) for run in runs])
158
+
159
+ # Return in canonical (dataset_idx, hp_idx) order for stability
160
+ run_results.sort(
161
+ key=lambda result: (result.run_spec.dataset_index, result.run_spec.hyperparameters_index)
162
+ )
163
+
164
+ # Return EvalResult
165
+ return EvalResult(run_results)
166
+
167
+
168
+ async def execute_run(
169
+ inference: Callable,
170
+ run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
171
+ experiment_id: Optional[str] = None,
172
+ project_id: Optional[str] = None,
173
+ metadata: Optional[Dict[str, Any]] = None,
174
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
175
+ ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
176
+ """Execute a single evaluation run."""
177
+
178
+ if isinstance(run, EvalRunSpec):
179
+ return await execute_classic_eval_run(inference, run)
180
+
181
+ elif isinstance(run, AdaptiveEvalRunSpec):
182
+ resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
183
+ resolved_project_id = project_id if project_id is not None else run.project_id
184
+ return await execute_adaptive_eval_run(
185
+ inference,
186
+ run,
187
+ resolved_experiment_id,
188
+ resolved_project_id,
189
+ metadata,
190
+ trismik_client,
191
+ )
192
+
193
+ else:
194
+ raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
195
+
196
+
197
+ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
198
+ """Execute a classic evaluation run."""
199
+ logger.debug("Executing classic eval run for %s", run)
200
+
201
+ inference_outputs = None
202
+ metric_scores = None
203
+
204
+ try:
205
+ inference_outputs = await run_inference_callable(
206
+ inference, run.dataset.items, run.hyperparameter_config
207
+ )
208
+ metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
209
+ logger.debug("Classic evaluation completed for run %s", run)
210
+ return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
211
+
212
+ except Exception as e:
213
+ logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
214
+ return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
215
+
216
+
217
+ async def run_inference_callable(
218
+ inference: Callable,
219
+ items: List[Dict[str, Any]],
220
+ hyperparameter_config: Dict[str, Any],
221
+ ) -> Any:
222
+ """Run inference on a given dataset and hyperparameter configuration."""
223
+
224
+ try:
225
+ predictions = await inference(items, **hyperparameter_config)
226
+ except Exception as e:
227
+ logger.error(
228
+ "Inference callable raised an exception: %s",
229
+ str(e),
230
+ )
231
+ raise InferenceError(f"Inference failed: {str(e)}") from e
232
+
233
+ if not isinstance(predictions, list) or len(predictions) != len(items):
234
+ raise InferenceError(
235
+ "Inference callable must return a list of predictions "
236
+ "of shared length as the input items. "
237
+ f"Items length: {len(items)}, predictions length: {len(predictions)}"
238
+ )
239
+
240
+ if all(prediction == "" for prediction in predictions):
241
+ logger.warning("Inference callable returned all empty strings for all items")
242
+
243
+ if all(prediction is None for prediction in predictions):
244
+ raise InferenceError("Inference callable returned all None for all items")
245
+
246
+ return predictions
247
+
248
+
249
+ async def execute_adaptive_eval_run(
250
+ inference: Callable,
251
+ run: AdaptiveEvalRunSpec,
252
+ experiment_id: str,
253
+ project_id: str,
254
+ metadata: Optional[Dict[str, Any]] = None,
255
+ trismik_client: Optional[Union[TrismikClient, TrismikAsyncClient]] = None,
256
+ ) -> AdaptiveEvalRunResult:
257
+ """Execute an adaptive evaluation run."""
258
+ logger.debug("Executing adaptive run for %s", run)
259
+
260
+ if trismik_client is None:
261
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
262
+
263
+ adaptive_eval_run_result = await run_adaptive_evaluation(
264
+ inference, run, experiment_id, project_id, metadata, trismik_client
265
+ )
266
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
267
+
268
+ return adaptive_eval_run_result
269
+
270
+
271
+ async def upload_classic_run_results(
272
+ run_result: ClassicEvalRunResult,
273
+ experiment_id: str,
274
+ project_id: str,
275
+ inference_callable: Optional[Callable],
276
+ metadata: Optional[Dict[str, Any]],
277
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
278
+ ) -> str:
279
+ """Upload a classic evaluation run result to Trismik platform.
280
+
281
+ Args:
282
+ run: The evaluation run result to upload
283
+ experiment_id: Trismik experiment identifier
284
+ project_id: Trismik project identifier
285
+ model: Model name used for evaluation
286
+ metadata: Optional metadata dictionary
287
+ trismik_client: Trismik client instance
288
+
289
+ Returns:
290
+ Run id
291
+ """
292
+ model = get_model_name(inference_callable)
293
+
294
+ # Create eval items from run_spec items, outputs, and labels
295
+ items: List[TrismikClassicEvalItem] = []
296
+ for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
297
+ label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
298
+
299
+ # Calculate item-level metrics for this item
300
+ item_metrics: Dict[str, Any] = {}
301
+ if run_result.scores:
302
+ for metric_name, metric_data in run_result.scores.items():
303
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
304
+ if idx < len(metric_data["item_scores"]):
305
+ item_metrics[metric_name] = metric_data["item_scores"][idx]
306
+ else:
307
+ # If scores is just a single value, use it for all items
308
+ item_metrics[metric_name] = metric_data
309
+
310
+ eval_item = TrismikClassicEvalItem(
311
+ datasetItemId=str(idx),
312
+ modelInput=str(item),
313
+ modelOutput=str(output),
314
+ goldOutput=str(label),
315
+ metrics=item_metrics,
316
+ )
317
+ items.append(eval_item)
318
+
319
+ # Create eval metrics from run aggregate scores
320
+ metrics: List[TrismikClassicEvalMetric] = []
321
+ if run_result.scores:
322
+ for metric_name, metric_data in run_result.scores.items():
323
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
324
+ # Handle structured metric data with aggregate scores
325
+ for agg_name, agg_value in metric_data["aggregate_scores"].items():
326
+ metric_id = (
327
+ f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
328
+ )
329
+ metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
330
+ metrics.append(metric)
331
+ else:
332
+ # Handle simple metric data (single value)
333
+ metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
334
+ metrics.append(metric)
335
+
336
+ classic_eval_request = TrismikClassicEvalRequest(
337
+ project_id,
338
+ experiment_id,
339
+ run_result.run_spec.dataset.name,
340
+ model,
341
+ run_result.run_spec.hyperparameter_config,
342
+ items,
343
+ metrics,
344
+ )
345
+
346
+ response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
347
+ classic_eval_request
348
+ )
349
+
350
+ run_id: str = response.id
351
+ logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
352
+
353
+ return run_id
354
+
355
+
356
+ async def run_adaptive_evaluation(
357
+ inference: Callable,
358
+ adaptive_run_spec: AdaptiveEvalRunSpec,
359
+ experiment_id: str,
360
+ project_id: str,
361
+ metadata: Any,
362
+ trismik_client: Union[TrismikClient, TrismikAsyncClient],
363
+ ) -> AdaptiveEvalRunResult:
364
+ """Run an adaptive evaluation using the Trismik API.
365
+
366
+ Args:
367
+ inference: Function to run inference
368
+ adaptive_run_spec: Specification for the adaptive evaluation
369
+ experiment_id: Experiment identifier
370
+ project_id: Trismik project ID
371
+ metadata: Additional metadata
372
+ trismik_client: Trismik client instance
373
+ Returns:
374
+ Results from the adaptive evaluation
375
+ """
376
+ trismik_results = await trismik_client.run(
377
+ test_id=adaptive_run_spec.dataset,
378
+ project_id=project_id,
379
+ experiment=experiment_id,
380
+ run_metadata=TrismikRunMetadata(
381
+ model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
382
+ test_configuration={},
383
+ inference_setup={},
384
+ ),
385
+ item_processor=make_trismik_inference(inference),
386
+ return_dict=False,
387
+ )
388
+
389
+ # Convert TrismikRunResults to AdaptiveEvalRunResult
390
+ # Extract scores from the Trismik results
391
+ scores = {}
392
+ if hasattr(trismik_results, "scores") and trismik_results.scores:
393
+ scores = trismik_results.scores
394
+ elif hasattr(trismik_results, "__dict__"):
395
+ # If scores aren't directly available, include all attributes as scores
396
+ scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
397
+
398
+ # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
399
+ def make_json_serializable(obj: Any) -> Any:
400
+ if hasattr(obj, "theta") and hasattr(obj, "std_error"):
401
+ # This is likely an AdaptiveTestScore object
402
+ return {"theta": obj.theta, "std_error": obj.std_error}
403
+ elif isinstance(obj, dict):
404
+ return {k: make_json_serializable(v) for k, v in obj.items()}
405
+ elif isinstance(obj, (list, tuple)):
406
+ return [make_json_serializable(item) for item in obj]
407
+ else:
408
+ return obj
409
+
410
+ # Make scores JSON serializable
411
+ scores = make_json_serializable(scores)
412
+
413
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
File without changes