scorebook 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scorebook/__init__.py CHANGED
@@ -12,7 +12,7 @@ __version__ = importlib.metadata.version(__package__ or __name__)
12
12
  from scorebook.eval_dataset import EvalDataset
13
13
  from scorebook.evaluate import evaluate
14
14
  from scorebook.inference_pipeline import InferencePipeline
15
- from scorebook.trismik.login import login, whoami
15
+ from scorebook.trismik_services.login import login, whoami
16
16
  from scorebook.utils.build_prompt import build_prompt
17
17
 
18
18
  __all__ = ["EvalDataset", "evaluate", "build_prompt", "login", "whoami", "InferencePipeline"]
scorebook/evaluate.py CHANGED
@@ -15,7 +15,7 @@ models on datasets and computing metric scores.
15
15
 
16
16
  import asyncio
17
17
  import logging
18
- from typing import Any, Callable, Dict, List, Optional, Union
18
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
19
19
 
20
20
  from scorebook.eval_dataset import EvalDataset
21
21
  from scorebook.exceptions import (
@@ -23,8 +23,11 @@ from scorebook.exceptions import (
23
23
  MetricComputationError,
24
24
  ParallelExecutionError,
25
25
  ParameterValidationError,
26
+ ScoreBookError,
26
27
  )
27
- from scorebook.trismik import run_adaptive_evaluation
28
+ from scorebook.trismik_services import run_adaptive_evaluation
29
+ from scorebook.trismik_services.login import get_token
30
+ from scorebook.trismik_services.upload_classic_eval_run import upload_classic_eval_run
28
31
  from scorebook.types import (
29
32
  AdaptiveEvalDataset,
30
33
  AdaptiveEvalRunResult,
@@ -39,60 +42,43 @@ logger = logging.getLogger(__name__)
39
42
 
40
43
 
41
44
  def evaluate(
42
- inference_callable: Callable,
43
- eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
45
+ inference: Callable,
46
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
47
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
- metadata: Optional[Dict[str, Any]] = None,
46
48
  experiment_id: Optional[str] = None,
47
49
  project_id: Optional[str] = None,
50
+ metadata: Optional[Dict[str, Any]] = None,
51
+ upload_results: Union[Literal["auto"], bool] = "auto",
52
+ sample_size: Optional[int] = None,
48
53
  parallel: bool = False,
49
54
  return_dict: bool = True,
50
55
  return_aggregates: bool = True,
51
56
  return_items: bool = False,
52
57
  return_output: bool = False,
53
- sample_size: Optional[int] = None,
54
58
  ) -> Union[Dict, List]:
55
59
  """
56
- Evaluate model predictions using specified metrics on given datasets.
57
-
58
- This function runs the provided inference callable on one or more evaluation datasets,
59
- computes metric scores, and returns the evaluation results. It supports batch processing,
60
- parameter sweeping, and different result formatting options.
60
+ Evaluate a model and collection of hyperparameters over datasets with specified metrics.
61
61
 
62
62
  Args:
63
- inference_callable: A callable function or object that takes (items, hyperparameters)
64
- and returns predictions. Can be a regular function, async function,
65
- or callable instance (like a class with __call__ method).
66
- eval_datasets: One or more evaluation datasets to run evaluation on. Can be:
67
- - A single EvalDataset instance
68
- - A list of EvalDataset instances
69
- - A string identifier (for future dataset registry support)
70
- - A list of string identifiers
71
- hyperparameters: Optional dictionary containing hyperparameter sweep configuration.
72
- metadata: Optional dictionary containing evaluation metadata.
73
- experiment_id: Optional string identifier for tracking multiple evaluation runs.
63
+ inference: A callable that runs model inference over a list of evaluation items
64
+ datasets: One or more evaluation datasets to run evaluation on.
65
+ hyperparameters: Optional list of hyperparameter configurations or grid to evaluate
66
+ experiment_id: Optional ID of the experiment to upload results to on Trismik's dashboard.
67
+ project_id: Optional ID of the project to upload results to on Trismik's dashboard.
68
+ metadata: Optional metadata to attach to the evaluation.
69
+ upload_results: If True, uploads results to Trismik's dashboard.
70
+ sample_size: Optional number of items to sample from each dataset.
71
+ parallel: If True, runs evaluation in parallel. Requires the inference callable to be async.
74
72
  return_dict: If True, returns eval results as a dict
75
73
  return_aggregates: If True, returns aggregate scores for each dataset
76
74
  return_items: If True, returns individual items for each dataset
77
75
  return_output: If True, returns model outputs for each dataset item evaluated
78
- sample_size: If set, only return a sample of the dataset items (for debugging)
79
- parallel: If True, run inference functions in parallel (requires all functions to be async)
80
76
 
81
77
  Returns:
82
- Dictionary mapping dataset names to their evaluation results. For each dataset,
83
- returns a dictionary containing:
84
- - items: List of EvalResult objects with predictions and ground truth
85
- - metrics: Dictionary mapping metric names to their computed scores
86
-
87
- Example:
88
-
89
- python
90
- dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
91
- def inference_fn(items):
92
- # Model inference logic here - process all items at once
93
- return [prediction for item in items]
94
-
95
- results = evaluate(inference_fn, dataset, item_limit=100)
78
+ Union[Dict, List, EvalResult]:
79
+ The evaluation results in the format specified by return parameters:
80
+ - If return_dict=False: Returns an EvalResult object containing all run results
81
+ - If return_dict=True Returns the evaluation results as a dict
96
82
  """
97
83
 
98
84
  logger.info(
@@ -104,8 +90,8 @@ def evaluate(
104
90
 
105
91
  return asyncio.run(
106
92
  _evaluate_async(
107
- inference_callable=inference_callable,
108
- eval_datasets=eval_datasets,
93
+ inference=inference,
94
+ datasets=datasets,
109
95
  hyperparameters=hyperparameters,
110
96
  metadata=metadata,
111
97
  experiment_id=experiment_id,
@@ -115,14 +101,15 @@ def evaluate(
115
101
  return_aggregates=return_aggregates,
116
102
  return_items=return_items,
117
103
  return_output=return_output,
104
+ upload_results=upload_results,
118
105
  sample_size=sample_size,
119
106
  )
120
107
  )
121
108
 
122
109
 
123
110
  async def _evaluate_async(
124
- inference_callable: Callable,
125
- eval_datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
111
+ inference: Callable,
112
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
126
113
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
127
114
  metadata: Optional[Dict[str, Any]] = None,
128
115
  experiment_id: Optional[str] = None,
@@ -132,10 +119,15 @@ async def _evaluate_async(
132
119
  return_items: bool = False,
133
120
  return_output: bool = False,
134
121
  parallel: bool = False,
122
+ upload_results: Union[Literal["auto"], bool] = "auto",
135
123
  sample_size: Optional[int] = None,
136
124
  ) -> Union[Dict, List]:
125
+ """Run evaluation asynchronously."""
126
+
127
+ upload_results = _resolve_upload_results(upload_results)
128
+
137
129
  _validate_parameters(locals())
138
- datasets = _prepare_datasets(eval_datasets, sample_size)
130
+ datasets = _prepare_datasets(datasets, sample_size)
139
131
  hyperparameter_configs = _prepare_hyperparameter_configs(hyperparameters)
140
132
 
141
133
  logger.info(
@@ -155,29 +147,29 @@ async def _evaluate_async(
155
147
  datasets, len(hyperparameter_configs), parallel, len(eval_run_specs)
156
148
  ) as progress_bars:
157
149
  if parallel:
158
- eval_results = await _run_parallel(
159
- inference_callable,
150
+ eval_result = await _run_parallel(
151
+ inference,
160
152
  eval_run_specs,
161
153
  progress_bars,
162
154
  experiment_id,
163
155
  project_id,
164
156
  metadata,
157
+ upload_results,
165
158
  )
166
159
  else:
167
- eval_results = await _run_sequential(
168
- inference_callable,
160
+ eval_result = await _run_sequential(
161
+ inference,
169
162
  eval_run_specs,
170
163
  progress_bars,
171
164
  experiment_id,
172
165
  project_id,
173
166
  metadata,
167
+ upload_results,
174
168
  )
175
169
 
176
170
  logger.info("Evaluation completed successfully")
177
171
 
178
- return _format_results(
179
- eval_results, return_dict, return_aggregates, return_items, return_output
180
- )
172
+ return _format_results(eval_result, return_dict, return_aggregates, return_items, return_output)
181
173
 
182
174
 
183
175
  # ===== ORCHESTRATION PATHS =====
@@ -190,16 +182,35 @@ async def _run_parallel(
190
182
  experiment_id: Optional[str] = None,
191
183
  project_id: Optional[str] = None,
192
184
  metadata: Optional[Dict[str, Any]] = None,
185
+ upload_results: bool = False,
193
186
  ) -> EvalResult:
187
+ """Run evaluation in parallel."""
188
+
194
189
  logger.debug("Running inference in parallel")
195
190
 
191
+ # Worker function to execute individual runs and handle uploads
196
192
  async def worker(
197
193
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
198
194
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
199
195
  run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
200
196
  progress_bars.on_eval_run_completed(run.dataset_index)
197
+
198
+ if (
199
+ upload_results
200
+ and isinstance(run_result, ClassicEvalRunResult)
201
+ and experiment_id
202
+ and project_id
203
+ ):
204
+ # Only upload runs that completed successfully
205
+ if run_result.run_completed:
206
+ run_id = await _upload_classic_run(
207
+ run_result, experiment_id, project_id, inference, metadata
208
+ )
209
+ run_result.run_id = run_id
210
+
201
211
  return run_result
202
212
 
213
+ # Execute all runs concurrently
203
214
  run_results = await asyncio.gather(*[worker(run) for run in runs])
204
215
  # Return in canonical (dataset_idx, hp_idx) order for stability
205
216
  run_results.sort(
@@ -215,13 +226,32 @@ async def _run_sequential(
215
226
  experiment_id: Optional[str] = None,
216
227
  project_id: Optional[str] = None,
217
228
  metadata: Optional[Dict[str, Any]] = None,
229
+ upload_results: bool = False,
218
230
  ) -> EvalResult:
231
+ """Run evaluation sequentially."""
232
+
219
233
  logger.debug("Running inference sequentially")
234
+
220
235
  run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]] = []
221
236
  for run in runs:
222
237
  run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
223
238
  run_results.append(run_result)
224
239
  progress_bars.on_hyperparam_completed(run_result.run_spec.dataset_index)
240
+
241
+ # Upload a classic eval run result immediately if upload_results is enabled
242
+ if (
243
+ upload_results
244
+ and isinstance(run_result, ClassicEvalRunResult)
245
+ and experiment_id
246
+ and project_id
247
+ ):
248
+ # Only upload runs that completed successfully
249
+ if run_result.run_completed:
250
+ run_id = await _upload_classic_run(
251
+ run_result, experiment_id, project_id, inference, metadata
252
+ )
253
+ run_result.run_id = run_id
254
+
225
255
  return EvalResult(run_results)
226
256
 
227
257
 
@@ -236,29 +266,39 @@ async def _execute_run(
236
266
  metadata: Optional[Dict[str, Any]] = None,
237
267
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
238
268
  """Execute a single evaluation run."""
269
+
239
270
  if isinstance(run, EvalRunSpec):
240
271
  return await _execute_classic_eval_run(inference, run)
272
+
241
273
  elif isinstance(run, AdaptiveEvalRunSpec):
242
- if experiment_id is None or project_id is None:
243
- raise ParameterValidationError(
244
- "experiment_id and project_id are required for adaptive evaluation runs"
274
+ if not experiment_id or not project_id:
275
+ raise ScoreBookError(
276
+ "experiment_id and project_id are required for adaptive evaluations"
245
277
  )
246
278
  return await _execute_adaptive_eval_run(inference, run, experiment_id, project_id, metadata)
279
+
247
280
  else:
248
- raise ParameterValidationError(f"Unrecognized run type: {type(run)}")
281
+ raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
249
282
 
250
283
 
251
284
  async def _execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
252
285
  """Execute a classic evaluation run."""
253
286
  logger.debug("Executing classic eval run for %s", run)
254
287
 
255
- inference_outputs = await _run_inference_callable(
256
- inference, run.dataset.items, run.hyperparameter_config
257
- )
258
- metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
288
+ inference_outputs = None
289
+ metric_scores = None
290
+
291
+ try:
292
+ inference_outputs = await _run_inference_callable(
293
+ inference, run.dataset.items, run.hyperparameter_config
294
+ )
295
+ metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
296
+ logger.debug("Classic evaluation completed for run %s", run)
297
+ return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
259
298
 
260
- logger.debug("Classic evaluation completed for run %s", run)
261
- return ClassicEvalRunResult(run, inference_outputs, metric_scores)
299
+ except Exception as e:
300
+ logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
301
+ return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
262
302
 
263
303
 
264
304
  async def _execute_adaptive_eval_run(
@@ -282,20 +322,41 @@ async def _execute_adaptive_eval_run(
282
322
  # ===== HELPER FUNCTIONS =====
283
323
 
284
324
 
325
+ def _resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
326
+ """Resolve the upload_results parameter based on trismik login status."""
327
+
328
+ if upload_results == "auto":
329
+ upload_results = get_token() is not None
330
+ logger.debug("Auto upload results resolved to: %s", upload_results)
331
+
332
+ return upload_results
333
+
334
+
285
335
  def _validate_parameters(params: Dict[str, Any]) -> None:
286
336
  """Validate all parameters for evaluation."""
287
337
 
338
+ # If returning a dict, it must contain items and/or aggregates
288
339
  if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
289
340
  raise ParameterValidationError(
290
341
  "When return_dict=True, at least one of return_aggregates or return_items must be True"
291
342
  )
292
343
 
293
- if params["parallel"] and not is_awaitable(params["inference_callable"]):
344
+ # Parallel runs require an asynchronous inference callable
345
+ if params["parallel"] and not is_awaitable(params["inference"]):
294
346
  raise ParallelExecutionError(
295
347
  "parallel=True requires the inference_callable to be async. "
296
348
  "Please make your inference function async or set parallel=False."
297
349
  )
298
350
 
351
+ # If uploading results, experiment_id and project_id must be specified
352
+ if params["upload_results"]:
353
+ if params["experiment_id"] is None or params["project_id"] is None:
354
+ raise ParameterValidationError(
355
+ "experiment_id and project_id are required for upload_results=True"
356
+ )
357
+
358
+ logger.debug("Parameter validation successful")
359
+
299
360
 
300
361
  def _prepare_datasets(
301
362
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
@@ -354,7 +415,8 @@ def _build_eval_run_specs(
354
415
  project_id: Optional[str],
355
416
  metadata: Optional[Dict[str, Any]] = None,
356
417
  ) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
357
- """Build RunSpec objects for each dataset/hyperparameter combination."""
418
+ """Build All RunSpec objects for each dataset/hyperparameter combination."""
419
+
358
420
  eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
359
421
  for dataset_index, dataset in enumerate(datasets):
360
422
  for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
@@ -369,9 +431,9 @@ def _build_eval_run_specs(
369
431
 
370
432
  # Create adaptive eval run spec from string
371
433
  elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
372
- if experiment_id is None or project_id is None:
373
- raise ParameterValidationError(
374
- "experiment_id and project_id are required for adaptive evaluation"
434
+ if not experiment_id or not project_id:
435
+ raise ScoreBookError(
436
+ "experiment_id and project_id are required for adaptive evaluations"
375
437
  )
376
438
  eval_run_specs.append(
377
439
  _build_adaptive_eval_run_spec(
@@ -385,24 +447,6 @@ def _build_eval_run_specs(
385
447
  )
386
448
  )
387
449
 
388
- # Create adaptive eval run spec from AdaptiveEvalDataset
389
- elif isinstance(dataset, AdaptiveEvalDataset):
390
- if experiment_id is None or project_id is None:
391
- raise ParameterValidationError(
392
- "experiment_id and project_id are required for adaptive evaluation"
393
- )
394
- eval_run_specs.append(
395
- _build_adaptive_eval_run_spec(
396
- dataset.name,
397
- dataset_index,
398
- hyperparameter_config,
399
- hyperparameters_index,
400
- experiment_id,
401
- project_id,
402
- metadata,
403
- )
404
- )
405
-
406
450
  # Log warning - should never happen
407
451
  else:
408
452
  logger.warning("Unrecognized dataset type: %s", dataset)
@@ -416,7 +460,7 @@ def _build_classic_eval_run_spec(
416
460
  hyperparameters: Dict[str, Any],
417
461
  hyperparameters_index: int,
418
462
  ) -> EvalRunSpec:
419
- """Build RunSpec objects for each dataset/hyperparameter combination."""
463
+ """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
420
464
  items = dataset.items
421
465
  labels = [item.get(dataset.label) for item in items]
422
466
  eval_run_spec = EvalRunSpec(
@@ -440,6 +484,7 @@ def _build_adaptive_eval_run_spec(
440
484
  project_id: str,
441
485
  metadata: Optional[Dict[str, Any]] = None,
442
486
  ) -> AdaptiveEvalRunSpec:
487
+ """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
443
488
  dataset = adaptive_dataset.replace(":adaptive", "")
444
489
  adaptive_eval_run_spec = AdaptiveEvalRunSpec(
445
490
  dataset,
@@ -459,6 +504,7 @@ async def _run_inference_callable(
459
504
  items: List[Dict[str, Any]],
460
505
  hyperparameter_config: Dict[str, Any],
461
506
  ) -> Any:
507
+ """Run inference on a given dataset and hyperparameter configuration."""
462
508
  if is_awaitable(inference):
463
509
  return await inference(items, **hyperparameter_config)
464
510
  else:
@@ -493,6 +539,50 @@ def _score_metrics(
493
539
  return metric_scores
494
540
 
495
541
 
542
+ async def _upload_classic_run(
543
+ run_result: ClassicEvalRunResult,
544
+ experiment_id: str,
545
+ project_id: str,
546
+ inference_callable: Optional[Callable] = None,
547
+ metadata: Optional[Dict[str, Any]] = None,
548
+ ) -> Optional[str]:
549
+ """Upload a ClassicEvalRunResult to Trismik."""
550
+
551
+ logger.debug("Uploading classic eval run: %s", run_result.run_spec)
552
+ try:
553
+ model_name = _get_model_name(inference_callable, metadata)
554
+ response = await upload_classic_eval_run(
555
+ run=run_result,
556
+ experiment_id=experiment_id,
557
+ project_id=project_id,
558
+ model=model_name,
559
+ metadata=metadata,
560
+ )
561
+ logger.info("Successfully uploaded classic eval run: %s", response.id)
562
+ return str(response.id)
563
+
564
+ except Exception as e:
565
+ logger.error("Failed to upload classic eval run: %s", str(e))
566
+ return None
567
+
568
+
569
+ def _get_model_name(
570
+ inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
571
+ ) -> str:
572
+ """Determine a model's name with the fallback "unspecified"."""
573
+
574
+ # First priority: metadata.model
575
+ if metadata and "model" in metadata:
576
+ return str(metadata["model"])
577
+
578
+ # Second priority: inference_pipeline.model (if callable is an InferencePipeline)
579
+ if inference_callable and hasattr(inference_callable, "model"):
580
+ return str(inference_callable.model)
581
+
582
+ # Fallback: "unspecified"
583
+ return "unspecified"
584
+
585
+
496
586
  def _format_results(
497
587
  eval_result: EvalResult,
498
588
  return_dict: bool,
@@ -510,10 +600,12 @@ def _format_results(
510
600
 
511
601
  if return_items:
512
602
  item_scores = eval_result.item_scores
603
+
513
604
  # Remove inference output if not requested
514
605
  if not return_output:
515
606
  for item in item_scores:
516
607
  item.pop("inference_output", None)
608
+
517
609
  results["item_results"] = item_scores
518
610
 
519
611
  # If both are requested, return the combined structure
@@ -0,0 +1,102 @@
1
+ """Upload classic evaluation run results to Trismik platform."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from trismik.adaptive_test import AdaptiveTest
7
+ from trismik.client_async import TrismikAsyncClient
8
+ from trismik.types import (
9
+ TrismikClassicEvalItem,
10
+ TrismikClassicEvalMetric,
11
+ TrismikClassicEvalRequest,
12
+ TrismikClassicEvalResponse,
13
+ )
14
+
15
+ from scorebook.trismik_services.login import get_token
16
+ from scorebook.types import ClassicEvalRunResult
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ async def upload_classic_eval_run(
22
+ run: ClassicEvalRunResult,
23
+ experiment_id: str,
24
+ project_id: str,
25
+ model: str,
26
+ metadata: Optional[Dict[str, Any]],
27
+ ) -> TrismikClassicEvalResponse:
28
+ """Upload a classic evaluation run result to Trismik platform.
29
+
30
+ Args:
31
+ run: The evaluation run result to upload
32
+ experiment_id: Trismik experiment identifier
33
+ project_id: Trismik project identifier
34
+ model: Model name used for evaluation
35
+ metadata: Optional metadata dictionary
36
+
37
+ Returns:
38
+ Response from Trismik API containing the upload result
39
+ """
40
+ runner = AdaptiveTest(
41
+ lambda x: None,
42
+ client=TrismikAsyncClient(
43
+ service_url="https://api-stage.trismik.com/adaptive-testing", api_key=get_token()
44
+ ),
45
+ )
46
+
47
+ # Create eval items from run_spec items, outputs, and labels
48
+ items: List[TrismikClassicEvalItem] = []
49
+ for idx, (item, output) in enumerate(zip(run.run_spec.items, run.outputs)):
50
+ label = run.run_spec.labels[idx] if idx < len(run.run_spec.labels) else ""
51
+
52
+ # Calculate item-level metrics for this item
53
+ item_metrics: Dict[str, Any] = {}
54
+ for metric_name, metric_data in run.scores.items():
55
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
56
+ if idx < len(metric_data["item_scores"]):
57
+ item_metrics[metric_name] = metric_data["item_scores"][idx]
58
+ else:
59
+ # If scores is just a single value, use it for all items
60
+ item_metrics[metric_name] = metric_data
61
+
62
+ eval_item = TrismikClassicEvalItem(
63
+ datasetItemId=str(idx),
64
+ modelInput=str(item),
65
+ modelOutput=str(output),
66
+ goldOutput=str(label),
67
+ metrics=item_metrics,
68
+ )
69
+ items.append(eval_item)
70
+
71
+ # Create eval metrics from run aggregate scores
72
+ metrics: List[TrismikClassicEvalMetric] = []
73
+ for metric_name, metric_data in run.scores.items():
74
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
75
+ # Handle structured metric data with aggregate scores
76
+ for agg_name, agg_value in metric_data["aggregate_scores"].items():
77
+ metric_id = f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
78
+ metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
79
+ metrics.append(metric)
80
+ else:
81
+ # Handle simple metric data (single value)
82
+ metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
83
+ metrics.append(metric)
84
+
85
+ classic_eval_request = TrismikClassicEvalRequest(
86
+ project_id,
87
+ experiment_id,
88
+ run.run_spec.dataset.name,
89
+ model,
90
+ run.run_spec.hyperparameter_config,
91
+ items,
92
+ metrics,
93
+ )
94
+
95
+ response: TrismikClassicEvalResponse = await runner.submit_classic_eval_async(
96
+ classic_eval_request
97
+ )
98
+
99
+ run_id: str = response.id
100
+ logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
101
+
102
+ return response
scorebook/types.py CHANGED
@@ -52,35 +52,43 @@ class ClassicEvalRunResult:
52
52
  """Results from executing a classic evaluation run."""
53
53
 
54
54
  run_spec: EvalRunSpec
55
- outputs: List[Any]
56
- scores: Dict[str, Any]
55
+ run_completed: bool
56
+ outputs: Optional[List[Any]]
57
+ scores: Optional[Dict[str, Any]]
58
+ run_id: Optional[str] = None
57
59
 
58
60
  @property
59
61
  def item_scores(self) -> List[Dict[str, Any]]:
60
62
  """Return a list of dictionaries containing scores for each evaluated item."""
61
63
  results = []
62
64
 
63
- for idx, output in enumerate(self.outputs):
64
- if idx >= len(self.run_spec.items):
65
- break
66
-
67
- result = {
68
- "item_id": idx,
69
- "dataset_name": self.run_spec.dataset.name,
70
- "inference_output": output,
71
- **self.run_spec.hyperparameter_config,
72
- }
73
-
74
- # Add individual item scores if available
75
- for metric_name, metric_data in self.scores.items():
76
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
77
- if idx < len(metric_data["item_scores"]):
78
- result[metric_name] = metric_data["item_scores"][idx]
79
- else:
80
- # If scores is just a single value, replicate it for each item
81
- result[metric_name] = metric_data
82
-
83
- results.append(result)
65
+ if self.outputs:
66
+ for idx, output in enumerate(self.outputs):
67
+ if idx >= len(self.run_spec.items):
68
+ break
69
+
70
+ result = {
71
+ "item_id": idx,
72
+ "dataset_name": self.run_spec.dataset.name,
73
+ "inference_output": output,
74
+ **self.run_spec.hyperparameter_config,
75
+ }
76
+
77
+ # Add run_id if available
78
+ if self.run_id is not None:
79
+ result["run_id"] = self.run_id
80
+
81
+ # Add individual item scores if available
82
+ if self.scores is not None:
83
+ for metric_name, metric_data in self.scores.items():
84
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
85
+ if idx < len(metric_data["item_scores"]):
86
+ result[metric_name] = metric_data["item_scores"][idx]
87
+ else:
88
+ # If scores is just a single value, replicate it for each item
89
+ result[metric_name] = metric_data
90
+
91
+ results.append(result)
84
92
 
85
93
  return results
86
94
 
@@ -89,19 +97,25 @@ class ClassicEvalRunResult:
89
97
  """Return the aggregated scores for this run."""
90
98
  result = {
91
99
  "dataset": self.run_spec.dataset.name,
100
+ "run_completed": self.run_completed,
92
101
  **self.run_spec.hyperparameter_config,
93
102
  }
94
103
 
104
+ # Add run_id if available
105
+ if self.run_id is not None:
106
+ result["run_id"] = self.run_id
107
+
95
108
  # Add aggregate scores from metrics
96
- for metric_name, metric_data in self.scores.items():
97
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
98
- # Flatten the aggregate scores from each metric
99
- for key, value in metric_data["aggregate_scores"].items():
100
- score_key = key if key == metric_name else f"{metric_name}_{key}"
101
- result[score_key] = value
102
- else:
103
- # If scores is just a single value, use it as is
104
- result[metric_name] = metric_data
109
+ if self.scores is not None:
110
+ for metric_name, metric_data in self.scores.items():
111
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
112
+ # Flatten the aggregate scores from each metric
113
+ for key, value in metric_data["aggregate_scores"].items():
114
+ score_key = key if key == metric_name else f"{metric_name}_{key}"
115
+ result[score_key] = value
116
+ else:
117
+ # If scores is just a single value, use it as is
118
+ result[metric_name] = metric_data
105
119
 
106
120
  return result
107
121
 
@@ -149,7 +163,7 @@ class EvalResult:
149
163
  results = []
150
164
 
151
165
  for run_result in self.run_results:
152
- if isinstance(run_result, ClassicEvalRunResult):
166
+ if isinstance(run_result, ClassicEvalRunResult) and run_result.run_completed:
153
167
  results.extend(run_result.item_scores)
154
168
 
155
169
  return results
@@ -1,7 +1,8 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: A Python project for LLM evaluation.
5
+ License-File: LICENSE
5
6
  Author: Euan Campbell
6
7
  Author-email: euan@trismik.com
7
8
  Requires-Python: >=3.9
@@ -11,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.10
11
12
  Classifier: Programming Language :: Python :: 3.11
12
13
  Classifier: Programming Language :: Python :: 3.12
13
14
  Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
14
16
  Provides-Extra: bedrock
15
17
  Provides-Extra: examples
16
18
  Provides-Extra: openai
@@ -35,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
35
37
  Requires-Dist: torchaudio ; extra == "examples"
36
38
  Requires-Dist: torchvision ; extra == "examples"
37
39
  Requires-Dist: transformers ; extra == "examples"
38
- Requires-Dist: trismik
40
+ Requires-Dist: trismik (>=0.9.4)
39
41
  Description-Content-Type: text/markdown
40
42
 
41
43
  # Scorebook
@@ -1,9 +1,9 @@
1
- scorebook/__init__.py,sha256=yHhNIHeLeRwjdyfSg3jtCz-NbQXMlN9fLhHJ1QzPQGQ,548
1
+ scorebook/__init__.py,sha256=30kyXG8sVbThtCt6cjPtkx7hiaUEukRQC-RsgunlkL4,557
2
2
  scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
3
  scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
4
4
  scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
5
  scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
6
- scorebook/evaluate.py,sha256=0Begs5Py9rpapoMixpqjlS2ofigQaGofbjqucABRfuM,19088
6
+ scorebook/evaluate.py,sha256=7tj1qFogcQJtCTAKK3oFnjV1xCMpxaHVnK0EGa9M4Hg,21912
7
7
  scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
8
8
  scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
9
9
  scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
@@ -16,10 +16,11 @@ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo
16
16
  scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
17
17
  scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
18
18
  scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
19
- scorebook/trismik/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
20
- scorebook/trismik/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
21
- scorebook/trismik/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
22
- scorebook/types.py,sha256=wQNFewn9Ji7nQJhXwRS-hVAL4XV6ePcLVdVQiMfWYzg,5149
19
+ scorebook/trismik_services/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
20
+ scorebook/trismik_services/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
21
+ scorebook/trismik_services/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
22
+ scorebook/trismik_services/upload_classic_eval_run.py,sha256=oHxELymEPxikVbtC6EQ06LfuNKEFV4Ijnth5v78bAmQ,3664
23
+ scorebook/types.py,sha256=zt8sGfbRjXatx1WtttWZDVIoiS-yhh_1lP0K4VHYvAM,5797
23
24
  scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
24
25
  scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
25
26
  scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
@@ -28,8 +29,8 @@ scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcK
28
29
  scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
29
30
  scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
30
31
  scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
31
- scorebook-0.0.5.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
32
- scorebook-0.0.5.dist-info/METADATA,sha256=t8ADsF_Eul8RVxZ1HS6VF1omw5RMWeWdSQdB0G8Czhg,11432
33
- scorebook-0.0.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
34
- scorebook-0.0.5.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
35
- scorebook-0.0.5.dist-info/RECORD,,
32
+ scorebook-0.0.7.dist-info/METADATA,sha256=rVXvTh-2fP1H6xMoRvzphrVdy14MirVjnA844Sgb1zA,11515
33
+ scorebook-0.0.7.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
34
+ scorebook-0.0.7.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
35
+ scorebook-0.0.7.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
36
+ scorebook-0.0.7.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 2.2.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
File without changes
File without changes