scorebook 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,13 +2,8 @@ import logging
2
2
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
3
3
 
4
4
  from trismik import TrismikAsyncClient, TrismikClient
5
- from trismik.types import (
6
- TrismikClassicEvalItem,
7
- TrismikClassicEvalMetric,
8
- TrismikClassicEvalRequest,
9
- TrismikClassicEvalResponse,
10
- TrismikRunMetadata,
11
- )
5
+ from trismik.settings import evaluation_settings
6
+ from trismik.types import TrismikRunMetadata
12
7
 
13
8
  from scorebook.eval_datasets import EvalDataset
14
9
  from scorebook.evaluate.evaluate_helpers import (
@@ -19,12 +14,12 @@ from scorebook.evaluate.evaluate_helpers import (
19
14
  make_trismik_inference,
20
15
  prepare_datasets,
21
16
  prepare_hyperparameter_configs,
22
- resolve_show_progress,
23
- resolve_upload_results,
24
- score_metrics,
17
+ resolve_adaptive_split,
25
18
  validate_parameters,
26
19
  )
27
20
  from scorebook.exceptions import InferenceError, ScoreBookError
21
+ from scorebook.inference.inference_pipeline import InferencePipeline
22
+ from scorebook.score._sync.score import score
28
23
  from scorebook.types import (
29
24
  AdaptiveEvalRunResult,
30
25
  AdaptiveEvalRunSpec,
@@ -32,15 +27,20 @@ from scorebook.types import (
32
27
  EvalResult,
33
28
  EvalRunSpec,
34
29
  )
35
- from contextlib import nullcontext
36
- from scorebook.utils import evaluation_progress_context
30
+ from scorebook.utils import (
31
+ nullcontext,
32
+ evaluation_progress_context,
33
+ resolve_show_progress,
34
+ resolve_upload_results,
35
+ )
37
36
 
38
37
  logger = logging.getLogger(__name__)
39
38
 
40
39
 
41
40
  def evaluate(
42
- inference: Callable,
41
+ inference: Union[Callable, InferencePipeline],
43
42
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
43
+ split: Optional[str] = None,
44
44
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
45
  metadata: Optional[Dict[str, Any]] = None,
46
46
  experiment_id: Optional[str] = None,
@@ -59,6 +59,7 @@ def evaluate(
59
59
  Args:
60
60
  inference: The inference callable to evaluate
61
61
  datasets: Dataset(s) to evaluate on
62
+ split: Split to use for evaluation (default: "validation")
62
63
  hyperparameters: Hyperparameter configuration(s) to evaluate with
63
64
  metadata: Optional metadata to attach to the evaluation
64
65
  experiment_id: Optional experiment identifier
@@ -83,14 +84,14 @@ def evaluate(
83
84
  validate_parameters(locals(), evaluate)
84
85
 
85
86
  # Prepare datasets, hyperparameters, and eval run specs
86
- datasets = prepare_datasets(datasets, sample_size)
87
+ datasets = prepare_datasets(datasets, split, sample_size)
87
88
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
88
89
  eval_run_specs = sorted(
89
90
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
90
91
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
92
  )
92
93
 
93
- # Create Trismik client if needed (for adaptive evals or uploads)
94
+ # Create a Trismik client if needed (for adaptive evals or uploads)
94
95
  needs_client = upload_results or any(
95
96
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
97
  )
@@ -101,7 +102,14 @@ def evaluate(
101
102
  with trismik_client or nullcontext():
102
103
  # Execute evaluation runs
103
104
  # Calculate total items across all runs
104
- total_items = sum(len(run.dataset.items) for run in eval_run_specs)
105
+ total_items = sum(
106
+ (
107
+ len(run.dataset.items)
108
+ if isinstance(run, EvalRunSpec)
109
+ else evaluation_settings["max_iterations"]
110
+ ) # Adaptive evals use max_iterations
111
+ for run in eval_run_specs
112
+ )
105
113
  model_display = get_model_name(inference)
106
114
 
107
115
  with evaluation_progress_context(
@@ -145,34 +153,32 @@ def execute_runs(
145
153
  def worker(
146
154
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
155
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
156
+ # Execute run (score_async handles upload internally for classic evals)
148
157
  run_result = execute_run(
149
- inference, run, experiment_id, project_id, metadata, trismik_client
158
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
150
159
  )
160
+
151
161
  # Update progress bars with items processed and success status
152
162
  if progress_bars is not None:
153
- items_processed = len(run.dataset.items)
163
+ # Classic evals have .items; adaptive evals use max_iterations
164
+ items_processed = (
165
+ len(run.dataset.items)
166
+ if isinstance(run, EvalRunSpec)
167
+ else evaluation_settings["max_iterations"]
168
+ )
154
169
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
170
 
171
+ # Update upload progress for classic evals
156
172
  if (
157
173
  upload_results
158
174
  and isinstance(run_result, ClassicEvalRunResult)
159
- and experiment_id
160
- and project_id
161
175
  and run_result.run_completed
162
- and trismik_client is not None
163
176
  ):
164
- try:
165
- run_id = upload_classic_run_results(
166
- run_result, experiment_id, project_id, inference, metadata, trismik_client
167
- )
168
- run_result.run_id = run_id
177
+ # Check if upload succeeded by checking for run_id
178
+ if experiment_id and project_id:
179
+ upload_succeeded = run_result.run_id is not None
169
180
  if progress_bars is not None:
170
- progress_bars.on_upload_completed(succeeded=True)
171
- except Exception as e:
172
- logger.warning(f"Failed to upload run results: {e}")
173
- if progress_bars is not None:
174
- progress_bars.on_upload_completed(succeeded=False)
175
- # Continue evaluation even if upload fails
181
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
176
182
 
177
183
  return run_result
178
184
 
@@ -191,6 +197,7 @@ def execute_runs(
191
197
  def execute_run(
192
198
  inference: Callable,
193
199
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
200
+ upload_results: bool, # NEW PARAMETER
194
201
  experiment_id: Optional[str] = None,
195
202
  project_id: Optional[str] = None,
196
203
  metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +206,9 @@ def execute_run(
199
206
  """Execute a single evaluation run."""
200
207
 
201
208
  if isinstance(run, EvalRunSpec):
202
- return execute_classic_eval_run(inference, run)
209
+ return execute_classic_eval_run(
210
+ inference, run, upload_results, experiment_id, project_id, metadata
211
+ )
203
212
 
204
213
  elif isinstance(run, AdaptiveEvalRunSpec):
205
214
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +226,79 @@ def execute_run(
217
226
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
227
 
219
228
 
220
- def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
- """Execute a classic evaluation run."""
229
+ def execute_classic_eval_run(
230
+ inference: Callable,
231
+ run: EvalRunSpec,
232
+ upload_results: bool,
233
+ experiment_id: Optional[str],
234
+ project_id: Optional[str],
235
+ metadata: Optional[Dict[str, Any]],
236
+ ) -> ClassicEvalRunResult:
237
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
222
238
  logger.debug("Executing classic eval run for %s", run)
223
239
 
224
240
  inference_outputs = None
225
- metric_scores = None
241
+ scores = None
226
242
 
227
243
  try:
244
+ # 1. Run inference
228
245
  inference_outputs = run_inference_callable(
229
246
  inference, run.inputs, run.hyperparameter_config
230
247
  )
231
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
- logger.debug("Classic evaluation completed for run %s", run)
233
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
248
+
249
+ # 2. Build items for score_async
250
+ items = [
251
+ {
252
+ "input": run.inputs[i] if i < len(run.inputs) else None,
253
+ "output": inference_outputs[i],
254
+ "label": run.labels[i] if i < len(run.labels) else "",
255
+ }
256
+ for i in range(len(inference_outputs))
257
+ ]
258
+
259
+ # 3. Get the model name for upload
260
+ model_name = get_model_name(inference, metadata)
261
+
262
+ # 4. Call score_async
263
+ scores = score(
264
+ items=items,
265
+ metrics=run.dataset.metrics,
266
+ output_column="output", # Explicit parameter
267
+ label_column="label", # Explicit parameter
268
+ input_column="input", # Explicit parameter
269
+ hyperparameters=run.hyperparameter_config,
270
+ dataset_name=run.dataset.name,
271
+ model_name=model_name,
272
+ metadata=metadata,
273
+ experiment_id=experiment_id,
274
+ project_id=project_id,
275
+ upload_results=upload_results,
276
+ show_progress=False,
277
+ )
278
+
279
+ # 5. Extract run_id if upload succeeded
280
+ run_id = None
281
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
282
+ run_id = scores["aggregate_results"][0].get("run_id")
283
+
284
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
285
+ return ClassicEvalRunResult(
286
+ run_spec=run,
287
+ run_completed=True,
288
+ outputs=inference_outputs,
289
+ scores=scores,
290
+ run_id=run_id,
291
+ )
234
292
 
235
293
  except Exception as e:
236
294
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
295
+ return ClassicEvalRunResult(
296
+ run_spec=run,
297
+ run_completed=False,
298
+ outputs=inference_outputs,
299
+ scores=scores,
300
+ run_id=None,
301
+ )
238
302
 
239
303
 
240
304
  def run_inference_callable(
@@ -296,93 +360,6 @@ def execute_adaptive_eval_run(
296
360
  return AdaptiveEvalRunResult(run, False, {})
297
361
 
298
362
 
299
- def upload_classic_run_results(
300
- run_result: ClassicEvalRunResult,
301
- experiment_id: str,
302
- project_id: str,
303
- inference_callable: Optional[Callable],
304
- metadata: Optional[Dict[str, Any]],
305
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
- ) -> str:
307
- """Upload a classic evaluation run result to Trismik platform.
308
-
309
- Args:
310
- run: The evaluation run result to upload
311
- experiment_id: Trismik experiment identifier
312
- project_id: Trismik project identifier
313
- model: Model name used for evaluation
314
- metadata: Optional metadata dictionary
315
- trismik_client: Trismik client instance
316
-
317
- Returns:
318
- Run id
319
- """
320
- model = get_model_name(inference_callable)
321
-
322
- # Create eval items from run_spec inputs, outputs, and labels
323
- items: List[TrismikClassicEvalItem] = []
324
- inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
- for idx, (input_value, output) in enumerate(inputs_outputs):
326
- labels = run_result.run_spec.labels
327
- label = labels[idx] if idx < len(labels) else ""
328
-
329
- # Calculate item-level metrics for this item
330
- item_metrics: Dict[str, Any] = {}
331
- if run_result.scores:
332
- for metric_name, metric_data in run_result.scores.items():
333
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
- if idx < len(metric_data["item_scores"]):
335
- item_metrics[metric_name] = metric_data["item_scores"][idx]
336
- else:
337
- # If scores is just a single value, use it for all items
338
- item_metrics[metric_name] = metric_data
339
-
340
- eval_item = TrismikClassicEvalItem(
341
- datasetItemId=str(idx),
342
- modelInput=str(input_value),
343
- modelOutput=str(output),
344
- goldOutput=str(label),
345
- metrics=item_metrics,
346
- )
347
- items.append(eval_item)
348
-
349
- # Create eval metrics from run aggregate scores
350
- metrics: List[TrismikClassicEvalMetric] = []
351
- if run_result.scores:
352
- for metric_name, metric_data in run_result.scores.items():
353
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
- # Handle structured metric data with aggregate scores
355
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
- metric_id = (
357
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
- )
359
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
- metrics.append(metric)
361
- else:
362
- # Handle simple metric data (single value)
363
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
- metrics.append(metric)
365
-
366
- classic_eval_request = TrismikClassicEvalRequest(
367
- project_id,
368
- experiment_id,
369
- run_result.run_spec.dataset.name,
370
- model,
371
- run_result.run_spec.hyperparameter_config,
372
- items,
373
- metrics,
374
- )
375
-
376
- response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
377
- classic_eval_request
378
- )
379
-
380
- run_id: str = response.id
381
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
-
383
- return run_id
384
-
385
-
386
363
  def run_adaptive_evaluation(
387
364
  inference: Callable,
388
365
  adaptive_run_spec: AdaptiveEvalRunSpec,
@@ -403,8 +380,20 @@ def run_adaptive_evaluation(
403
380
  Returns:
404
381
  Results from the adaptive evaluation
405
382
  """
383
+ # Fetch available splits from Trismik
384
+ dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
385
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
386
+
387
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
388
+ resolved_split = resolve_adaptive_split(
389
+ test_id=adaptive_run_spec.dataset,
390
+ user_specified_split=adaptive_run_spec.split,
391
+ available_splits=available_splits,
392
+ )
393
+
406
394
  trismik_results = trismik_client.run(
407
395
  test_id=adaptive_run_spec.dataset,
396
+ split=resolved_split,
408
397
  project_id=project_id,
409
398
  experiment=experiment_id,
410
399
  run_metadata=TrismikRunMetadata(
@@ -2,9 +2,8 @@
2
2
 
3
3
  import asyncio
4
4
  import dataclasses
5
- import inspect
6
5
  import logging
7
- from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
6
+ from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
8
7
 
9
8
  from trismik._async.client import TrismikAsyncClient
10
9
  from trismik._sync.client import TrismikClient
@@ -25,30 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
25
24
  logger = logging.getLogger(__name__)
26
25
 
27
26
 
28
- def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
29
- """Resolve the upload_results parameter based on trismik login status."""
27
+ # TODO: Remove this when backend supports boolean item metrics
28
+ NORMALIZE_METRICS_FOR_UPLOAD = True
30
29
 
31
- if upload_results == "auto":
32
- upload_results = get_token() is not None
33
- logger.debug("Auto upload results resolved to: %s", upload_results)
34
30
 
35
- return upload_results
31
+ def normalize_metric_value(value: Any) -> Any:
32
+ """Normalize metric values for API upload compatibility.
36
33
 
37
-
38
- def resolve_show_progress(show_progress: Optional[bool]) -> bool:
39
- """Resolve whether to show progress bars.
34
+ TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
35
+ This function converts boolean values to floats (True -> 1.0, False -> 0.0)
36
+ to ensure upload compatibility.
40
37
 
41
38
  Args:
42
- show_progress: Explicit setting (None uses default from settings)
39
+ value: The metric value to normalize
43
40
 
44
41
  Returns:
45
- bool: Whether to show progress bars
42
+ Float if value is bool, otherwise unchanged
43
+
44
+ TODO: Remove this function when backend supports boolean metrics natively.
45
+ To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
46
46
  """
47
- if show_progress is None:
48
- from scorebook.settings import SHOW_PROGRESS_BARS
47
+ if not NORMALIZE_METRICS_FOR_UPLOAD:
48
+ return value
49
+
50
+ # Convert booleans to floats for API compatibility
51
+ if isinstance(value, bool):
52
+ return float(value) # True -> 1.0, False -> 0.0
49
53
 
50
- return bool(SHOW_PROGRESS_BARS)
51
- return show_progress
54
+ return value
52
55
 
53
56
 
54
57
  def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
@@ -88,6 +91,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
88
91
 
89
92
  def prepare_datasets(
90
93
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
94
+ split: Optional[str] = None,
91
95
  sample_size: Optional[int] = None,
92
96
  ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
93
97
  """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
@@ -101,6 +105,12 @@ def prepare_datasets(
101
105
 
102
106
  # Prepare classic datasets
103
107
  if isinstance(dataset, EvalDataset):
108
+ # Warn if dataset split differs from provided split parameter
109
+ if split is not None and dataset.split is not None and dataset.split != split:
110
+ logger.warning(
111
+ f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
112
+ f"parameter is '{split}'. The dataset split will be used."
113
+ )
104
114
 
105
115
  if sample_size is not None:
106
116
  dataset = dataset.sample(sample_size)
@@ -108,8 +118,17 @@ def prepare_datasets(
108
118
  datasets_out.append(dataset)
109
119
 
110
120
  # Prepare adaptive datasets
111
- elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
112
- datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
121
+ elif isinstance(dataset, str) and ":adaptive" in dataset:
122
+ # Parse adaptive dataset
123
+ parts = dataset.split(":")
124
+ if len(parts) != 2 or parts[1] != "adaptive":
125
+ raise ParameterValidationError(
126
+ f"Invalid adaptive dataset format: '{dataset}'. "
127
+ f"Use 'test_id:adaptive' format and specify split via the split parameter."
128
+ )
129
+
130
+ # Use the split parameter for all adaptive datasets
131
+ datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
113
132
 
114
133
  # TODO: dataset name string registry
115
134
  elif isinstance(dataset, str):
@@ -171,6 +190,7 @@ def build_eval_run_specs(
171
190
  hyperparameters_index,
172
191
  experiment_id,
173
192
  project_id,
193
+ dataset.split,
174
194
  metadata,
175
195
  )
176
196
  )
@@ -217,17 +237,19 @@ def build_adaptive_eval_run_spec(
217
237
  hyperparameter_config_index: int,
218
238
  experiment_id: str,
219
239
  project_id: str,
240
+ split: Optional[str] = None,
220
241
  metadata: Optional[Dict[str, Any]] = None,
221
242
  ) -> AdaptiveEvalRunSpec:
222
243
  """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
223
- dataset = adaptive_dataset.replace(":adaptive", "")
244
+ # Keep the full dataset name including ":adaptive" suffix for backend API
224
245
  adaptive_eval_run_spec = AdaptiveEvalRunSpec(
225
- dataset,
246
+ adaptive_dataset,
226
247
  dataset_index,
227
248
  hyperparameter_config,
228
249
  hyperparameter_config_index,
229
250
  experiment_id,
230
251
  project_id,
252
+ split,
231
253
  metadata,
232
254
  )
233
255
  logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
@@ -345,10 +367,7 @@ def make_trismik_inference(
345
367
  """
346
368
 
347
369
  # Check if the inference function is async
348
- is_async = inspect.iscoroutinefunction(inference_function) or (
349
- hasattr(inference_function, "__call__")
350
- and inspect.iscoroutinefunction(inference_function.__call__)
351
- )
370
+ is_async = is_awaitable(inference_function)
352
371
 
353
372
  def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
354
373
  # Single TrismikMultipleChoiceTextItem dataclass
@@ -386,3 +405,57 @@ def make_trismik_inference(
386
405
  )
387
406
 
388
407
  return sync_trismik_inference_function
408
+
409
+
410
+ def resolve_adaptive_split(
411
+ test_id: str,
412
+ user_specified_split: Optional[str],
413
+ available_splits: List[str],
414
+ ) -> str:
415
+ """Resolve the dataset split to use for adaptive evaluation.
416
+
417
+ Resolution order:
418
+ 1. If user specified a split, validate it exists and use it
419
+ 2. If not specified and exactly one split is available, use it
420
+ 3. If not specified and multiple splits are available, raise an error
421
+ 4. If no splits are available, raise an error
422
+
423
+ Args:
424
+ test_id: The test dataset ID (without ":adaptive" suffix)
425
+ user_specified_split: Optional split name specified by the user
426
+ available_splits: List of available split names for this dataset
427
+
428
+ Returns:
429
+ The resolved split name to use
430
+
431
+ Raises:
432
+ ScoreBookError: If the specified split doesn't exist, multiple splits exist without
433
+ user specification, or no splits are available
434
+ """
435
+ logger.debug(f"Available splits for {test_id}: {available_splits}")
436
+
437
+ # If user specified a split, validate and use it
438
+ if user_specified_split is not None:
439
+ if user_specified_split in available_splits:
440
+ logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
441
+ return user_specified_split
442
+ else:
443
+ raise ScoreBookError(
444
+ f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
445
+ f"Available splits: {available_splits}"
446
+ )
447
+
448
+ # No split specified - check available splits
449
+ if len(available_splits) == 0:
450
+ raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
451
+ elif len(available_splits) == 1:
452
+ # Exactly one split - auto-select it
453
+ selected_split = available_splits[0]
454
+ logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
455
+ return selected_split
456
+ else:
457
+ # Multiple splits available - user must specify
458
+ raise ScoreBookError(
459
+ f"Multiple splits available for dataset '{test_id}': {available_splits}. "
460
+ f"Please specify which split to use via evaluate's 'split' parameter."
461
+ )
scorebook/exceptions.py CHANGED
@@ -84,10 +84,14 @@ class MetricComputationError(EvaluationError):
84
84
  )
85
85
 
86
86
 
87
- class DataMismatchError(EvaluationError):
87
+ class ScoreError(ScoreBookError):
88
+ """Raised when there are errors during scoring."""
89
+
90
+
91
+ class DataMismatchError(ScoreError):
88
92
  """Raised when there's a mismatch between outputs and expected labels."""
89
93
 
90
- def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
94
+ def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
91
95
  """Initialize data mismatch error."""
92
96
  self.outputs_count = outputs_count
93
97
  self.labels_count = labels_count
@@ -0,0 +1,6 @@
1
+ """Score module for computing metrics on pre-computed outputs."""
2
+
3
+ from scorebook.score._async.score_async import score_async
4
+ from scorebook.score._sync.score import score
5
+
6
+ __all__ = ["score", "score_async"]
File without changes