scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,15 +2,10 @@ import logging
2
2
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
3
3
 
4
4
  from trismik import TrismikAsyncClient, TrismikClient
5
- from trismik.types import (
6
- TrismikClassicEvalItem,
7
- TrismikClassicEvalMetric,
8
- TrismikClassicEvalRequest,
9
- TrismikClassicEvalResponse,
10
- TrismikRunMetadata,
11
- )
5
+ from trismik.settings import evaluation_settings
6
+ from trismik.types import TrismikRunMetadata
12
7
 
13
- from scorebook.eval_dataset import EvalDataset
8
+ from scorebook.eval_datasets import EvalDataset
14
9
  from scorebook.evaluate.evaluate_helpers import (
15
10
  build_eval_run_specs,
16
11
  create_trismik_sync_client,
@@ -19,11 +14,11 @@ from scorebook.evaluate.evaluate_helpers import (
19
14
  make_trismik_inference,
20
15
  prepare_datasets,
21
16
  prepare_hyperparameter_configs,
22
- resolve_upload_results,
23
- score_metrics,
24
17
  validate_parameters,
25
18
  )
26
19
  from scorebook.exceptions import InferenceError, ScoreBookError
20
+ from scorebook.inference.inference_pipeline import InferencePipeline
21
+ from scorebook.score._sync.score import score
27
22
  from scorebook.types import (
28
23
  AdaptiveEvalRunResult,
29
24
  AdaptiveEvalRunSpec,
@@ -31,14 +26,18 @@ from scorebook.types import (
31
26
  EvalResult,
32
27
  EvalRunSpec,
33
28
  )
34
- from contextlib import nullcontext
35
- from scorebook.utils import evaluation_progress
29
+ from scorebook.utils import (
30
+ nullcontext,
31
+ evaluation_progress_context,
32
+ resolve_show_progress,
33
+ resolve_upload_results,
34
+ )
36
35
 
37
36
  logger = logging.getLogger(__name__)
38
37
 
39
38
 
40
39
  def evaluate(
41
- inference: Callable,
40
+ inference: Union[Callable, InferencePipeline],
42
41
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
43
42
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
44
43
  metadata: Optional[Dict[str, Any]] = None,
@@ -50,6 +49,7 @@ def evaluate(
50
49
  return_output: bool = False,
51
50
  upload_results: Union[Literal["auto"], bool] = "auto",
52
51
  sample_size: Optional[int] = None,
52
+ show_progress: Optional[bool] = None,
53
53
  ) -> Union[Dict, List, EvalResult]:
54
54
  """
55
55
  Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +67,8 @@ def evaluate(
67
67
  return_output: If True, returns model outputs for each dataset item
68
68
  upload_results: If True, uploads results to Trismik's dashboard
69
69
  sample_size: Optional number of items to sample from each dataset
70
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
71
+ If True/False, explicitly enables/disables progress bars for this evaluation.
70
72
 
71
73
  Returns:
72
74
  The evaluation results in the format specified by return parameters:
@@ -75,6 +77,7 @@ def evaluate(
75
77
  """
76
78
  # Resolve and validate parameters
77
79
  upload_results = cast(bool, resolve_upload_results(upload_results))
80
+ show_progress_bars = resolve_show_progress(show_progress)
78
81
  validate_parameters(locals(), evaluate)
79
82
 
80
83
  # Prepare datasets, hyperparameters, and eval run specs
@@ -85,7 +88,7 @@ def evaluate(
85
88
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
86
89
  )
87
90
 
88
- # Create Trismik client if needed (for adaptive evals or uploads)
91
+ # Create a Trismik client if needed (for adaptive evals or uploads)
89
92
  needs_client = upload_results or any(
90
93
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
91
94
  )
@@ -95,10 +98,24 @@ def evaluate(
95
98
 
96
99
  with trismik_client or nullcontext():
97
100
  # Execute evaluation runs
98
- with evaluation_progress(
101
+ # Calculate total items across all runs
102
+ total_items = sum(
103
+ (
104
+ len(run.dataset.items)
105
+ if isinstance(run, EvalRunSpec)
106
+ else evaluation_settings["max_iterations"]
107
+ ) # Adaptive evals use max_iterations
108
+ for run in eval_run_specs
109
+ )
110
+ model_display = get_model_name(inference)
111
+
112
+ with evaluation_progress_context(
113
+ total_eval_runs=len(eval_run_specs),
114
+ total_items=total_items,
99
115
  dataset_count=len(datasets),
100
- hyperparameter_config_count=len(hyperparameter_configs),
101
- run_count=len(eval_run_specs),
116
+ hyperparam_count=len(hyperparameter_configs),
117
+ model_display=model_display,
118
+ enabled=show_progress_bars,
102
119
  ) as progress_bars:
103
120
  eval_result = execute_runs(
104
121
  inference,
@@ -133,23 +150,32 @@ def execute_runs(
133
150
  def worker(
134
151
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
135
152
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
153
+ # Execute run (score_async handles upload internally for classic evals)
136
154
  run_result = execute_run(
137
- inference, run, experiment_id, project_id, metadata, trismik_client
155
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
138
156
  )
139
- progress_bars.on_eval_run_completed(run.dataset_index)
140
157
 
158
+ # Update progress bars with items processed and success status
159
+ if progress_bars is not None:
160
+ # Classic evals have .items; adaptive evals use max_iterations
161
+ items_processed = (
162
+ len(run.dataset.items)
163
+ if isinstance(run, EvalRunSpec)
164
+ else evaluation_settings["max_iterations"]
165
+ )
166
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
167
+
168
+ # Update upload progress for classic evals
141
169
  if (
142
170
  upload_results
143
171
  and isinstance(run_result, ClassicEvalRunResult)
144
- and experiment_id
145
- and project_id
146
172
  and run_result.run_completed
147
- and trismik_client is not None
148
173
  ):
149
- run_id = upload_classic_run_results(
150
- run_result, experiment_id, project_id, inference, metadata, trismik_client
151
- )
152
- run_result.run_id = run_id
174
+ # Check if upload succeeded by checking for run_id
175
+ if experiment_id and project_id:
176
+ upload_succeeded = run_result.run_id is not None
177
+ if progress_bars is not None:
178
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
153
179
 
154
180
  return run_result
155
181
 
@@ -168,6 +194,7 @@ def execute_runs(
168
194
  def execute_run(
169
195
  inference: Callable,
170
196
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
197
+ upload_results: bool, # NEW PARAMETER
171
198
  experiment_id: Optional[str] = None,
172
199
  project_id: Optional[str] = None,
173
200
  metadata: Optional[Dict[str, Any]] = None,
@@ -176,7 +203,9 @@ def execute_run(
176
203
  """Execute a single evaluation run."""
177
204
 
178
205
  if isinstance(run, EvalRunSpec):
179
- return execute_classic_eval_run(inference, run)
206
+ return execute_classic_eval_run(
207
+ inference, run, upload_results, experiment_id, project_id, metadata
208
+ )
180
209
 
181
210
  elif isinstance(run, AdaptiveEvalRunSpec):
182
211
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -194,35 +223,90 @@ def execute_run(
194
223
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
195
224
 
196
225
 
197
- def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
198
- """Execute a classic evaluation run."""
226
+ def execute_classic_eval_run(
227
+ inference: Callable,
228
+ run: EvalRunSpec,
229
+ upload_results: bool,
230
+ experiment_id: Optional[str],
231
+ project_id: Optional[str],
232
+ metadata: Optional[Dict[str, Any]],
233
+ ) -> ClassicEvalRunResult:
234
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
199
235
  logger.debug("Executing classic eval run for %s", run)
200
236
 
201
237
  inference_outputs = None
202
- metric_scores = None
238
+ scores = None
203
239
 
204
240
  try:
241
+ # 1. Run inference
205
242
  inference_outputs = run_inference_callable(
206
- inference, run.dataset.items, run.hyperparameter_config
243
+ inference, run.inputs, run.hyperparameter_config
244
+ )
245
+
246
+ # 2. Build items for score_async
247
+ items = [
248
+ {
249
+ "input": run.inputs[i] if i < len(run.inputs) else None,
250
+ "output": inference_outputs[i],
251
+ "label": run.labels[i] if i < len(run.labels) else "",
252
+ }
253
+ for i in range(len(inference_outputs))
254
+ ]
255
+
256
+ # 3. Get the model name for upload
257
+ model_name = get_model_name(inference, metadata)
258
+
259
+ # 4. Call score_async
260
+ scores = score(
261
+ items=items,
262
+ metrics=run.dataset.metrics,
263
+ output_column="output", # Explicit parameter
264
+ label_column="label", # Explicit parameter
265
+ input_column="input", # Explicit parameter
266
+ hyperparameters=run.hyperparameter_config,
267
+ dataset_name=run.dataset.name,
268
+ model_name=model_name,
269
+ metadata=metadata,
270
+ experiment_id=experiment_id,
271
+ project_id=project_id,
272
+ upload_results=upload_results,
273
+ show_progress=False,
274
+ )
275
+
276
+ # 5. Extract run_id if upload succeeded
277
+ run_id = None
278
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
279
+ run_id = scores["aggregate_results"][0].get("run_id")
280
+
281
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
282
+ return ClassicEvalRunResult(
283
+ run_spec=run,
284
+ run_completed=True,
285
+ outputs=inference_outputs,
286
+ scores=scores,
287
+ run_id=run_id,
207
288
  )
208
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
209
- logger.debug("Classic evaluation completed for run %s", run)
210
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
211
289
 
212
290
  except Exception as e:
213
291
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
214
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
292
+ return ClassicEvalRunResult(
293
+ run_spec=run,
294
+ run_completed=False,
295
+ outputs=inference_outputs,
296
+ scores=scores,
297
+ run_id=None,
298
+ )
215
299
 
216
300
 
217
301
  def run_inference_callable(
218
302
  inference: Callable,
219
- items: List[Dict[str, Any]],
303
+ inputs: List[Any],
220
304
  hyperparameter_config: Dict[str, Any],
221
305
  ) -> Any:
222
306
  """Run inference on a given dataset and hyperparameter configuration."""
223
307
 
224
308
  try:
225
- predictions = inference(items, **hyperparameter_config)
309
+ predictions = inference(inputs, **hyperparameter_config)
226
310
  except Exception as e:
227
311
  logger.error(
228
312
  "Inference callable raised an exception: %s",
@@ -230,11 +314,11 @@ def run_inference_callable(
230
314
  )
231
315
  raise InferenceError(f"Inference failed: {str(e)}") from e
232
316
 
233
- if not isinstance(predictions, list) or len(predictions) != len(items):
317
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
234
318
  raise InferenceError(
235
319
  "Inference callable must return a list of predictions "
236
- "of shared length as the input items. "
237
- f"Items length: {len(items)}, predictions length: {len(predictions)}"
320
+ "of shared length as the inputs. "
321
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
238
322
  )
239
323
 
240
324
  if all(prediction == "" for prediction in predictions):
@@ -257,100 +341,20 @@ def execute_adaptive_eval_run(
257
341
  """Execute an adaptive evaluation run."""
258
342
  logger.debug("Executing adaptive run for %s", run)
259
343
 
260
- if trismik_client is None:
261
- raise ScoreBookError("Trismik client is required for adaptive evaluation")
262
-
263
- adaptive_eval_run_result = run_adaptive_evaluation(
264
- inference, run, experiment_id, project_id, metadata, trismik_client
265
- )
266
- logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
267
-
268
- return adaptive_eval_run_result
269
-
270
-
271
- def upload_classic_run_results(
272
- run_result: ClassicEvalRunResult,
273
- experiment_id: str,
274
- project_id: str,
275
- inference_callable: Optional[Callable],
276
- metadata: Optional[Dict[str, Any]],
277
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
278
- ) -> str:
279
- """Upload a classic evaluation run result to Trismik platform.
280
-
281
- Args:
282
- run: The evaluation run result to upload
283
- experiment_id: Trismik experiment identifier
284
- project_id: Trismik project identifier
285
- model: Model name used for evaluation
286
- metadata: Optional metadata dictionary
287
- trismik_client: Trismik client instance
344
+ try:
345
+ if trismik_client is None:
346
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
288
347
 
289
- Returns:
290
- Run id
291
- """
292
- model = get_model_name(inference_callable)
293
-
294
- # Create eval items from run_spec items, outputs, and labels
295
- items: List[TrismikClassicEvalItem] = []
296
- for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
297
- label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
298
-
299
- # Calculate item-level metrics for this item
300
- item_metrics: Dict[str, Any] = {}
301
- if run_result.scores:
302
- for metric_name, metric_data in run_result.scores.items():
303
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
304
- if idx < len(metric_data["item_scores"]):
305
- item_metrics[metric_name] = metric_data["item_scores"][idx]
306
- else:
307
- # If scores is just a single value, use it for all items
308
- item_metrics[metric_name] = metric_data
309
-
310
- eval_item = TrismikClassicEvalItem(
311
- datasetItemId=str(idx),
312
- modelInput=str(item),
313
- modelOutput=str(output),
314
- goldOutput=str(label),
315
- metrics=item_metrics,
348
+ adaptive_eval_run_result = run_adaptive_evaluation(
349
+ inference, run, experiment_id, project_id, metadata, trismik_client
316
350
  )
317
- items.append(eval_item)
318
-
319
- # Create eval metrics from run aggregate scores
320
- metrics: List[TrismikClassicEvalMetric] = []
321
- if run_result.scores:
322
- for metric_name, metric_data in run_result.scores.items():
323
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
324
- # Handle structured metric data with aggregate scores
325
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
326
- metric_id = (
327
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
328
- )
329
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
330
- metrics.append(metric)
331
- else:
332
- # Handle simple metric data (single value)
333
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
334
- metrics.append(metric)
335
-
336
- classic_eval_request = TrismikClassicEvalRequest(
337
- project_id,
338
- experiment_id,
339
- run_result.run_spec.dataset.name,
340
- model,
341
- run_result.run_spec.hyperparameter_config,
342
- items,
343
- metrics,
344
- )
345
-
346
- response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
347
- classic_eval_request
348
- )
351
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
349
352
 
350
- run_id: str = response.id
351
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
353
+ return adaptive_eval_run_result
352
354
 
353
- return run_id
355
+ except Exception as e:
356
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
357
+ return AdaptiveEvalRunResult(run, False, {})
354
358
 
355
359
 
356
360
  def run_adaptive_evaluation(
@@ -410,4 +414,4 @@ def run_adaptive_evaluation(
410
414
  # Make scores JSON serializable
411
415
  scores = make_json_serializable(scores)
412
416
 
413
- return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
417
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
@@ -2,9 +2,8 @@
2
2
 
3
3
  import asyncio
4
4
  import dataclasses
5
- import inspect
6
5
  import logging
7
- from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
6
+ from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union
8
7
 
9
8
  from trismik._async.client import TrismikAsyncClient
10
9
  from trismik._sync.client import TrismikClient
@@ -25,14 +24,34 @@ from scorebook.utils import expand_dict, is_awaitable
25
24
  logger = logging.getLogger(__name__)
26
25
 
27
26
 
28
- def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
29
- """Resolve the upload_results parameter based on trismik login status."""
27
+ # TODO: Remove this when backend supports boolean item metrics
28
+ NORMALIZE_METRICS_FOR_UPLOAD = True
30
29
 
31
- if upload_results == "auto":
32
- upload_results = get_token() is not None
33
- logger.debug("Auto upload results resolved to: %s", upload_results)
34
30
 
35
- return upload_results
31
+ def normalize_metric_value(value: Any) -> Any:
32
+ """Normalize metric values for API upload compatibility.
33
+
34
+ TEMPORARY WORKAROUND: The Trismik API currently rejects boolean metric values.
35
+ This function converts boolean values to floats (True -> 1.0, False -> 0.0)
36
+ to ensure upload compatibility.
37
+
38
+ Args:
39
+ value: The metric value to normalize
40
+
41
+ Returns:
42
+ Float if value is bool, otherwise unchanged
43
+
44
+ TODO: Remove this function when backend supports boolean metrics natively.
45
+ To revert: Set NORMALIZE_METRICS_FOR_UPLOAD = False
46
+ """
47
+ if not NORMALIZE_METRICS_FOR_UPLOAD:
48
+ return value
49
+
50
+ # Convert booleans to floats for API compatibility
51
+ if isinstance(value, bool):
52
+ return float(value) # True -> 1.0, False -> 0.0
53
+
54
+ return value
36
55
 
37
56
 
38
57
  def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
@@ -93,7 +112,7 @@ def prepare_datasets(
93
112
 
94
113
  # Prepare adaptive datasets
95
114
  elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
96
- datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
115
+ datasets_out.append(AdaptiveEvalDataset(dataset))
97
116
 
98
117
  # TODO: dataset name string registry
99
118
  elif isinstance(dataset, str):
@@ -172,15 +191,22 @@ def build_classic_eval_run_spec(
172
191
  hyperparameters: Dict[str, Any],
173
192
  hyperparameters_index: int,
174
193
  ) -> EvalRunSpec:
175
- """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
176
- items = dataset.items
177
- labels = [item.get(dataset.label) for item in items]
194
+ """Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
195
+
196
+ Extracts input and label values from the appropriate columns in the dataset.
197
+ The column names are determined by dataset.input and dataset.label,
198
+ which may be original field names (e.g., "question", "answer") or computed
199
+ column names (e.g., "*input", "*label") if templates were used.
200
+ """
201
+ # Extract inputs and labels using the dataset's column specifications
202
+ inputs = dataset[dataset.input] # Returns List[Any]
203
+ labels = dataset[dataset.label] # Returns List[Any]
178
204
  eval_run_spec = EvalRunSpec(
179
205
  dataset,
180
206
  dataset_index,
181
207
  hyperparameters,
182
208
  hyperparameters_index,
183
- items,
209
+ inputs,
184
210
  labels,
185
211
  )
186
212
  logger.debug("Built EvalRunSpec: %s", eval_run_spec)
@@ -197,9 +223,9 @@ def build_adaptive_eval_run_spec(
197
223
  metadata: Optional[Dict[str, Any]] = None,
198
224
  ) -> AdaptiveEvalRunSpec:
199
225
  """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
200
- dataset = adaptive_dataset.replace(":adaptive", "")
226
+ # Keep the full dataset name including ":adaptive" suffix for backend API
201
227
  adaptive_eval_run_spec = AdaptiveEvalRunSpec(
202
- dataset,
228
+ adaptive_dataset,
203
229
  dataset_index,
204
230
  hyperparameter_config,
205
231
  hyperparameter_config_index,
@@ -256,7 +282,7 @@ def create_trismik_sync_client() -> TrismikClient:
256
282
  def get_model_name(
257
283
  inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
258
284
  ) -> str:
259
- """Determine a model's name with the fallback "unspecified"."""
285
+ """Determine a model's name with the fallback "Model"."""
260
286
 
261
287
  # First priority: metadata.model
262
288
  if metadata and "model" in metadata:
@@ -266,8 +292,8 @@ def get_model_name(
266
292
  if inference_callable and hasattr(inference_callable, "model"):
267
293
  return str(inference_callable.model)
268
294
 
269
- # Fallback: "unspecified"
270
- return "unspecified"
295
+ # Fallback: "Model"
296
+ return "Model"
271
297
 
272
298
 
273
299
  def format_results(
@@ -292,7 +318,7 @@ def format_results(
292
318
  # Remove inference output if not requested
293
319
  if not return_output:
294
320
  for item in item_scores:
295
- item.pop("inference_output", None)
321
+ item.pop("output", None)
296
322
 
297
323
  results["item_results"] = item_scores
298
324
 
@@ -322,10 +348,7 @@ def make_trismik_inference(
322
348
  """
323
349
 
324
350
  # Check if the inference function is async
325
- is_async = inspect.iscoroutinefunction(inference_function) or (
326
- hasattr(inference_function, "__call__")
327
- and inspect.iscoroutinefunction(inference_function.__call__)
328
- )
351
+ is_async = is_awaitable(inference_function)
329
352
 
330
353
  def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
331
354
  # Single TrismikMultipleChoiceTextItem dataclass
scorebook/exceptions.py CHANGED
@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
10
10
  """Base exception class for all Scorebook-related errors."""
11
11
 
12
12
 
13
+ class EvalDatasetError(ScoreBookError):
14
+ """Base exception class for all EvalDataset errors."""
15
+
16
+
17
+ class DatasetConfigurationError(EvalDatasetError):
18
+ """Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
19
+
20
+
21
+ class MissingFieldError(EvalDatasetError):
22
+ """Raised when required field is missing from dataset."""
23
+
24
+ def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
25
+ """Initialize missing field error with structured context."""
26
+ self.field_name = field_name
27
+ self.field_type = field_type # "input" or "label"
28
+ self.available_fields = available_fields
29
+ super().__init__(
30
+ f"{field_type.capitalize()} field '{field_name}' not found. "
31
+ f"Available fields: {', '.join(available_fields)}"
32
+ )
33
+
34
+
35
+ class DatasetLoadError(EvalDatasetError):
36
+ """Raised when dataset fails to load from source (file or remote)."""
37
+
38
+
39
+ class DatasetParseError(EvalDatasetError):
40
+ """Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
41
+
42
+
43
+ class DatasetNotInitializedError(EvalDatasetError):
44
+ """Raised when operations are attempted on uninitialized dataset."""
45
+
46
+
47
+ class DatasetSampleError(EvalDatasetError):
48
+ """Raised when sampling parameters are invalid."""
49
+
50
+ def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
51
+ """Initialize dataset sample error with structured context."""
52
+ self.sample_size = sample_size
53
+ self.dataset_size = dataset_size
54
+ self.dataset_name = dataset_name
55
+ super().__init__(
56
+ f"Sample size {sample_size} exceeds dataset size {dataset_size} "
57
+ f"for dataset '{dataset_name}'"
58
+ )
59
+
60
+
13
61
  class EvaluationError(ScoreBookError):
14
62
  """Raised when there are errors during model evaluation."""
15
63
 
@@ -36,10 +84,14 @@ class MetricComputationError(EvaluationError):
36
84
  )
37
85
 
38
86
 
39
- class DataMismatchError(EvaluationError):
87
+ class ScoreError(ScoreBookError):
88
+ """Raised when there are errors during scoring."""
89
+
90
+
91
+ class DataMismatchError(ScoreError):
40
92
  """Raised when there's a mismatch between outputs and expected labels."""
41
93
 
42
- def __init__(self, outputs_count: int, labels_count: int, dataset_name: str):
94
+ def __init__(self, outputs_count: int, labels_count: int, dataset_name: str = "Dataset"):
43
95
  """Initialize data mismatch error."""
44
96
  self.outputs_count = outputs_count
45
97
  self.labels_count = labels_count
@@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Tuple
17
17
  import boto3
18
18
  from botocore.config import Config
19
19
  from botocore.exceptions import ClientError
20
- from tqdm.asyncio import tqdm
20
+ from tqdm.auto import tqdm
21
21
 
22
22
 
23
23
  async def batch(
@@ -13,7 +13,7 @@ import tempfile
13
13
  from typing import Any, List, Optional
14
14
 
15
15
  from portkey_ai import AsyncPortkey
16
- from tqdm.asyncio import tqdm
16
+ from tqdm.auto import tqdm
17
17
 
18
18
 
19
19
  async def responses(
@@ -18,7 +18,7 @@ import pandas as pd
18
18
  from google import genai
19
19
  from google.cloud import storage
20
20
  from google.genai import types
21
- from tqdm.asyncio import tqdm
21
+ from tqdm.auto import tqdm
22
22
 
23
23
 
24
24
  async def responses(
@@ -0,0 +1,6 @@
1
+ """Score module for computing metrics on pre-computed outputs."""
2
+
3
+ from scorebook.score._async.score_async import score_async
4
+ from scorebook.score._sync.score import score
5
+
6
+ __all__ = ["score", "score_async"]
File without changes