scorebook 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ from trismik.types import (
11
11
  TrismikRunMetadata,
12
12
  )
13
13
 
14
- from scorebook.eval_dataset import EvalDataset
14
+ from scorebook.eval_datasets import EvalDataset
15
15
  from scorebook.evaluate.evaluate_helpers import (
16
16
  build_eval_run_specs,
17
17
  create_trismik_async_client,
@@ -20,6 +20,7 @@ from scorebook.evaluate.evaluate_helpers import (
20
20
  make_trismik_inference,
21
21
  prepare_datasets,
22
22
  prepare_hyperparameter_configs,
23
+ resolve_show_progress,
23
24
  resolve_upload_results,
24
25
  score_metrics,
25
26
  validate_parameters,
@@ -32,7 +33,7 @@ from scorebook.types import (
32
33
  EvalResult,
33
34
  EvalRunSpec,
34
35
  )
35
- from scorebook.utils import async_nullcontext, evaluation_progress
36
+ from scorebook.utils import async_nullcontext, evaluation_progress_context
36
37
 
37
38
  logger = logging.getLogger(__name__)
38
39
 
@@ -50,6 +51,7 @@ async def evaluate_async(
50
51
  return_output: bool = False,
51
52
  upload_results: Union[Literal["auto"], bool] = "auto",
52
53
  sample_size: Optional[int] = None,
54
+ show_progress: Optional[bool] = None,
53
55
  ) -> Union[Dict, List, EvalResult]:
54
56
  """
55
57
  Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +69,8 @@ async def evaluate_async(
67
69
  return_output: If True, returns model outputs for each dataset item
68
70
  upload_results: If True, uploads results to Trismik's dashboard
69
71
  sample_size: Optional number of items to sample from each dataset
72
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
73
+ If True/False, explicitly enables/disables progress bars for this evaluation.
70
74
 
71
75
  Returns:
72
76
  The evaluation results in the format specified by return parameters:
@@ -75,6 +79,7 @@ async def evaluate_async(
75
79
  """
76
80
  # Resolve and validate parameters
77
81
  upload_results = cast(bool, resolve_upload_results(upload_results))
82
+ show_progress_bars = resolve_show_progress(show_progress)
78
83
  validate_parameters(locals(), evaluate_async)
79
84
 
80
85
  # Prepare datasets, hyperparameters, and eval run specs
@@ -95,10 +100,17 @@ async def evaluate_async(
95
100
 
96
101
  async with trismik_client or async_nullcontext():
97
102
  # Execute evaluation runs
98
- with evaluation_progress(
103
+ # Calculate total items across all runs
104
+ total_items = sum(len(run.dataset.items) for run in eval_run_specs)
105
+ model_display = get_model_name(inference)
106
+
107
+ with evaluation_progress_context(
108
+ total_eval_runs=len(eval_run_specs),
109
+ total_items=total_items,
99
110
  dataset_count=len(datasets),
100
- hyperparameter_config_count=len(hyperparameter_configs),
101
- run_count=len(eval_run_specs),
111
+ hyperparam_count=len(hyperparameter_configs),
112
+ model_display=model_display,
113
+ enabled=show_progress_bars,
102
114
  ) as progress_bars:
103
115
  eval_result = await execute_runs(
104
116
  inference,
@@ -136,7 +148,10 @@ async def execute_runs(
136
148
  run_result = await execute_run(
137
149
  inference, run, experiment_id, project_id, metadata, trismik_client
138
150
  )
139
- progress_bars.on_eval_run_completed(run.dataset_index)
151
+ # Update progress bars with items processed and success status
152
+ if progress_bars is not None:
153
+ items_processed = len(run.dataset.items)
154
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
140
155
 
141
156
  if (
142
157
  upload_results
@@ -146,10 +161,18 @@ async def execute_runs(
146
161
  and run_result.run_completed
147
162
  and trismik_client is not None
148
163
  ):
149
- run_id = await upload_classic_run_results(
150
- run_result, experiment_id, project_id, inference, metadata, trismik_client
151
- )
152
- run_result.run_id = run_id
164
+ try:
165
+ run_id = await upload_classic_run_results(
166
+ run_result, experiment_id, project_id, inference, metadata, trismik_client
167
+ )
168
+ run_result.run_id = run_id
169
+ if progress_bars is not None:
170
+ progress_bars.on_upload_completed(succeeded=True)
171
+ except Exception as e:
172
+ logger.warning(f"Failed to upload run results: {e}")
173
+ if progress_bars is not None:
174
+ progress_bars.on_upload_completed(succeeded=False)
175
+ # Continue evaluation even if upload fails
153
176
 
154
177
  return run_result
155
178
 
@@ -203,7 +226,7 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
203
226
 
204
227
  try:
205
228
  inference_outputs = await run_inference_callable(
206
- inference, run.dataset.items, run.hyperparameter_config
229
+ inference, run.inputs, run.hyperparameter_config
207
230
  )
208
231
  metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
209
232
  logger.debug("Classic evaluation completed for run %s", run)
@@ -216,13 +239,13 @@ async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> Cla
216
239
 
217
240
  async def run_inference_callable(
218
241
  inference: Callable,
219
- items: List[Dict[str, Any]],
242
+ inputs: List[Any],
220
243
  hyperparameter_config: Dict[str, Any],
221
244
  ) -> Any:
222
245
  """Run inference on a given dataset and hyperparameter configuration."""
223
246
 
224
247
  try:
225
- predictions = await inference(items, **hyperparameter_config)
248
+ predictions = await inference(inputs, **hyperparameter_config)
226
249
  except Exception as e:
227
250
  logger.error(
228
251
  "Inference callable raised an exception: %s",
@@ -230,11 +253,11 @@ async def run_inference_callable(
230
253
  )
231
254
  raise InferenceError(f"Inference failed: {str(e)}") from e
232
255
 
233
- if not isinstance(predictions, list) or len(predictions) != len(items):
256
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
234
257
  raise InferenceError(
235
258
  "Inference callable must return a list of predictions "
236
- "of shared length as the input items. "
237
- f"Items length: {len(items)}, predictions length: {len(predictions)}"
259
+ "of shared length as the inputs. "
260
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
238
261
  )
239
262
 
240
263
  if all(prediction == "" for prediction in predictions):
@@ -257,15 +280,20 @@ async def execute_adaptive_eval_run(
257
280
  """Execute an adaptive evaluation run."""
258
281
  logger.debug("Executing adaptive run for %s", run)
259
282
 
260
- if trismik_client is None:
261
- raise ScoreBookError("Trismik client is required for adaptive evaluation")
283
+ try:
284
+ if trismik_client is None:
285
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
262
286
 
263
- adaptive_eval_run_result = await run_adaptive_evaluation(
264
- inference, run, experiment_id, project_id, metadata, trismik_client
265
- )
266
- logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
287
+ adaptive_eval_run_result = await run_adaptive_evaluation(
288
+ inference, run, experiment_id, project_id, metadata, trismik_client
289
+ )
290
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
267
291
 
268
- return adaptive_eval_run_result
292
+ return adaptive_eval_run_result
293
+
294
+ except Exception as e:
295
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
296
+ return AdaptiveEvalRunResult(run, False, {})
269
297
 
270
298
 
271
299
  async def upload_classic_run_results(
@@ -291,10 +319,12 @@ async def upload_classic_run_results(
291
319
  """
292
320
  model = get_model_name(inference_callable)
293
321
 
294
- # Create eval items from run_spec items, outputs, and labels
322
+ # Create eval items from run_spec inputs, outputs, and labels
295
323
  items: List[TrismikClassicEvalItem] = []
296
- for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
297
- label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
324
+ inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
+ for idx, (input_value, output) in enumerate(inputs_outputs):
326
+ labels = run_result.run_spec.labels
327
+ label = labels[idx] if idx < len(labels) else ""
298
328
 
299
329
  # Calculate item-level metrics for this item
300
330
  item_metrics: Dict[str, Any] = {}
@@ -309,7 +339,7 @@ async def upload_classic_run_results(
309
339
 
310
340
  eval_item = TrismikClassicEvalItem(
311
341
  datasetItemId=str(idx),
312
- modelInput=str(item),
342
+ modelInput=str(input_value),
313
343
  modelOutput=str(output),
314
344
  goldOutput=str(label),
315
345
  metrics=item_metrics,
@@ -410,4 +440,4 @@ async def run_adaptive_evaluation(
410
440
  # Make scores JSON serializable
411
441
  scores = make_json_serializable(scores)
412
442
 
413
- return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
443
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
@@ -10,7 +10,7 @@ from trismik.types import (
10
10
  TrismikRunMetadata,
11
11
  )
12
12
 
13
- from scorebook.eval_dataset import EvalDataset
13
+ from scorebook.eval_datasets import EvalDataset
14
14
  from scorebook.evaluate.evaluate_helpers import (
15
15
  build_eval_run_specs,
16
16
  create_trismik_sync_client,
@@ -19,6 +19,7 @@ from scorebook.evaluate.evaluate_helpers import (
19
19
  make_trismik_inference,
20
20
  prepare_datasets,
21
21
  prepare_hyperparameter_configs,
22
+ resolve_show_progress,
22
23
  resolve_upload_results,
23
24
  score_metrics,
24
25
  validate_parameters,
@@ -32,7 +33,7 @@ from scorebook.types import (
32
33
  EvalRunSpec,
33
34
  )
34
35
  from contextlib import nullcontext
35
- from scorebook.utils import evaluation_progress
36
+ from scorebook.utils import evaluation_progress_context
36
37
 
37
38
  logger = logging.getLogger(__name__)
38
39
 
@@ -50,6 +51,7 @@ def evaluate(
50
51
  return_output: bool = False,
51
52
  upload_results: Union[Literal["auto"], bool] = "auto",
52
53
  sample_size: Optional[int] = None,
54
+ show_progress: Optional[bool] = None,
53
55
  ) -> Union[Dict, List, EvalResult]:
54
56
  """
55
57
  Evaluate a model across a collection of hyperparameters and datasets.
@@ -67,6 +69,8 @@ def evaluate(
67
69
  return_output: If True, returns model outputs for each dataset item
68
70
  upload_results: If True, uploads results to Trismik's dashboard
69
71
  sample_size: Optional number of items to sample from each dataset
72
+ show_progress: If None, uses SHOW_PROGRESS_BARS from settings.
73
+ If True/False, explicitly enables/disables progress bars for this evaluation.
70
74
 
71
75
  Returns:
72
76
  The evaluation results in the format specified by return parameters:
@@ -75,6 +79,7 @@ def evaluate(
75
79
  """
76
80
  # Resolve and validate parameters
77
81
  upload_results = cast(bool, resolve_upload_results(upload_results))
82
+ show_progress_bars = resolve_show_progress(show_progress)
78
83
  validate_parameters(locals(), evaluate)
79
84
 
80
85
  # Prepare datasets, hyperparameters, and eval run specs
@@ -95,10 +100,17 @@ def evaluate(
95
100
 
96
101
  with trismik_client or nullcontext():
97
102
  # Execute evaluation runs
98
- with evaluation_progress(
103
+ # Calculate total items across all runs
104
+ total_items = sum(len(run.dataset.items) for run in eval_run_specs)
105
+ model_display = get_model_name(inference)
106
+
107
+ with evaluation_progress_context(
108
+ total_eval_runs=len(eval_run_specs),
109
+ total_items=total_items,
99
110
  dataset_count=len(datasets),
100
- hyperparameter_config_count=len(hyperparameter_configs),
101
- run_count=len(eval_run_specs),
111
+ hyperparam_count=len(hyperparameter_configs),
112
+ model_display=model_display,
113
+ enabled=show_progress_bars,
102
114
  ) as progress_bars:
103
115
  eval_result = execute_runs(
104
116
  inference,
@@ -136,7 +148,10 @@ def execute_runs(
136
148
  run_result = execute_run(
137
149
  inference, run, experiment_id, project_id, metadata, trismik_client
138
150
  )
139
- progress_bars.on_eval_run_completed(run.dataset_index)
151
+ # Update progress bars with items processed and success status
152
+ if progress_bars is not None:
153
+ items_processed = len(run.dataset.items)
154
+ progress_bars.on_run_completed(items_processed, run_result.run_completed)
140
155
 
141
156
  if (
142
157
  upload_results
@@ -146,10 +161,18 @@ def execute_runs(
146
161
  and run_result.run_completed
147
162
  and trismik_client is not None
148
163
  ):
149
- run_id = upload_classic_run_results(
150
- run_result, experiment_id, project_id, inference, metadata, trismik_client
151
- )
152
- run_result.run_id = run_id
164
+ try:
165
+ run_id = upload_classic_run_results(
166
+ run_result, experiment_id, project_id, inference, metadata, trismik_client
167
+ )
168
+ run_result.run_id = run_id
169
+ if progress_bars is not None:
170
+ progress_bars.on_upload_completed(succeeded=True)
171
+ except Exception as e:
172
+ logger.warning(f"Failed to upload run results: {e}")
173
+ if progress_bars is not None:
174
+ progress_bars.on_upload_completed(succeeded=False)
175
+ # Continue evaluation even if upload fails
153
176
 
154
177
  return run_result
155
178
 
@@ -203,7 +226,7 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
203
226
 
204
227
  try:
205
228
  inference_outputs = run_inference_callable(
206
- inference, run.dataset.items, run.hyperparameter_config
229
+ inference, run.inputs, run.hyperparameter_config
207
230
  )
208
231
  metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
209
232
  logger.debug("Classic evaluation completed for run %s", run)
@@ -216,13 +239,13 @@ def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEv
216
239
 
217
240
  def run_inference_callable(
218
241
  inference: Callable,
219
- items: List[Dict[str, Any]],
242
+ inputs: List[Any],
220
243
  hyperparameter_config: Dict[str, Any],
221
244
  ) -> Any:
222
245
  """Run inference on a given dataset and hyperparameter configuration."""
223
246
 
224
247
  try:
225
- predictions = inference(items, **hyperparameter_config)
248
+ predictions = inference(inputs, **hyperparameter_config)
226
249
  except Exception as e:
227
250
  logger.error(
228
251
  "Inference callable raised an exception: %s",
@@ -230,11 +253,11 @@ def run_inference_callable(
230
253
  )
231
254
  raise InferenceError(f"Inference failed: {str(e)}") from e
232
255
 
233
- if not isinstance(predictions, list) or len(predictions) != len(items):
256
+ if not isinstance(predictions, list) or len(predictions) != len(inputs):
234
257
  raise InferenceError(
235
258
  "Inference callable must return a list of predictions "
236
- "of shared length as the input items. "
237
- f"Items length: {len(items)}, predictions length: {len(predictions)}"
259
+ "of shared length as the inputs. "
260
+ f"Inputs length: {len(inputs)}, predictions length: {len(predictions)}"
238
261
  )
239
262
 
240
263
  if all(prediction == "" for prediction in predictions):
@@ -257,15 +280,20 @@ def execute_adaptive_eval_run(
257
280
  """Execute an adaptive evaluation run."""
258
281
  logger.debug("Executing adaptive run for %s", run)
259
282
 
260
- if trismik_client is None:
261
- raise ScoreBookError("Trismik client is required for adaptive evaluation")
283
+ try:
284
+ if trismik_client is None:
285
+ raise ScoreBookError("Trismik client is required for adaptive evaluation")
262
286
 
263
- adaptive_eval_run_result = run_adaptive_evaluation(
264
- inference, run, experiment_id, project_id, metadata, trismik_client
265
- )
266
- logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
287
+ adaptive_eval_run_result = run_adaptive_evaluation(
288
+ inference, run, experiment_id, project_id, metadata, trismik_client
289
+ )
290
+ logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
267
291
 
268
- return adaptive_eval_run_result
292
+ return adaptive_eval_run_result
293
+
294
+ except Exception as e:
295
+ logger.warning("Failed to complete adaptive eval run for %s: %s", run, str(e))
296
+ return AdaptiveEvalRunResult(run, False, {})
269
297
 
270
298
 
271
299
  def upload_classic_run_results(
@@ -291,10 +319,12 @@ def upload_classic_run_results(
291
319
  """
292
320
  model = get_model_name(inference_callable)
293
321
 
294
- # Create eval items from run_spec items, outputs, and labels
322
+ # Create eval items from run_spec inputs, outputs, and labels
295
323
  items: List[TrismikClassicEvalItem] = []
296
- for idx, (item, output) in enumerate(zip(run_result.run_spec.items, run_result.outputs)):
297
- label = run_result.run_spec.labels[idx] if idx < len(run_result.run_spec.labels) else ""
324
+ inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
+ for idx, (input_value, output) in enumerate(inputs_outputs):
326
+ labels = run_result.run_spec.labels
327
+ label = labels[idx] if idx < len(labels) else ""
298
328
 
299
329
  # Calculate item-level metrics for this item
300
330
  item_metrics: Dict[str, Any] = {}
@@ -309,7 +339,7 @@ def upload_classic_run_results(
309
339
 
310
340
  eval_item = TrismikClassicEvalItem(
311
341
  datasetItemId=str(idx),
312
- modelInput=str(item),
342
+ modelInput=str(input_value),
313
343
  modelOutput=str(output),
314
344
  goldOutput=str(label),
315
345
  metrics=item_metrics,
@@ -410,4 +440,4 @@ def run_adaptive_evaluation(
410
440
  # Make scores JSON serializable
411
441
  scores = make_json_serializable(scores)
412
442
 
413
- return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
443
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, run_completed=True, scores=scores)
@@ -35,6 +35,22 @@ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool
35
35
  return upload_results
36
36
 
37
37
 
38
+ def resolve_show_progress(show_progress: Optional[bool]) -> bool:
39
+ """Resolve whether to show progress bars.
40
+
41
+ Args:
42
+ show_progress: Explicit setting (None uses default from settings)
43
+
44
+ Returns:
45
+ bool: Whether to show progress bars
46
+ """
47
+ if show_progress is None:
48
+ from scorebook.settings import SHOW_PROGRESS_BARS
49
+
50
+ return bool(SHOW_PROGRESS_BARS)
51
+ return show_progress
52
+
53
+
38
54
  def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
39
55
  """Validate all parameters for evaluation."""
40
56
 
@@ -172,15 +188,22 @@ def build_classic_eval_run_spec(
172
188
  hyperparameters: Dict[str, Any],
173
189
  hyperparameters_index: int,
174
190
  ) -> EvalRunSpec:
175
- """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
176
- items = dataset.items
177
- labels = [item.get(dataset.label) for item in items]
191
+ """Build EvalRunSpec objects for a classic dataset and hyperparameter combination.
192
+
193
+ Extracts input and label values from the appropriate columns in the dataset.
194
+ The column names are determined by dataset.input and dataset.label,
195
+ which may be original field names (e.g., "question", "answer") or computed
196
+ column names (e.g., "*input", "*label") if templates were used.
197
+ """
198
+ # Extract inputs and labels using the dataset's column specifications
199
+ inputs = dataset[dataset.input] # Returns List[Any]
200
+ labels = dataset[dataset.label] # Returns List[Any]
178
201
  eval_run_spec = EvalRunSpec(
179
202
  dataset,
180
203
  dataset_index,
181
204
  hyperparameters,
182
205
  hyperparameters_index,
183
- items,
206
+ inputs,
184
207
  labels,
185
208
  )
186
209
  logger.debug("Built EvalRunSpec: %s", eval_run_spec)
@@ -256,7 +279,7 @@ def create_trismik_sync_client() -> TrismikClient:
256
279
  def get_model_name(
257
280
  inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
258
281
  ) -> str:
259
- """Determine a model's name with the fallback "unspecified"."""
282
+ """Determine a model's name with the fallback "Model"."""
260
283
 
261
284
  # First priority: metadata.model
262
285
  if metadata and "model" in metadata:
@@ -266,8 +289,8 @@ def get_model_name(
266
289
  if inference_callable and hasattr(inference_callable, "model"):
267
290
  return str(inference_callable.model)
268
291
 
269
- # Fallback: "unspecified"
270
- return "unspecified"
292
+ # Fallback: "Model"
293
+ return "Model"
271
294
 
272
295
 
273
296
  def format_results(
@@ -292,7 +315,7 @@ def format_results(
292
315
  # Remove inference output if not requested
293
316
  if not return_output:
294
317
  for item in item_scores:
295
- item.pop("inference_output", None)
318
+ item.pop("output", None)
296
319
 
297
320
  results["item_results"] = item_scores
298
321
 
scorebook/exceptions.py CHANGED
@@ -10,6 +10,54 @@ class ScoreBookError(Exception):
10
10
  """Base exception class for all Scorebook-related errors."""
11
11
 
12
12
 
13
+ class EvalDatasetError(ScoreBookError):
14
+ """Base exception class for all EvalDataset errors."""
15
+
16
+
17
+ class DatasetConfigurationError(EvalDatasetError):
18
+ """Raised when dataset configuration is invalid (e.g., mutually exclusive parameters)."""
19
+
20
+
21
+ class MissingFieldError(EvalDatasetError):
22
+ """Raised when required field is missing from dataset."""
23
+
24
+ def __init__(self, field_name: str, field_type: str, available_fields: list[str]):
25
+ """Initialize missing field error with structured context."""
26
+ self.field_name = field_name
27
+ self.field_type = field_type # "input" or "label"
28
+ self.available_fields = available_fields
29
+ super().__init__(
30
+ f"{field_type.capitalize()} field '{field_name}' not found. "
31
+ f"Available fields: {', '.join(available_fields)}"
32
+ )
33
+
34
+
35
+ class DatasetLoadError(EvalDatasetError):
36
+ """Raised when dataset fails to load from source (file or remote)."""
37
+
38
+
39
+ class DatasetParseError(EvalDatasetError):
40
+ """Raised when dataset file cannot be parsed (CSV, JSON, YAML)."""
41
+
42
+
43
+ class DatasetNotInitializedError(EvalDatasetError):
44
+ """Raised when operations are attempted on uninitialized dataset."""
45
+
46
+
47
+ class DatasetSampleError(EvalDatasetError):
48
+ """Raised when sampling parameters are invalid."""
49
+
50
+ def __init__(self, sample_size: int, dataset_size: int, dataset_name: str):
51
+ """Initialize dataset sample error with structured context."""
52
+ self.sample_size = sample_size
53
+ self.dataset_size = dataset_size
54
+ self.dataset_name = dataset_name
55
+ super().__init__(
56
+ f"Sample size {sample_size} exceeds dataset size {dataset_size} "
57
+ f"for dataset '{dataset_name}'"
58
+ )
59
+
60
+
13
61
  class EvaluationError(ScoreBookError):
14
62
  """Raised when there are errors during model evaluation."""
15
63
 
@@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Tuple
17
17
  import boto3
18
18
  from botocore.config import Config
19
19
  from botocore.exceptions import ClientError
20
- from tqdm.asyncio import tqdm
20
+ from tqdm.auto import tqdm
21
21
 
22
22
 
23
23
  async def batch(
@@ -13,7 +13,7 @@ import tempfile
13
13
  from typing import Any, List, Optional
14
14
 
15
15
  from portkey_ai import AsyncPortkey
16
- from tqdm.asyncio import tqdm
16
+ from tqdm.auto import tqdm
17
17
 
18
18
 
19
19
  async def responses(
@@ -18,7 +18,7 @@ import pandas as pd
18
18
  from google import genai
19
19
  from google.cloud import storage
20
20
  from google.genai import types
21
- from tqdm.asyncio import tqdm
21
+ from tqdm.auto import tqdm
22
22
 
23
23
 
24
24
  async def responses(
scorebook/settings.py CHANGED
@@ -16,3 +16,6 @@ TRISMIK_ADAPTIVE_TESTING_URL = f"{TRISMIK_API_BASE_URL}/adaptive-testing"
16
16
 
17
17
  # Allow override via environment variable
18
18
  TRISMIK_SERVICE_URL = os.environ.get("TRISMIK_SERVICE_URL", TRISMIK_ADAPTIVE_TESTING_URL)
19
+
20
+ # Progress bar configuration
21
+ SHOW_PROGRESS_BARS = os.environ.get("SCOREBOOK_SHOW_PROGRESS_BARS", "true").lower() == "true"
scorebook/types.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from dataclasses import dataclass
4
4
  from typing import Any, Dict, List, Optional, Union
5
5
 
6
- from scorebook.eval_dataset import EvalDataset
6
+ from scorebook.eval_datasets import EvalDataset
7
7
 
8
8
 
9
9
  @dataclass
@@ -21,7 +21,7 @@ class EvalRunSpec:
21
21
  dataset_index: int
22
22
  hyperparameter_config: Dict[str, Any]
23
23
  hyperparameters_index: int
24
- items: List[Dict[str, Any]]
24
+ inputs: List[Any]
25
25
  labels: List[Any]
26
26
 
27
27
  def __str__(self) -> str:
@@ -64,13 +64,15 @@ class ClassicEvalRunResult:
64
64
 
65
65
  if self.outputs:
66
66
  for idx, output in enumerate(self.outputs):
67
- if idx >= len(self.run_spec.items):
67
+ if idx >= len(self.run_spec.inputs):
68
68
  break
69
69
 
70
70
  result = {
71
- "item_id": idx,
71
+ "id": idx,
72
72
  "dataset_name": self.run_spec.dataset.name,
73
- "inference_output": output,
73
+ "input": self.run_spec.inputs[idx],
74
+ "label": self.run_spec.labels[idx] if idx < len(self.run_spec.labels) else None,
75
+ "output": output,
74
76
  **self.run_spec.hyperparameter_config,
75
77
  }
76
78
 
@@ -125,6 +127,7 @@ class AdaptiveEvalRunResult:
125
127
  """Results from executing an adaptive evaluation run."""
126
128
 
127
129
  run_spec: AdaptiveEvalRunSpec
130
+ run_completed: bool
128
131
  scores: Dict[str, Any]
129
132
 
130
133
  @property
@@ -1,9 +1,9 @@
1
1
  """Utility functions and common helpers for the Scorebook framework."""
2
2
 
3
3
  from scorebook.utils.async_utils import async_nullcontext, is_awaitable
4
- from scorebook.utils.build_prompt import build_prompt
5
4
  from scorebook.utils.io_helpers import validate_path
6
- from scorebook.utils.progress_bars import evaluation_progress
5
+ from scorebook.utils.progress_bars import evaluation_progress_context
6
+ from scorebook.utils.render_template import render_template
7
7
  from scorebook.utils.transform_helpers import expand_dict
8
8
 
9
9
  __all__ = [
@@ -11,6 +11,6 @@ __all__ = [
11
11
  "is_awaitable",
12
12
  "validate_path",
13
13
  "expand_dict",
14
- "evaluation_progress",
15
- "build_prompt",
14
+ "evaluation_progress_context",
15
+ "render_template",
16
16
  ]