scorebook 0.0.12__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {scorebook-0.0.12 → scorebook-0.0.13}/PKG-INFO +2 -2
  2. {scorebook-0.0.12 → scorebook-0.0.13}/pyproject.toml +3 -3
  3. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/eval_datasets/eval_dataset.py +18 -0
  4. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_async/evaluate_async.py +16 -1
  5. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/evaluate.py +16 -1
  6. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/evaluate_helpers.py +75 -2
  7. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/types.py +2 -0
  8. {scorebook-0.0.12 → scorebook-0.0.13}/LICENSE +0 -0
  9. {scorebook-0.0.12 → scorebook-0.0.13}/README.md +0 -0
  10. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/__init__.py +0 -0
  11. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/cli/__init__.py +0 -0
  12. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/cli/auth.py +0 -0
  13. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/cli/main.py +0 -0
  14. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/eval_datasets/__init__.py +0 -0
  15. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/__init__.py +0 -0
  16. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_async/__init__.py +0 -0
  17. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/__init__.py +0 -0
  18. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/exceptions.py +0 -0
  19. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/__init__.py +0 -0
  20. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/clients/__init__.py +0 -0
  21. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/clients/bedrock.py +0 -0
  22. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/clients/openai.py +0 -0
  23. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/clients/portkey.py +0 -0
  24. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/clients/vertex.py +0 -0
  25. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/inference/inference_pipeline.py +0 -0
  26. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/metrics/__init__.py +0 -0
  27. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/metrics/accuracy.py +0 -0
  28. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/metrics/metric_base.py +0 -0
  29. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/metrics/metric_registry.py +0 -0
  30. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/metrics/precision.py +0 -0
  31. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/__init__.py +0 -0
  32. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/_async/__init__.py +0 -0
  33. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/_async/score_async.py +0 -0
  34. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/_sync/__init__.py +0 -0
  35. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/_sync/score.py +0 -0
  36. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/score/score_helpers.py +0 -0
  37. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/settings.py +0 -0
  38. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/trismik/__init__.py +0 -0
  39. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/trismik/credentials.py +0 -0
  40. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/trismik/upload_results.py +0 -0
  41. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/__init__.py +0 -0
  42. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/async_utils.py +0 -0
  43. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/common_helpers.py +0 -0
  44. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/io_helpers.py +0 -0
  45. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/jinja_helpers.py +0 -0
  46. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/mappers.py +0 -0
  47. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/progress_bars.py +0 -0
  48. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/render_template.py +0 -0
  49. {scorebook-0.0.12 → scorebook-0.0.13}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.12
3
+ Version: 0.0.13
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
37
37
  Requires-Dist: torchaudio ; extra == "examples"
38
38
  Requires-Dist: torchvision ; extra == "examples"
39
39
  Requires-Dist: transformers ; extra == "examples"
40
- Requires-Dist: trismik (==1.0.1)
40
+ Requires-Dist: trismik (==1.0.2)
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
  # Scorebook
@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
11
11
  dependencies = [
12
12
  "datasets>=3.6.0",
13
13
  "notebook (>=7.4.5,<8.0.0)",
14
- "trismik==1.0.1",
14
+ "trismik==1.0.2",
15
15
  "ipywidgets>=8.0.0",
16
16
  ]
17
17
 
@@ -19,14 +19,14 @@ dependencies = [
19
19
  scorebook = "scorebook.cli.main:main"
20
20
 
21
21
  [tool.poetry]
22
- version = "0.0.12" # base version
22
+ version = "0.0.13" # base version
23
23
  packages = [{ include = "scorebook", from = "src" }]
24
24
 
25
25
  [tool.poetry.dependencies]
26
26
  python = ">=3.9,<3.14"
27
27
  datasets = ">=3.6.0"
28
28
  notebook = ">=7.4.5,<8.0.0"
29
- trismik = "1.0.1"
29
+ trismik = "1.0.2"
30
30
  ipywidgets = ">=8.0.0"
31
31
 
32
32
  # Optional dependencies
@@ -137,6 +137,24 @@ class EvalDataset:
137
137
  raise DatasetNotInitializedError("Dataset is not initialized")
138
138
  return list(map(str, self._hf_dataset.column_names))
139
139
 
140
+ @property
141
+ def split(self) -> Optional[str]:
142
+ """Return the split name of the underlying HuggingFace dataset, if available.
143
+
144
+ Returns:
145
+ The split name (e.g., "train", "test", "validation") if the dataset was loaded
146
+ from HuggingFace with a specific split. Returns None if the dataset was created
147
+ from a list, CSV, JSON, or loaded without a split specification.
148
+
149
+ Raises:
150
+ DatasetNotInitializedError: If the dataset is not initialized.
151
+ """
152
+ if self._hf_dataset is None:
153
+ raise DatasetNotInitializedError("Dataset is not initialized")
154
+
155
+ split = self._hf_dataset.split
156
+ return str(split) if split is not None else None
157
+
140
158
  def shuffle(self) -> None:
141
159
  """Randomly shuffle the dataset items."""
142
160
  if self._hf_dataset is None:
@@ -15,6 +15,7 @@ from scorebook.evaluate.evaluate_helpers import (
15
15
  make_trismik_inference,
16
16
  prepare_datasets,
17
17
  prepare_hyperparameter_configs,
18
+ resolve_adaptive_split,
18
19
  validate_parameters,
19
20
  )
20
21
  from scorebook.exceptions import InferenceError, ScoreBookError
@@ -40,6 +41,7 @@ logger = logging.getLogger(__name__)
40
41
  async def evaluate_async(
41
42
  inference: Union[Callable, InferencePipeline],
42
43
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
+ split: Optional[str] = None,
43
45
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
44
46
  metadata: Optional[Dict[str, Any]] = None,
45
47
  experiment_id: Optional[str] = None,
@@ -58,6 +60,7 @@ async def evaluate_async(
58
60
  Args:
59
61
  inference: The inference callable to evaluate
60
62
  datasets: Dataset(s) to evaluate on
63
+ split: Split to use for evaluation (default: "validation")
61
64
  hyperparameters: Hyperparameter configuration(s) to evaluate with
62
65
  metadata: Optional metadata to attach to the evaluation
63
66
  experiment_id: Optional experiment identifier
@@ -82,7 +85,7 @@ async def evaluate_async(
82
85
  validate_parameters(locals(), evaluate_async)
83
86
 
84
87
  # Prepare datasets, hyperparameters, and eval run specs
85
- datasets = prepare_datasets(datasets, sample_size)
88
+ datasets = prepare_datasets(datasets, split, sample_size)
86
89
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
87
90
  eval_run_specs = sorted(
88
91
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -378,8 +381,20 @@ async def run_adaptive_evaluation(
378
381
  Returns:
379
382
  Results from the adaptive evaluation
380
383
  """
384
+ # Fetch available splits from Trismik
385
+ dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
386
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
387
+
388
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
389
+ resolved_split = resolve_adaptive_split(
390
+ test_id=adaptive_run_spec.dataset,
391
+ user_specified_split=adaptive_run_spec.split,
392
+ available_splits=available_splits,
393
+ )
394
+
381
395
  trismik_results = await trismik_client.run(
382
396
  test_id=adaptive_run_spec.dataset,
397
+ split=resolved_split,
383
398
  project_id=project_id,
384
399
  experiment=experiment_id,
385
400
  run_metadata=TrismikRunMetadata(
@@ -14,6 +14,7 @@ from scorebook.evaluate.evaluate_helpers import (
14
14
  make_trismik_inference,
15
15
  prepare_datasets,
16
16
  prepare_hyperparameter_configs,
17
+ resolve_adaptive_split,
17
18
  validate_parameters,
18
19
  )
19
20
  from scorebook.exceptions import InferenceError, ScoreBookError
@@ -39,6 +40,7 @@ logger = logging.getLogger(__name__)
39
40
  def evaluate(
40
41
  inference: Union[Callable, InferencePipeline],
41
42
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
43
+ split: Optional[str] = None,
42
44
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
43
45
  metadata: Optional[Dict[str, Any]] = None,
44
46
  experiment_id: Optional[str] = None,
@@ -57,6 +59,7 @@ def evaluate(
57
59
  Args:
58
60
  inference: The inference callable to evaluate
59
61
  datasets: Dataset(s) to evaluate on
62
+ split: Split to use for evaluation (default: "validation")
60
63
  hyperparameters: Hyperparameter configuration(s) to evaluate with
61
64
  metadata: Optional metadata to attach to the evaluation
62
65
  experiment_id: Optional experiment identifier
@@ -81,7 +84,7 @@ def evaluate(
81
84
  validate_parameters(locals(), evaluate)
82
85
 
83
86
  # Prepare datasets, hyperparameters, and eval run specs
84
- datasets = prepare_datasets(datasets, sample_size)
87
+ datasets = prepare_datasets(datasets, split, sample_size)
85
88
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
86
89
  eval_run_specs = sorted(
87
90
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -377,8 +380,20 @@ def run_adaptive_evaluation(
377
380
  Returns:
378
381
  Results from the adaptive evaluation
379
382
  """
383
+ # Fetch available splits from Trismik
384
+ dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
385
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
386
+
387
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
388
+ resolved_split = resolve_adaptive_split(
389
+ test_id=adaptive_run_spec.dataset,
390
+ user_specified_split=adaptive_run_spec.split,
391
+ available_splits=available_splits,
392
+ )
393
+
380
394
  trismik_results = trismik_client.run(
381
395
  test_id=adaptive_run_spec.dataset,
396
+ split=resolved_split,
382
397
  project_id=project_id,
383
398
  experiment=experiment_id,
384
399
  run_metadata=TrismikRunMetadata(
@@ -91,6 +91,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
91
91
 
92
92
  def prepare_datasets(
93
93
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
94
+ split: Optional[str] = None,
94
95
  sample_size: Optional[int] = None,
95
96
  ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
96
97
  """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
@@ -104,6 +105,12 @@ def prepare_datasets(
104
105
 
105
106
  # Prepare classic datasets
106
107
  if isinstance(dataset, EvalDataset):
108
+ # Warn if dataset split differs from provided split parameter
109
+ if split is not None and dataset.split is not None and dataset.split != split:
110
+ logger.warning(
111
+ f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
112
+ f"parameter is '{split}'. The dataset split will be used."
113
+ )
107
114
 
108
115
  if sample_size is not None:
109
116
  dataset = dataset.sample(sample_size)
@@ -111,8 +118,17 @@ def prepare_datasets(
111
118
  datasets_out.append(dataset)
112
119
 
113
120
  # Prepare adaptive datasets
114
- elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
115
- datasets_out.append(AdaptiveEvalDataset(dataset))
121
+ elif isinstance(dataset, str) and ":adaptive" in dataset:
122
+ # Parse adaptive dataset
123
+ parts = dataset.split(":")
124
+ if len(parts) != 2 or parts[1] != "adaptive":
125
+ raise ParameterValidationError(
126
+ f"Invalid adaptive dataset format: '{dataset}'. "
127
+ f"Use 'test_id:adaptive' format and specify split via the split parameter."
128
+ )
129
+
130
+ # Use the split parameter for all adaptive datasets
131
+ datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
116
132
 
117
133
  # TODO: dataset name string registry
118
134
  elif isinstance(dataset, str):
@@ -174,6 +190,7 @@ def build_eval_run_specs(
174
190
  hyperparameters_index,
175
191
  experiment_id,
176
192
  project_id,
193
+ dataset.split,
177
194
  metadata,
178
195
  )
179
196
  )
@@ -220,6 +237,7 @@ def build_adaptive_eval_run_spec(
220
237
  hyperparameter_config_index: int,
221
238
  experiment_id: str,
222
239
  project_id: str,
240
+ split: Optional[str] = None,
223
241
  metadata: Optional[Dict[str, Any]] = None,
224
242
  ) -> AdaptiveEvalRunSpec:
225
243
  """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
@@ -231,6 +249,7 @@ def build_adaptive_eval_run_spec(
231
249
  hyperparameter_config_index,
232
250
  experiment_id,
233
251
  project_id,
252
+ split,
234
253
  metadata,
235
254
  )
236
255
  logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
@@ -386,3 +405,57 @@ def make_trismik_inference(
386
405
  )
387
406
 
388
407
  return sync_trismik_inference_function
408
+
409
+
410
+ def resolve_adaptive_split(
411
+ test_id: str,
412
+ user_specified_split: Optional[str],
413
+ available_splits: List[str],
414
+ ) -> str:
415
+ """Resolve the dataset split to use for adaptive evaluation.
416
+
417
+ Resolution order:
418
+ 1. If user specified a split, validate it exists and use it
419
+ 2. If not specified and exactly one split is available, use it
420
+ 3. If not specified and multiple splits are available, raise an error
421
+ 4. If no splits are available, raise an error
422
+
423
+ Args:
424
+ test_id: The test dataset ID (without ":adaptive" suffix)
425
+ user_specified_split: Optional split name specified by the user
426
+ available_splits: List of available split names for this dataset
427
+
428
+ Returns:
429
+ The resolved split name to use
430
+
431
+ Raises:
432
+ ScoreBookError: If the specified split doesn't exist, multiple splits exist without
433
+ user specification, or no splits are available
434
+ """
435
+ logger.debug(f"Available splits for {test_id}: {available_splits}")
436
+
437
+ # If user specified a split, validate and use it
438
+ if user_specified_split is not None:
439
+ if user_specified_split in available_splits:
440
+ logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
441
+ return user_specified_split
442
+ else:
443
+ raise ScoreBookError(
444
+ f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
445
+ f"Available splits: {available_splits}"
446
+ )
447
+
448
+ # No split specified - check available splits
449
+ if len(available_splits) == 0:
450
+ raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
451
+ elif len(available_splits) == 1:
452
+ # Exactly one split - auto-select it
453
+ selected_split = available_splits[0]
454
+ logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
455
+ return selected_split
456
+ else:
457
+ # Multiple splits available - user must specify
458
+ raise ScoreBookError(
459
+ f"Multiple splits available for dataset '{test_id}': {available_splits}. "
460
+ f"Please specify which split to use via evaluate's 'split' parameter."
461
+ )
@@ -17,6 +17,7 @@ class AdaptiveEvalDataset:
17
17
  """Represents a dataset configured for adaptive evaluation."""
18
18
 
19
19
  name: str
20
+ split: Optional[str] = None
20
21
 
21
22
 
22
23
  @dataclass
@@ -50,6 +51,7 @@ class AdaptiveEvalRunSpec:
50
51
  hyperparameters_index: int
51
52
  experiment_id: str
52
53
  project_id: str
54
+ split: Optional[str] = None
53
55
  metadata: Optional[Dict[str, Any]] = None
54
56
 
55
57
 
File without changes
File without changes