scorebook 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. scorebook/__init__.py +10 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +24 -9
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +22 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +25 -9
  11. scorebook/evaluate/_sync/evaluate.py +25 -9
  12. scorebook/evaluate/evaluate_helpers.py +79 -5
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/__init__.py +1 -18
  17. scorebook/metrics/metric_registry.py +2 -0
  18. scorebook/score/__init__.py +0 -5
  19. scorebook/score/_async/score_async.py +3 -2
  20. scorebook/score/_sync/score.py +3 -2
  21. scorebook/score/score_helpers.py +1 -1
  22. scorebook/types.py +3 -1
  23. scorebook/utils/__init__.py +0 -22
  24. scorebook/utils/common_helpers.py +1 -1
  25. scorebook/utils/mock_llm/__init__.py +41 -0
  26. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  27. scorebook-0.0.14.dist-info/METADATA +292 -0
  28. scorebook-0.0.14.dist-info/RECORD +53 -0
  29. scorebook/trismik/__init__.py +0 -10
  30. scorebook-0.0.12.dist-info/METADATA +0 -389
  31. scorebook-0.0.12.dist-info/RECORD +0 -50
  32. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/WHEEL +0 -0
  33. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/entry_points.txt +0 -0
  34. {scorebook-0.0.12.dist-info → scorebook-0.0.14.dist-info}/licenses/LICENSE +0 -0
scorebook/__init__.py CHANGED
@@ -9,12 +9,15 @@ import importlib.metadata
9
9
  # get version from pyproject.toml
10
10
  __version__ = importlib.metadata.version(__package__ or __name__)
11
11
 
12
- from scorebook.eval_datasets import EvalDataset
13
- from scorebook.evaluate import evaluate, evaluate_async
12
+ from scorebook.dashboard.create_project import create_project, create_project_async
13
+ from scorebook.dashboard.credentials import login, logout, whoami
14
+ from scorebook.dashboard.upload_results import upload_result, upload_result_async
15
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
16
+ from scorebook.evaluate._async.evaluate_async import evaluate_async
17
+ from scorebook.evaluate._sync.evaluate import evaluate
14
18
  from scorebook.inference.inference_pipeline import InferencePipeline
15
- from scorebook.score import score, score_async
16
- from scorebook.trismik.credentials import login, logout, whoami
17
- from scorebook.trismik.upload_results import upload_result, upload_result_async
19
+ from scorebook.score._async.score_async import score_async
20
+ from scorebook.score._sync.score import score
18
21
  from scorebook.utils.render_template import render_template
19
22
 
20
23
  __all__ = [
@@ -28,6 +31,8 @@ __all__ = [
28
31
  "logout",
29
32
  "whoami",
30
33
  "InferencePipeline",
34
+ "create_project",
35
+ "create_project_async",
31
36
  "upload_result",
32
37
  "upload_result_async",
33
38
  ]
scorebook/cli/auth.py CHANGED
@@ -4,7 +4,7 @@ import argparse
4
4
  import getpass
5
5
  import sys
6
6
 
7
- from scorebook.trismik.credentials import get_stored_token, get_token_path, login, logout, whoami
7
+ from scorebook.dashboard.credentials import get_stored_token, get_token_path, login, logout, whoami
8
8
 
9
9
 
10
10
  def auth_command(args: argparse.Namespace) -> int:
@@ -0,0 +1 @@
1
+ """Trismik authentication and API integration."""
@@ -0,0 +1,91 @@
1
+ """Create projects in Trismik's experimentation platform."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from trismik.types import TrismikProject
7
+
8
+ from scorebook.evaluate.evaluate_helpers import (
9
+ create_trismik_async_client,
10
+ create_trismik_sync_client,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def create_project(
17
+ name: str,
18
+ team_id: Optional[str] = None,
19
+ description: Optional[str] = None,
20
+ ) -> TrismikProject:
21
+ """Create a new project in Trismik's experimentation platform (synchronous).
22
+
23
+ This function creates a new project that can be used to organize experiments
24
+ and evaluation runs in the Trismik platform.
25
+
26
+ Args:
27
+ name: Name of the project
28
+ team_id: Optional ID of the team to create the project in. If not provided,
29
+ the project will be created in the user's default team.
30
+ description: Optional description of the project
31
+
32
+ Returns:
33
+ TrismikProject: Created project object containing project details including
34
+ id, name, description, accountId, createdAt, and updatedAt fields
35
+
36
+ Raises:
37
+ TrismikValidationError: If the request fails validation
38
+ TrismikApiError: If the API request fails
39
+ """
40
+ # Create Trismik client
41
+ trismik_client = create_trismik_sync_client()
42
+
43
+ # Create project via Trismik API
44
+ project = trismik_client.create_project(
45
+ name=name,
46
+ team_id=team_id,
47
+ description=description,
48
+ )
49
+
50
+ logger.info(f"Project '{name}' created successfully with ID: {project.id}")
51
+
52
+ return project
53
+
54
+
55
+ async def create_project_async(
56
+ name: str,
57
+ team_id: Optional[str] = None,
58
+ description: Optional[str] = None,
59
+ ) -> TrismikProject:
60
+ """Create a new project in Trismik's experimentation platform (asynchronous).
61
+
62
+ This function creates a new project that can be used to organize experiments
63
+ and evaluation runs in the Trismik platform.
64
+
65
+ Args:
66
+ name: Name of the project
67
+ team_id: Optional ID of the team to create the project in. If not provided,
68
+ the project will be created in the user's default team.
69
+ description: Optional description of the project
70
+
71
+ Returns:
72
+ TrismikProject: Created project object containing project details including
73
+ id, name, description, accountId, createdAt, and updatedAt fields
74
+
75
+ Raises:
76
+ TrismikValidationError: If the request fails validation
77
+ TrismikApiError: If the API request fails
78
+ """
79
+ # Create Trismik async client
80
+ trismik_client = create_trismik_async_client()
81
+
82
+ # Create project via Trismik API (async)
83
+ project = await trismik_client.create_project(
84
+ name=name,
85
+ team_id=team_id,
86
+ description=description,
87
+ )
88
+
89
+ logger.info(f"Project '{name}' created successfully with ID: {project.id}")
90
+
91
+ return project
@@ -5,6 +5,10 @@ import os
5
5
  import pathlib
6
6
  from typing import Optional
7
7
 
8
+ from trismik import TrismikClient
9
+
10
+ from scorebook.settings import TRISMIK_SERVICE_URL
11
+
8
12
  logger = logging.getLogger(__name__)
9
13
 
10
14
 
@@ -66,15 +70,26 @@ def get_token() -> Optional[str]:
66
70
 
67
71
 
68
72
  def validate_token(token: str) -> bool:
69
- """Validate the token by making a test API call to trismik."""
70
- # TODO: Implement actual API validation once you have an endpoint
71
- # This would typically make a request to something like:
72
- # response = requests.get("https://api.trismik.com/whoami",
73
- # headers={"Authorization": f"Bearer {token}"})
74
- # return response.status_code == 200
75
-
76
- # For now, just check it's not empty
77
- return bool(token and token.strip())
73
+ """Validate the token by making a test API call to trismik.
74
+
75
+ Args:
76
+ token: The API token to validate.
77
+
78
+ Returns:
79
+ bool: True if the token is valid, False otherwise.
80
+ """
81
+ if not token or not token.strip():
82
+ return False
83
+
84
+ try:
85
+ # Create a client with the token and verify it works
86
+ client = TrismikClient(service_url=TRISMIK_SERVICE_URL, api_key=token)
87
+ client.me()
88
+ client.close()
89
+ return True
90
+ except Exception as e:
91
+ logger.debug(f"Token validation failed: {e}")
92
+ return False
78
93
 
79
94
 
80
95
  def login(trismik_api_key: str) -> None:
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
21
21
 
22
22
  # Known fields that are not metrics or hyperparameters
23
23
  KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
24
- KNOWN_ITEM_FIELDS = {"id", "dataset_name", "input", "output", "label", "run_id"}
24
+ KNOWN_ITEM_FIELDS = {"id", "dataset", "input", "output", "label", "run_id"}
25
25
 
26
26
 
27
27
  def upload_result(
@@ -1,5 +1 @@
1
1
  """Dataset utilities for scorebook."""
2
-
3
- from scorebook.eval_datasets.eval_dataset import EvalDataset
4
-
5
- __all__ = ["EvalDataset"]
@@ -18,8 +18,10 @@ from scorebook.exceptions import (
18
18
  DatasetSampleError,
19
19
  MissingFieldError,
20
20
  )
21
- from scorebook.metrics import MetricBase, MetricRegistry
22
- from scorebook.utils import render_template, validate_path
21
+ from scorebook.metrics.metric_base import MetricBase
22
+ from scorebook.metrics.metric_registry import MetricRegistry
23
+ from scorebook.utils.io_helpers import validate_path
24
+ from scorebook.utils.render_template import render_template
23
25
 
24
26
 
25
27
  class EvalDataset:
@@ -137,6 +139,24 @@ class EvalDataset:
137
139
  raise DatasetNotInitializedError("Dataset is not initialized")
138
140
  return list(map(str, self._hf_dataset.column_names))
139
141
 
142
+ @property
143
+ def split(self) -> Optional[str]:
144
+ """Return the split name of the underlying HuggingFace dataset, if available.
145
+
146
+ Returns:
147
+ The split name (e.g., "train", "test", "validation") if the dataset was loaded
148
+ from HuggingFace with a specific split. Returns None if the dataset was created
149
+ from a list, CSV, JSON, or loaded without a split specification.
150
+
151
+ Raises:
152
+ DatasetNotInitializedError: If the dataset is not initialized.
153
+ """
154
+ if self._hf_dataset is None:
155
+ raise DatasetNotInitializedError("Dataset is not initialized")
156
+
157
+ split = self._hf_dataset.split
158
+ return str(split) if split is not None else None
159
+
140
160
  def shuffle(self) -> None:
141
161
  """Randomly shuffle the dataset items."""
142
162
  if self._hf_dataset is None:
@@ -1,15 +1 @@
1
- """
2
- Evaluation module for Scorebook.
3
-
4
- This module provides both synchronous and asynchronous evaluation functions.
5
- The async version serves as the source of truth, with the sync version
6
- automatically generated using unasync.
7
- """
8
-
9
- # Import from async module
10
- from ._async.evaluate_async import evaluate_async
11
-
12
- # Import from generated sync module
13
- from ._sync.evaluate import evaluate
14
-
15
- __all__ = ["evaluate", "evaluate_async"]
1
+ """Evaluation module for Scorebook."""
@@ -6,7 +6,7 @@ from trismik import TrismikAsyncClient, TrismikClient
6
6
  from trismik.settings import evaluation_settings
7
7
  from trismik.types import TrismikRunMetadata
8
8
 
9
- from scorebook.eval_datasets import EvalDataset
9
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
10
10
  from scorebook.evaluate.evaluate_helpers import (
11
11
  build_eval_run_specs,
12
12
  create_trismik_async_client,
@@ -15,6 +15,7 @@ from scorebook.evaluate.evaluate_helpers import (
15
15
  make_trismik_inference,
16
16
  prepare_datasets,
17
17
  prepare_hyperparameter_configs,
18
+ resolve_adaptive_split,
18
19
  validate_parameters,
19
20
  )
20
21
  from scorebook.exceptions import InferenceError, ScoreBookError
@@ -27,12 +28,9 @@ from scorebook.types import (
27
28
  EvalResult,
28
29
  EvalRunSpec,
29
30
  )
30
- from scorebook.utils import (
31
- async_nullcontext,
32
- evaluation_progress_context,
33
- resolve_show_progress,
34
- resolve_upload_results,
35
- )
31
+ from scorebook.utils.async_utils import async_nullcontext
32
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
33
+ from scorebook.utils.progress_bars import evaluation_progress_context
36
34
 
37
35
  logger = logging.getLogger(__name__)
38
36
 
@@ -40,6 +38,7 @@ logger = logging.getLogger(__name__)
40
38
  async def evaluate_async(
41
39
  inference: Union[Callable, InferencePipeline],
42
40
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
41
+ split: Optional[str] = None,
43
42
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
44
43
  metadata: Optional[Dict[str, Any]] = None,
45
44
  experiment_id: Optional[str] = None,
@@ -58,6 +57,7 @@ async def evaluate_async(
58
57
  Args:
59
58
  inference: The inference callable to evaluate
60
59
  datasets: Dataset(s) to evaluate on
60
+ split: Split to use for evaluation (default: "validation")
61
61
  hyperparameters: Hyperparameter configuration(s) to evaluate with
62
62
  metadata: Optional metadata to attach to the evaluation
63
63
  experiment_id: Optional experiment identifier
@@ -82,7 +82,7 @@ async def evaluate_async(
82
82
  validate_parameters(locals(), evaluate_async)
83
83
 
84
84
  # Prepare datasets, hyperparameters, and eval run specs
85
- datasets = prepare_datasets(datasets, sample_size)
85
+ datasets = prepare_datasets(datasets, split, sample_size)
86
86
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
87
87
  eval_run_specs = sorted(
88
88
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -378,8 +378,24 @@ async def run_adaptive_evaluation(
378
378
  Returns:
379
379
  Results from the adaptive evaluation
380
380
  """
381
+ # Fetch available splits from Trismik
382
+ dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
383
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
384
+
385
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
386
+ resolved_split = resolve_adaptive_split(
387
+ test_id=adaptive_run_spec.dataset,
388
+ user_specified_split=adaptive_run_spec.split,
389
+ available_splits=available_splits,
390
+ )
391
+
392
+ # Create inference function with bound hyperparameters
393
+ async def inference_with_hyperparams(items: Any) -> Any:
394
+ return await inference(items, **adaptive_run_spec.hyperparameter_config)
395
+
381
396
  trismik_results = await trismik_client.run(
382
397
  test_id=adaptive_run_spec.dataset,
398
+ split=resolved_split,
383
399
  project_id=project_id,
384
400
  experiment=experiment_id,
385
401
  run_metadata=TrismikRunMetadata(
@@ -387,7 +403,7 @@ async def run_adaptive_evaluation(
387
403
  test_configuration={},
388
404
  inference_setup={},
389
405
  ),
390
- item_processor=make_trismik_inference(inference),
406
+ item_processor=make_trismik_inference(inference_with_hyperparams),
391
407
  return_dict=False,
392
408
  )
393
409
 
@@ -5,7 +5,7 @@ from trismik import TrismikAsyncClient, TrismikClient
5
5
  from trismik.settings import evaluation_settings
6
6
  from trismik.types import TrismikRunMetadata
7
7
 
8
- from scorebook.eval_datasets import EvalDataset
8
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
9
9
  from scorebook.evaluate.evaluate_helpers import (
10
10
  build_eval_run_specs,
11
11
  create_trismik_sync_client,
@@ -14,6 +14,7 @@ from scorebook.evaluate.evaluate_helpers import (
14
14
  make_trismik_inference,
15
15
  prepare_datasets,
16
16
  prepare_hyperparameter_configs,
17
+ resolve_adaptive_split,
17
18
  validate_parameters,
18
19
  )
19
20
  from scorebook.exceptions import InferenceError, ScoreBookError
@@ -26,12 +27,9 @@ from scorebook.types import (
26
27
  EvalResult,
27
28
  EvalRunSpec,
28
29
  )
29
- from scorebook.utils import (
30
- nullcontext,
31
- evaluation_progress_context,
32
- resolve_show_progress,
33
- resolve_upload_results,
34
- )
30
+ from contextlib import nullcontext
31
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
32
+ from scorebook.utils.progress_bars import evaluation_progress_context
35
33
 
36
34
  logger = logging.getLogger(__name__)
37
35
 
@@ -39,6 +37,7 @@ logger = logging.getLogger(__name__)
39
37
  def evaluate(
40
38
  inference: Union[Callable, InferencePipeline],
41
39
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
40
+ split: Optional[str] = None,
42
41
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
43
42
  metadata: Optional[Dict[str, Any]] = None,
44
43
  experiment_id: Optional[str] = None,
@@ -57,6 +56,7 @@ def evaluate(
57
56
  Args:
58
57
  inference: The inference callable to evaluate
59
58
  datasets: Dataset(s) to evaluate on
59
+ split: Split to use for evaluation (default: "validation")
60
60
  hyperparameters: Hyperparameter configuration(s) to evaluate with
61
61
  metadata: Optional metadata to attach to the evaluation
62
62
  experiment_id: Optional experiment identifier
@@ -81,7 +81,7 @@ def evaluate(
81
81
  validate_parameters(locals(), evaluate)
82
82
 
83
83
  # Prepare datasets, hyperparameters, and eval run specs
84
- datasets = prepare_datasets(datasets, sample_size)
84
+ datasets = prepare_datasets(datasets, split, sample_size)
85
85
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
86
86
  eval_run_specs = sorted(
87
87
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
@@ -377,8 +377,24 @@ def run_adaptive_evaluation(
377
377
  Returns:
378
378
  Results from the adaptive evaluation
379
379
  """
380
+ # Fetch available splits from Trismik
381
+ dataset_info = trismik_client.get_dataset_info(adaptive_run_spec.dataset)
382
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
383
+
384
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
385
+ resolved_split = resolve_adaptive_split(
386
+ test_id=adaptive_run_spec.dataset,
387
+ user_specified_split=adaptive_run_spec.split,
388
+ available_splits=available_splits,
389
+ )
390
+
391
+ # Create inference function with bound hyperparameters
392
+ def inference_with_hyperparams(items: Any) -> Any:
393
+ return inference(items, **adaptive_run_spec.hyperparameter_config)
394
+
380
395
  trismik_results = trismik_client.run(
381
396
  test_id=adaptive_run_spec.dataset,
397
+ split=resolved_split,
382
398
  project_id=project_id,
383
399
  experiment=experiment_id,
384
400
  run_metadata=TrismikRunMetadata(
@@ -386,7 +402,7 @@ def run_adaptive_evaluation(
386
402
  test_configuration={},
387
403
  inference_setup={},
388
404
  ),
389
- item_processor=make_trismik_inference(inference),
405
+ item_processor=make_trismik_inference(inference_with_hyperparams),
390
406
  return_dict=False,
391
407
  )
392
408
 
@@ -9,7 +9,8 @@ from trismik._async.client import TrismikAsyncClient
9
9
  from trismik._sync.client import TrismikClient
10
10
  from trismik.types import TrismikMultipleChoiceTextItem
11
11
 
12
- from scorebook import EvalDataset
12
+ from scorebook.dashboard.credentials import get_token
13
+ from scorebook.eval_datasets.eval_dataset import EvalDataset
13
14
  from scorebook.exceptions import (
14
15
  DataMismatchError,
15
16
  MetricComputationError,
@@ -17,9 +18,9 @@ from scorebook.exceptions import (
17
18
  ScoreBookError,
18
19
  )
19
20
  from scorebook.settings import TRISMIK_SERVICE_URL
20
- from scorebook.trismik.credentials import get_token
21
21
  from scorebook.types import AdaptiveEvalDataset, AdaptiveEvalRunSpec, EvalResult, EvalRunSpec
22
- from scorebook.utils import expand_dict, is_awaitable
22
+ from scorebook.utils.async_utils import is_awaitable
23
+ from scorebook.utils.transform_helpers import expand_dict
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -91,6 +92,7 @@ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> N
91
92
 
92
93
  def prepare_datasets(
93
94
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
95
+ split: Optional[str] = None,
94
96
  sample_size: Optional[int] = None,
95
97
  ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
96
98
  """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
@@ -104,6 +106,12 @@ def prepare_datasets(
104
106
 
105
107
  # Prepare classic datasets
106
108
  if isinstance(dataset, EvalDataset):
109
+ # Warn if dataset split differs from provided split parameter
110
+ if split is not None and dataset.split is not None and dataset.split != split:
111
+ logger.warning(
112
+ f"Dataset '{dataset.name}' has split '{dataset.split}' but evaluate split "
113
+ f"parameter is '{split}'. The dataset split will be used."
114
+ )
107
115
 
108
116
  if sample_size is not None:
109
117
  dataset = dataset.sample(sample_size)
@@ -111,8 +119,17 @@ def prepare_datasets(
111
119
  datasets_out.append(dataset)
112
120
 
113
121
  # Prepare adaptive datasets
114
- elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
115
- datasets_out.append(AdaptiveEvalDataset(dataset))
122
+ elif isinstance(dataset, str) and ":adaptive" in dataset:
123
+ # Parse adaptive dataset
124
+ parts = dataset.split(":")
125
+ if len(parts) != 2 or parts[1] != "adaptive":
126
+ raise ParameterValidationError(
127
+ f"Invalid adaptive dataset format: '{dataset}'. "
128
+ f"Use 'test_id:adaptive' format and specify split via the split parameter."
129
+ )
130
+
131
+ # Use the split parameter for all adaptive datasets
132
+ datasets_out.append(AdaptiveEvalDataset(name=dataset, split=split))
116
133
 
117
134
  # TODO: dataset name string registry
118
135
  elif isinstance(dataset, str):
@@ -174,6 +191,7 @@ def build_eval_run_specs(
174
191
  hyperparameters_index,
175
192
  experiment_id,
176
193
  project_id,
194
+ dataset.split,
177
195
  metadata,
178
196
  )
179
197
  )
@@ -220,6 +238,7 @@ def build_adaptive_eval_run_spec(
220
238
  hyperparameter_config_index: int,
221
239
  experiment_id: str,
222
240
  project_id: str,
241
+ split: Optional[str] = None,
223
242
  metadata: Optional[Dict[str, Any]] = None,
224
243
  ) -> AdaptiveEvalRunSpec:
225
244
  """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
@@ -231,6 +250,7 @@ def build_adaptive_eval_run_spec(
231
250
  hyperparameter_config_index,
232
251
  experiment_id,
233
252
  project_id,
253
+ split,
234
254
  metadata,
235
255
  )
236
256
  logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
@@ -386,3 +406,57 @@ def make_trismik_inference(
386
406
  )
387
407
 
388
408
  return sync_trismik_inference_function
409
+
410
+
411
+ def resolve_adaptive_split(
412
+ test_id: str,
413
+ user_specified_split: Optional[str],
414
+ available_splits: List[str],
415
+ ) -> str:
416
+ """Resolve the dataset split to use for adaptive evaluation.
417
+
418
+ Resolution order:
419
+ 1. If user specified a split, validate it exists and use it
420
+ 2. If not specified and exactly one split is available, use it
421
+ 3. If not specified and multiple splits are available, raise an error
422
+ 4. If no splits are available, raise an error
423
+
424
+ Args:
425
+ test_id: The test dataset ID (without ":adaptive" suffix)
426
+ user_specified_split: Optional split name specified by the user
427
+ available_splits: List of available split names for this dataset
428
+
429
+ Returns:
430
+ The resolved split name to use
431
+
432
+ Raises:
433
+ ScoreBookError: If the specified split doesn't exist, multiple splits exist without
434
+ user specification, or no splits are available
435
+ """
436
+ logger.debug(f"Available splits for {test_id}: {available_splits}")
437
+
438
+ # If user specified a split, validate and use it
439
+ if user_specified_split is not None:
440
+ if user_specified_split in available_splits:
441
+ logger.info(f"Using user-specified split '{user_specified_split}' for {test_id}")
442
+ return user_specified_split
443
+ else:
444
+ raise ScoreBookError(
445
+ f"Specified split '{user_specified_split}' not found for dataset '{test_id}'. "
446
+ f"Available splits: {available_splits}"
447
+ )
448
+
449
+ # No split specified - check available splits
450
+ if len(available_splits) == 0:
451
+ raise ScoreBookError(f"No splits available for dataset '{test_id}'. ")
452
+ elif len(available_splits) == 1:
453
+ # Exactly one split - auto-select it
454
+ selected_split = available_splits[0]
455
+ logger.info(f"Auto-selecting only available split '{selected_split}' for {test_id}")
456
+ return selected_split
457
+ else:
458
+ # Multiple splits available - user must specify
459
+ raise ScoreBookError(
460
+ f"Multiple splits available for dataset '{test_id}': {available_splits}. "
461
+ f"Please specify which split to use via evaluate's 'split' parameter."
462
+ )
@@ -1,11 +1 @@
1
- """
2
- Inference module for model execution and predictions.
3
-
4
- This module provides functionality for running inference with various models
5
- and processing their responses. It includes utilities for both single and
6
- batch inference operations.
7
- """
8
-
9
- from scorebook.inference.inference_pipeline import InferencePipeline
10
-
11
- __all__ = ["InferencePipeline"]
1
+ """Inference module for model execution and predictions."""
@@ -1,8 +1 @@
1
- """
2
- Inference clients for various LLM providers.
3
-
4
- This module provides client implementations for different LLM providers including
5
- OpenAI, AWS Bedrock, Google Vertex AI, and Portkey.
6
- """
7
-
8
- __all__ = ["bedrock", "openai", "portkey", "vertex"]
1
+ """Inference clients for various LLM providers."""
@@ -9,7 +9,7 @@ configurable way.
9
9
  import asyncio
10
10
  from typing import Any, Callable, Dict, List, Optional, cast
11
11
 
12
- from scorebook.utils import is_awaitable
12
+ from scorebook.utils.async_utils import is_awaitable
13
13
 
14
14
 
15
15
  class InferencePipeline:
@@ -1,18 +1 @@
1
- """
2
- Metrics for evaluating model predictions.
3
-
4
- This module provides a collection of evaluation metrics for comparing model outputs
5
- against ground truth labels. Available metrics include standard classification and
6
- generation metrics like accuracy, precision, recall, F1-score, etc.
7
-
8
- Metrics can be accessed by name through the `get_metrics()` function or used
9
- directly by instantiating specific metric classes. All metrics implement a
10
- common interface for scoring predictions against references.
11
- """
12
-
13
- from scorebook.metrics.accuracy import Accuracy
14
- from scorebook.metrics.metric_base import MetricBase
15
- from scorebook.metrics.metric_registry import MetricRegistry
16
- from scorebook.metrics.precision import Precision
17
-
18
- __all__ = ["MetricBase", "Precision", "Accuracy", "MetricRegistry"]
1
+ """Metrics for evaluating model predictions."""
@@ -85,8 +85,10 @@ class MetricRegistry:
85
85
  # If input is a string, look up the class in the registry
86
86
  if isinstance(name_or_class, str):
87
87
  key = name_or_class.lower()
88
+
88
89
  if key not in cls._registry:
89
90
  raise ValueError(f"Metric '{name_or_class}' not registered.")
91
+
90
92
  return cls._registry[key](**kwargs)
91
93
 
92
94
  raise ValueError(
@@ -1,6 +1 @@
1
1
  """Score module for computing metrics on pre-computed outputs."""
2
-
3
- from scorebook.score._async.score_async import score_async
4
- from scorebook.score._sync.score import score
5
-
6
- __all__ = ["score", "score_async"]
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Literal, Optional, Union, cast
3
3
 
4
+ from scorebook.dashboard.upload_results import upload_result_async
4
5
  from scorebook.exceptions import DataMismatchError, ParameterValidationError
5
6
  from scorebook.score.score_helpers import (
6
7
  calculate_metric_scores_async,
@@ -8,9 +9,9 @@ from scorebook.score.score_helpers import (
8
9
  resolve_metrics,
9
10
  validate_items,
10
11
  )
11
- from scorebook.trismik.upload_results import upload_result_async
12
12
  from scorebook.types import Metrics
13
- from scorebook.utils import resolve_show_progress, resolve_upload_results, scoring_progress_context
13
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
14
+ from scorebook.utils.progress_bars import scoring_progress_context
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17