scorebook 0.0.11__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {scorebook-0.0.11 → scorebook-0.0.13}/PKG-INFO +2 -2
  2. {scorebook-0.0.11 → scorebook-0.0.13}/pyproject.toml +18 -12
  3. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/__init__.py +8 -1
  4. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/eval_datasets/eval_dataset.py +18 -0
  5. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_async/evaluate_async.py +116 -126
  6. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/evaluate.py +116 -127
  7. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/evaluate_helpers.py +98 -25
  8. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/exceptions.py +6 -2
  9. scorebook-0.0.13/src/scorebook/score/__init__.py +6 -0
  10. scorebook-0.0.13/src/scorebook/score/_async/__init__.py +0 -0
  11. scorebook-0.0.13/src/scorebook/score/_async/score_async.py +145 -0
  12. scorebook-0.0.13/src/scorebook/score/_sync/__init__.py +0 -0
  13. scorebook-0.0.13/src/scorebook/score/_sync/score.py +145 -0
  14. scorebook-0.0.13/src/scorebook/score/score_helpers.py +207 -0
  15. scorebook-0.0.13/src/scorebook/trismik/upload_results.py +254 -0
  16. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/types.py +35 -54
  17. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/__init__.py +8 -1
  18. scorebook-0.0.13/src/scorebook/utils/common_helpers.py +41 -0
  19. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/progress_bars.py +67 -0
  20. {scorebook-0.0.11 → scorebook-0.0.13}/LICENSE +0 -0
  21. {scorebook-0.0.11 → scorebook-0.0.13}/README.md +0 -0
  22. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/__init__.py +0 -0
  23. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/auth.py +0 -0
  24. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/main.py +0 -0
  25. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/eval_datasets/__init__.py +0 -0
  26. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/__init__.py +0 -0
  27. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_async/__init__.py +0 -0
  28. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/__init__.py +0 -0
  29. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/__init__.py +0 -0
  30. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/__init__.py +0 -0
  31. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/bedrock.py +0 -0
  32. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/openai.py +0 -0
  33. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/portkey.py +0 -0
  34. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/vertex.py +0 -0
  35. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/inference_pipeline.py +0 -0
  36. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/__init__.py +0 -0
  37. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/accuracy.py +0 -0
  38. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/metric_base.py +0 -0
  39. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/metric_registry.py +0 -0
  40. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/precision.py +0 -0
  41. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/settings.py +0 -0
  42. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/trismik/__init__.py +0 -0
  43. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/trismik/credentials.py +0 -0
  44. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/async_utils.py +0 -0
  45. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/io_helpers.py +0 -0
  46. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/jinja_helpers.py +0 -0
  47. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/mappers.py +0 -0
  48. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/render_template.py +0 -0
  49. {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/transform_helpers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scorebook
3
- Version: 0.0.11
3
+ Version: 0.0.13
4
4
  Summary: A Python project for LLM evaluation.
5
5
  License-File: LICENSE
6
6
  Author: Euan Campbell
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
37
37
  Requires-Dist: torchaudio ; extra == "examples"
38
38
  Requires-Dist: torchvision ; extra == "examples"
39
39
  Requires-Dist: transformers ; extra == "examples"
40
- Requires-Dist: trismik (>=1.0.1,<2.0.0)
40
+ Requires-Dist: trismik (==1.0.2)
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
  # Scorebook
@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
11
11
  dependencies = [
12
12
  "datasets>=3.6.0",
13
13
  "notebook (>=7.4.5,<8.0.0)",
14
- "trismik (>=1.0.1, <2.0.0)",
14
+ "trismik==1.0.2",
15
15
  "ipywidgets>=8.0.0",
16
16
  ]
17
17
 
@@ -19,14 +19,14 @@ dependencies = [
19
19
  scorebook = "scorebook.cli.main:main"
20
20
 
21
21
  [tool.poetry]
22
- version = "0.0.11" # base version
22
+ version = "0.0.13" # base version
23
23
  packages = [{ include = "scorebook", from = "src" }]
24
24
 
25
25
  [tool.poetry.dependencies]
26
26
  python = ">=3.9,<3.14"
27
27
  datasets = ">=3.6.0"
28
28
  notebook = ">=7.4.5,<8.0.0"
29
- trismik = ">=1.0.1,<2.0.0"
29
+ trismik = "1.0.2"
30
30
  ipywidgets = ">=8.0.0"
31
31
 
32
32
  # Optional dependencies
@@ -119,16 +119,22 @@ install_types = true
119
119
  [tool.flake8] # note that this depends on Flake8-pyproject
120
120
  ignore = ["D202", "W503", "W504"]
121
121
 
122
- [tool.unasync]
123
122
  [[tool.unasync.rules]]
124
123
  fromdir = "src/scorebook/evaluate/_async/"
125
124
  todir = "src/scorebook/evaluate/_sync/"
125
+ replacements."scorebook.score._async.score_async" = "scorebook.score._sync.score"
126
+ replacements."scorebook.score._async" = "scorebook.score._sync"
127
+ replacements.evaluate_async = "evaluate"
128
+ replacements."Asynchronous evaluation complete" = "Synchronous evaluation complete"
129
+ replacements." run_results = asyncio.gather(*[worker(run) for run in runs])" = " run_results = [worker(run) for run in runs]"
130
+ replacements.async_nullcontext = "nullcontext"
131
+ replacements.create_trismik_async_client = "create_trismik_sync_client"
132
+ replacements.score_async = "score"
126
133
 
127
-
128
- # Custom replacements beyond default async/await transformations
129
- [tool.unasync.rules.replacements]
130
- "evaluate_async" = "evaluate"
131
- "Asynchronous evaluation complete" = "Synchronous evaluation complete"
132
- " run_results = asyncio.gather(*[worker(run) for run in runs])" = " run_results = [worker(run) for run in runs]"
133
- "async_nullcontext" = "nullcontext"
134
- "create_trismik_async_client" = "create_trismik_sync_client"
134
+ [[tool.unasync.rules]]
135
+ fromdir = "src/scorebook/score/_async/"
136
+ todir = "src/scorebook/score/_sync/"
137
+ replacements.score_async = "score"
138
+ replacements."Async scoring complete" = "Scoring complete"
139
+ replacements.calculate_metric_scores_async = "calculate_metric_scores"
140
+ replacements.upload_result_async = "upload_result"
@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
12
12
  from scorebook.eval_datasets import EvalDataset
13
13
  from scorebook.evaluate import evaluate, evaluate_async
14
14
  from scorebook.inference.inference_pipeline import InferencePipeline
15
- from scorebook.trismik.credentials import login, whoami
15
+ from scorebook.score import score, score_async
16
+ from scorebook.trismik.credentials import login, logout, whoami
17
+ from scorebook.trismik.upload_results import upload_result, upload_result_async
16
18
  from scorebook.utils.render_template import render_template
17
19
 
18
20
  __all__ = [
19
21
  "EvalDataset",
20
22
  "evaluate",
21
23
  "evaluate_async",
24
+ "score",
25
+ "score_async",
22
26
  "render_template",
23
27
  "login",
28
+ "logout",
24
29
  "whoami",
25
30
  "InferencePipeline",
31
+ "upload_result",
32
+ "upload_result_async",
26
33
  ]
@@ -137,6 +137,24 @@ class EvalDataset:
137
137
  raise DatasetNotInitializedError("Dataset is not initialized")
138
138
  return list(map(str, self._hf_dataset.column_names))
139
139
 
140
+ @property
141
+ def split(self) -> Optional[str]:
142
+ """Return the split name of the underlying HuggingFace dataset, if available.
143
+
144
+ Returns:
145
+ The split name (e.g., "train", "test", "validation") if the dataset was loaded
146
+ from HuggingFace with a specific split. Returns None if the dataset was created
147
+ from a list, CSV, JSON, or loaded without a split specification.
148
+
149
+ Raises:
150
+ DatasetNotInitializedError: If the dataset is not initialized.
151
+ """
152
+ if self._hf_dataset is None:
153
+ raise DatasetNotInitializedError("Dataset is not initialized")
154
+
155
+ split = self._hf_dataset.split
156
+ return str(split) if split is not None else None
157
+
140
158
  def shuffle(self) -> None:
141
159
  """Randomly shuffle the dataset items."""
142
160
  if self._hf_dataset is None:
@@ -3,13 +3,8 @@ import logging
3
3
  from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
4
4
 
5
5
  from trismik import TrismikAsyncClient, TrismikClient
6
- from trismik.types import (
7
- TrismikClassicEvalItem,
8
- TrismikClassicEvalMetric,
9
- TrismikClassicEvalRequest,
10
- TrismikClassicEvalResponse,
11
- TrismikRunMetadata,
12
- )
6
+ from trismik.settings import evaluation_settings
7
+ from trismik.types import TrismikRunMetadata
13
8
 
14
9
  from scorebook.eval_datasets import EvalDataset
15
10
  from scorebook.evaluate.evaluate_helpers import (
@@ -20,12 +15,12 @@ from scorebook.evaluate.evaluate_helpers import (
20
15
  make_trismik_inference,
21
16
  prepare_datasets,
22
17
  prepare_hyperparameter_configs,
23
- resolve_show_progress,
24
- resolve_upload_results,
25
- score_metrics,
18
+ resolve_adaptive_split,
26
19
  validate_parameters,
27
20
  )
28
21
  from scorebook.exceptions import InferenceError, ScoreBookError
22
+ from scorebook.inference.inference_pipeline import InferencePipeline
23
+ from scorebook.score._async.score_async import score_async
29
24
  from scorebook.types import (
30
25
  AdaptiveEvalRunResult,
31
26
  AdaptiveEvalRunSpec,
@@ -33,14 +28,20 @@ from scorebook.types import (
33
28
  EvalResult,
34
29
  EvalRunSpec,
35
30
  )
36
- from scorebook.utils import async_nullcontext, evaluation_progress_context
31
+ from scorebook.utils import (
32
+ async_nullcontext,
33
+ evaluation_progress_context,
34
+ resolve_show_progress,
35
+ resolve_upload_results,
36
+ )
37
37
 
38
38
  logger = logging.getLogger(__name__)
39
39
 
40
40
 
41
41
  async def evaluate_async(
42
- inference: Callable,
42
+ inference: Union[Callable, InferencePipeline],
43
43
  datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
44
+ split: Optional[str] = None,
44
45
  hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
45
46
  metadata: Optional[Dict[str, Any]] = None,
46
47
  experiment_id: Optional[str] = None,
@@ -59,6 +60,7 @@ async def evaluate_async(
59
60
  Args:
60
61
  inference: The inference callable to evaluate
61
62
  datasets: Dataset(s) to evaluate on
63
+ split: Split to use for evaluation (default: "validation")
62
64
  hyperparameters: Hyperparameter configuration(s) to evaluate with
63
65
  metadata: Optional metadata to attach to the evaluation
64
66
  experiment_id: Optional experiment identifier
@@ -83,14 +85,14 @@ async def evaluate_async(
83
85
  validate_parameters(locals(), evaluate_async)
84
86
 
85
87
  # Prepare datasets, hyperparameters, and eval run specs
86
- datasets = prepare_datasets(datasets, sample_size)
88
+ datasets = prepare_datasets(datasets, split, sample_size)
87
89
  hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
88
90
  eval_run_specs = sorted(
89
91
  build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
90
92
  key=lambda run: (run.dataset_index, run.hyperparameters_index),
91
93
  )
92
94
 
93
- # Create Trismik client if needed (for adaptive evals or uploads)
95
+ # Create a Trismik client if needed (for adaptive evals or uploads)
94
96
  needs_client = upload_results or any(
95
97
  isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
96
98
  )
@@ -101,7 +103,14 @@ async def evaluate_async(
101
103
  async with trismik_client or async_nullcontext():
102
104
  # Execute evaluation runs
103
105
  # Calculate total items across all runs
104
- total_items = sum(len(run.dataset.items) for run in eval_run_specs)
106
+ total_items = sum(
107
+ (
108
+ len(run.dataset.items)
109
+ if isinstance(run, EvalRunSpec)
110
+ else evaluation_settings["max_iterations"]
111
+ ) # Adaptive evals use max_iterations
112
+ for run in eval_run_specs
113
+ )
105
114
  model_display = get_model_name(inference)
106
115
 
107
116
  with evaluation_progress_context(
@@ -145,34 +154,32 @@ async def execute_runs(
145
154
  async def worker(
146
155
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
147
156
  ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
157
+ # Execute run (score_async handles upload internally for classic evals)
148
158
  run_result = await execute_run(
149
- inference, run, experiment_id, project_id, metadata, trismik_client
159
+ inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
150
160
  )
161
+
151
162
  # Update progress bars with items processed and success status
152
163
  if progress_bars is not None:
153
- items_processed = len(run.dataset.items)
164
+ # Classic evals have .items; adaptive evals use max_iterations
165
+ items_processed = (
166
+ len(run.dataset.items)
167
+ if isinstance(run, EvalRunSpec)
168
+ else evaluation_settings["max_iterations"]
169
+ )
154
170
  progress_bars.on_run_completed(items_processed, run_result.run_completed)
155
171
 
172
+ # Update upload progress for classic evals
156
173
  if (
157
174
  upload_results
158
175
  and isinstance(run_result, ClassicEvalRunResult)
159
- and experiment_id
160
- and project_id
161
176
  and run_result.run_completed
162
- and trismik_client is not None
163
177
  ):
164
- try:
165
- run_id = await upload_classic_run_results(
166
- run_result, experiment_id, project_id, inference, metadata, trismik_client
167
- )
168
- run_result.run_id = run_id
178
+ # Check if upload succeeded by checking for run_id
179
+ if experiment_id and project_id:
180
+ upload_succeeded = run_result.run_id is not None
169
181
  if progress_bars is not None:
170
- progress_bars.on_upload_completed(succeeded=True)
171
- except Exception as e:
172
- logger.warning(f"Failed to upload run results: {e}")
173
- if progress_bars is not None:
174
- progress_bars.on_upload_completed(succeeded=False)
175
- # Continue evaluation even if upload fails
182
+ progress_bars.on_upload_completed(succeeded=upload_succeeded)
176
183
 
177
184
  return run_result
178
185
 
@@ -191,6 +198,7 @@ async def execute_runs(
191
198
  async def execute_run(
192
199
  inference: Callable,
193
200
  run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
201
+ upload_results: bool, # NEW PARAMETER
194
202
  experiment_id: Optional[str] = None,
195
203
  project_id: Optional[str] = None,
196
204
  metadata: Optional[Dict[str, Any]] = None,
@@ -199,7 +207,9 @@ async def execute_run(
199
207
  """Execute a single evaluation run."""
200
208
 
201
209
  if isinstance(run, EvalRunSpec):
202
- return await execute_classic_eval_run(inference, run)
210
+ return await execute_classic_eval_run(
211
+ inference, run, upload_results, experiment_id, project_id, metadata
212
+ )
203
213
 
204
214
  elif isinstance(run, AdaptiveEvalRunSpec):
205
215
  resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
@@ -217,24 +227,79 @@ async def execute_run(
217
227
  raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
218
228
 
219
229
 
220
- async def execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
221
- """Execute a classic evaluation run."""
230
+ async def execute_classic_eval_run(
231
+ inference: Callable,
232
+ run: EvalRunSpec,
233
+ upload_results: bool,
234
+ experiment_id: Optional[str],
235
+ project_id: Optional[str],
236
+ metadata: Optional[Dict[str, Any]],
237
+ ) -> ClassicEvalRunResult:
238
+ """Execute a classic evaluation run using score_async() for scoring and uploading."""
222
239
  logger.debug("Executing classic eval run for %s", run)
223
240
 
224
241
  inference_outputs = None
225
- metric_scores = None
242
+ scores = None
226
243
 
227
244
  try:
245
+ # 1. Run inference
228
246
  inference_outputs = await run_inference_callable(
229
247
  inference, run.inputs, run.hyperparameter_config
230
248
  )
231
- metric_scores = score_metrics(run.dataset, inference_outputs, run.labels)
232
- logger.debug("Classic evaluation completed for run %s", run)
233
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
249
+
250
+ # 2. Build items for score_async
251
+ items = [
252
+ {
253
+ "input": run.inputs[i] if i < len(run.inputs) else None,
254
+ "output": inference_outputs[i],
255
+ "label": run.labels[i] if i < len(run.labels) else "",
256
+ }
257
+ for i in range(len(inference_outputs))
258
+ ]
259
+
260
+ # 3. Get the model name for upload
261
+ model_name = get_model_name(inference, metadata)
262
+
263
+ # 4. Call score_async
264
+ scores = await score_async(
265
+ items=items,
266
+ metrics=run.dataset.metrics,
267
+ output_column="output", # Explicit parameter
268
+ label_column="label", # Explicit parameter
269
+ input_column="input", # Explicit parameter
270
+ hyperparameters=run.hyperparameter_config,
271
+ dataset_name=run.dataset.name,
272
+ model_name=model_name,
273
+ metadata=metadata,
274
+ experiment_id=experiment_id,
275
+ project_id=project_id,
276
+ upload_results=upload_results,
277
+ show_progress=False,
278
+ )
279
+
280
+ # 5. Extract run_id if upload succeeded
281
+ run_id = None
282
+ if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
283
+ run_id = scores["aggregate_results"][0].get("run_id")
284
+
285
+ logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
286
+ return ClassicEvalRunResult(
287
+ run_spec=run,
288
+ run_completed=True,
289
+ outputs=inference_outputs,
290
+ scores=scores,
291
+ run_id=run_id,
292
+ )
234
293
 
235
294
  except Exception as e:
236
295
  logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
237
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
296
+ return ClassicEvalRunResult(
297
+ run_spec=run,
298
+ run_completed=False,
299
+ outputs=inference_outputs,
300
+ scores=scores,
301
+ run_id=None,
302
+ )
238
303
 
239
304
 
240
305
  async def run_inference_callable(
@@ -296,93 +361,6 @@ async def execute_adaptive_eval_run(
296
361
  return AdaptiveEvalRunResult(run, False, {})
297
362
 
298
363
 
299
- async def upload_classic_run_results(
300
- run_result: ClassicEvalRunResult,
301
- experiment_id: str,
302
- project_id: str,
303
- inference_callable: Optional[Callable],
304
- metadata: Optional[Dict[str, Any]],
305
- trismik_client: Union[TrismikClient, TrismikAsyncClient],
306
- ) -> str:
307
- """Upload a classic evaluation run result to Trismik platform.
308
-
309
- Args:
310
- run: The evaluation run result to upload
311
- experiment_id: Trismik experiment identifier
312
- project_id: Trismik project identifier
313
- model: Model name used for evaluation
314
- metadata: Optional metadata dictionary
315
- trismik_client: Trismik client instance
316
-
317
- Returns:
318
- Run id
319
- """
320
- model = get_model_name(inference_callable)
321
-
322
- # Create eval items from run_spec inputs, outputs, and labels
323
- items: List[TrismikClassicEvalItem] = []
324
- inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
325
- for idx, (input_value, output) in enumerate(inputs_outputs):
326
- labels = run_result.run_spec.labels
327
- label = labels[idx] if idx < len(labels) else ""
328
-
329
- # Calculate item-level metrics for this item
330
- item_metrics: Dict[str, Any] = {}
331
- if run_result.scores:
332
- for metric_name, metric_data in run_result.scores.items():
333
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
334
- if idx < len(metric_data["item_scores"]):
335
- item_metrics[metric_name] = metric_data["item_scores"][idx]
336
- else:
337
- # If scores is just a single value, use it for all items
338
- item_metrics[metric_name] = metric_data
339
-
340
- eval_item = TrismikClassicEvalItem(
341
- datasetItemId=str(idx),
342
- modelInput=str(input_value),
343
- modelOutput=str(output),
344
- goldOutput=str(label),
345
- metrics=item_metrics,
346
- )
347
- items.append(eval_item)
348
-
349
- # Create eval metrics from run aggregate scores
350
- metrics: List[TrismikClassicEvalMetric] = []
351
- if run_result.scores:
352
- for metric_name, metric_data in run_result.scores.items():
353
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
354
- # Handle structured metric data with aggregate scores
355
- for agg_name, agg_value in metric_data["aggregate_scores"].items():
356
- metric_id = (
357
- f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
358
- )
359
- metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
360
- metrics.append(metric)
361
- else:
362
- # Handle simple metric data (single value)
363
- metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
364
- metrics.append(metric)
365
-
366
- classic_eval_request = TrismikClassicEvalRequest(
367
- project_id,
368
- experiment_id,
369
- run_result.run_spec.dataset.name,
370
- model,
371
- run_result.run_spec.hyperparameter_config,
372
- items,
373
- metrics,
374
- )
375
-
376
- response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
377
- classic_eval_request
378
- )
379
-
380
- run_id: str = response.id
381
- logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
382
-
383
- return run_id
384
-
385
-
386
364
  async def run_adaptive_evaluation(
387
365
  inference: Callable,
388
366
  adaptive_run_spec: AdaptiveEvalRunSpec,
@@ -403,8 +381,20 @@ async def run_adaptive_evaluation(
403
381
  Returns:
404
382
  Results from the adaptive evaluation
405
383
  """
384
+ # Fetch available splits from Trismik
385
+ dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
386
+ available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
387
+
388
+ # Resolve the split to use (with fallback: user-specified -> validation -> test)
389
+ resolved_split = resolve_adaptive_split(
390
+ test_id=adaptive_run_spec.dataset,
391
+ user_specified_split=adaptive_run_spec.split,
392
+ available_splits=available_splits,
393
+ )
394
+
406
395
  trismik_results = await trismik_client.run(
407
396
  test_id=adaptive_run_spec.dataset,
397
+ split=resolved_split,
408
398
  project_id=project_id,
409
399
  experiment=experiment_id,
410
400
  run_metadata=TrismikRunMetadata(