scorebook 0.0.11__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scorebook-0.0.11 → scorebook-0.0.13}/PKG-INFO +2 -2
- {scorebook-0.0.11 → scorebook-0.0.13}/pyproject.toml +18 -12
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/__init__.py +8 -1
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/eval_datasets/eval_dataset.py +18 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_async/evaluate_async.py +116 -126
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/evaluate.py +116 -127
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/evaluate_helpers.py +98 -25
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/exceptions.py +6 -2
- scorebook-0.0.13/src/scorebook/score/__init__.py +6 -0
- scorebook-0.0.13/src/scorebook/score/_async/__init__.py +0 -0
- scorebook-0.0.13/src/scorebook/score/_async/score_async.py +145 -0
- scorebook-0.0.13/src/scorebook/score/_sync/__init__.py +0 -0
- scorebook-0.0.13/src/scorebook/score/_sync/score.py +145 -0
- scorebook-0.0.13/src/scorebook/score/score_helpers.py +207 -0
- scorebook-0.0.13/src/scorebook/trismik/upload_results.py +254 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/types.py +35 -54
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/__init__.py +8 -1
- scorebook-0.0.13/src/scorebook/utils/common_helpers.py +41 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/progress_bars.py +67 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/LICENSE +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/README.md +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/auth.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/cli/main.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/eval_datasets/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_async/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/evaluate/_sync/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/bedrock.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/openai.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/portkey.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/clients/vertex.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/inference/inference_pipeline.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/accuracy.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/metric_base.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/metric_registry.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/metrics/precision.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/settings.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/trismik/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/trismik/credentials.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/async_utils.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/io_helpers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/jinja_helpers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/mappers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/render_template.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.13}/src/scorebook/utils/transform_helpers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.13
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
37
37
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
38
|
Requires-Dist: torchvision ; extra == "examples"
|
|
39
39
|
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (
|
|
40
|
+
Requires-Dist: trismik (==1.0.2)
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
|
|
43
43
|
# Scorebook
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"datasets>=3.6.0",
|
|
13
13
|
"notebook (>=7.4.5,<8.0.0)",
|
|
14
|
-
"trismik
|
|
14
|
+
"trismik==1.0.2",
|
|
15
15
|
"ipywidgets>=8.0.0",
|
|
16
16
|
]
|
|
17
17
|
|
|
@@ -19,14 +19,14 @@ dependencies = [
|
|
|
19
19
|
scorebook = "scorebook.cli.main:main"
|
|
20
20
|
|
|
21
21
|
[tool.poetry]
|
|
22
|
-
version = "0.0.
|
|
22
|
+
version = "0.0.13" # base version
|
|
23
23
|
packages = [{ include = "scorebook", from = "src" }]
|
|
24
24
|
|
|
25
25
|
[tool.poetry.dependencies]
|
|
26
26
|
python = ">=3.9,<3.14"
|
|
27
27
|
datasets = ">=3.6.0"
|
|
28
28
|
notebook = ">=7.4.5,<8.0.0"
|
|
29
|
-
trismik = "
|
|
29
|
+
trismik = "1.0.2"
|
|
30
30
|
ipywidgets = ">=8.0.0"
|
|
31
31
|
|
|
32
32
|
# Optional dependencies
|
|
@@ -119,16 +119,22 @@ install_types = true
|
|
|
119
119
|
[tool.flake8] # note that this depends on Flake8-pyproject
|
|
120
120
|
ignore = ["D202", "W503", "W504"]
|
|
121
121
|
|
|
122
|
-
[tool.unasync]
|
|
123
122
|
[[tool.unasync.rules]]
|
|
124
123
|
fromdir = "src/scorebook/evaluate/_async/"
|
|
125
124
|
todir = "src/scorebook/evaluate/_sync/"
|
|
125
|
+
replacements."scorebook.score._async.score_async" = "scorebook.score._sync.score"
|
|
126
|
+
replacements."scorebook.score._async" = "scorebook.score._sync"
|
|
127
|
+
replacements.evaluate_async = "evaluate"
|
|
128
|
+
replacements."Asynchronous evaluation complete" = "Synchronous evaluation complete"
|
|
129
|
+
replacements." run_results = asyncio.gather(*[worker(run) for run in runs])" = " run_results = [worker(run) for run in runs]"
|
|
130
|
+
replacements.async_nullcontext = "nullcontext"
|
|
131
|
+
replacements.create_trismik_async_client = "create_trismik_sync_client"
|
|
132
|
+
replacements.score_async = "score"
|
|
126
133
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
"
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
"create_trismik_async_client" = "create_trismik_sync_client"
|
|
134
|
+
[[tool.unasync.rules]]
|
|
135
|
+
fromdir = "src/scorebook/score/_async/"
|
|
136
|
+
todir = "src/scorebook/score/_sync/"
|
|
137
|
+
replacements.score_async = "score"
|
|
138
|
+
replacements."Async scoring complete" = "Scoring complete"
|
|
139
|
+
replacements.calculate_metric_scores_async = "calculate_metric_scores"
|
|
140
|
+
replacements.upload_result_async = "upload_result"
|
|
@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
|
|
|
12
12
|
from scorebook.eval_datasets import EvalDataset
|
|
13
13
|
from scorebook.evaluate import evaluate, evaluate_async
|
|
14
14
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.
|
|
15
|
+
from scorebook.score import score, score_async
|
|
16
|
+
from scorebook.trismik.credentials import login, logout, whoami
|
|
17
|
+
from scorebook.trismik.upload_results import upload_result, upload_result_async
|
|
16
18
|
from scorebook.utils.render_template import render_template
|
|
17
19
|
|
|
18
20
|
__all__ = [
|
|
19
21
|
"EvalDataset",
|
|
20
22
|
"evaluate",
|
|
21
23
|
"evaluate_async",
|
|
24
|
+
"score",
|
|
25
|
+
"score_async",
|
|
22
26
|
"render_template",
|
|
23
27
|
"login",
|
|
28
|
+
"logout",
|
|
24
29
|
"whoami",
|
|
25
30
|
"InferencePipeline",
|
|
31
|
+
"upload_result",
|
|
32
|
+
"upload_result_async",
|
|
26
33
|
]
|
|
@@ -137,6 +137,24 @@ class EvalDataset:
|
|
|
137
137
|
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
138
138
|
return list(map(str, self._hf_dataset.column_names))
|
|
139
139
|
|
|
140
|
+
@property
|
|
141
|
+
def split(self) -> Optional[str]:
|
|
142
|
+
"""Return the split name of the underlying HuggingFace dataset, if available.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
The split name (e.g., "train", "test", "validation") if the dataset was loaded
|
|
146
|
+
from HuggingFace with a specific split. Returns None if the dataset was created
|
|
147
|
+
from a list, CSV, JSON, or loaded without a split specification.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
DatasetNotInitializedError: If the dataset is not initialized.
|
|
151
|
+
"""
|
|
152
|
+
if self._hf_dataset is None:
|
|
153
|
+
raise DatasetNotInitializedError("Dataset is not initialized")
|
|
154
|
+
|
|
155
|
+
split = self._hf_dataset.split
|
|
156
|
+
return str(split) if split is not None else None
|
|
157
|
+
|
|
140
158
|
def shuffle(self) -> None:
|
|
141
159
|
"""Randomly shuffle the dataset items."""
|
|
142
160
|
if self._hf_dataset is None:
|
|
@@ -3,13 +3,8 @@ import logging
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
6
|
-
from trismik.
|
|
7
|
-
|
|
8
|
-
TrismikClassicEvalMetric,
|
|
9
|
-
TrismikClassicEvalRequest,
|
|
10
|
-
TrismikClassicEvalResponse,
|
|
11
|
-
TrismikRunMetadata,
|
|
12
|
-
)
|
|
6
|
+
from trismik.settings import evaluation_settings
|
|
7
|
+
from trismik.types import TrismikRunMetadata
|
|
13
8
|
|
|
14
9
|
from scorebook.eval_datasets import EvalDataset
|
|
15
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
@@ -20,12 +15,12 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
20
15
|
make_trismik_inference,
|
|
21
16
|
prepare_datasets,
|
|
22
17
|
prepare_hyperparameter_configs,
|
|
23
|
-
|
|
24
|
-
resolve_upload_results,
|
|
25
|
-
score_metrics,
|
|
18
|
+
resolve_adaptive_split,
|
|
26
19
|
validate_parameters,
|
|
27
20
|
)
|
|
28
21
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
22
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
23
|
+
from scorebook.score._async.score_async import score_async
|
|
29
24
|
from scorebook.types import (
|
|
30
25
|
AdaptiveEvalRunResult,
|
|
31
26
|
AdaptiveEvalRunSpec,
|
|
@@ -33,14 +28,20 @@ from scorebook.types import (
|
|
|
33
28
|
EvalResult,
|
|
34
29
|
EvalRunSpec,
|
|
35
30
|
)
|
|
36
|
-
from scorebook.utils import
|
|
31
|
+
from scorebook.utils import (
|
|
32
|
+
async_nullcontext,
|
|
33
|
+
evaluation_progress_context,
|
|
34
|
+
resolve_show_progress,
|
|
35
|
+
resolve_upload_results,
|
|
36
|
+
)
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
async def evaluate_async(
|
|
42
|
-
inference: Callable,
|
|
42
|
+
inference: Union[Callable, InferencePipeline],
|
|
43
43
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
44
|
+
split: Optional[str] = None,
|
|
44
45
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
46
|
metadata: Optional[Dict[str, Any]] = None,
|
|
46
47
|
experiment_id: Optional[str] = None,
|
|
@@ -59,6 +60,7 @@ async def evaluate_async(
|
|
|
59
60
|
Args:
|
|
60
61
|
inference: The inference callable to evaluate
|
|
61
62
|
datasets: Dataset(s) to evaluate on
|
|
63
|
+
split: Split to use for evaluation (default: "validation")
|
|
62
64
|
hyperparameters: Hyperparameter configuration(s) to evaluate with
|
|
63
65
|
metadata: Optional metadata to attach to the evaluation
|
|
64
66
|
experiment_id: Optional experiment identifier
|
|
@@ -83,14 +85,14 @@ async def evaluate_async(
|
|
|
83
85
|
validate_parameters(locals(), evaluate_async)
|
|
84
86
|
|
|
85
87
|
# Prepare datasets, hyperparameters, and eval run specs
|
|
86
|
-
datasets = prepare_datasets(datasets, sample_size)
|
|
88
|
+
datasets = prepare_datasets(datasets, split, sample_size)
|
|
87
89
|
hyperparameter_configs = prepare_hyperparameter_configs(hyperparameters)
|
|
88
90
|
eval_run_specs = sorted(
|
|
89
91
|
build_eval_run_specs(datasets, hyperparameter_configs, experiment_id, project_id, metadata),
|
|
90
92
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
91
93
|
)
|
|
92
94
|
|
|
93
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
95
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
94
96
|
needs_client = upload_results or any(
|
|
95
97
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
96
98
|
)
|
|
@@ -101,7 +103,14 @@ async def evaluate_async(
|
|
|
101
103
|
async with trismik_client or async_nullcontext():
|
|
102
104
|
# Execute evaluation runs
|
|
103
105
|
# Calculate total items across all runs
|
|
104
|
-
total_items = sum(
|
|
106
|
+
total_items = sum(
|
|
107
|
+
(
|
|
108
|
+
len(run.dataset.items)
|
|
109
|
+
if isinstance(run, EvalRunSpec)
|
|
110
|
+
else evaluation_settings["max_iterations"]
|
|
111
|
+
) # Adaptive evals use max_iterations
|
|
112
|
+
for run in eval_run_specs
|
|
113
|
+
)
|
|
105
114
|
model_display = get_model_name(inference)
|
|
106
115
|
|
|
107
116
|
with evaluation_progress_context(
|
|
@@ -145,34 +154,32 @@ async def execute_runs(
|
|
|
145
154
|
async def worker(
|
|
146
155
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
147
156
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
157
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
148
158
|
run_result = await execute_run(
|
|
149
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
159
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
150
160
|
)
|
|
161
|
+
|
|
151
162
|
# Update progress bars with items processed and success status
|
|
152
163
|
if progress_bars is not None:
|
|
153
|
-
|
|
164
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
165
|
+
items_processed = (
|
|
166
|
+
len(run.dataset.items)
|
|
167
|
+
if isinstance(run, EvalRunSpec)
|
|
168
|
+
else evaluation_settings["max_iterations"]
|
|
169
|
+
)
|
|
154
170
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
155
171
|
|
|
172
|
+
# Update upload progress for classic evals
|
|
156
173
|
if (
|
|
157
174
|
upload_results
|
|
158
175
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
159
|
-
and experiment_id
|
|
160
|
-
and project_id
|
|
161
176
|
and run_result.run_completed
|
|
162
|
-
and trismik_client is not None
|
|
163
177
|
):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
run_result.run_id = run_id
|
|
178
|
+
# Check if upload succeeded by checking for run_id
|
|
179
|
+
if experiment_id and project_id:
|
|
180
|
+
upload_succeeded = run_result.run_id is not None
|
|
169
181
|
if progress_bars is not None:
|
|
170
|
-
progress_bars.on_upload_completed(succeeded=
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.warning(f"Failed to upload run results: {e}")
|
|
173
|
-
if progress_bars is not None:
|
|
174
|
-
progress_bars.on_upload_completed(succeeded=False)
|
|
175
|
-
# Continue evaluation even if upload fails
|
|
182
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
176
183
|
|
|
177
184
|
return run_result
|
|
178
185
|
|
|
@@ -191,6 +198,7 @@ async def execute_runs(
|
|
|
191
198
|
async def execute_run(
|
|
192
199
|
inference: Callable,
|
|
193
200
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
201
|
+
upload_results: bool, # NEW PARAMETER
|
|
194
202
|
experiment_id: Optional[str] = None,
|
|
195
203
|
project_id: Optional[str] = None,
|
|
196
204
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -199,7 +207,9 @@ async def execute_run(
|
|
|
199
207
|
"""Execute a single evaluation run."""
|
|
200
208
|
|
|
201
209
|
if isinstance(run, EvalRunSpec):
|
|
202
|
-
return await execute_classic_eval_run(
|
|
210
|
+
return await execute_classic_eval_run(
|
|
211
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
212
|
+
)
|
|
203
213
|
|
|
204
214
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
205
215
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -217,24 +227,79 @@ async def execute_run(
|
|
|
217
227
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
218
228
|
|
|
219
229
|
|
|
220
|
-
async def execute_classic_eval_run(
|
|
221
|
-
|
|
230
|
+
async def execute_classic_eval_run(
|
|
231
|
+
inference: Callable,
|
|
232
|
+
run: EvalRunSpec,
|
|
233
|
+
upload_results: bool,
|
|
234
|
+
experiment_id: Optional[str],
|
|
235
|
+
project_id: Optional[str],
|
|
236
|
+
metadata: Optional[Dict[str, Any]],
|
|
237
|
+
) -> ClassicEvalRunResult:
|
|
238
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
222
239
|
logger.debug("Executing classic eval run for %s", run)
|
|
223
240
|
|
|
224
241
|
inference_outputs = None
|
|
225
|
-
|
|
242
|
+
scores = None
|
|
226
243
|
|
|
227
244
|
try:
|
|
245
|
+
# 1. Run inference
|
|
228
246
|
inference_outputs = await run_inference_callable(
|
|
229
247
|
inference, run.inputs, run.hyperparameter_config
|
|
230
248
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
249
|
+
|
|
250
|
+
# 2. Build items for score_async
|
|
251
|
+
items = [
|
|
252
|
+
{
|
|
253
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
254
|
+
"output": inference_outputs[i],
|
|
255
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
256
|
+
}
|
|
257
|
+
for i in range(len(inference_outputs))
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
# 3. Get the model name for upload
|
|
261
|
+
model_name = get_model_name(inference, metadata)
|
|
262
|
+
|
|
263
|
+
# 4. Call score_async
|
|
264
|
+
scores = await score_async(
|
|
265
|
+
items=items,
|
|
266
|
+
metrics=run.dataset.metrics,
|
|
267
|
+
output_column="output", # Explicit parameter
|
|
268
|
+
label_column="label", # Explicit parameter
|
|
269
|
+
input_column="input", # Explicit parameter
|
|
270
|
+
hyperparameters=run.hyperparameter_config,
|
|
271
|
+
dataset_name=run.dataset.name,
|
|
272
|
+
model_name=model_name,
|
|
273
|
+
metadata=metadata,
|
|
274
|
+
experiment_id=experiment_id,
|
|
275
|
+
project_id=project_id,
|
|
276
|
+
upload_results=upload_results,
|
|
277
|
+
show_progress=False,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# 5. Extract run_id if upload succeeded
|
|
281
|
+
run_id = None
|
|
282
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
283
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
284
|
+
|
|
285
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
286
|
+
return ClassicEvalRunResult(
|
|
287
|
+
run_spec=run,
|
|
288
|
+
run_completed=True,
|
|
289
|
+
outputs=inference_outputs,
|
|
290
|
+
scores=scores,
|
|
291
|
+
run_id=run_id,
|
|
292
|
+
)
|
|
234
293
|
|
|
235
294
|
except Exception as e:
|
|
236
295
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
237
|
-
return ClassicEvalRunResult(
|
|
296
|
+
return ClassicEvalRunResult(
|
|
297
|
+
run_spec=run,
|
|
298
|
+
run_completed=False,
|
|
299
|
+
outputs=inference_outputs,
|
|
300
|
+
scores=scores,
|
|
301
|
+
run_id=None,
|
|
302
|
+
)
|
|
238
303
|
|
|
239
304
|
|
|
240
305
|
async def run_inference_callable(
|
|
@@ -296,93 +361,6 @@ async def execute_adaptive_eval_run(
|
|
|
296
361
|
return AdaptiveEvalRunResult(run, False, {})
|
|
297
362
|
|
|
298
363
|
|
|
299
|
-
async def upload_classic_run_results(
|
|
300
|
-
run_result: ClassicEvalRunResult,
|
|
301
|
-
experiment_id: str,
|
|
302
|
-
project_id: str,
|
|
303
|
-
inference_callable: Optional[Callable],
|
|
304
|
-
metadata: Optional[Dict[str, Any]],
|
|
305
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
306
|
-
) -> str:
|
|
307
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
run: The evaluation run result to upload
|
|
311
|
-
experiment_id: Trismik experiment identifier
|
|
312
|
-
project_id: Trismik project identifier
|
|
313
|
-
model: Model name used for evaluation
|
|
314
|
-
metadata: Optional metadata dictionary
|
|
315
|
-
trismik_client: Trismik client instance
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Run id
|
|
319
|
-
"""
|
|
320
|
-
model = get_model_name(inference_callable)
|
|
321
|
-
|
|
322
|
-
# Create eval items from run_spec inputs, outputs, and labels
|
|
323
|
-
items: List[TrismikClassicEvalItem] = []
|
|
324
|
-
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
-
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
-
labels = run_result.run_spec.labels
|
|
327
|
-
label = labels[idx] if idx < len(labels) else ""
|
|
328
|
-
|
|
329
|
-
# Calculate item-level metrics for this item
|
|
330
|
-
item_metrics: Dict[str, Any] = {}
|
|
331
|
-
if run_result.scores:
|
|
332
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
333
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
334
|
-
if idx < len(metric_data["item_scores"]):
|
|
335
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
336
|
-
else:
|
|
337
|
-
# If scores is just a single value, use it for all items
|
|
338
|
-
item_metrics[metric_name] = metric_data
|
|
339
|
-
|
|
340
|
-
eval_item = TrismikClassicEvalItem(
|
|
341
|
-
datasetItemId=str(idx),
|
|
342
|
-
modelInput=str(input_value),
|
|
343
|
-
modelOutput=str(output),
|
|
344
|
-
goldOutput=str(label),
|
|
345
|
-
metrics=item_metrics,
|
|
346
|
-
)
|
|
347
|
-
items.append(eval_item)
|
|
348
|
-
|
|
349
|
-
# Create eval metrics from run aggregate scores
|
|
350
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
351
|
-
if run_result.scores:
|
|
352
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
353
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
354
|
-
# Handle structured metric data with aggregate scores
|
|
355
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
356
|
-
metric_id = (
|
|
357
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
358
|
-
)
|
|
359
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
360
|
-
metrics.append(metric)
|
|
361
|
-
else:
|
|
362
|
-
# Handle simple metric data (single value)
|
|
363
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
364
|
-
metrics.append(metric)
|
|
365
|
-
|
|
366
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
367
|
-
project_id,
|
|
368
|
-
experiment_id,
|
|
369
|
-
run_result.run_spec.dataset.name,
|
|
370
|
-
model,
|
|
371
|
-
run_result.run_spec.hyperparameter_config,
|
|
372
|
-
items,
|
|
373
|
-
metrics,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
|
|
377
|
-
classic_eval_request
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
run_id: str = response.id
|
|
381
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
382
|
-
|
|
383
|
-
return run_id
|
|
384
|
-
|
|
385
|
-
|
|
386
364
|
async def run_adaptive_evaluation(
|
|
387
365
|
inference: Callable,
|
|
388
366
|
adaptive_run_spec: AdaptiveEvalRunSpec,
|
|
@@ -403,8 +381,20 @@ async def run_adaptive_evaluation(
|
|
|
403
381
|
Returns:
|
|
404
382
|
Results from the adaptive evaluation
|
|
405
383
|
"""
|
|
384
|
+
# Fetch available splits from Trismik
|
|
385
|
+
dataset_info = await trismik_client.get_dataset_info(adaptive_run_spec.dataset)
|
|
386
|
+
available_splits = dataset_info.splits if hasattr(dataset_info, "splits") else []
|
|
387
|
+
|
|
388
|
+
# Resolve the split to use (with fallback: user-specified -> validation -> test)
|
|
389
|
+
resolved_split = resolve_adaptive_split(
|
|
390
|
+
test_id=adaptive_run_spec.dataset,
|
|
391
|
+
user_specified_split=adaptive_run_spec.split,
|
|
392
|
+
available_splits=available_splits,
|
|
393
|
+
)
|
|
394
|
+
|
|
406
395
|
trismik_results = await trismik_client.run(
|
|
407
396
|
test_id=adaptive_run_spec.dataset,
|
|
397
|
+
split=resolved_split,
|
|
408
398
|
project_id=project_id,
|
|
409
399
|
experiment=experiment_id,
|
|
410
400
|
run_metadata=TrismikRunMetadata(
|