scorebook 0.0.5__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scorebook-0.0.5 → scorebook-0.0.7}/PKG-INFO +5 -3
- {scorebook-0.0.5 → scorebook-0.0.7}/pyproject.toml +2 -2
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/__init__.py +1 -1
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/evaluate.py +176 -84
- scorebook-0.0.7/src/scorebook/trismik_services/upload_classic_eval_run.py +102 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/types.py +47 -33
- {scorebook-0.0.5 → scorebook-0.0.7}/LICENSE +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/README.md +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/cli/__init__.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/cli/auth.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/cli/main.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/eval_dataset.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/exceptions.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference/__init__.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference/bedrock.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference/openai.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference/portkey.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference/vertex.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/inference_pipeline.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/metrics/__init__.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/metrics/accuracy.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/metrics/metric_base.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/metrics/metric_registry.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/metrics/precision.py +0 -0
- {scorebook-0.0.5/src/scorebook/trismik → scorebook-0.0.7/src/scorebook/trismik_services}/__init__.py +0 -0
- {scorebook-0.0.5/src/scorebook/trismik → scorebook-0.0.7/src/scorebook/trismik_services}/adaptive_testing_service.py +0 -0
- {scorebook-0.0.5/src/scorebook/trismik → scorebook-0.0.7/src/scorebook/trismik_services}/login.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/__init__.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/async_utils.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/build_prompt.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/io_helpers.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/jinja_helpers.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/mappers.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/progress_bars.py +0 -0
- {scorebook-0.0.5 → scorebook-0.0.7}/src/scorebook/utils/transform_helpers.py +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.7
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
|
+
License-File: LICENSE
|
|
5
6
|
Author: Euan Campbell
|
|
6
7
|
Author-email: euan@trismik.com
|
|
7
8
|
Requires-Python: >=3.9
|
|
@@ -11,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
14
16
|
Provides-Extra: bedrock
|
|
15
17
|
Provides-Extra: examples
|
|
16
18
|
Provides-Extra: openai
|
|
@@ -35,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
35
37
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
36
38
|
Requires-Dist: torchvision ; extra == "examples"
|
|
37
39
|
Requires-Dist: transformers ; extra == "examples"
|
|
38
|
-
Requires-Dist: trismik
|
|
40
|
+
Requires-Dist: trismik (>=0.9.4)
|
|
39
41
|
Description-Content-Type: text/markdown
|
|
40
42
|
|
|
41
43
|
# Scorebook
|
|
@@ -11,14 +11,14 @@ requires-python = ">=3.9"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"datasets>=3.6.0",
|
|
13
13
|
"notebook (>=7.4.5,<8.0.0)",
|
|
14
|
-
"trismik",
|
|
14
|
+
"trismik>=0.9.4",
|
|
15
15
|
]
|
|
16
16
|
|
|
17
17
|
[project.scripts]
|
|
18
18
|
scorebook = "scorebook.cli.main:main"
|
|
19
19
|
|
|
20
20
|
[tool.poetry]
|
|
21
|
-
version = "0.0.
|
|
21
|
+
version = "0.0.7" # base version
|
|
22
22
|
packages = [{ include = "scorebook", from = "src" }]
|
|
23
23
|
|
|
24
24
|
[[tool.poetry.source]]
|
|
@@ -12,7 +12,7 @@ __version__ = importlib.metadata.version(__package__ or __name__)
|
|
|
12
12
|
from scorebook.eval_dataset import EvalDataset
|
|
13
13
|
from scorebook.evaluate import evaluate
|
|
14
14
|
from scorebook.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.
|
|
15
|
+
from scorebook.trismik_services.login import login, whoami
|
|
16
16
|
from scorebook.utils.build_prompt import build_prompt
|
|
17
17
|
|
|
18
18
|
__all__ = ["EvalDataset", "evaluate", "build_prompt", "login", "whoami", "InferencePipeline"]
|
|
@@ -15,7 +15,7 @@ models on datasets and computing metric scores.
|
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
17
|
import logging
|
|
18
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
18
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
19
19
|
|
|
20
20
|
from scorebook.eval_dataset import EvalDataset
|
|
21
21
|
from scorebook.exceptions import (
|
|
@@ -23,8 +23,11 @@ from scorebook.exceptions import (
|
|
|
23
23
|
MetricComputationError,
|
|
24
24
|
ParallelExecutionError,
|
|
25
25
|
ParameterValidationError,
|
|
26
|
+
ScoreBookError,
|
|
26
27
|
)
|
|
27
|
-
from scorebook.
|
|
28
|
+
from scorebook.trismik_services import run_adaptive_evaluation
|
|
29
|
+
from scorebook.trismik_services.login import get_token
|
|
30
|
+
from scorebook.trismik_services.upload_classic_eval_run import upload_classic_eval_run
|
|
28
31
|
from scorebook.types import (
|
|
29
32
|
AdaptiveEvalDataset,
|
|
30
33
|
AdaptiveEvalRunResult,
|
|
@@ -39,60 +42,43 @@ logger = logging.getLogger(__name__)
|
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
def evaluate(
|
|
42
|
-
|
|
43
|
-
|
|
45
|
+
inference: Callable,
|
|
46
|
+
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
44
47
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
46
48
|
experiment_id: Optional[str] = None,
|
|
47
49
|
project_id: Optional[str] = None,
|
|
50
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
51
|
+
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
52
|
+
sample_size: Optional[int] = None,
|
|
48
53
|
parallel: bool = False,
|
|
49
54
|
return_dict: bool = True,
|
|
50
55
|
return_aggregates: bool = True,
|
|
51
56
|
return_items: bool = False,
|
|
52
57
|
return_output: bool = False,
|
|
53
|
-
sample_size: Optional[int] = None,
|
|
54
58
|
) -> Union[Dict, List]:
|
|
55
59
|
"""
|
|
56
|
-
Evaluate model
|
|
57
|
-
|
|
58
|
-
This function runs the provided inference callable on one or more evaluation datasets,
|
|
59
|
-
computes metric scores, and returns the evaluation results. It supports batch processing,
|
|
60
|
-
parameter sweeping, and different result formatting options.
|
|
60
|
+
Evaluate a model and collection of hyperparameters over datasets with specified metrics.
|
|
61
61
|
|
|
62
62
|
Args:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
metadata: Optional dictionary containing evaluation metadata.
|
|
73
|
-
experiment_id: Optional string identifier for tracking multiple evaluation runs.
|
|
63
|
+
inference: A callable that runs model inference over a list of evaluation items
|
|
64
|
+
datasets: One or more evaluation datasets to run evaluation on.
|
|
65
|
+
hyperparameters: Optional list of hyperparameter configurations or grid to evaluate
|
|
66
|
+
experiment_id: Optional ID of the experiment to upload results to on Trismik's dashboard.
|
|
67
|
+
project_id: Optional ID of the project to upload results to on Trismik's dashboard.
|
|
68
|
+
metadata: Optional metadata to attach to the evaluation.
|
|
69
|
+
upload_results: If True, uploads results to Trismik's dashboard.
|
|
70
|
+
sample_size: Optional number of items to sample from each dataset.
|
|
71
|
+
parallel: If True, runs evaluation in parallel. Requires the inference callable to be async.
|
|
74
72
|
return_dict: If True, returns eval results as a dict
|
|
75
73
|
return_aggregates: If True, returns aggregate scores for each dataset
|
|
76
74
|
return_items: If True, returns individual items for each dataset
|
|
77
75
|
return_output: If True, returns model outputs for each dataset item evaluated
|
|
78
|
-
sample_size: If set, only return a sample of the dataset items (for debugging)
|
|
79
|
-
parallel: If True, run inference functions in parallel (requires all functions to be async)
|
|
80
76
|
|
|
81
77
|
Returns:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
Example:
|
|
88
|
-
|
|
89
|
-
python
|
|
90
|
-
dataset = EvalDataset.from_huggingface("dataset_name", label="answer", metrics=[Precision])
|
|
91
|
-
def inference_fn(items):
|
|
92
|
-
# Model inference logic here - process all items at once
|
|
93
|
-
return [prediction for item in items]
|
|
94
|
-
|
|
95
|
-
results = evaluate(inference_fn, dataset, item_limit=100)
|
|
78
|
+
Union[Dict, List, EvalResult]:
|
|
79
|
+
The evaluation results in the format specified by return parameters:
|
|
80
|
+
- If return_dict=False: Returns an EvalResult object containing all run results
|
|
81
|
+
- If return_dict=True Returns the evaluation results as a dict
|
|
96
82
|
"""
|
|
97
83
|
|
|
98
84
|
logger.info(
|
|
@@ -104,8 +90,8 @@ def evaluate(
|
|
|
104
90
|
|
|
105
91
|
return asyncio.run(
|
|
106
92
|
_evaluate_async(
|
|
107
|
-
|
|
108
|
-
|
|
93
|
+
inference=inference,
|
|
94
|
+
datasets=datasets,
|
|
109
95
|
hyperparameters=hyperparameters,
|
|
110
96
|
metadata=metadata,
|
|
111
97
|
experiment_id=experiment_id,
|
|
@@ -115,14 +101,15 @@ def evaluate(
|
|
|
115
101
|
return_aggregates=return_aggregates,
|
|
116
102
|
return_items=return_items,
|
|
117
103
|
return_output=return_output,
|
|
104
|
+
upload_results=upload_results,
|
|
118
105
|
sample_size=sample_size,
|
|
119
106
|
)
|
|
120
107
|
)
|
|
121
108
|
|
|
122
109
|
|
|
123
110
|
async def _evaluate_async(
|
|
124
|
-
|
|
125
|
-
|
|
111
|
+
inference: Callable,
|
|
112
|
+
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
126
113
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
127
114
|
metadata: Optional[Dict[str, Any]] = None,
|
|
128
115
|
experiment_id: Optional[str] = None,
|
|
@@ -132,10 +119,15 @@ async def _evaluate_async(
|
|
|
132
119
|
return_items: bool = False,
|
|
133
120
|
return_output: bool = False,
|
|
134
121
|
parallel: bool = False,
|
|
122
|
+
upload_results: Union[Literal["auto"], bool] = "auto",
|
|
135
123
|
sample_size: Optional[int] = None,
|
|
136
124
|
) -> Union[Dict, List]:
|
|
125
|
+
"""Run evaluation asynchronously."""
|
|
126
|
+
|
|
127
|
+
upload_results = _resolve_upload_results(upload_results)
|
|
128
|
+
|
|
137
129
|
_validate_parameters(locals())
|
|
138
|
-
datasets = _prepare_datasets(
|
|
130
|
+
datasets = _prepare_datasets(datasets, sample_size)
|
|
139
131
|
hyperparameter_configs = _prepare_hyperparameter_configs(hyperparameters)
|
|
140
132
|
|
|
141
133
|
logger.info(
|
|
@@ -155,29 +147,29 @@ async def _evaluate_async(
|
|
|
155
147
|
datasets, len(hyperparameter_configs), parallel, len(eval_run_specs)
|
|
156
148
|
) as progress_bars:
|
|
157
149
|
if parallel:
|
|
158
|
-
|
|
159
|
-
|
|
150
|
+
eval_result = await _run_parallel(
|
|
151
|
+
inference,
|
|
160
152
|
eval_run_specs,
|
|
161
153
|
progress_bars,
|
|
162
154
|
experiment_id,
|
|
163
155
|
project_id,
|
|
164
156
|
metadata,
|
|
157
|
+
upload_results,
|
|
165
158
|
)
|
|
166
159
|
else:
|
|
167
|
-
|
|
168
|
-
|
|
160
|
+
eval_result = await _run_sequential(
|
|
161
|
+
inference,
|
|
169
162
|
eval_run_specs,
|
|
170
163
|
progress_bars,
|
|
171
164
|
experiment_id,
|
|
172
165
|
project_id,
|
|
173
166
|
metadata,
|
|
167
|
+
upload_results,
|
|
174
168
|
)
|
|
175
169
|
|
|
176
170
|
logger.info("Evaluation completed successfully")
|
|
177
171
|
|
|
178
|
-
return _format_results(
|
|
179
|
-
eval_results, return_dict, return_aggregates, return_items, return_output
|
|
180
|
-
)
|
|
172
|
+
return _format_results(eval_result, return_dict, return_aggregates, return_items, return_output)
|
|
181
173
|
|
|
182
174
|
|
|
183
175
|
# ===== ORCHESTRATION PATHS =====
|
|
@@ -190,16 +182,35 @@ async def _run_parallel(
|
|
|
190
182
|
experiment_id: Optional[str] = None,
|
|
191
183
|
project_id: Optional[str] = None,
|
|
192
184
|
metadata: Optional[Dict[str, Any]] = None,
|
|
185
|
+
upload_results: bool = False,
|
|
193
186
|
) -> EvalResult:
|
|
187
|
+
"""Run evaluation in parallel."""
|
|
188
|
+
|
|
194
189
|
logger.debug("Running inference in parallel")
|
|
195
190
|
|
|
191
|
+
# Worker function to execute individual runs and handle uploads
|
|
196
192
|
async def worker(
|
|
197
193
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
198
194
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
199
195
|
run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
|
|
200
196
|
progress_bars.on_eval_run_completed(run.dataset_index)
|
|
197
|
+
|
|
198
|
+
if (
|
|
199
|
+
upload_results
|
|
200
|
+
and isinstance(run_result, ClassicEvalRunResult)
|
|
201
|
+
and experiment_id
|
|
202
|
+
and project_id
|
|
203
|
+
):
|
|
204
|
+
# Only upload runs that completed successfully
|
|
205
|
+
if run_result.run_completed:
|
|
206
|
+
run_id = await _upload_classic_run(
|
|
207
|
+
run_result, experiment_id, project_id, inference, metadata
|
|
208
|
+
)
|
|
209
|
+
run_result.run_id = run_id
|
|
210
|
+
|
|
201
211
|
return run_result
|
|
202
212
|
|
|
213
|
+
# Execute all runs concurrently
|
|
203
214
|
run_results = await asyncio.gather(*[worker(run) for run in runs])
|
|
204
215
|
# Return in canonical (dataset_idx, hp_idx) order for stability
|
|
205
216
|
run_results.sort(
|
|
@@ -215,13 +226,32 @@ async def _run_sequential(
|
|
|
215
226
|
experiment_id: Optional[str] = None,
|
|
216
227
|
project_id: Optional[str] = None,
|
|
217
228
|
metadata: Optional[Dict[str, Any]] = None,
|
|
229
|
+
upload_results: bool = False,
|
|
218
230
|
) -> EvalResult:
|
|
231
|
+
"""Run evaluation sequentially."""
|
|
232
|
+
|
|
219
233
|
logger.debug("Running inference sequentially")
|
|
234
|
+
|
|
220
235
|
run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]] = []
|
|
221
236
|
for run in runs:
|
|
222
237
|
run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
|
|
223
238
|
run_results.append(run_result)
|
|
224
239
|
progress_bars.on_hyperparam_completed(run_result.run_spec.dataset_index)
|
|
240
|
+
|
|
241
|
+
# Upload a classic eval run result immediately if upload_results is enabled
|
|
242
|
+
if (
|
|
243
|
+
upload_results
|
|
244
|
+
and isinstance(run_result, ClassicEvalRunResult)
|
|
245
|
+
and experiment_id
|
|
246
|
+
and project_id
|
|
247
|
+
):
|
|
248
|
+
# Only upload runs that completed successfully
|
|
249
|
+
if run_result.run_completed:
|
|
250
|
+
run_id = await _upload_classic_run(
|
|
251
|
+
run_result, experiment_id, project_id, inference, metadata
|
|
252
|
+
)
|
|
253
|
+
run_result.run_id = run_id
|
|
254
|
+
|
|
225
255
|
return EvalResult(run_results)
|
|
226
256
|
|
|
227
257
|
|
|
@@ -236,29 +266,39 @@ async def _execute_run(
|
|
|
236
266
|
metadata: Optional[Dict[str, Any]] = None,
|
|
237
267
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
238
268
|
"""Execute a single evaluation run."""
|
|
269
|
+
|
|
239
270
|
if isinstance(run, EvalRunSpec):
|
|
240
271
|
return await _execute_classic_eval_run(inference, run)
|
|
272
|
+
|
|
241
273
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
242
|
-
if experiment_id
|
|
243
|
-
raise
|
|
244
|
-
"experiment_id and project_id are required for adaptive
|
|
274
|
+
if not experiment_id or not project_id:
|
|
275
|
+
raise ScoreBookError(
|
|
276
|
+
"experiment_id and project_id are required for adaptive evaluations"
|
|
245
277
|
)
|
|
246
278
|
return await _execute_adaptive_eval_run(inference, run, experiment_id, project_id, metadata)
|
|
279
|
+
|
|
247
280
|
else:
|
|
248
|
-
raise
|
|
281
|
+
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
249
282
|
|
|
250
283
|
|
|
251
284
|
async def _execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
|
|
252
285
|
"""Execute a classic evaluation run."""
|
|
253
286
|
logger.debug("Executing classic eval run for %s", run)
|
|
254
287
|
|
|
255
|
-
inference_outputs =
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
288
|
+
inference_outputs = None
|
|
289
|
+
metric_scores = None
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
inference_outputs = await _run_inference_callable(
|
|
293
|
+
inference, run.dataset.items, run.hyperparameter_config
|
|
294
|
+
)
|
|
295
|
+
metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
|
|
296
|
+
logger.debug("Classic evaluation completed for run %s", run)
|
|
297
|
+
return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
|
|
259
298
|
|
|
260
|
-
|
|
261
|
-
|
|
299
|
+
except Exception as e:
|
|
300
|
+
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
301
|
+
return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
|
|
262
302
|
|
|
263
303
|
|
|
264
304
|
async def _execute_adaptive_eval_run(
|
|
@@ -282,20 +322,41 @@ async def _execute_adaptive_eval_run(
|
|
|
282
322
|
# ===== HELPER FUNCTIONS =====
|
|
283
323
|
|
|
284
324
|
|
|
325
|
+
def _resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
|
|
326
|
+
"""Resolve the upload_results parameter based on trismik login status."""
|
|
327
|
+
|
|
328
|
+
if upload_results == "auto":
|
|
329
|
+
upload_results = get_token() is not None
|
|
330
|
+
logger.debug("Auto upload results resolved to: %s", upload_results)
|
|
331
|
+
|
|
332
|
+
return upload_results
|
|
333
|
+
|
|
334
|
+
|
|
285
335
|
def _validate_parameters(params: Dict[str, Any]) -> None:
|
|
286
336
|
"""Validate all parameters for evaluation."""
|
|
287
337
|
|
|
338
|
+
# If returning a dict, it must contain items and/or aggregates
|
|
288
339
|
if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
|
|
289
340
|
raise ParameterValidationError(
|
|
290
341
|
"When return_dict=True, at least one of return_aggregates or return_items must be True"
|
|
291
342
|
)
|
|
292
343
|
|
|
293
|
-
|
|
344
|
+
# Parallel runs require an asynchronous inference callable
|
|
345
|
+
if params["parallel"] and not is_awaitable(params["inference"]):
|
|
294
346
|
raise ParallelExecutionError(
|
|
295
347
|
"parallel=True requires the inference_callable to be async. "
|
|
296
348
|
"Please make your inference function async or set parallel=False."
|
|
297
349
|
)
|
|
298
350
|
|
|
351
|
+
# If uploading results, experiment_id and project_id must be specified
|
|
352
|
+
if params["upload_results"]:
|
|
353
|
+
if params["experiment_id"] is None or params["project_id"] is None:
|
|
354
|
+
raise ParameterValidationError(
|
|
355
|
+
"experiment_id and project_id are required for upload_results=True"
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
logger.debug("Parameter validation successful")
|
|
359
|
+
|
|
299
360
|
|
|
300
361
|
def _prepare_datasets(
|
|
301
362
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
@@ -354,7 +415,8 @@ def _build_eval_run_specs(
|
|
|
354
415
|
project_id: Optional[str],
|
|
355
416
|
metadata: Optional[Dict[str, Any]] = None,
|
|
356
417
|
) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
|
|
357
|
-
"""Build RunSpec objects for each dataset/hyperparameter combination."""
|
|
418
|
+
"""Build All RunSpec objects for each dataset/hyperparameter combination."""
|
|
419
|
+
|
|
358
420
|
eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
|
|
359
421
|
for dataset_index, dataset in enumerate(datasets):
|
|
360
422
|
for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
|
|
@@ -369,9 +431,9 @@ def _build_eval_run_specs(
|
|
|
369
431
|
|
|
370
432
|
# Create adaptive eval run spec from string
|
|
371
433
|
elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
|
|
372
|
-
if experiment_id
|
|
373
|
-
raise
|
|
374
|
-
"experiment_id and project_id are required for adaptive
|
|
434
|
+
if not experiment_id or not project_id:
|
|
435
|
+
raise ScoreBookError(
|
|
436
|
+
"experiment_id and project_id are required for adaptive evaluations"
|
|
375
437
|
)
|
|
376
438
|
eval_run_specs.append(
|
|
377
439
|
_build_adaptive_eval_run_spec(
|
|
@@ -385,24 +447,6 @@ def _build_eval_run_specs(
|
|
|
385
447
|
)
|
|
386
448
|
)
|
|
387
449
|
|
|
388
|
-
# Create adaptive eval run spec from AdaptiveEvalDataset
|
|
389
|
-
elif isinstance(dataset, AdaptiveEvalDataset):
|
|
390
|
-
if experiment_id is None or project_id is None:
|
|
391
|
-
raise ParameterValidationError(
|
|
392
|
-
"experiment_id and project_id are required for adaptive evaluation"
|
|
393
|
-
)
|
|
394
|
-
eval_run_specs.append(
|
|
395
|
-
_build_adaptive_eval_run_spec(
|
|
396
|
-
dataset.name,
|
|
397
|
-
dataset_index,
|
|
398
|
-
hyperparameter_config,
|
|
399
|
-
hyperparameters_index,
|
|
400
|
-
experiment_id,
|
|
401
|
-
project_id,
|
|
402
|
-
metadata,
|
|
403
|
-
)
|
|
404
|
-
)
|
|
405
|
-
|
|
406
450
|
# Log warning - should never happen
|
|
407
451
|
else:
|
|
408
452
|
logger.warning("Unrecognized dataset type: %s", dataset)
|
|
@@ -416,7 +460,7 @@ def _build_classic_eval_run_spec(
|
|
|
416
460
|
hyperparameters: Dict[str, Any],
|
|
417
461
|
hyperparameters_index: int,
|
|
418
462
|
) -> EvalRunSpec:
|
|
419
|
-
"""Build
|
|
463
|
+
"""Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
|
|
420
464
|
items = dataset.items
|
|
421
465
|
labels = [item.get(dataset.label) for item in items]
|
|
422
466
|
eval_run_spec = EvalRunSpec(
|
|
@@ -440,6 +484,7 @@ def _build_adaptive_eval_run_spec(
|
|
|
440
484
|
project_id: str,
|
|
441
485
|
metadata: Optional[Dict[str, Any]] = None,
|
|
442
486
|
) -> AdaptiveEvalRunSpec:
|
|
487
|
+
"""Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
|
|
443
488
|
dataset = adaptive_dataset.replace(":adaptive", "")
|
|
444
489
|
adaptive_eval_run_spec = AdaptiveEvalRunSpec(
|
|
445
490
|
dataset,
|
|
@@ -459,6 +504,7 @@ async def _run_inference_callable(
|
|
|
459
504
|
items: List[Dict[str, Any]],
|
|
460
505
|
hyperparameter_config: Dict[str, Any],
|
|
461
506
|
) -> Any:
|
|
507
|
+
"""Run inference on a given dataset and hyperparameter configuration."""
|
|
462
508
|
if is_awaitable(inference):
|
|
463
509
|
return await inference(items, **hyperparameter_config)
|
|
464
510
|
else:
|
|
@@ -493,6 +539,50 @@ def _score_metrics(
|
|
|
493
539
|
return metric_scores
|
|
494
540
|
|
|
495
541
|
|
|
542
|
+
async def _upload_classic_run(
|
|
543
|
+
run_result: ClassicEvalRunResult,
|
|
544
|
+
experiment_id: str,
|
|
545
|
+
project_id: str,
|
|
546
|
+
inference_callable: Optional[Callable] = None,
|
|
547
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
548
|
+
) -> Optional[str]:
|
|
549
|
+
"""Upload a ClassicEvalRunResult to Trismik."""
|
|
550
|
+
|
|
551
|
+
logger.debug("Uploading classic eval run: %s", run_result.run_spec)
|
|
552
|
+
try:
|
|
553
|
+
model_name = _get_model_name(inference_callable, metadata)
|
|
554
|
+
response = await upload_classic_eval_run(
|
|
555
|
+
run=run_result,
|
|
556
|
+
experiment_id=experiment_id,
|
|
557
|
+
project_id=project_id,
|
|
558
|
+
model=model_name,
|
|
559
|
+
metadata=metadata,
|
|
560
|
+
)
|
|
561
|
+
logger.info("Successfully uploaded classic eval run: %s", response.id)
|
|
562
|
+
return str(response.id)
|
|
563
|
+
|
|
564
|
+
except Exception as e:
|
|
565
|
+
logger.error("Failed to upload classic eval run: %s", str(e))
|
|
566
|
+
return None
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _get_model_name(
|
|
570
|
+
inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
|
|
571
|
+
) -> str:
|
|
572
|
+
"""Determine a model's name with the fallback "unspecified"."""
|
|
573
|
+
|
|
574
|
+
# First priority: metadata.model
|
|
575
|
+
if metadata and "model" in metadata:
|
|
576
|
+
return str(metadata["model"])
|
|
577
|
+
|
|
578
|
+
# Second priority: inference_pipeline.model (if callable is an InferencePipeline)
|
|
579
|
+
if inference_callable and hasattr(inference_callable, "model"):
|
|
580
|
+
return str(inference_callable.model)
|
|
581
|
+
|
|
582
|
+
# Fallback: "unspecified"
|
|
583
|
+
return "unspecified"
|
|
584
|
+
|
|
585
|
+
|
|
496
586
|
def _format_results(
|
|
497
587
|
eval_result: EvalResult,
|
|
498
588
|
return_dict: bool,
|
|
@@ -510,10 +600,12 @@ def _format_results(
|
|
|
510
600
|
|
|
511
601
|
if return_items:
|
|
512
602
|
item_scores = eval_result.item_scores
|
|
603
|
+
|
|
513
604
|
# Remove inference output if not requested
|
|
514
605
|
if not return_output:
|
|
515
606
|
for item in item_scores:
|
|
516
607
|
item.pop("inference_output", None)
|
|
608
|
+
|
|
517
609
|
results["item_results"] = item_scores
|
|
518
610
|
|
|
519
611
|
# If both are requested, return the combined structure
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Upload classic evaluation run results to Trismik platform."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from trismik.adaptive_test import AdaptiveTest
|
|
7
|
+
from trismik.client_async import TrismikAsyncClient
|
|
8
|
+
from trismik.types import (
|
|
9
|
+
TrismikClassicEvalItem,
|
|
10
|
+
TrismikClassicEvalMetric,
|
|
11
|
+
TrismikClassicEvalRequest,
|
|
12
|
+
TrismikClassicEvalResponse,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from scorebook.trismik_services.login import get_token
|
|
16
|
+
from scorebook.types import ClassicEvalRunResult
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def upload_classic_eval_run(
|
|
22
|
+
run: ClassicEvalRunResult,
|
|
23
|
+
experiment_id: str,
|
|
24
|
+
project_id: str,
|
|
25
|
+
model: str,
|
|
26
|
+
metadata: Optional[Dict[str, Any]],
|
|
27
|
+
) -> TrismikClassicEvalResponse:
|
|
28
|
+
"""Upload a classic evaluation run result to Trismik platform.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
run: The evaluation run result to upload
|
|
32
|
+
experiment_id: Trismik experiment identifier
|
|
33
|
+
project_id: Trismik project identifier
|
|
34
|
+
model: Model name used for evaluation
|
|
35
|
+
metadata: Optional metadata dictionary
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Response from Trismik API containing the upload result
|
|
39
|
+
"""
|
|
40
|
+
runner = AdaptiveTest(
|
|
41
|
+
lambda x: None,
|
|
42
|
+
client=TrismikAsyncClient(
|
|
43
|
+
service_url="https://api-stage.trismik.com/adaptive-testing", api_key=get_token()
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Create eval items from run_spec items, outputs, and labels
|
|
48
|
+
items: List[TrismikClassicEvalItem] = []
|
|
49
|
+
for idx, (item, output) in enumerate(zip(run.run_spec.items, run.outputs)):
|
|
50
|
+
label = run.run_spec.labels[idx] if idx < len(run.run_spec.labels) else ""
|
|
51
|
+
|
|
52
|
+
# Calculate item-level metrics for this item
|
|
53
|
+
item_metrics: Dict[str, Any] = {}
|
|
54
|
+
for metric_name, metric_data in run.scores.items():
|
|
55
|
+
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
56
|
+
if idx < len(metric_data["item_scores"]):
|
|
57
|
+
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
58
|
+
else:
|
|
59
|
+
# If scores is just a single value, use it for all items
|
|
60
|
+
item_metrics[metric_name] = metric_data
|
|
61
|
+
|
|
62
|
+
eval_item = TrismikClassicEvalItem(
|
|
63
|
+
datasetItemId=str(idx),
|
|
64
|
+
modelInput=str(item),
|
|
65
|
+
modelOutput=str(output),
|
|
66
|
+
goldOutput=str(label),
|
|
67
|
+
metrics=item_metrics,
|
|
68
|
+
)
|
|
69
|
+
items.append(eval_item)
|
|
70
|
+
|
|
71
|
+
# Create eval metrics from run aggregate scores
|
|
72
|
+
metrics: List[TrismikClassicEvalMetric] = []
|
|
73
|
+
for metric_name, metric_data in run.scores.items():
|
|
74
|
+
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
75
|
+
# Handle structured metric data with aggregate scores
|
|
76
|
+
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
77
|
+
metric_id = f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
78
|
+
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
79
|
+
metrics.append(metric)
|
|
80
|
+
else:
|
|
81
|
+
# Handle simple metric data (single value)
|
|
82
|
+
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
83
|
+
metrics.append(metric)
|
|
84
|
+
|
|
85
|
+
classic_eval_request = TrismikClassicEvalRequest(
|
|
86
|
+
project_id,
|
|
87
|
+
experiment_id,
|
|
88
|
+
run.run_spec.dataset.name,
|
|
89
|
+
model,
|
|
90
|
+
run.run_spec.hyperparameter_config,
|
|
91
|
+
items,
|
|
92
|
+
metrics,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
response: TrismikClassicEvalResponse = await runner.submit_classic_eval_async(
|
|
96
|
+
classic_eval_request
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
run_id: str = response.id
|
|
100
|
+
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
101
|
+
|
|
102
|
+
return response
|
|
@@ -52,35 +52,43 @@ class ClassicEvalRunResult:
|
|
|
52
52
|
"""Results from executing a classic evaluation run."""
|
|
53
53
|
|
|
54
54
|
run_spec: EvalRunSpec
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
run_completed: bool
|
|
56
|
+
outputs: Optional[List[Any]]
|
|
57
|
+
scores: Optional[Dict[str, Any]]
|
|
58
|
+
run_id: Optional[str] = None
|
|
57
59
|
|
|
58
60
|
@property
|
|
59
61
|
def item_scores(self) -> List[Dict[str, Any]]:
|
|
60
62
|
"""Return a list of dictionaries containing scores for each evaluated item."""
|
|
61
63
|
results = []
|
|
62
64
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
65
|
+
if self.outputs:
|
|
66
|
+
for idx, output in enumerate(self.outputs):
|
|
67
|
+
if idx >= len(self.run_spec.items):
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
result = {
|
|
71
|
+
"item_id": idx,
|
|
72
|
+
"dataset_name": self.run_spec.dataset.name,
|
|
73
|
+
"inference_output": output,
|
|
74
|
+
**self.run_spec.hyperparameter_config,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Add run_id if available
|
|
78
|
+
if self.run_id is not None:
|
|
79
|
+
result["run_id"] = self.run_id
|
|
80
|
+
|
|
81
|
+
# Add individual item scores if available
|
|
82
|
+
if self.scores is not None:
|
|
83
|
+
for metric_name, metric_data in self.scores.items():
|
|
84
|
+
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
85
|
+
if idx < len(metric_data["item_scores"]):
|
|
86
|
+
result[metric_name] = metric_data["item_scores"][idx]
|
|
87
|
+
else:
|
|
88
|
+
# If scores is just a single value, replicate it for each item
|
|
89
|
+
result[metric_name] = metric_data
|
|
90
|
+
|
|
91
|
+
results.append(result)
|
|
84
92
|
|
|
85
93
|
return results
|
|
86
94
|
|
|
@@ -89,19 +97,25 @@ class ClassicEvalRunResult:
|
|
|
89
97
|
"""Return the aggregated scores for this run."""
|
|
90
98
|
result = {
|
|
91
99
|
"dataset": self.run_spec.dataset.name,
|
|
100
|
+
"run_completed": self.run_completed,
|
|
92
101
|
**self.run_spec.hyperparameter_config,
|
|
93
102
|
}
|
|
94
103
|
|
|
104
|
+
# Add run_id if available
|
|
105
|
+
if self.run_id is not None:
|
|
106
|
+
result["run_id"] = self.run_id
|
|
107
|
+
|
|
95
108
|
# Add aggregate scores from metrics
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
109
|
+
if self.scores is not None:
|
|
110
|
+
for metric_name, metric_data in self.scores.items():
|
|
111
|
+
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
112
|
+
# Flatten the aggregate scores from each metric
|
|
113
|
+
for key, value in metric_data["aggregate_scores"].items():
|
|
114
|
+
score_key = key if key == metric_name else f"{metric_name}_{key}"
|
|
115
|
+
result[score_key] = value
|
|
116
|
+
else:
|
|
117
|
+
# If scores is just a single value, use it as is
|
|
118
|
+
result[metric_name] = metric_data
|
|
105
119
|
|
|
106
120
|
return result
|
|
107
121
|
|
|
@@ -149,7 +163,7 @@ class EvalResult:
|
|
|
149
163
|
results = []
|
|
150
164
|
|
|
151
165
|
for run_result in self.run_results:
|
|
152
|
-
if isinstance(run_result, ClassicEvalRunResult):
|
|
166
|
+
if isinstance(run_result, ClassicEvalRunResult) and run_result.run_completed:
|
|
153
167
|
results.extend(run_result.item_scores)
|
|
154
168
|
|
|
155
169
|
return results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scorebook-0.0.5/src/scorebook/trismik → scorebook-0.0.7/src/scorebook/trismik_services}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{scorebook-0.0.5/src/scorebook/trismik → scorebook-0.0.7/src/scorebook/trismik_services}/login.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|