scorebook 0.0.11__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scorebook-0.0.11 → scorebook-0.0.12}/PKG-INFO +2 -2
- {scorebook-0.0.11 → scorebook-0.0.12}/pyproject.toml +18 -12
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/__init__.py +8 -1
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/_async/evaluate_async.py +100 -125
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/_sync/evaluate.py +100 -126
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/evaluate_helpers.py +24 -24
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/exceptions.py +6 -2
- scorebook-0.0.12/src/scorebook/score/__init__.py +6 -0
- scorebook-0.0.12/src/scorebook/score/_async/__init__.py +0 -0
- scorebook-0.0.12/src/scorebook/score/_async/score_async.py +145 -0
- scorebook-0.0.12/src/scorebook/score/_sync/__init__.py +0 -0
- scorebook-0.0.12/src/scorebook/score/_sync/score.py +145 -0
- scorebook-0.0.12/src/scorebook/score/score_helpers.py +207 -0
- scorebook-0.0.12/src/scorebook/trismik/upload_results.py +254 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/types.py +33 -54
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/__init__.py +8 -1
- scorebook-0.0.12/src/scorebook/utils/common_helpers.py +41 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/progress_bars.py +67 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/LICENSE +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/README.md +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/cli/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/cli/auth.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/cli/main.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/eval_datasets/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/eval_datasets/eval_dataset.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/_async/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/evaluate/_sync/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/clients/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/clients/bedrock.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/clients/openai.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/clients/portkey.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/clients/vertex.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/inference/inference_pipeline.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/metrics/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/metrics/accuracy.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/metrics/metric_base.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/metrics/metric_registry.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/metrics/precision.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/settings.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/trismik/__init__.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/trismik/credentials.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/async_utils.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/io_helpers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/jinja_helpers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/mappers.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/render_template.py +0 -0
- {scorebook-0.0.11 → scorebook-0.0.12}/src/scorebook/utils/transform_helpers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scorebook
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: A Python project for LLM evaluation.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Euan Campbell
|
|
@@ -37,7 +37,7 @@ Requires-Dist: torch ; extra == "examples"
|
|
|
37
37
|
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
38
|
Requires-Dist: torchvision ; extra == "examples"
|
|
39
39
|
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (
|
|
40
|
+
Requires-Dist: trismik (==1.0.1)
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
|
|
43
43
|
# Scorebook
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.9, <3.14"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"datasets>=3.6.0",
|
|
13
13
|
"notebook (>=7.4.5,<8.0.0)",
|
|
14
|
-
"trismik
|
|
14
|
+
"trismik==1.0.1",
|
|
15
15
|
"ipywidgets>=8.0.0",
|
|
16
16
|
]
|
|
17
17
|
|
|
@@ -19,14 +19,14 @@ dependencies = [
|
|
|
19
19
|
scorebook = "scorebook.cli.main:main"
|
|
20
20
|
|
|
21
21
|
[tool.poetry]
|
|
22
|
-
version = "0.0.
|
|
22
|
+
version = "0.0.12" # base version
|
|
23
23
|
packages = [{ include = "scorebook", from = "src" }]
|
|
24
24
|
|
|
25
25
|
[tool.poetry.dependencies]
|
|
26
26
|
python = ">=3.9,<3.14"
|
|
27
27
|
datasets = ">=3.6.0"
|
|
28
28
|
notebook = ">=7.4.5,<8.0.0"
|
|
29
|
-
trismik = "
|
|
29
|
+
trismik = "1.0.1"
|
|
30
30
|
ipywidgets = ">=8.0.0"
|
|
31
31
|
|
|
32
32
|
# Optional dependencies
|
|
@@ -119,16 +119,22 @@ install_types = true
|
|
|
119
119
|
[tool.flake8] # note that this depends on Flake8-pyproject
|
|
120
120
|
ignore = ["D202", "W503", "W504"]
|
|
121
121
|
|
|
122
|
-
[tool.unasync]
|
|
123
122
|
[[tool.unasync.rules]]
|
|
124
123
|
fromdir = "src/scorebook/evaluate/_async/"
|
|
125
124
|
todir = "src/scorebook/evaluate/_sync/"
|
|
125
|
+
replacements."scorebook.score._async.score_async" = "scorebook.score._sync.score"
|
|
126
|
+
replacements."scorebook.score._async" = "scorebook.score._sync"
|
|
127
|
+
replacements.evaluate_async = "evaluate"
|
|
128
|
+
replacements."Asynchronous evaluation complete" = "Synchronous evaluation complete"
|
|
129
|
+
replacements." run_results = asyncio.gather(*[worker(run) for run in runs])" = " run_results = [worker(run) for run in runs]"
|
|
130
|
+
replacements.async_nullcontext = "nullcontext"
|
|
131
|
+
replacements.create_trismik_async_client = "create_trismik_sync_client"
|
|
132
|
+
replacements.score_async = "score"
|
|
126
133
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
"
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
"create_trismik_async_client" = "create_trismik_sync_client"
|
|
134
|
+
[[tool.unasync.rules]]
|
|
135
|
+
fromdir = "src/scorebook/score/_async/"
|
|
136
|
+
todir = "src/scorebook/score/_sync/"
|
|
137
|
+
replacements.score_async = "score"
|
|
138
|
+
replacements."Async scoring complete" = "Scoring complete"
|
|
139
|
+
replacements.calculate_metric_scores_async = "calculate_metric_scores"
|
|
140
|
+
replacements.upload_result_async = "upload_result"
|
|
@@ -12,15 +12,22 @@ __version__ = importlib.metadata.version(__package__ or __name__)
|
|
|
12
12
|
from scorebook.eval_datasets import EvalDataset
|
|
13
13
|
from scorebook.evaluate import evaluate, evaluate_async
|
|
14
14
|
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
15
|
-
from scorebook.
|
|
15
|
+
from scorebook.score import score, score_async
|
|
16
|
+
from scorebook.trismik.credentials import login, logout, whoami
|
|
17
|
+
from scorebook.trismik.upload_results import upload_result, upload_result_async
|
|
16
18
|
from scorebook.utils.render_template import render_template
|
|
17
19
|
|
|
18
20
|
__all__ = [
|
|
19
21
|
"EvalDataset",
|
|
20
22
|
"evaluate",
|
|
21
23
|
"evaluate_async",
|
|
24
|
+
"score",
|
|
25
|
+
"score_async",
|
|
22
26
|
"render_template",
|
|
23
27
|
"login",
|
|
28
|
+
"logout",
|
|
24
29
|
"whoami",
|
|
25
30
|
"InferencePipeline",
|
|
31
|
+
"upload_result",
|
|
32
|
+
"upload_result_async",
|
|
26
33
|
]
|
|
@@ -3,13 +3,8 @@ import logging
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from trismik import TrismikAsyncClient, TrismikClient
|
|
6
|
-
from trismik.
|
|
7
|
-
|
|
8
|
-
TrismikClassicEvalMetric,
|
|
9
|
-
TrismikClassicEvalRequest,
|
|
10
|
-
TrismikClassicEvalResponse,
|
|
11
|
-
TrismikRunMetadata,
|
|
12
|
-
)
|
|
6
|
+
from trismik.settings import evaluation_settings
|
|
7
|
+
from trismik.types import TrismikRunMetadata
|
|
13
8
|
|
|
14
9
|
from scorebook.eval_datasets import EvalDataset
|
|
15
10
|
from scorebook.evaluate.evaluate_helpers import (
|
|
@@ -20,12 +15,11 @@ from scorebook.evaluate.evaluate_helpers import (
|
|
|
20
15
|
make_trismik_inference,
|
|
21
16
|
prepare_datasets,
|
|
22
17
|
prepare_hyperparameter_configs,
|
|
23
|
-
resolve_show_progress,
|
|
24
|
-
resolve_upload_results,
|
|
25
|
-
score_metrics,
|
|
26
18
|
validate_parameters,
|
|
27
19
|
)
|
|
28
20
|
from scorebook.exceptions import InferenceError, ScoreBookError
|
|
21
|
+
from scorebook.inference.inference_pipeline import InferencePipeline
|
|
22
|
+
from scorebook.score._async.score_async import score_async
|
|
29
23
|
from scorebook.types import (
|
|
30
24
|
AdaptiveEvalRunResult,
|
|
31
25
|
AdaptiveEvalRunSpec,
|
|
@@ -33,13 +27,18 @@ from scorebook.types import (
|
|
|
33
27
|
EvalResult,
|
|
34
28
|
EvalRunSpec,
|
|
35
29
|
)
|
|
36
|
-
from scorebook.utils import
|
|
30
|
+
from scorebook.utils import (
|
|
31
|
+
async_nullcontext,
|
|
32
|
+
evaluation_progress_context,
|
|
33
|
+
resolve_show_progress,
|
|
34
|
+
resolve_upload_results,
|
|
35
|
+
)
|
|
37
36
|
|
|
38
37
|
logger = logging.getLogger(__name__)
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
async def evaluate_async(
|
|
42
|
-
inference: Callable,
|
|
41
|
+
inference: Union[Callable, InferencePipeline],
|
|
43
42
|
datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
|
|
44
43
|
hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
45
44
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -90,7 +89,7 @@ async def evaluate_async(
|
|
|
90
89
|
key=lambda run: (run.dataset_index, run.hyperparameters_index),
|
|
91
90
|
)
|
|
92
91
|
|
|
93
|
-
# Create Trismik client if needed (for adaptive evals or uploads)
|
|
92
|
+
# Create a Trismik client if needed (for adaptive evals or uploads)
|
|
94
93
|
needs_client = upload_results or any(
|
|
95
94
|
isinstance(run, AdaptiveEvalRunSpec) for run in eval_run_specs
|
|
96
95
|
)
|
|
@@ -101,7 +100,14 @@ async def evaluate_async(
|
|
|
101
100
|
async with trismik_client or async_nullcontext():
|
|
102
101
|
# Execute evaluation runs
|
|
103
102
|
# Calculate total items across all runs
|
|
104
|
-
total_items = sum(
|
|
103
|
+
total_items = sum(
|
|
104
|
+
(
|
|
105
|
+
len(run.dataset.items)
|
|
106
|
+
if isinstance(run, EvalRunSpec)
|
|
107
|
+
else evaluation_settings["max_iterations"]
|
|
108
|
+
) # Adaptive evals use max_iterations
|
|
109
|
+
for run in eval_run_specs
|
|
110
|
+
)
|
|
105
111
|
model_display = get_model_name(inference)
|
|
106
112
|
|
|
107
113
|
with evaluation_progress_context(
|
|
@@ -145,34 +151,32 @@ async def execute_runs(
|
|
|
145
151
|
async def worker(
|
|
146
152
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
|
|
147
153
|
) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
|
|
154
|
+
# Execute run (score_async handles upload internally for classic evals)
|
|
148
155
|
run_result = await execute_run(
|
|
149
|
-
inference, run, experiment_id, project_id, metadata, trismik_client
|
|
156
|
+
inference, run, upload_results, experiment_id, project_id, metadata, trismik_client
|
|
150
157
|
)
|
|
158
|
+
|
|
151
159
|
# Update progress bars with items processed and success status
|
|
152
160
|
if progress_bars is not None:
|
|
153
|
-
|
|
161
|
+
# Classic evals have .items; adaptive evals use max_iterations
|
|
162
|
+
items_processed = (
|
|
163
|
+
len(run.dataset.items)
|
|
164
|
+
if isinstance(run, EvalRunSpec)
|
|
165
|
+
else evaluation_settings["max_iterations"]
|
|
166
|
+
)
|
|
154
167
|
progress_bars.on_run_completed(items_processed, run_result.run_completed)
|
|
155
168
|
|
|
169
|
+
# Update upload progress for classic evals
|
|
156
170
|
if (
|
|
157
171
|
upload_results
|
|
158
172
|
and isinstance(run_result, ClassicEvalRunResult)
|
|
159
|
-
and experiment_id
|
|
160
|
-
and project_id
|
|
161
173
|
and run_result.run_completed
|
|
162
|
-
and trismik_client is not None
|
|
163
174
|
):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
run_result.run_id = run_id
|
|
169
|
-
if progress_bars is not None:
|
|
170
|
-
progress_bars.on_upload_completed(succeeded=True)
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.warning(f"Failed to upload run results: {e}")
|
|
175
|
+
# Check if upload succeeded by checking for run_id
|
|
176
|
+
if experiment_id and project_id:
|
|
177
|
+
upload_succeeded = run_result.run_id is not None
|
|
173
178
|
if progress_bars is not None:
|
|
174
|
-
progress_bars.on_upload_completed(succeeded=
|
|
175
|
-
# Continue evaluation even if upload fails
|
|
179
|
+
progress_bars.on_upload_completed(succeeded=upload_succeeded)
|
|
176
180
|
|
|
177
181
|
return run_result
|
|
178
182
|
|
|
@@ -191,6 +195,7 @@ async def execute_runs(
|
|
|
191
195
|
async def execute_run(
|
|
192
196
|
inference: Callable,
|
|
193
197
|
run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
|
|
198
|
+
upload_results: bool, # NEW PARAMETER
|
|
194
199
|
experiment_id: Optional[str] = None,
|
|
195
200
|
project_id: Optional[str] = None,
|
|
196
201
|
metadata: Optional[Dict[str, Any]] = None,
|
|
@@ -199,7 +204,9 @@ async def execute_run(
|
|
|
199
204
|
"""Execute a single evaluation run."""
|
|
200
205
|
|
|
201
206
|
if isinstance(run, EvalRunSpec):
|
|
202
|
-
return await execute_classic_eval_run(
|
|
207
|
+
return await execute_classic_eval_run(
|
|
208
|
+
inference, run, upload_results, experiment_id, project_id, metadata
|
|
209
|
+
)
|
|
203
210
|
|
|
204
211
|
elif isinstance(run, AdaptiveEvalRunSpec):
|
|
205
212
|
resolved_experiment_id = experiment_id if experiment_id is not None else run.experiment_id
|
|
@@ -217,24 +224,79 @@ async def execute_run(
|
|
|
217
224
|
raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
|
|
218
225
|
|
|
219
226
|
|
|
220
|
-
async def execute_classic_eval_run(
|
|
221
|
-
|
|
227
|
+
async def execute_classic_eval_run(
|
|
228
|
+
inference: Callable,
|
|
229
|
+
run: EvalRunSpec,
|
|
230
|
+
upload_results: bool,
|
|
231
|
+
experiment_id: Optional[str],
|
|
232
|
+
project_id: Optional[str],
|
|
233
|
+
metadata: Optional[Dict[str, Any]],
|
|
234
|
+
) -> ClassicEvalRunResult:
|
|
235
|
+
"""Execute a classic evaluation run using score_async() for scoring and uploading."""
|
|
222
236
|
logger.debug("Executing classic eval run for %s", run)
|
|
223
237
|
|
|
224
238
|
inference_outputs = None
|
|
225
|
-
|
|
239
|
+
scores = None
|
|
226
240
|
|
|
227
241
|
try:
|
|
242
|
+
# 1. Run inference
|
|
228
243
|
inference_outputs = await run_inference_callable(
|
|
229
244
|
inference, run.inputs, run.hyperparameter_config
|
|
230
245
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
246
|
+
|
|
247
|
+
# 2. Build items for score_async
|
|
248
|
+
items = [
|
|
249
|
+
{
|
|
250
|
+
"input": run.inputs[i] if i < len(run.inputs) else None,
|
|
251
|
+
"output": inference_outputs[i],
|
|
252
|
+
"label": run.labels[i] if i < len(run.labels) else "",
|
|
253
|
+
}
|
|
254
|
+
for i in range(len(inference_outputs))
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
# 3. Get the model name for upload
|
|
258
|
+
model_name = get_model_name(inference, metadata)
|
|
259
|
+
|
|
260
|
+
# 4. Call score_async
|
|
261
|
+
scores = await score_async(
|
|
262
|
+
items=items,
|
|
263
|
+
metrics=run.dataset.metrics,
|
|
264
|
+
output_column="output", # Explicit parameter
|
|
265
|
+
label_column="label", # Explicit parameter
|
|
266
|
+
input_column="input", # Explicit parameter
|
|
267
|
+
hyperparameters=run.hyperparameter_config,
|
|
268
|
+
dataset_name=run.dataset.name,
|
|
269
|
+
model_name=model_name,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
experiment_id=experiment_id,
|
|
272
|
+
project_id=project_id,
|
|
273
|
+
upload_results=upload_results,
|
|
274
|
+
show_progress=False,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# 5. Extract run_id if upload succeeded
|
|
278
|
+
run_id = None
|
|
279
|
+
if scores.get("aggregate_results") and len(scores["aggregate_results"]) > 0:
|
|
280
|
+
run_id = scores["aggregate_results"][0].get("run_id")
|
|
281
|
+
|
|
282
|
+
logger.debug("Classic evaluation completed for run %s (run_id: %s)", run, run_id)
|
|
283
|
+
return ClassicEvalRunResult(
|
|
284
|
+
run_spec=run,
|
|
285
|
+
run_completed=True,
|
|
286
|
+
outputs=inference_outputs,
|
|
287
|
+
scores=scores,
|
|
288
|
+
run_id=run_id,
|
|
289
|
+
)
|
|
234
290
|
|
|
235
291
|
except Exception as e:
|
|
236
292
|
logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
|
|
237
|
-
return ClassicEvalRunResult(
|
|
293
|
+
return ClassicEvalRunResult(
|
|
294
|
+
run_spec=run,
|
|
295
|
+
run_completed=False,
|
|
296
|
+
outputs=inference_outputs,
|
|
297
|
+
scores=scores,
|
|
298
|
+
run_id=None,
|
|
299
|
+
)
|
|
238
300
|
|
|
239
301
|
|
|
240
302
|
async def run_inference_callable(
|
|
@@ -296,93 +358,6 @@ async def execute_adaptive_eval_run(
|
|
|
296
358
|
return AdaptiveEvalRunResult(run, False, {})
|
|
297
359
|
|
|
298
360
|
|
|
299
|
-
async def upload_classic_run_results(
|
|
300
|
-
run_result: ClassicEvalRunResult,
|
|
301
|
-
experiment_id: str,
|
|
302
|
-
project_id: str,
|
|
303
|
-
inference_callable: Optional[Callable],
|
|
304
|
-
metadata: Optional[Dict[str, Any]],
|
|
305
|
-
trismik_client: Union[TrismikClient, TrismikAsyncClient],
|
|
306
|
-
) -> str:
|
|
307
|
-
"""Upload a classic evaluation run result to Trismik platform.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
run: The evaluation run result to upload
|
|
311
|
-
experiment_id: Trismik experiment identifier
|
|
312
|
-
project_id: Trismik project identifier
|
|
313
|
-
model: Model name used for evaluation
|
|
314
|
-
metadata: Optional metadata dictionary
|
|
315
|
-
trismik_client: Trismik client instance
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Run id
|
|
319
|
-
"""
|
|
320
|
-
model = get_model_name(inference_callable)
|
|
321
|
-
|
|
322
|
-
# Create eval items from run_spec inputs, outputs, and labels
|
|
323
|
-
items: List[TrismikClassicEvalItem] = []
|
|
324
|
-
inputs_outputs = zip(run_result.run_spec.inputs, run_result.outputs)
|
|
325
|
-
for idx, (input_value, output) in enumerate(inputs_outputs):
|
|
326
|
-
labels = run_result.run_spec.labels
|
|
327
|
-
label = labels[idx] if idx < len(labels) else ""
|
|
328
|
-
|
|
329
|
-
# Calculate item-level metrics for this item
|
|
330
|
-
item_metrics: Dict[str, Any] = {}
|
|
331
|
-
if run_result.scores:
|
|
332
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
333
|
-
if isinstance(metric_data, dict) and "item_scores" in metric_data:
|
|
334
|
-
if idx < len(metric_data["item_scores"]):
|
|
335
|
-
item_metrics[metric_name] = metric_data["item_scores"][idx]
|
|
336
|
-
else:
|
|
337
|
-
# If scores is just a single value, use it for all items
|
|
338
|
-
item_metrics[metric_name] = metric_data
|
|
339
|
-
|
|
340
|
-
eval_item = TrismikClassicEvalItem(
|
|
341
|
-
datasetItemId=str(idx),
|
|
342
|
-
modelInput=str(input_value),
|
|
343
|
-
modelOutput=str(output),
|
|
344
|
-
goldOutput=str(label),
|
|
345
|
-
metrics=item_metrics,
|
|
346
|
-
)
|
|
347
|
-
items.append(eval_item)
|
|
348
|
-
|
|
349
|
-
# Create eval metrics from run aggregate scores
|
|
350
|
-
metrics: List[TrismikClassicEvalMetric] = []
|
|
351
|
-
if run_result.scores:
|
|
352
|
-
for metric_name, metric_data in run_result.scores.items():
|
|
353
|
-
if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
|
|
354
|
-
# Handle structured metric data with aggregate scores
|
|
355
|
-
for agg_name, agg_value in metric_data["aggregate_scores"].items():
|
|
356
|
-
metric_id = (
|
|
357
|
-
f"{metric_name}_{agg_name}" if agg_name != metric_name else metric_name
|
|
358
|
-
)
|
|
359
|
-
metric = TrismikClassicEvalMetric(metricId=metric_id, value=agg_value)
|
|
360
|
-
metrics.append(metric)
|
|
361
|
-
else:
|
|
362
|
-
# Handle simple metric data (single value)
|
|
363
|
-
metric = TrismikClassicEvalMetric(metricId=metric_name, value=metric_data)
|
|
364
|
-
metrics.append(metric)
|
|
365
|
-
|
|
366
|
-
classic_eval_request = TrismikClassicEvalRequest(
|
|
367
|
-
project_id,
|
|
368
|
-
experiment_id,
|
|
369
|
-
run_result.run_spec.dataset.name,
|
|
370
|
-
model,
|
|
371
|
-
run_result.run_spec.hyperparameter_config,
|
|
372
|
-
items,
|
|
373
|
-
metrics,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
|
|
377
|
-
classic_eval_request
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
run_id: str = response.id
|
|
381
|
-
logger.info(f"Classic eval run uploaded successfully with run_id: {run_id}")
|
|
382
|
-
|
|
383
|
-
return run_id
|
|
384
|
-
|
|
385
|
-
|
|
386
361
|
async def run_adaptive_evaluation(
|
|
387
362
|
inference: Callable,
|
|
388
363
|
adaptive_run_spec: AdaptiveEvalRunSpec,
|