nemo-evaluator-launcher 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +79 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +698 -0
- nemo_evaluator_launcher/api/types.py +98 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +267 -0
- nemo_evaluator_launcher/cli/info.py +512 -0
- nemo_evaluator_launcher/cli/kill.py +41 -0
- nemo_evaluator_launcher/cli/ls_runs.py +134 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
- nemo_evaluator_launcher/cli/main.py +226 -0
- nemo_evaluator_launcher/cli/run.py +200 -0
- nemo_evaluator_launcher/cli/status.py +164 -0
- nemo_evaluator_launcher/cli/version.py +55 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +283 -0
- nemo_evaluator_launcher/common/helpers.py +366 -0
- nemo_evaluator_launcher/common/logging_utils.py +357 -0
- nemo_evaluator_launcher/common/mapping.py +295 -0
- nemo_evaluator_launcher/common/printing_utils.py +93 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +120 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +605 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +121 -0
- nemo_evaluator_launcher/exporters/gsheets.py +409 -0
- nemo_evaluator_launcher/exporters/local.py +502 -0
- nemo_evaluator_launcher/exporters/mlflow.py +619 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +624 -0
- nemo_evaluator_launcher/exporters/wandb.py +490 -0
- nemo_evaluator_launcher/package_info.py +38 -0
- nemo_evaluator_launcher/resources/mapping.toml +380 -0
- nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
- nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Export evaluation artifacts to local filesystem."""
|
|
17
|
+
|
|
18
|
+
import csv
|
|
19
|
+
import json
|
|
20
|
+
import shutil
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List
|
|
24
|
+
|
|
25
|
+
from nemo_evaluator_launcher.common.execdb import JobData
|
|
26
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
27
|
+
from nemo_evaluator_launcher.exporters.base import BaseExporter, ExportResult
|
|
28
|
+
from nemo_evaluator_launcher.exporters.registry import register_exporter
|
|
29
|
+
from nemo_evaluator_launcher.exporters.utils import (
|
|
30
|
+
download_gitlab_artifacts,
|
|
31
|
+
extract_accuracy_metrics,
|
|
32
|
+
extract_exporter_config,
|
|
33
|
+
get_benchmark_info,
|
|
34
|
+
get_container_from_mapping,
|
|
35
|
+
get_model_name,
|
|
36
|
+
get_relevant_artifacts,
|
|
37
|
+
get_task_name,
|
|
38
|
+
ssh_cleanup_masters,
|
|
39
|
+
ssh_download_artifacts,
|
|
40
|
+
ssh_setup_masters,
|
|
41
|
+
validate_artifacts,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@register_exporter("local")
|
|
46
|
+
class LocalExporter(BaseExporter):
|
|
47
|
+
"""Export all artifacts to local/remote filesystem with optional JSON/CSV summaries.
|
|
48
|
+
|
|
49
|
+
Config keys:
|
|
50
|
+
output_dir (str): Output directory for exported results (default: "./nemo-evaluator-launcher-results")
|
|
51
|
+
copy_logs (bool): Whether to copy logs with artifacts (default: False)
|
|
52
|
+
only_required (bool): Copy only required+optional artifacts (default: True)
|
|
53
|
+
format (str or None): Summary format, one of None, "json", or "csv" (default: None; no summary, only original artifacts)
|
|
54
|
+
log_metrics (list[str]): Filters for metric names; includes full metric name or substring pattern
|
|
55
|
+
output_filename (str): Overrides default processed_results.json/csv filename
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def supports_executor(self, executor_type: str) -> bool:
|
|
59
|
+
return True # Local export compatible with all executors
|
|
60
|
+
|
|
61
|
+
def export_job(self, job_data: JobData) -> ExportResult:
|
|
62
|
+
"""Export job artifacts to local directory."""
|
|
63
|
+
# Merge auto-export + CLI config
|
|
64
|
+
cfg = extract_exporter_config(job_data, "local", self.config)
|
|
65
|
+
skip_validation = bool(cfg.get("skip_validation", False))
|
|
66
|
+
|
|
67
|
+
output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
|
|
68
|
+
job_export_dir = output_dir / job_data.invocation_id / job_data.job_id
|
|
69
|
+
job_export_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
paths = self.get_job_paths(job_data)
|
|
73
|
+
exported_files: List[str] = []
|
|
74
|
+
|
|
75
|
+
# Stage artifacts per storage type
|
|
76
|
+
if paths["storage_type"] == "local_filesystem":
|
|
77
|
+
exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
|
|
78
|
+
elif paths["storage_type"] == "remote_local":
|
|
79
|
+
# Same as local_filesystem (we're on the remote machine, accessing locally)
|
|
80
|
+
exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
|
|
81
|
+
elif paths["storage_type"] == "remote_ssh":
|
|
82
|
+
cp = ssh_setup_masters({job_data.job_id: job_data})
|
|
83
|
+
try:
|
|
84
|
+
exported_files = ssh_download_artifacts(
|
|
85
|
+
paths, job_export_dir, cfg, cp
|
|
86
|
+
)
|
|
87
|
+
finally:
|
|
88
|
+
ssh_cleanup_masters(cp)
|
|
89
|
+
else:
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
f"Export not implemented for storage type: {paths['storage_type']}"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Validate artifacts
|
|
95
|
+
artifacts_dir = job_export_dir / "artifacts"
|
|
96
|
+
validation = (
|
|
97
|
+
validate_artifacts(artifacts_dir)
|
|
98
|
+
if not skip_validation
|
|
99
|
+
else {
|
|
100
|
+
"can_export": True,
|
|
101
|
+
"missing_required": [],
|
|
102
|
+
"missing_optional": [],
|
|
103
|
+
"message": "Validation skipped",
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Save metadata
|
|
108
|
+
self._save_job_metadata(job_data, job_export_dir)
|
|
109
|
+
exported_files.append(str(job_export_dir / "job_metadata.json"))
|
|
110
|
+
|
|
111
|
+
if not validation["can_export"]:
|
|
112
|
+
return ExportResult(
|
|
113
|
+
success=False,
|
|
114
|
+
dest=str(job_export_dir),
|
|
115
|
+
message=validation["message"],
|
|
116
|
+
metadata=validation,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if validation["missing_optional"]:
|
|
120
|
+
logger.info(
|
|
121
|
+
f"Exporting without optional artifacts: {', '.join(validation['missing_optional'])}",
|
|
122
|
+
job_id=job_data.job_id,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Optional summary (JSON/CSV) at invocation level
|
|
126
|
+
msg = f"Exported {len(exported_files)} files. {validation['message']}"
|
|
127
|
+
meta: Dict[str, Any] = {"files_count": len(exported_files)}
|
|
128
|
+
fmt = cfg.get("format")
|
|
129
|
+
if fmt in ["json", "csv"]:
|
|
130
|
+
try:
|
|
131
|
+
summary_path = self._write_summary(job_data, cfg)
|
|
132
|
+
meta["summary_path"] = str(summary_path)
|
|
133
|
+
msg += f". Summary: {summary_path.name}"
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.warning(f"Failed to create {fmt} summary: {e}")
|
|
136
|
+
msg += " (summary failed)"
|
|
137
|
+
|
|
138
|
+
meta["output_dir"] = str(job_export_dir.resolve())
|
|
139
|
+
|
|
140
|
+
return ExportResult(
|
|
141
|
+
success=True, dest=str(job_export_dir), message=msg, metadata=meta
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.error(f"Failed to export job {job_data.job_id}: {e}")
|
|
146
|
+
return ExportResult(
|
|
147
|
+
success=False,
|
|
148
|
+
dest=str(job_export_dir),
|
|
149
|
+
message=f"Export failed: {str(e)}",
|
|
150
|
+
metadata={},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def export_invocation(self, invocation_id: str) -> Dict[str, Any]:
|
|
154
|
+
"""Export all jobs in an invocation (with connection reuse)."""
|
|
155
|
+
jobs = self.db.get_jobs(invocation_id)
|
|
156
|
+
if not jobs:
|
|
157
|
+
return {
|
|
158
|
+
"success": False,
|
|
159
|
+
"error": f"No jobs found for invocation {invocation_id}",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
control_paths = ssh_setup_masters(jobs)
|
|
163
|
+
try:
|
|
164
|
+
results = {}
|
|
165
|
+
for job_id, job_data in jobs.items():
|
|
166
|
+
result = self.export_job(job_data)
|
|
167
|
+
results[job_id] = result.__dict__
|
|
168
|
+
return {"success": True, "invocation_id": invocation_id, "jobs": results}
|
|
169
|
+
finally:
|
|
170
|
+
ssh_cleanup_masters(control_paths)
|
|
171
|
+
|
|
172
|
+
def export_multiple_invocations(self, invocation_ids: List[str]) -> Dict[str, Any]:
|
|
173
|
+
db_jobs: Dict[str, JobData] = {}
|
|
174
|
+
results: Dict[str, Any] = {}
|
|
175
|
+
for inv in invocation_ids:
|
|
176
|
+
jobs = self.db.get_jobs(inv)
|
|
177
|
+
if jobs:
|
|
178
|
+
db_jobs.update(jobs)
|
|
179
|
+
results[inv] = {"success": True, "job_count": len(jobs)}
|
|
180
|
+
else:
|
|
181
|
+
results[inv] = {"success": False, "error": f"No jobs found for {inv}"}
|
|
182
|
+
if not db_jobs:
|
|
183
|
+
return {
|
|
184
|
+
"success": False,
|
|
185
|
+
"error": "No jobs to export",
|
|
186
|
+
"invocations": results,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# Reuse SSH masters across all jobs/hosts and stage artifacts locally
|
|
190
|
+
cp = ssh_setup_masters(db_jobs)
|
|
191
|
+
try:
|
|
192
|
+
first = next(iter(db_jobs.values()))
|
|
193
|
+
cfg = extract_exporter_config(first, "local", self.config)
|
|
194
|
+
fmt = cfg.get("format")
|
|
195
|
+
output_dir = Path(
|
|
196
|
+
cfg.get("output_dir", "./nemo-evaluator-launcher-results")
|
|
197
|
+
)
|
|
198
|
+
filename = cfg.get("output_filename", f"processed_results.{fmt}")
|
|
199
|
+
out_path = output_dir / filename # consolidated file at output_dir
|
|
200
|
+
|
|
201
|
+
# Stage artifacts for all jobs into <output_dir>/<inv>/<job>/
|
|
202
|
+
for jd in db_jobs.values():
|
|
203
|
+
try:
|
|
204
|
+
self.export_job(jd)
|
|
205
|
+
except Exception:
|
|
206
|
+
pass # keep going; remaining jobs may still contribute
|
|
207
|
+
|
|
208
|
+
# Build consolidated summary from staged artifacts
|
|
209
|
+
all_metrics, jobs_list = {}, []
|
|
210
|
+
for jd in db_jobs.values():
|
|
211
|
+
artifacts_dir = output_dir / jd.invocation_id / jd.job_id / "artifacts"
|
|
212
|
+
metrics = extract_accuracy_metrics(
|
|
213
|
+
jd,
|
|
214
|
+
lambda _: {
|
|
215
|
+
"artifacts_dir": artifacts_dir,
|
|
216
|
+
"storage_type": "local_filesystem",
|
|
217
|
+
},
|
|
218
|
+
cfg.get("log_metrics", []),
|
|
219
|
+
)
|
|
220
|
+
all_metrics[jd.job_id] = metrics
|
|
221
|
+
jobs_list.append(jd)
|
|
222
|
+
|
|
223
|
+
if fmt == "json":
|
|
224
|
+
if out_path.exists():
|
|
225
|
+
data = json.loads(out_path.read_text(encoding="utf-8"))
|
|
226
|
+
else:
|
|
227
|
+
data = {
|
|
228
|
+
"export_timestamp": datetime.now().isoformat(),
|
|
229
|
+
"benchmarks": {},
|
|
230
|
+
}
|
|
231
|
+
for jd in jobs_list:
|
|
232
|
+
bench, model, entry = self._build_entry(
|
|
233
|
+
jd, all_metrics.get(jd.job_id, {}), cfg
|
|
234
|
+
)
|
|
235
|
+
m = (
|
|
236
|
+
data.setdefault("benchmarks", {})
|
|
237
|
+
.setdefault(bench, {})
|
|
238
|
+
.setdefault("models", {})
|
|
239
|
+
)
|
|
240
|
+
lst = m.setdefault(model, [])
|
|
241
|
+
idx = next(
|
|
242
|
+
(
|
|
243
|
+
i
|
|
244
|
+
for i, e in enumerate(lst)
|
|
245
|
+
if e.get("invocation_id") == jd.invocation_id
|
|
246
|
+
and e.get("job_id") == jd.job_id
|
|
247
|
+
),
|
|
248
|
+
None,
|
|
249
|
+
)
|
|
250
|
+
if idx is None:
|
|
251
|
+
lst.append(entry)
|
|
252
|
+
else:
|
|
253
|
+
lst[idx] = entry
|
|
254
|
+
data["export_timestamp"] = datetime.now().isoformat()
|
|
255
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
out_path.write_text(
|
|
257
|
+
json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
258
|
+
)
|
|
259
|
+
elif fmt == "csv":
|
|
260
|
+
for jd in jobs_list:
|
|
261
|
+
self._csv_upsert(out_path, jd, all_metrics.get(jd.job_id, {}), cfg)
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"success": True,
|
|
265
|
+
"invocations": results,
|
|
266
|
+
"metadata": {
|
|
267
|
+
"total_invocations": len(invocation_ids),
|
|
268
|
+
"total_jobs": len(db_jobs),
|
|
269
|
+
"summary_path": str(out_path.resolve()),
|
|
270
|
+
},
|
|
271
|
+
}
|
|
272
|
+
finally:
|
|
273
|
+
ssh_cleanup_masters(cp)
|
|
274
|
+
|
|
275
|
+
# Artifact staging helpers
|
|
276
|
+
def _copy_local_artifacts(
|
|
277
|
+
self, paths: Dict[str, Any], export_dir: Path, cfg: Dict[str, Any]
|
|
278
|
+
) -> List[str]:
|
|
279
|
+
exported_files: List[str] = []
|
|
280
|
+
copy_logs = bool(cfg.get("copy_logs", False))
|
|
281
|
+
copy_artifacts = bool(cfg.get("copy_artifacts", True))
|
|
282
|
+
only_required = bool(cfg.get("only_required", True))
|
|
283
|
+
|
|
284
|
+
# separate logic for artifacts and logs
|
|
285
|
+
# artifacts/
|
|
286
|
+
if copy_artifacts and paths["artifacts_dir"].exists():
|
|
287
|
+
if only_required:
|
|
288
|
+
names = [
|
|
289
|
+
a
|
|
290
|
+
for a in get_relevant_artifacts()
|
|
291
|
+
if (paths["artifacts_dir"] / a).exists()
|
|
292
|
+
]
|
|
293
|
+
(export_dir / "artifacts").mkdir(parents=True, exist_ok=True)
|
|
294
|
+
for name in names:
|
|
295
|
+
src = paths["artifacts_dir"] / name
|
|
296
|
+
dst = export_dir / "artifacts" / name
|
|
297
|
+
shutil.copy2(src, dst)
|
|
298
|
+
exported_files.append(str(dst))
|
|
299
|
+
else:
|
|
300
|
+
# Restore recursive copy (test_copy_all_tree expects nested files)
|
|
301
|
+
shutil.copytree(
|
|
302
|
+
paths["artifacts_dir"], export_dir / "artifacts", dirs_exist_ok=True
|
|
303
|
+
)
|
|
304
|
+
exported_files.extend(
|
|
305
|
+
[
|
|
306
|
+
str(f)
|
|
307
|
+
for f in (export_dir / "artifacts").rglob("*")
|
|
308
|
+
if f.is_file()
|
|
309
|
+
]
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# logs/
|
|
313
|
+
# If only_required is False → always copy logs; otherwise respect copy_logs
|
|
314
|
+
if ((not only_required) or copy_logs) and paths["logs_dir"].exists():
|
|
315
|
+
shutil.copytree(paths["logs_dir"], export_dir / "logs", dirs_exist_ok=True)
|
|
316
|
+
exported_files.extend(
|
|
317
|
+
[str(f) for f in (export_dir / "logs").rglob("*") if f.is_file()]
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return exported_files
|
|
321
|
+
|
|
322
|
+
def _download_gitlab_remote_artifacts(
|
|
323
|
+
self, paths: Dict[str, Any], export_dir: Path
|
|
324
|
+
) -> List[str]:
|
|
325
|
+
artifacts = download_gitlab_artifacts(paths, export_dir, extract_specific=True)
|
|
326
|
+
return [str(p) for p in artifacts.values()]
|
|
327
|
+
|
|
328
|
+
def _save_job_metadata(self, job_data: JobData, export_dir: Path):
|
|
329
|
+
metadata = {
|
|
330
|
+
"invocation_id": job_data.invocation_id,
|
|
331
|
+
"job_id": job_data.job_id,
|
|
332
|
+
"executor": job_data.executor,
|
|
333
|
+
"timestamp": job_data.timestamp,
|
|
334
|
+
}
|
|
335
|
+
(export_dir / "job_metadata.json").write_text(
|
|
336
|
+
json.dumps(metadata, indent=2, default=str)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Summary JSON/CSV helpers
|
|
340
|
+
def _write_summary(self, job_data: JobData, cfg: Dict[str, Any]) -> Path:
|
|
341
|
+
"""Read per-job artifacts, extract metrics, and update invocation-level summary."""
|
|
342
|
+
output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
|
|
343
|
+
artifacts_dir = (
|
|
344
|
+
output_dir / job_data.invocation_id / job_data.job_id / "artifacts"
|
|
345
|
+
)
|
|
346
|
+
fmt = cfg.get("format")
|
|
347
|
+
filename = cfg.get("output_filename", f"processed_results.{fmt}")
|
|
348
|
+
out_path = output_dir / job_data.invocation_id / filename
|
|
349
|
+
|
|
350
|
+
# Extract metrics
|
|
351
|
+
metrics = extract_accuracy_metrics(
|
|
352
|
+
job_data,
|
|
353
|
+
lambda jd: {
|
|
354
|
+
"artifacts_dir": artifacts_dir,
|
|
355
|
+
"storage_type": "local_filesystem",
|
|
356
|
+
},
|
|
357
|
+
cfg.get("log_metrics", []),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
if fmt == "json":
|
|
361
|
+
self._json_upsert(out_path, job_data, metrics, cfg)
|
|
362
|
+
elif fmt == "csv":
|
|
363
|
+
self._csv_upsert(out_path, job_data, metrics, cfg)
|
|
364
|
+
return out_path.resolve()
|
|
365
|
+
|
|
366
|
+
def _build_entry(
|
|
367
|
+
self, job_data: JobData, metrics: Dict[str, float], cfg: Dict[str, Any]
|
|
368
|
+
) -> tuple[str, str, dict]:
|
|
369
|
+
bench = get_benchmark_info(job_data)
|
|
370
|
+
benchmark_name = bench["benchmark"]
|
|
371
|
+
model_name = get_model_name(job_data, cfg)
|
|
372
|
+
entry = {
|
|
373
|
+
"invocation_id": job_data.invocation_id,
|
|
374
|
+
"job_id": job_data.job_id,
|
|
375
|
+
"harness": bench.get("harness", "unknown"),
|
|
376
|
+
"container": get_container_from_mapping(job_data),
|
|
377
|
+
"scores": metrics,
|
|
378
|
+
"timestamp": datetime.now().isoformat(),
|
|
379
|
+
"executor": job_data.executor,
|
|
380
|
+
}
|
|
381
|
+
return benchmark_name, model_name, entry
|
|
382
|
+
|
|
383
|
+
def _json_upsert(
|
|
384
|
+
self,
|
|
385
|
+
out_path: Path,
|
|
386
|
+
job_data: JobData,
|
|
387
|
+
metrics: Dict[str, float],
|
|
388
|
+
cfg: Dict[str, Any],
|
|
389
|
+
) -> None:
|
|
390
|
+
if out_path.exists():
|
|
391
|
+
data = json.loads(out_path.read_text(encoding="utf-8"))
|
|
392
|
+
else:
|
|
393
|
+
data = {"export_timestamp": datetime.now().isoformat(), "benchmarks": {}}
|
|
394
|
+
|
|
395
|
+
benchmark_name, model_name, entry = self._build_entry(job_data, metrics, cfg)
|
|
396
|
+
bench = data.setdefault("benchmarks", {}).setdefault(benchmark_name, {})
|
|
397
|
+
models = bench.setdefault("models", {})
|
|
398
|
+
|
|
399
|
+
# Switch to list semantics
|
|
400
|
+
lst = models.setdefault(model_name, [])
|
|
401
|
+
# Upsert by unique combination
|
|
402
|
+
idx = next(
|
|
403
|
+
(
|
|
404
|
+
i
|
|
405
|
+
for i, e in enumerate(lst)
|
|
406
|
+
if e.get("invocation_id") == job_data.invocation_id
|
|
407
|
+
and e.get("job_id") == job_data.job_id
|
|
408
|
+
),
|
|
409
|
+
None,
|
|
410
|
+
)
|
|
411
|
+
if idx is None:
|
|
412
|
+
lst.append(entry) # append
|
|
413
|
+
else:
|
|
414
|
+
lst[idx] = entry # override
|
|
415
|
+
|
|
416
|
+
data["export_timestamp"] = datetime.now().isoformat()
|
|
417
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
418
|
+
out_path.write_text(
|
|
419
|
+
json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def _csv_upsert(
|
|
423
|
+
self,
|
|
424
|
+
out_path: Path,
|
|
425
|
+
job_data: JobData,
|
|
426
|
+
metrics: Dict[str, float],
|
|
427
|
+
cfg: Dict[str, Any],
|
|
428
|
+
) -> None:
|
|
429
|
+
base_cols = [
|
|
430
|
+
"Model Name",
|
|
431
|
+
"Harness",
|
|
432
|
+
"Task Name",
|
|
433
|
+
"Executor",
|
|
434
|
+
"Container",
|
|
435
|
+
"Invocation ID",
|
|
436
|
+
"Job ID",
|
|
437
|
+
]
|
|
438
|
+
rows, headers = [], []
|
|
439
|
+
if out_path.exists():
|
|
440
|
+
with out_path.open("r", newline="", encoding="utf-8") as f:
|
|
441
|
+
r = csv.reader(f)
|
|
442
|
+
headers = next(r, [])
|
|
443
|
+
rows = list(r)
|
|
444
|
+
else:
|
|
445
|
+
headers = base_cols.copy()
|
|
446
|
+
|
|
447
|
+
# Build metric names by stripping <benchmark>_ prefix from keys
|
|
448
|
+
benchmark, model_name, entry = self._build_entry(job_data, metrics, cfg)
|
|
449
|
+
|
|
450
|
+
# clean headers using bare benchmark
|
|
451
|
+
task_prefix = benchmark # no harness prefix
|
|
452
|
+
clean_metrics = []
|
|
453
|
+
for full_key in metrics.keys():
|
|
454
|
+
if full_key.startswith(f"{task_prefix}_"):
|
|
455
|
+
clean_metrics.append(full_key[len(task_prefix) + 1 :])
|
|
456
|
+
else:
|
|
457
|
+
clean_metrics.append(full_key)
|
|
458
|
+
|
|
459
|
+
# Extend headers if new metrics appear
|
|
460
|
+
metric_cols_existing = [h for h in headers if h not in base_cols]
|
|
461
|
+
new_metric_cols = [
|
|
462
|
+
m for m in sorted(set(clean_metrics)) if m not in metric_cols_existing
|
|
463
|
+
]
|
|
464
|
+
if new_metric_cols:
|
|
465
|
+
headers = headers + new_metric_cols
|
|
466
|
+
for row in rows:
|
|
467
|
+
row.extend([""] * len(new_metric_cols))
|
|
468
|
+
|
|
469
|
+
# Build row for this job (upsert keyed by invocation_id + job_id)
|
|
470
|
+
bench = get_benchmark_info(job_data)
|
|
471
|
+
task_name = get_task_name(job_data)
|
|
472
|
+
row = [
|
|
473
|
+
model_name,
|
|
474
|
+
bench.get("harness", "unknown"),
|
|
475
|
+
task_name,
|
|
476
|
+
job_data.executor,
|
|
477
|
+
get_container_from_mapping(job_data),
|
|
478
|
+
job_data.invocation_id,
|
|
479
|
+
job_data.job_id,
|
|
480
|
+
]
|
|
481
|
+
# Fill metric columns from <benchmark>_<...>
|
|
482
|
+
for col in headers[len(base_cols) :]:
|
|
483
|
+
full_key = f"{task_prefix}_{col}"
|
|
484
|
+
val = metrics.get(full_key, "")
|
|
485
|
+
try:
|
|
486
|
+
row.append("" if val == "" else float(val))
|
|
487
|
+
except Exception:
|
|
488
|
+
row.append(val)
|
|
489
|
+
|
|
490
|
+
# Upsert row
|
|
491
|
+
idx_by_key = {(r[5], r[6]): i for i, r in enumerate(rows) if len(r) >= 7}
|
|
492
|
+
key = (job_data.invocation_id, job_data.job_id)
|
|
493
|
+
if key in idx_by_key:
|
|
494
|
+
rows[idx_by_key[key]] = row
|
|
495
|
+
else:
|
|
496
|
+
rows.append(row)
|
|
497
|
+
|
|
498
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
499
|
+
with out_path.open("w", newline="", encoding="utf-8") as f:
|
|
500
|
+
w = csv.writer(f)
|
|
501
|
+
w.writerow(headers)
|
|
502
|
+
w.writerows(rows)
|