nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,669 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Shared utilities for metrics and configuration handling."""
17
+
18
+ import json
19
+ import subprocess
20
+ from pathlib import Path
21
+ from typing import Any, Callable, Dict, List, Tuple
22
+
23
+ import yaml
24
+
25
+ from nemo_evaluator_launcher.common.execdb import JobData
26
+ from nemo_evaluator_launcher.common.logging_utils import logger
27
+ from nemo_evaluator_launcher.common.mapping import (
28
+ get_task_from_mapping,
29
+ load_tasks_mapping,
30
+ )
31
+
32
+ # =============================================================================
33
+ # ARTIFACTS
34
+ # =============================================================================
35
+
36
+ # Artifacts to be logged by default
37
+ REQUIRED_ARTIFACTS = ["results.yml", "eval_factory_metrics.json"]
38
+ OPTIONAL_ARTIFACTS = ["omni-info.json"]
39
+
40
+
41
+ def get_relevant_artifacts() -> List[str]:
42
+ """Get relevant artifacts (required + optional)."""
43
+ return REQUIRED_ARTIFACTS + OPTIONAL_ARTIFACTS
44
+
45
+
46
+ def validate_artifacts(artifacts_dir: Path) -> Dict[str, Any]:
47
+ """Check which artifacts are available."""
48
+ if not artifacts_dir or not artifacts_dir.exists():
49
+ return {
50
+ "can_export": False,
51
+ "missing_required": REQUIRED_ARTIFACTS.copy(),
52
+ "missing_optional": OPTIONAL_ARTIFACTS.copy(),
53
+ "message": "Artifacts directory not found",
54
+ }
55
+
56
+ missing_required = [
57
+ f for f in REQUIRED_ARTIFACTS if not (artifacts_dir / f).exists()
58
+ ]
59
+ missing_optional = [
60
+ f for f in OPTIONAL_ARTIFACTS if not (artifacts_dir / f).exists()
61
+ ]
62
+ can_export = len(missing_required) == 0
63
+
64
+ message_parts = []
65
+ if missing_required:
66
+ message_parts.append(f"Missing required: {', '.join(missing_required)}")
67
+ if missing_optional:
68
+ message_parts.append(f"Missing optional: {', '.join(missing_optional)}")
69
+
70
+ return {
71
+ "can_export": can_export,
72
+ "missing_required": missing_required,
73
+ "missing_optional": missing_optional,
74
+ "message": (
75
+ ". ".join(message_parts) if message_parts else "All artifacts available"
76
+ ),
77
+ }
78
+
79
+
80
+ def get_available_artifacts(artifacts_dir: Path) -> List[str]:
81
+ """Get list of artifacts available in artifacts directory."""
82
+ if not artifacts_dir or not artifacts_dir.exists():
83
+ return []
84
+ return [
85
+ filename
86
+ for filename in get_relevant_artifacts()
87
+ if (artifacts_dir / filename).exists()
88
+ ]
89
+
90
+
91
+ # =============================================================================
92
+ # METRICS EXTRACTION
93
+ # =============================================================================
94
+
95
+
96
+ class MetricConflictError(Exception):
97
+ """Raised when attempting to set the same metric key with a different value."""
98
+
99
+
100
+ def extract_accuracy_metrics(
101
+ job_data: JobData, get_job_paths_func: Callable, log_metrics: List[str] = None
102
+ ) -> Dict[str, float]:
103
+ """Extract accuracy metrics from job results."""
104
+ try:
105
+ paths = get_job_paths_func(job_data)
106
+ artifacts_dir = _get_artifacts_dir(paths)
107
+
108
+ if not artifacts_dir or not artifacts_dir.exists():
109
+ logger.warning(f"Artifacts directory not found for job {job_data.job_id}")
110
+ return {}
111
+
112
+ # Prefer results.yml, but also merge JSON metrics to avoid missing values
113
+ metrics: Dict[str, float] = {}
114
+ results_yml = artifacts_dir / "results.yml"
115
+ if results_yml.exists():
116
+ yml_metrics = _extract_from_results_yml(results_yml)
117
+ if yml_metrics:
118
+ metrics.update(yml_metrics)
119
+
120
+ # Merge in JSON metrics (handles tasks that only emit JSON or extra fields)
121
+ json_metrics = _extract_from_json_files(artifacts_dir)
122
+ for k, v in json_metrics.items():
123
+ metrics.setdefault(k, v)
124
+
125
+ # Filter metrics if specified
126
+ if log_metrics:
127
+ filtered_metrics = {}
128
+ for metric_name, metric_value in metrics.items():
129
+ if any(filter_key in metric_name.lower() for filter_key in log_metrics):
130
+ filtered_metrics[metric_name] = metric_value
131
+ return filtered_metrics
132
+
133
+ return metrics
134
+
135
+ except Exception as e:
136
+ logger.error(f"Failed to extract metrics for job {job_data.job_id}: {e}")
137
+ return {}
138
+
139
+
140
+ # =============================================================================
141
+ # CONFIG EXTRACTION
142
+ # =============================================================================
143
+
144
+
145
+ def extract_exporter_config(
146
+ job_data: JobData, exporter_name: str, constructor_config: Dict[str, Any] = None
147
+ ) -> Dict[str, Any]:
148
+ """Extract and merge exporter configuration from multiple sources."""
149
+ config = {}
150
+
151
+ # Get config from dedicated field
152
+ if job_data.config:
153
+ execution_config = job_data.config.get("execution", {})
154
+ auto_export_config = execution_config.get("auto_export", {})
155
+ exporter_configs = auto_export_config.get("configs", {})
156
+ yaml_config = exporter_configs.get(exporter_name, {})
157
+
158
+ # No conversion needed
159
+ config.update(yaml_config)
160
+
161
+ # From webhook metadata (if triggered by webhook)
162
+ if "webhook_metadata" in job_data.data:
163
+ webhook_data = job_data.data["webhook_metadata"]
164
+ webhook_config = {
165
+ "triggered_by_webhook": True,
166
+ "webhook_source": webhook_data.get("webhook_source", "unknown"),
167
+ "source_artifact": f"{webhook_data.get('artifact_name', 'unknown')}:{webhook_data.get('artifact_version', 'unknown')}",
168
+ "config_source": webhook_data.get("config_file", "unknown"),
169
+ }
170
+
171
+ # For W&B specifically, extract run info if available
172
+ if exporter_name == "wandb" and webhook_data.get("webhook_source") == "wandb":
173
+ wandb_specific = {
174
+ "entity": webhook_data.get("entity"),
175
+ "project": webhook_data.get("project"),
176
+ "run_id": webhook_data.get("run_id"),
177
+ }
178
+ webhook_config.update({k: v for k, v in wandb_specific.items() if v})
179
+
180
+ config.update(webhook_config)
181
+
182
+ # Constructor config: allows CLI overrides
183
+ if constructor_config:
184
+ config.update(constructor_config)
185
+
186
+ return config
187
+
188
+
189
+ # =============================================================================
190
+ # JOB DATA EXTRACTION
191
+ # =============================================================================
192
+
193
+
194
+ def get_task_name(job_data: JobData) -> str:
195
+ """Get task name from job data."""
196
+ if "." in job_data.job_id:
197
+ try:
198
+ idx = int(job_data.job_id.split(".")[-1])
199
+ return job_data.config["evaluation"]["tasks"][idx]["name"]
200
+ except Exception:
201
+ return f"job_{job_data.job_id}"
202
+ return "all_tasks"
203
+
204
+
205
+ def get_model_name(job_data: JobData, config: Dict[str, Any] = None) -> str:
206
+ """Extract model name from config or job data."""
207
+ if config and "model_name" in config:
208
+ return config["model_name"]
209
+
210
+ job_config = job_data.config or {}
211
+ model_sources = [
212
+ job_config.get("target", {}).get("api_endpoint", {}).get("model_id"),
213
+ job_config.get("deployment", {}).get("served_model_name"),
214
+ job_data.data.get("served_model_name"),
215
+ job_data.data.get("model_name"),
216
+ job_data.data.get("model_id"),
217
+ ]
218
+
219
+ for source in model_sources:
220
+ if source:
221
+ return str(source)
222
+
223
+ return f"unknown_model_{job_data.job_id}"
224
+
225
+
226
+ def get_pipeline_id(job_data: JobData) -> str:
227
+ """Get pipeline ID for GitLab jobs."""
228
+ return job_data.data.get("pipeline_id") if job_data.executor == "gitlab" else None
229
+
230
+
231
+ def get_benchmark_info(job_data: JobData) -> Dict[str, str]:
232
+ """Get harness and benchmark info from mapping."""
233
+ try:
234
+ task_name = get_task_name(job_data)
235
+ if task_name in ["all_tasks", f"job_{job_data.job_id}"]:
236
+ return {"harness": "unknown", "benchmark": task_name}
237
+
238
+ # Use mapping to get harness info
239
+ mapping = load_tasks_mapping()
240
+ task_definition = get_task_from_mapping(task_name, mapping)
241
+ harness = task_definition.get("harness", "unknown")
242
+
243
+ # Extract benchmark name (remove harness prefix)
244
+ if "." in task_name:
245
+ benchmark = ".".join(task_name.split(".")[1:])
246
+ else:
247
+ benchmark = task_name
248
+
249
+ return {"harness": harness, "benchmark": benchmark}
250
+
251
+ except Exception as e:
252
+ logger.warning(f"Failed to get benchmark info: {e}")
253
+ return {"harness": "unknown", "benchmark": get_task_name(job_data)}
254
+
255
+
256
+ def get_container_from_mapping(job_data: JobData) -> str:
257
+ """Get container from mapping."""
258
+ try:
259
+ task_name = get_task_name(job_data)
260
+ if task_name in ["all_tasks", f"job_{job_data.job_id}"]:
261
+ return None
262
+
263
+ mapping = load_tasks_mapping()
264
+ task_definition = get_task_from_mapping(task_name, mapping)
265
+ return task_definition.get("container")
266
+
267
+ except Exception as e:
268
+ logger.warning(f"Failed to get container from mapping: {e}")
269
+ return None
270
+
271
+
272
+ # =============================================================================
273
+ # GITLAB DOWNLOAD
274
+ # =============================================================================
275
+
276
+
277
+ def download_gitlab_artifacts(
278
+ paths: Dict[str, Any], export_dir: Path, extract_specific: bool = False
279
+ ) -> Dict[str, Path]:
280
+ """Download artifacts from GitLab API.
281
+
282
+ Args:
283
+ paths: Dictionary containing pipeline_id and project_id
284
+ export_dir: Local directory to save artifacts
285
+ extract_specific: If True, extract individual files; if False, keep as ZIP files
286
+
287
+ Returns:
288
+ Dictionary mapping artifact names to local file paths
289
+ """
290
+ raise NotImplementedError("Downloading from gitlab is not implemented")
291
+ # TODO: rework this logic
292
+ # pipeline_id = paths["pipeline_id"]
293
+ # project_id = paths["project_id"]
294
+ # gitlab_token = os.getenv("GITLAB_TOKEN")
295
+ #
296
+ # if not gitlab_token:
297
+ # raise RuntimeError(
298
+ # "GITLAB_TOKEN environment variable required for GitLab remote downloads"
299
+ # )
300
+ #
301
+ # # GitLab API endpoint for artifacts
302
+ # base_url = "TODO: replace"
303
+ # artifacts_url = "TODO: replace"
304
+ #
305
+ # headers = {"Private-Token": gitlab_token}
306
+ # downloaded_artifacts = {}
307
+ #
308
+ # try:
309
+ # # Get pipeline jobs
310
+ # response = requests.get(artifacts_url, headers=headers, timeout=30)
311
+ # response.raise_for_status()
312
+ # jobs = response.json()
313
+ #
314
+ # for job in jobs:
315
+ # if job.get("artifacts_file"):
316
+ # job_id = job["id"]
317
+ # job_name = job.get("name", f"job_{job_id}")
318
+ # artifacts_download_url = (
319
+ # f"{base_url}/api/v4/projects/{project_id}/jobs/{job_id}/artifacts"
320
+ # )
321
+ #
322
+ # logger.info(f"Downloading artifacts from job: {job_name}")
323
+ #
324
+ # # Download job artifacts
325
+ # response = requests.get(
326
+ # artifacts_download_url, headers=headers, timeout=300
327
+ # )
328
+ # response.raise_for_status()
329
+ #
330
+ # if extract_specific:
331
+ # # Extract specific files from ZIP
332
+ # with tempfile.NamedTemporaryFile(
333
+ # suffix=".zip", delete=False
334
+ # ) as temp_zip:
335
+ # temp_zip.write(response.content)
336
+ # temp_zip_path = temp_zip.name
337
+ #
338
+ # try:
339
+ # with zipfile.ZipFile(temp_zip_path, "r") as zip_ref:
340
+ # # Create artifacts directory
341
+ # artifacts_dir = export_dir / "artifacts"
342
+ # artifacts_dir.mkdir(parents=True, exist_ok=True)
343
+ #
344
+ # # Extract to be logged artifacts
345
+ # for member in zip_ref.namelist():
346
+ # filename = Path(member).name
347
+ # if filename in get_relevant_artifacts():
348
+ # # Extract the file
349
+ # source = zip_ref.open(member)
350
+ # target_path = artifacts_dir / filename
351
+ # with open(target_path, "wb") as f:
352
+ # f.write(source.read())
353
+ # source.close()
354
+ #
355
+ # downloaded_artifacts[filename] = target_path
356
+ # logger.info(f"Extracted: {filename}")
357
+ # finally:
358
+ # os.unlink(temp_zip_path)
359
+ # else:
360
+ # # Save as ZIP files (original behavior)
361
+ # artifacts_zip = export_dir / f"job_{job_id}_artifacts.zip"
362
+ # with open(artifacts_zip, "wb") as f:
363
+ # f.write(response.content)
364
+ #
365
+ # downloaded_artifacts[f"job_{job_id}_artifacts.zip"] = artifacts_zip
366
+ # logger.info(f"Downloaded: {artifacts_zip.name}")
367
+ #
368
+ # except requests.RequestException as e:
369
+ # logger.error(f"GitLab API request failed: {e}")
370
+ # raise RuntimeError(f"GitLab API request failed: {e}")
371
+ # except Exception as e:
372
+ # logger.error(f"GitLab remote download failed: {e}")
373
+ # raise RuntimeError(f"GitLab remote download failed: {e}")
374
+ #
375
+ # return downloaded_artifacts
376
+
377
+
378
+ # =============================================================================
379
+ # SSH UTILS
380
+ # =============================================================================
381
+
382
+
383
+ # SSH connections directory
384
+ CONNECTIONS_DIR = Path.home() / ".nemo-evaluator" / "connections"
385
+
386
+
387
+ def ssh_setup_masters(jobs: Dict[str, JobData]) -> Dict[Tuple[str, str], str]:
388
+ """Start SSH master connections for remote jobs, returns control_paths."""
389
+ remote_pairs: set[tuple[str, str]] = set()
390
+ for jd in jobs.values():
391
+ try:
392
+ paths = jd.data.get("paths") or {}
393
+ if paths.get("storage_type") == "remote_ssh":
394
+ remote_pairs.add((paths["username"], paths["hostname"]))
395
+ except Exception:
396
+ pass
397
+
398
+ if not remote_pairs:
399
+ return {} # no remote jobs
400
+
401
+ # Ensure connections directory exists (like execDB does)
402
+ CONNECTIONS_DIR.mkdir(parents=True, exist_ok=True)
403
+
404
+ control_paths: Dict[Tuple[str, str], str] = {}
405
+ for username, hostname in remote_pairs:
406
+ # Simple socket name
407
+ socket_path = CONNECTIONS_DIR / f"{username}_{hostname}.sock"
408
+ try:
409
+ cmd = [
410
+ "ssh",
411
+ "-N",
412
+ "-f",
413
+ "-o",
414
+ "ControlMaster=auto",
415
+ "-o",
416
+ "ControlPersist=60",
417
+ "-o",
418
+ f"ControlPath={socket_path}",
419
+ f"{username}@{hostname}",
420
+ ]
421
+ subprocess.run(cmd, check=False, capture_output=True)
422
+ control_paths[(username, hostname)] = str(socket_path)
423
+ except Exception as e:
424
+ logger.warning(f"Failed to start SSH master for {username}@{hostname}: {e}")
425
+ return control_paths
426
+
427
+
428
+ def ssh_cleanup_masters(control_paths: Dict[Tuple[str, str], str]) -> None:
429
+ """Clean up SSH master connections from control_paths."""
430
+ for (username, hostname), socket_path in (control_paths or {}).items():
431
+ try:
432
+ cmd = [
433
+ "ssh",
434
+ "-O",
435
+ "exit",
436
+ "-o",
437
+ f"ControlPath={socket_path}",
438
+ f"{username}@{hostname}",
439
+ ]
440
+ subprocess.run(cmd, check=False, capture_output=True)
441
+ except Exception as e:
442
+ logger.warning(f"Failed to stop SSH master for {username}@{hostname}: {e}")
443
+
444
+ # Clean up
445
+ try:
446
+ Path(socket_path).unlink(missing_ok=True)
447
+ except Exception as e:
448
+ logger.warning(f"Failed to clean up file: {e}")
449
+
450
+
451
+ def ssh_download_artifacts(
452
+ paths: Dict[str, Any],
453
+ export_dir: Path,
454
+ config: Dict[str, Any] | None = None,
455
+ control_paths: Dict[Tuple[str, str], str] | None = None,
456
+ ) -> List[str]:
457
+ """Download artifacts via SSH with optional connection reuse."""
458
+ exported_files: List[str] = []
459
+ copy_logs = bool((config or {}).get("copy_logs", False))
460
+ only_required = bool((config or {}).get("only_required", True))
461
+
462
+ control_path = None
463
+ if control_paths:
464
+ control_path = control_paths.get((paths["username"], paths["hostname"]))
465
+ ssh_opts = ["-o", f"ControlPath={control_path}"] if control_path else []
466
+
467
+ def scp_file(remote_path: str, local_path: Path) -> bool:
468
+ cmd = (
469
+ ["scp"]
470
+ + ssh_opts
471
+ + [
472
+ f"{paths['username']}@{paths['hostname']}:{remote_path}",
473
+ str(local_path),
474
+ ]
475
+ )
476
+ result = subprocess.run(cmd, capture_output=True)
477
+ return result.returncode == 0
478
+
479
+ export_dir.mkdir(parents=True, exist_ok=True)
480
+ (export_dir / "artifacts").mkdir(parents=True, exist_ok=True)
481
+
482
+ available_local = (
483
+ get_available_artifacts(paths.get("artifacts_dir", Path()))
484
+ if not only_required
485
+ else None
486
+ )
487
+ artifact_names = (
488
+ [a for a in get_relevant_artifacts()]
489
+ if only_required
490
+ else (available_local or [])
491
+ )
492
+
493
+ for artifact in artifact_names:
494
+ remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
495
+ local_file = export_dir / "artifacts" / artifact
496
+ if scp_file(remote_file, local_file):
497
+ exported_files.append(str(local_file))
498
+
499
+ if copy_logs:
500
+ remote_logs = f"{paths['remote_path']}/logs"
501
+ local_logs = export_dir / "logs"
502
+ cmd = (
503
+ ["scp", "-r"]
504
+ + ssh_opts
505
+ + [
506
+ f"{paths['username']}@{paths['hostname']}:{remote_logs}",
507
+ str(local_logs),
508
+ ]
509
+ )
510
+ if subprocess.run(cmd, capture_output=True).returncode == 0:
511
+ exported_files.extend(
512
+ [str(f) for f in local_logs.rglob("*") if f.is_file()]
513
+ )
514
+
515
+ return exported_files
516
+
517
+
518
+ # =============================================================================
519
+ # PRIVATE HELPER FUNCTIONS
520
+ # =============================================================================
521
+
522
+
523
+ def _get_artifacts_dir(paths: Dict[str, Any]) -> Path:
524
+ """Get artifacts directory from paths."""
525
+ if paths["storage_type"] == "local_filesystem":
526
+ return paths["artifacts_dir"]
527
+ elif paths["storage_type"] == "gitlab_ci_local":
528
+ return paths["artifacts_dir"]
529
+ elif paths["storage_type"] == "remote_ssh":
530
+ return None
531
+ else:
532
+ logger.error(f"Unsupported storage type: {paths['storage_type']}")
533
+ return None
534
+
535
+
536
+ def _extract_metrics_from_results(results: dict) -> Dict[str, float]:
537
+ """Extract metrics from a 'results' dict (with optional 'groups'/'tasks')."""
538
+ metrics: Dict[str, float] = {}
539
+ for section in ["groups", "tasks"]:
540
+ section_data = results.get(section)
541
+ if isinstance(section_data, dict):
542
+ for task_name, task_data in section_data.items():
543
+ if isinstance(task_data, dict) and "metrics" in task_data:
544
+ task_metrics = _extract_task_metrics(
545
+ task_name, task_data["metrics"]
546
+ )
547
+ _safe_update_metrics(
548
+ target=metrics,
549
+ source=task_metrics,
550
+ context=f" while extracting results for task '{task_name}'",
551
+ )
552
+ return metrics
553
+
554
+
555
+ def _extract_from_results_yml(results_yml: Path) -> Dict[str, float]:
556
+ """Extract metrics from results.yml file."""
557
+ try:
558
+ with open(results_yml, "r", encoding="utf-8") as f:
559
+ data = yaml.safe_load(f)
560
+ if not isinstance(data, dict) or "results" not in data:
561
+ return {}
562
+ return _extract_metrics_from_results(data.get("results"))
563
+ except Exception as e:
564
+ logger.warning(f"Failed to parse results.yml: {e}")
565
+ return {}
566
+
567
+
568
+ def _extract_from_json_files(artifacts_dir: Path) -> Dict[str, float]:
569
+ """Extract metrics from individual JSON result files."""
570
+ metrics = {}
571
+
572
+ for json_file in artifacts_dir.glob("*.json"):
573
+ if json_file.name in get_relevant_artifacts():
574
+ continue # Skip known artifact files, focus on task result files
575
+
576
+ try:
577
+ with open(json_file, "r", encoding="utf-8") as f:
578
+ data = json.load(f)
579
+
580
+ if isinstance(data, dict) and "score" in data:
581
+ task_name = json_file.stem
582
+ metrics[f"{task_name}_score"] = float(data["score"])
583
+
584
+ except Exception as e:
585
+ logger.warning(f"Failed to parse {json_file}: {e}")
586
+
587
+ return metrics
588
+
589
+
590
+ def _extract_task_metrics(task_name: str, metrics_data: dict) -> Dict[str, float]:
591
+ """Extract metrics from a task's metrics data."""
592
+ extracted = {}
593
+ score_patterns = [
594
+ "acc",
595
+ "accuracy",
596
+ "score",
597
+ "exact_match",
598
+ "f1",
599
+ "em",
600
+ "pass@1",
601
+ "pass@k",
602
+ ]
603
+
604
+ for metric_name, metric_data in metrics_data.items():
605
+ # Only extract score-like metrics
606
+ if not any(pattern in metric_name.lower() for pattern in score_patterns):
607
+ continue
608
+
609
+ try:
610
+ if isinstance(metric_data, dict):
611
+ if "scores" in metric_data:
612
+ # Handle nested scores (e.g., mmlu macro/micro)
613
+ for score_type, score_data in metric_data["scores"].items():
614
+ if isinstance(score_data, dict) and "value" in score_data:
615
+ key = f"{task_name}_{metric_name}_{score_type}"
616
+ _safe_set_metric(
617
+ container=extracted,
618
+ key=key,
619
+ new_value=score_data["value"],
620
+ context=f" in task '{task_name}'",
621
+ )
622
+ elif "value" in metric_data:
623
+ key = f"{task_name}_{metric_name}"
624
+ _safe_set_metric(
625
+ container=extracted,
626
+ key=key,
627
+ new_value=metric_data["value"],
628
+ context=f" in task '{task_name}'",
629
+ )
630
+ elif isinstance(metric_data, (int, float)):
631
+ key = f"{task_name}_{metric_name}"
632
+ _safe_set_metric(
633
+ container=extracted,
634
+ key=key,
635
+ new_value=metric_data,
636
+ context=f" in task '{task_name}'",
637
+ )
638
+ except (ValueError, TypeError) as e:
639
+ logger.warning(
640
+ f"Failed to extract metric {metric_name} for task {task_name}: {e}"
641
+ )
642
+
643
+ return extracted
644
+
645
+
646
+ def _safe_set_metric(
647
+ container: Dict[str, float], key: str, new_value: float, context: str
648
+ ) -> None:
649
+ """Set a metric into container; raise with details if key exists."""
650
+ if key in container:
651
+ # Allow exact matches; warn and keep existing
652
+ if container[key] == float(new_value):
653
+ logger.warning(
654
+ f"Metric rewrite{context}: '{key}' has identical value; keeping existing. value={container[key]}"
655
+ )
656
+ return
657
+ # Different value is an error we want to surface distinctly
658
+ raise MetricConflictError(
659
+ f"Metric key collision{context}: '{key}' already set. existing={container[key]} new={new_value}"
660
+ )
661
+ container[key] = float(new_value)
662
+
663
+
664
+ def _safe_update_metrics(
665
+ target: Dict[str, float], source: Dict[str, float], context: str
666
+ ) -> None:
667
+ """Update target from source safely, raising on collisions with detailed values."""
668
+ for k, v in source.items():
669
+ _safe_set_metric(target, k, v, context)