nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@
16
16
  """Shared utilities for metrics and configuration handling."""
17
17
 
18
18
  import json
19
+ import re
19
20
  import subprocess
20
21
  from pathlib import Path
21
22
  from typing import Any, Callable, Dict, List, Tuple
@@ -148,15 +149,12 @@ def extract_exporter_config(
148
149
  """Extract and merge exporter configuration from multiple sources."""
149
150
  config = {}
150
151
 
151
- # Get config from dedicated field
152
+ # root-level `export.<exporter-name>`
152
153
  if job_data.config:
153
- execution_config = job_data.config.get("execution", {})
154
- auto_export_config = execution_config.get("auto_export", {})
155
- exporter_configs = auto_export_config.get("configs", {})
156
- yaml_config = exporter_configs.get(exporter_name, {})
157
-
158
- # No conversion needed
159
- config.update(yaml_config)
154
+ export_block = (job_data.config or {}).get("export", {})
155
+ yaml_config = (export_block or {}).get(exporter_name, {})
156
+ if yaml_config:
157
+ config.update(yaml_config)
160
158
 
161
159
  # From webhook metadata (if triggered by webhook)
162
160
  if "webhook_metadata" in job_data.data:
@@ -167,8 +165,6 @@ def extract_exporter_config(
167
165
  "source_artifact": f"{webhook_data.get('artifact_name', 'unknown')}:{webhook_data.get('artifact_version', 'unknown')}",
168
166
  "config_source": webhook_data.get("config_file", "unknown"),
169
167
  }
170
-
171
- # For W&B specifically, extract run info if available
172
168
  if exporter_name == "wandb" and webhook_data.get("webhook_source") == "wandb":
173
169
  wandb_specific = {
174
170
  "entity": webhook_data.get("entity"),
@@ -176,10 +172,9 @@ def extract_exporter_config(
176
172
  "run_id": webhook_data.get("run_id"),
177
173
  }
178
174
  webhook_config.update({k: v for k, v in wandb_specific.items() if v})
179
-
180
175
  config.update(webhook_config)
181
176
 
182
- # Constructor config: allows CLI overrides
177
+ # allows CLI overrides
183
178
  if constructor_config:
184
179
  config.update(constructor_config)
185
180
 
@@ -269,6 +264,14 @@ def get_container_from_mapping(job_data: JobData) -> str:
269
264
  return None
270
265
 
271
266
 
267
+ def get_artifact_root(job_data: JobData) -> str:
268
+ """Get artifact root from job data."""
269
+ bench = get_benchmark_info(job_data)
270
+ h = bench.get("harness", "unknown")
271
+ b = bench.get("benchmark", get_task_name(job_data))
272
+ return f"{h}.{b}"
273
+
274
+
272
275
  # =============================================================================
273
276
  # GITLAB DOWNLOAD
274
277
  # =============================================================================
@@ -288,91 +291,6 @@ def download_gitlab_artifacts(
288
291
  Dictionary mapping artifact names to local file paths
289
292
  """
290
293
  raise NotImplementedError("Downloading from gitlab is not implemented")
291
- # TODO: rework this logic
292
- # pipeline_id = paths["pipeline_id"]
293
- # project_id = paths["project_id"]
294
- # gitlab_token = os.getenv("GITLAB_TOKEN")
295
- #
296
- # if not gitlab_token:
297
- # raise RuntimeError(
298
- # "GITLAB_TOKEN environment variable required for GitLab remote downloads"
299
- # )
300
- #
301
- # # GitLab API endpoint for artifacts
302
- # base_url = "TODO: replace"
303
- # artifacts_url = "TODO: replace"
304
- #
305
- # headers = {"Private-Token": gitlab_token}
306
- # downloaded_artifacts = {}
307
- #
308
- # try:
309
- # # Get pipeline jobs
310
- # response = requests.get(artifacts_url, headers=headers, timeout=30)
311
- # response.raise_for_status()
312
- # jobs = response.json()
313
- #
314
- # for job in jobs:
315
- # if job.get("artifacts_file"):
316
- # job_id = job["id"]
317
- # job_name = job.get("name", f"job_{job_id}")
318
- # artifacts_download_url = (
319
- # f"{base_url}/api/v4/projects/{project_id}/jobs/{job_id}/artifacts"
320
- # )
321
- #
322
- # logger.info(f"Downloading artifacts from job: {job_name}")
323
- #
324
- # # Download job artifacts
325
- # response = requests.get(
326
- # artifacts_download_url, headers=headers, timeout=300
327
- # )
328
- # response.raise_for_status()
329
- #
330
- # if extract_specific:
331
- # # Extract specific files from ZIP
332
- # with tempfile.NamedTemporaryFile(
333
- # suffix=".zip", delete=False
334
- # ) as temp_zip:
335
- # temp_zip.write(response.content)
336
- # temp_zip_path = temp_zip.name
337
- #
338
- # try:
339
- # with zipfile.ZipFile(temp_zip_path, "r") as zip_ref:
340
- # # Create artifacts directory
341
- # artifacts_dir = export_dir / "artifacts"
342
- # artifacts_dir.mkdir(parents=True, exist_ok=True)
343
- #
344
- # # Extract to be logged artifacts
345
- # for member in zip_ref.namelist():
346
- # filename = Path(member).name
347
- # if filename in get_relevant_artifacts():
348
- # # Extract the file
349
- # source = zip_ref.open(member)
350
- # target_path = artifacts_dir / filename
351
- # with open(target_path, "wb") as f:
352
- # f.write(source.read())
353
- # source.close()
354
- #
355
- # downloaded_artifacts[filename] = target_path
356
- # logger.info(f"Extracted: {filename}")
357
- # finally:
358
- # os.unlink(temp_zip_path)
359
- # else:
360
- # # Save as ZIP files (original behavior)
361
- # artifacts_zip = export_dir / f"job_{job_id}_artifacts.zip"
362
- # with open(artifacts_zip, "wb") as f:
363
- # f.write(response.content)
364
- #
365
- # downloaded_artifacts[f"job_{job_id}_artifacts.zip"] = artifacts_zip
366
- # logger.info(f"Downloaded: {artifacts_zip.name}")
367
- #
368
- # except requests.RequestException as e:
369
- # logger.error(f"GitLab API request failed: {e}")
370
- # raise RuntimeError(f"GitLab API request failed: {e}")
371
- # except Exception as e:
372
- # logger.error(f"GitLab remote download failed: {e}")
373
- # raise RuntimeError(f"GitLab remote download failed: {e}")
374
- #
375
- # return downloaded_artifacts
376
294
 
377
295
 
378
296
  # =============================================================================
@@ -389,21 +307,28 @@ def ssh_setup_masters(jobs: Dict[str, JobData]) -> Dict[Tuple[str, str], str]:
389
307
  remote_pairs: set[tuple[str, str]] = set()
390
308
  for jd in jobs.values():
391
309
  try:
392
- paths = jd.data.get("paths") or {}
393
- if paths.get("storage_type") == "remote_ssh":
394
- remote_pairs.add((paths["username"], paths["hostname"]))
310
+ # Preferred: explicit 'paths' from job data
311
+ p = (jd.data or {}).get("paths") or {}
312
+ if (
313
+ p.get("storage_type") == "remote_ssh"
314
+ and p.get("username")
315
+ and p.get("hostname")
316
+ ):
317
+ remote_pairs.add((p["username"], p["hostname"]))
318
+ continue
319
+ # Fallback: common slurm fields (works with BaseExporter.get_job_paths)
320
+ d = jd.data or {}
321
+ if jd.executor == "slurm" and d.get("username") and d.get("hostname"):
322
+ remote_pairs.add((d["username"], d["hostname"]))
395
323
  except Exception:
396
324
  pass
397
325
 
398
326
  if not remote_pairs:
399
- return {} # no remote jobs
327
+ return {}
400
328
 
401
- # Ensure connections directory exists (like execDB does)
402
329
  CONNECTIONS_DIR.mkdir(parents=True, exist_ok=True)
403
-
404
330
  control_paths: Dict[Tuple[str, str], str] = {}
405
331
  for username, hostname in remote_pairs:
406
- # Simple socket name
407
332
  socket_path = CONNECTIONS_DIR / f"{username}_{hostname}.sock"
408
333
  try:
409
334
  cmd = [
@@ -454,9 +379,10 @@ def ssh_download_artifacts(
454
379
  config: Dict[str, Any] | None = None,
455
380
  control_paths: Dict[Tuple[str, str], str] | None = None,
456
381
  ) -> List[str]:
457
- """Download artifacts via SSH with optional connection reuse."""
382
+ """Download artifacts/logs via SSH with optional connection reuse."""
458
383
  exported_files: List[str] = []
459
384
  copy_logs = bool((config or {}).get("copy_logs", False))
385
+ copy_artifacts = bool((config or {}).get("copy_artifacts", True))
460
386
  only_required = bool((config or {}).get("only_required", True))
461
387
 
462
388
  control_path = None
@@ -473,44 +399,49 @@ def ssh_download_artifacts(
473
399
  str(local_path),
474
400
  ]
475
401
  )
476
- result = subprocess.run(cmd, capture_output=True)
477
- return result.returncode == 0
402
+ return subprocess.run(cmd, capture_output=True).returncode == 0
478
403
 
479
404
  export_dir.mkdir(parents=True, exist_ok=True)
480
- (export_dir / "artifacts").mkdir(parents=True, exist_ok=True)
481
-
482
- available_local = (
483
- get_available_artifacts(paths.get("artifacts_dir", Path()))
484
- if not only_required
485
- else None
486
- )
487
- artifact_names = (
488
- [a for a in get_relevant_artifacts()]
489
- if only_required
490
- else (available_local or [])
491
- )
492
-
493
- for artifact in artifact_names:
494
- remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
495
- local_file = export_dir / "artifacts" / artifact
496
- if scp_file(remote_file, local_file):
497
- exported_files.append(str(local_file))
498
405
 
406
+ # Artifacts
407
+ if copy_artifacts:
408
+ art_dir = export_dir / "artifacts"
409
+ art_dir.mkdir(parents=True, exist_ok=True)
410
+
411
+ if only_required:
412
+ for artifact in get_relevant_artifacts():
413
+ remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
414
+ local_file = art_dir / artifact
415
+ local_file.parent.mkdir(parents=True, exist_ok=True)
416
+ if scp_file(remote_file, local_file):
417
+ exported_files.append(str(local_file))
418
+ else:
419
+ # Copy known files individually to avoid subfolders and satisfy tests
420
+ for artifact in get_available_artifacts(paths.get("artifacts_dir", Path())):
421
+ remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
422
+ local_file = art_dir / artifact
423
+ if scp_file(remote_file, local_file):
424
+ exported_files.append(str(local_file))
425
+
426
+ # Logs (top-level only)
499
427
  if copy_logs:
500
- remote_logs = f"{paths['remote_path']}/logs"
501
428
  local_logs = export_dir / "logs"
429
+ remote_logs = f"{paths['remote_path']}/logs"
502
430
  cmd = (
503
431
  ["scp", "-r"]
504
432
  + ssh_opts
505
433
  + [
506
- f"{paths['username']}@{paths['hostname']}:{remote_logs}",
434
+ f"{paths['username']}@{paths['hostname']}:{remote_logs}/.",
507
435
  str(local_logs),
508
436
  ]
509
437
  )
510
438
  if subprocess.run(cmd, capture_output=True).returncode == 0:
511
- exported_files.extend(
512
- [str(f) for f in local_logs.rglob("*") if f.is_file()]
513
- )
439
+ for p in local_logs.iterdir():
440
+ if p.is_dir():
441
+ import shutil
442
+
443
+ shutil.rmtree(p, ignore_errors=True)
444
+ exported_files.extend([str(f) for f in local_logs.glob("*") if f.is_file()])
514
445
 
515
446
  return exported_files
516
447
 
@@ -522,16 +453,16 @@ def ssh_download_artifacts(
522
453
 
523
454
  def _get_artifacts_dir(paths: Dict[str, Any]) -> Path:
524
455
  """Get artifacts directory from paths."""
525
- if paths["storage_type"] == "local_filesystem":
526
- return paths["artifacts_dir"]
527
- elif paths["storage_type"] == "gitlab_ci_local":
528
- return paths["artifacts_dir"]
529
- elif paths["storage_type"] == "remote_ssh":
530
- return None
531
- else:
532
- logger.error(f"Unsupported storage type: {paths['storage_type']}")
456
+ storage_type = paths.get("storage_type")
457
+
458
+ # For SSH-based remote access, artifacts aren't available locally yet
459
+ if storage_type == "remote_ssh":
533
460
  return None
534
461
 
462
+ # For all local access (local_filesystem, remote_local, gitlab_ci_local)
463
+ # return the artifacts_dir from paths
464
+ return paths.get("artifacts_dir")
465
+
535
466
 
536
467
  def _extract_metrics_from_results(results: dict) -> Dict[str, float]:
537
468
  """Extract metrics from a 'results' dict (with optional 'groups'/'tasks')."""
@@ -540,15 +471,12 @@ def _extract_metrics_from_results(results: dict) -> Dict[str, float]:
540
471
  section_data = results.get(section)
541
472
  if isinstance(section_data, dict):
542
473
  for task_name, task_data in section_data.items():
543
- if isinstance(task_data, dict) and "metrics" in task_data:
544
- task_metrics = _extract_task_metrics(
545
- task_name, task_data["metrics"]
546
- )
547
- _safe_update_metrics(
548
- target=metrics,
549
- source=task_metrics,
550
- context=f" while extracting results for task '{task_name}'",
551
- )
474
+ task_metrics = _extract_task_metrics(task_name, task_data)
475
+ _safe_update_metrics(
476
+ target=metrics,
477
+ source=task_metrics,
478
+ context=f" while extracting results for task '{task_name}'",
479
+ )
552
480
  return metrics
553
481
 
554
482
 
@@ -587,54 +515,43 @@ def _extract_from_json_files(artifacts_dir: Path) -> Dict[str, float]:
587
515
  return metrics
588
516
 
589
517
 
590
- def _extract_task_metrics(task_name: str, metrics_data: dict) -> Dict[str, float]:
518
+ def _extract_task_metrics(task_name: str, task_data: dict) -> Dict[str, float]:
591
519
  """Extract metrics from a task's metrics data."""
592
520
  extracted = {}
593
- score_patterns = [
594
- "acc",
595
- "accuracy",
596
- "score",
597
- "exact_match",
598
- "f1",
599
- "em",
600
- "pass@1",
601
- "pass@k",
602
- ]
603
521
 
604
- for metric_name, metric_data in metrics_data.items():
605
- # Only extract score-like metrics
606
- if not any(pattern in metric_name.lower() for pattern in score_patterns):
607
- continue
522
+ metrics_data = task_data.get("metrics", {})
523
+ if "groups" in task_data:
524
+ for group_name, group_data in task_data["groups"].items():
525
+ group_extracted = _extract_task_metrics(
526
+ f"{task_name}_{group_name}", group_data
527
+ )
528
+ _safe_update_metrics(
529
+ target=extracted,
530
+ source=group_extracted,
531
+ context=f" in task '{task_name}'",
532
+ )
608
533
 
534
+ for metric_name, metric_data in metrics_data.items():
609
535
  try:
610
- if isinstance(metric_data, dict):
611
- if "scores" in metric_data:
612
- # Handle nested scores (e.g., mmlu macro/micro)
613
- for score_type, score_data in metric_data["scores"].items():
614
- if isinstance(score_data, dict) and "value" in score_data:
615
- key = f"{task_name}_{metric_name}_{score_type}"
616
- _safe_set_metric(
617
- container=extracted,
618
- key=key,
619
- new_value=score_data["value"],
620
- context=f" in task '{task_name}'",
621
- )
622
- elif "value" in metric_data:
536
+ for score_type, score_data in metric_data["scores"].items():
537
+ if score_type != metric_name:
538
+ key = f"{task_name}_{metric_name}_{score_type}"
539
+ else:
623
540
  key = f"{task_name}_{metric_name}"
624
- _safe_set_metric(
625
- container=extracted,
626
- key=key,
627
- new_value=metric_data["value"],
628
- context=f" in task '{task_name}'",
629
- )
630
- elif isinstance(metric_data, (int, float)):
631
- key = f"{task_name}_{metric_name}"
632
541
  _safe_set_metric(
633
542
  container=extracted,
634
543
  key=key,
635
- new_value=metric_data,
544
+ new_value=score_data["value"],
636
545
  context=f" in task '{task_name}'",
637
546
  )
547
+ for stat_name, stat_value in metric_data.get("stats", {}).items():
548
+ stats_key = f"{key}_{stat_name}"
549
+ _safe_set_metric(
550
+ container=extracted,
551
+ key=stats_key,
552
+ new_value=stat_value,
553
+ context=f" in task '{task_name}'",
554
+ )
638
555
  except (ValueError, TypeError) as e:
639
556
  logger.warning(
640
557
  f"Failed to extract metric {metric_name} for task {task_name}: {e}"
@@ -667,3 +584,41 @@ def _safe_update_metrics(
667
584
  """Update target from source safely, raising on collisions with detailed values."""
668
585
  for k, v in source.items():
669
586
  _safe_set_metric(target, k, v, context)
587
+
588
+
589
+ # =============================================================================
590
+ # MLFLOW FUNCTIONS
591
+ # =============================================================================
592
+
593
+ # MLflow constants
594
+ _MLFLOW_KEY_MAX = 250
595
+ _MLFLOW_PARAM_VAL_MAX = 250
596
+ _MLFLOW_TAG_VAL_MAX = 5000
597
+
598
+ _INVALID_KEY_CHARS = re.compile(r"[^/\w.\- ]")
599
+ _MULTI_UNDERSCORE = re.compile(r"_+")
600
+
601
+
602
+ def mlflow_sanitize(s: Any, kind: str = "key") -> str:
603
+ """
604
+ Sanitize strings for MLflow logging.
605
+
606
+ kind:
607
+ - "key", "metric", "tag_key", "param_key": apply key rules
608
+ - "tag_value": apply tag value rules
609
+ - "param_value": apply param value rules
610
+ """
611
+ s = "" if s is None else str(s)
612
+
613
+ if kind in ("key", "metric", "tag_key", "param_key"):
614
+ # common replacements
615
+ s = s.replace("pass@", "pass_at_")
616
+ # drop disallowed chars, collapse underscores, trim
617
+ s = _INVALID_KEY_CHARS.sub("_", s)
618
+ s = _MULTI_UNDERSCORE.sub("_", s).strip()
619
+ return s[:_MLFLOW_KEY_MAX] or "key"
620
+
621
+ # values: normalize whitespace, enforce length
622
+ s = s.replace("\n", " ").replace("\r", " ").strip()
623
+ max_len = _MLFLOW_TAG_VAL_MAX if kind == "tag_value" else _MLFLOW_PARAM_VAL_MAX
624
+ return s[:max_len]