nemo-evaluator-launcher 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (63) hide show
  1. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/PKG-INFO +1 -1
  2. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/functional.py +19 -29
  3. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -1
  4. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/local.yaml +1 -0
  5. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/base.py +23 -0
  6. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/executor.py +17 -71
  7. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/executor.py +48 -7
  8. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/run.template.sh +18 -6
  9. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/slurm/executor.py +40 -22
  10. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/local.py +25 -16
  11. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/mlflow.py +168 -70
  12. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/utils.py +85 -33
  13. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/wandb.py +40 -5
  14. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/package_info.py +1 -1
  15. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
  16. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/LICENSE +0 -0
  17. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/README.md +0 -0
  18. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/pyproject.toml +0 -0
  19. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/setup.cfg +0 -0
  20. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/__init__.py +0 -0
  21. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  22. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/types.py +0 -0
  23. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  24. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  25. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  26. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  27. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
  28. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  29. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/main.py +0 -0
  30. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/run.py +0 -0
  31. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/status.py +0 -0
  32. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/version.py +0 -0
  33. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  34. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  35. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/helpers.py +0 -0
  36. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
  37. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  38. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  39. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  40. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  41. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  42. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  43. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
  44. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
  45. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  46. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
  47. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  48. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  49. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
  50. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  51. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  52. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  53. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  54. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  55. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  56. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  57. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  58. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
  59. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +0 -0
  60. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  61. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
  62. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
  63. {nemo_evaluator_launcher-0.1.13 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -440,23 +440,28 @@ def export_results(
440
440
  single_id = invocation_ids[0]
441
441
 
442
442
  if "." in single_id: # job_id
443
+ # Try reading config from artifacts working dir (auto-export on remote node)
444
+ cfg_file = None
445
+ for name in ("run_config.yml", "config.yml"):
446
+ p = Path(name)
447
+ if p.exists():
448
+ cfg_file = p
449
+ break
450
+
443
451
  md_job_data = None
444
- # Use artifacts/run_config.yml if present
445
- ypath_artifacts = Path("run_config.yml")
446
- if ypath_artifacts.exists():
452
+ if cfg_file:
447
453
  try:
448
454
  cfg_yaml = (
449
- yaml.safe_load(ypath_artifacts.read_text(encoding="utf-8"))
450
- or {}
455
+ yaml.safe_load(cfg_file.read_text(encoding="utf-8")) or {}
451
456
  )
452
- # merge exporter config if present
457
+
458
+ # Merge exporter override file if present
453
459
  ypath_export = Path("export_config.yml")
454
460
  if ypath_export.exists():
455
461
  exp_yaml = (
456
462
  yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
457
463
  or {}
458
464
  )
459
- # execution.auto_export contains auto-export destinations
460
465
  exec_cfg = cfg_yaml.get("execution") or {}
461
466
  auto_exp = (exp_yaml.get("execution") or {}).get(
462
467
  "auto_export"
@@ -464,42 +469,30 @@ def export_results(
464
469
  if auto_exp is not None:
465
470
  exec_cfg["auto_export"] = auto_exp
466
471
  cfg_yaml["execution"] = exec_cfg
467
-
468
- # top-level export block contains exporter config
469
472
  if "export" in exp_yaml:
470
473
  cfg_yaml["export"] = exp_yaml["export"]
471
-
472
- # Merge evaluation.tasks from export_config (Slurm writes it there)
473
474
  if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
474
475
  eval_cfg = cfg_yaml.get("evaluation") or {}
475
476
  eval_cfg.update(exp_yaml["evaluation"])
476
477
  cfg_yaml["evaluation"] = eval_cfg
477
478
 
478
- # metadata
479
479
  executor_name = (cfg_yaml.get("execution") or {}).get(
480
480
  "type", "local"
481
481
  )
482
-
483
482
  md_job_data = JobData(
484
483
  invocation_id=single_id.split(".")[0],
485
484
  job_id=single_id,
486
485
  timestamp=0.0,
487
- executor=executor_name,
486
+ executor=executor_name, # ensures slurm tag is preserved
488
487
  data={
489
488
  "output_dir": str(Path.cwd().parent),
490
- "storage_type": "remote_local",
489
+ "storage_type": "remote_local", # no SSH in auto-export path
491
490
  },
492
491
  config=cfg_yaml,
493
492
  )
494
- # DEBUG: print what we loaded
495
- print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
496
- if "evaluation" in cfg_yaml:
497
- print(
498
- f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
499
- )
500
493
  except Exception:
501
494
  md_job_data = None
502
- # fallback to execDB only
495
+
503
496
  job_data = md_job_data or ExecutionDB().get_job(single_id)
504
497
  if job_data is None:
505
498
  return {
@@ -507,7 +500,6 @@ def export_results(
507
500
  "error": f"Job {single_id} not found in ExecutionDB",
508
501
  }
509
502
 
510
- # Convert single job result to invocation-like structure
511
503
  job_result = exporter.export_job(job_data)
512
504
  return {
513
505
  "success": job_result.success,
@@ -522,10 +514,9 @@ def export_results(
522
514
  },
523
515
  "metadata": job_result.metadata or {},
524
516
  }
517
+
525
518
  elif single_id.isdigit(): # pipeline_id
526
- # Find job by pipeline_id
527
519
  db = ExecutionDB()
528
- # Search all jobs for matching pipeline_id
529
520
  for job_id, job_data in db._jobs.items():
530
521
  if job_data.data.get("pipeline_id") == int(single_id):
531
522
  job_result = exporter.export_job(job_data)
@@ -542,14 +533,13 @@ def export_results(
542
533
  "metadata": job_result.metadata or {},
543
534
  }
544
535
  return {"success": False, "error": f"Pipeline {single_id} not found"}
536
+
545
537
  else: # invocation_id
546
538
  result = exporter.export_invocation(single_id)
547
- # Ensure metadata is present in job results to prevent KeyError
548
539
  if "jobs" in result:
549
540
  for job_id, job_result in result["jobs"].items():
550
- if "metadata" not in job_result:
551
- job_result["metadata"] = {}
552
- return result # type: ignore[no-any-return]
541
+ job_result.setdefault("metadata", {})
542
+ return result
553
543
  else:
554
544
  # Multiple IDs - parse and group
555
545
  db = ExecutionDB()
@@ -21,6 +21,7 @@ port: 8000
21
21
  tensor_parallel_size: 8
22
22
  pipeline_parallel_size: 1
23
23
  data_parallel_size: 1
24
+ gpu_memory_utilization: 0.95
24
25
  extra_args: ""
25
26
  env_vars: {} # {name: value} dict
26
27
 
@@ -37,5 +38,5 @@ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
37
38
  --trust-remote-code
38
39
  --served-model-name ${deployment.served_model_name}
39
40
  --enforce-eager
40
- --gpu-memory-utilization 0.95
41
+ --gpu-memory-utilization ${deployment.gpu_memory_utilization}
41
42
  ${deployment.extra_args}
@@ -16,3 +16,4 @@
16
16
  type: local
17
17
  output_dir: ???
18
18
  extra_docker_args: ""
19
+ mode: sequential
@@ -95,3 +95,26 @@ class BaseExecutor(ABC):
95
95
  NotImplementedError: If not implemented by a subclass.
96
96
  """
97
97
  raise NotImplementedError("Subclasses must implement this method")
98
+
99
+ @staticmethod
100
+ def get_kill_failure_message(
101
+ job_id: str, container_or_id: str, status: Optional[ExecutionState] = None
102
+ ) -> str:
103
+ """Generate an informative error message when kill fails based on job status.
104
+
105
+ Args:
106
+ job_id: The job ID that failed to kill.
107
+ container_or_id: Container name, SLURM job ID, or other identifier.
108
+ status: Optional execution state of the job.
109
+
110
+ Returns:
111
+ str: An informative error message with job status context.
112
+ """
113
+ if status == ExecutionState.SUCCESS:
114
+ return f"Could not find or kill job {job_id} ({container_or_id}) - job already completed successfully"
115
+ elif status == ExecutionState.FAILED:
116
+ return f"Could not find or kill job {job_id} ({container_or_id}) - job already failed"
117
+ elif status == ExecutionState.KILLED:
118
+ return f"Could not find or kill job {job_id} ({container_or_id}) - job was already killed"
119
+ # Generic error message
120
+ return f"Could not find or kill job {job_id} ({container_or_id})"
@@ -622,76 +622,14 @@ class LeptonExecutor(BaseExecutor):
622
622
  def kill_job(job_id: str) -> None:
623
623
  """Kill Lepton evaluation jobs and clean up endpoints.
624
624
 
625
- For invocation IDs, this will kill all jobs and clean up all
626
- dedicated endpoints created for the invocation.
627
-
628
625
  Args:
629
- job_id: The job ID or invocation ID to kill.
626
+ job_id: The job ID to kill.
630
627
 
631
628
  Raises:
632
629
  ValueError: If job is not found or invalid.
633
630
  RuntimeError: If job cannot be killed.
634
631
  """
635
632
  db = ExecutionDB()
636
-
637
- # If it looks like an invocation_id, kill all jobs for that invocation
638
- if len(job_id) == 8 and "." not in job_id:
639
- jobs = db.get_jobs(job_id)
640
- if not jobs:
641
- raise ValueError(f"No jobs found for invocation {job_id}")
642
-
643
- endpoint_names = (
644
- set()
645
- ) # Use set to avoid duplicates (though each should be unique)
646
- lepton_job_names = []
647
-
648
- # Collect all Lepton jobs and endpoint info
649
- for curr_job_data in jobs.values():
650
- if curr_job_data.executor != "lepton":
651
- continue
652
-
653
- # Collect endpoint name for this job (each task may have its own)
654
- endpoint_name = curr_job_data.data.get("endpoint_name")
655
- if endpoint_name:
656
- endpoint_names.add(endpoint_name)
657
-
658
- lepton_job_name = curr_job_data.data.get("lepton_job_name")
659
- if lepton_job_name:
660
- lepton_job_names.append(lepton_job_name)
661
-
662
- # Mark job as killed in database
663
- curr_job_data.data["status"] = "killed"
664
- curr_job_data.data["killed_time"] = time.time()
665
- db.write_job(curr_job_data)
666
-
667
- print(
668
- f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
669
- )
670
-
671
- # Cancel all Lepton jobs
672
- for lepton_job_name in lepton_job_names:
673
- success = delete_lepton_job(lepton_job_name)
674
- if success:
675
- print(f"✅ Cancelled Lepton job: {lepton_job_name}")
676
- else:
677
- print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
678
-
679
- # Clean up all dedicated endpoints
680
- if endpoint_names:
681
- print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
682
- for endpoint_name in endpoint_names:
683
- success = delete_lepton_endpoint(endpoint_name)
684
- if success:
685
- print(f"✅ Cleaned up endpoint: {endpoint_name}")
686
- else:
687
- print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
688
- else:
689
- print("📌 No dedicated endpoints to clean up (using shared endpoint)")
690
-
691
- print(f"🛑 Killed all resources for invocation {job_id}")
692
- return
693
-
694
- # Otherwise, treat as individual job_id
695
633
  job_data = db.get_job(job_id)
696
634
  if job_data is None:
697
635
  raise ValueError(f"Job {job_id} not found")
@@ -703,17 +641,25 @@ class LeptonExecutor(BaseExecutor):
703
641
 
704
642
  # Cancel the specific Lepton job
705
643
  lepton_job_name = job_data.data.get("lepton_job_name")
644
+
706
645
  if lepton_job_name:
707
- success = delete_lepton_job(lepton_job_name)
708
- if success:
646
+ cancel_success = delete_lepton_job(lepton_job_name)
647
+ if cancel_success:
709
648
  print(f"✅ Cancelled Lepton job: {lepton_job_name}")
649
+ # Mark job as killed in database
650
+ job_data.data["status"] = "killed"
651
+ job_data.data["killed_time"] = time.time()
652
+ db.write_job(job_data)
710
653
  else:
711
- print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
712
-
713
- # Mark job as killed in database
714
- job_data.data["status"] = "killed"
715
- job_data.data["killed_time"] = time.time()
716
- db.write_job(job_data)
654
+ # Use common helper to get informative error message based on job status
655
+ status_list = LeptonExecutor.get_status(job_id)
656
+ current_status = status_list[0].state if status_list else None
657
+ error_msg = LeptonExecutor.get_kill_failure_message(
658
+ job_id, f"lepton_job: {lepton_job_name}", current_status
659
+ )
660
+ raise RuntimeError(error_msg)
661
+ else:
662
+ raise ValueError(f"No Lepton job name found for job {job_id}")
717
663
 
718
664
  print(f"🛑 Killed Lepton job {job_id}")
719
665
 
@@ -415,10 +415,10 @@ class LocalExecutor(BaseExecutor):
415
415
 
416
416
  @staticmethod
417
417
  def kill_job(job_id: str) -> None:
418
- """Kill a local job by stopping its Docker container and related processes.
418
+ """Kill a local job.
419
419
 
420
420
  Args:
421
- job_id: The job ID to kill.
421
+ job_id: The job ID (e.g., abc123.0) to kill.
422
422
 
423
423
  Raises:
424
424
  ValueError: If job is not found or invalid.
@@ -463,14 +463,55 @@ class LocalExecutor(BaseExecutor):
463
463
  if result.returncode == 0:
464
464
  killed_something = True
465
465
 
466
- # Mark job as killed in database if we killed something
466
+ # If we successfully killed something, mark as killed
467
467
  if killed_something:
468
468
  job_data.data["killed"] = True
469
469
  db.write_job(job_data)
470
- else:
471
- raise RuntimeError(
472
- f"Could not find or kill job {job_id} (container: {container_name})"
473
- )
470
+ LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
471
+ return
472
+
473
+ # If nothing was killed, check if this is a pending job
474
+ status_list = LocalExecutor.get_status(job_id)
475
+ if status_list and status_list[0].state == ExecutionState.PENDING:
476
+ # For pending jobs, mark as killed even though there's nothing to kill yet
477
+ job_data.data["killed"] = True
478
+ db.write_job(job_data)
479
+ LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
480
+ return
481
+
482
+ # Use common helper to get informative error message based on job status
483
+ current_status = status_list[0].state if status_list else None
484
+ error_msg = LocalExecutor.get_kill_failure_message(
485
+ job_id, f"container: {container_name}", current_status
486
+ )
487
+ raise RuntimeError(error_msg)
488
+
489
+ @staticmethod
490
+ def _add_to_killed_jobs(invocation_id: str, job_id: str) -> None:
491
+ """Add a job ID to the killed jobs file for this invocation.
492
+
493
+ Args:
494
+ invocation_id: The invocation ID.
495
+ job_id: The job ID to mark as killed.
496
+ """
497
+ db = ExecutionDB()
498
+ jobs = db.get_jobs(invocation_id)
499
+ if not jobs:
500
+ return
501
+
502
+ # Get invocation output directory from any job's output_dir
503
+ first_job_data = next(iter(jobs.values()))
504
+ job_output_dir = pathlib.Path(first_job_data.data.get("output_dir", ""))
505
+ if not job_output_dir.exists():
506
+ return
507
+
508
+ # Invocation dir is parent of job output dir
509
+ invocation_dir = job_output_dir.parent
510
+ killed_jobs_file = invocation_dir / "killed_jobs.txt"
511
+
512
+ # Append job_id to file
513
+ with open(killed_jobs_file, "a") as f:
514
+ f.write(f"{job_id}\n")
474
515
 
475
516
 
476
517
  def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
@@ -17,6 +17,11 @@
17
17
  # check if docker exists
18
18
  command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
19
19
 
20
+ # Initialize: remove killed jobs file from previous runs
21
+ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
+ killed_jobs_file="$script_dir/killed_jobs.txt"
23
+ rm -f "$killed_jobs_file"
24
+
20
25
  {% for task in evaluation_tasks %}
21
26
  # {{ task.job_id }} {{ task.name }}
22
27
 
@@ -28,13 +33,17 @@ mkdir -m 777 -p "$task_dir"
28
33
  mkdir -m 777 -p "$artifacts_dir"
29
34
  mkdir -m 777 -p "$logs_dir"
30
35
 
31
- # Create pre-start stage file
32
- echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
36
+ # Check if this job was killed
37
+ if [ -f "$killed_jobs_file" ] && grep -q "^{{ task.job_id }}$" "$killed_jobs_file"; then
38
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} ({{ task.name }}) was killed, skipping execution" | tee -a "$logs_dir/stdout.log"
39
+ else
40
+ # Create pre-start stage file
41
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
33
42
 
34
- # Docker run with eval factory command
35
- (
36
- echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
37
- docker run --rm --shm-size=100g {{ extra_docker_args }} \
43
+ # Docker run with eval factory command
44
+ (
45
+ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
46
+ docker run --rm --shm-size=100g {{ extra_docker_args }} \
38
47
  --name {{ task.container_name }} \
39
48
  --volume "$artifacts_dir":/results \
40
49
  {% for env_var in task.env_vars -%}
@@ -85,4 +94,7 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
85
94
  )
86
95
 
87
96
  {% endif %}
97
+ fi
98
+
99
+
88
100
  {% endfor %}
@@ -389,7 +389,7 @@ class SlurmExecutor(BaseExecutor):
389
389
  """Kill a SLURM job.
390
390
 
391
391
  Args:
392
- job_id: The job ID to kill.
392
+ job_id: The job ID (e.g., abc123.0) to kill.
393
393
  """
394
394
  db = ExecutionDB()
395
395
  job_data = db.get_job(job_id)
@@ -402,26 +402,31 @@ class SlurmExecutor(BaseExecutor):
402
402
  f"Job {job_id} is not a slurm job (executor: {job_data.executor})"
403
403
  )
404
404
 
405
- killed_something = False
406
-
407
- result = _kill_slurm_job(
405
+ # OPTIMIZATION: Query status AND kill in ONE SSH call
406
+ slurm_status, result = _kill_slurm_job(
408
407
  slurm_job_ids=[job_data.data.get("slurm_job_id")],
409
408
  username=job_data.data.get("username"),
410
409
  hostname=job_data.data.get("hostname"),
411
410
  socket=job_data.data.get("socket"),
412
411
  )
413
412
 
413
+ # Mark job as killed in database if kill succeeded
414
414
  if result.returncode == 0:
415
- killed_something = True
416
-
417
- # Mark job as killed in database if we killed something
418
- if killed_something:
419
415
  job_data.data["killed"] = True
420
416
  db.write_job(job_data)
421
417
  else:
422
- raise RuntimeError(
423
- f"Could not find or kill job {job_id} (slurm_job_id: {job_data.data.get('slurm_job_id')})"
418
+ # Use the pre-fetched status for better error message
419
+ current_status = None
420
+ if slurm_status:
421
+ current_status = SlurmExecutor._map_slurm_state_to_execution_state(
422
+ slurm_status
423
+ )
424
+ error_msg = SlurmExecutor.get_kill_failure_message(
425
+ job_id,
426
+ f"slurm_job_id: {job_data.data.get('slurm_job_id')}",
427
+ current_status,
424
428
  )
429
+ raise RuntimeError(error_msg)
425
430
 
426
431
 
427
432
  def _create_slurm_sbatch_script(
@@ -880,34 +885,47 @@ def _query_slurm_jobs_status(
880
885
 
881
886
  def _kill_slurm_job(
882
887
  slurm_job_ids: List[str], username: str, hostname: str, socket: str | None
883
- ) -> None:
884
- """Kill a SLURM job.
888
+ ) -> tuple[str | None, subprocess.CompletedProcess]:
889
+ """Kill a SLURM job, querying status first in one SSH call for efficiency.
885
890
 
886
891
  Args:
887
892
  slurm_job_ids: List of SLURM job IDs to kill.
888
893
  username: SSH username.
889
894
  hostname: SSH hostname.
890
895
  socket: control socket location or None
896
+
897
+ Returns:
898
+ Tuple of (status_string, completed_process) where status_string is the SLURM status or None
891
899
  """
892
900
  if len(slurm_job_ids) == 0:
893
- return {}
894
- kill_command = "scancel {}".format(",".join(slurm_job_ids))
901
+ return None, subprocess.CompletedProcess(args=[], returncode=0)
902
+
903
+ jobs_str = ",".join(slurm_job_ids)
904
+ # Combine both commands in one SSH call: query THEN kill
905
+ combined_command = (
906
+ f"sacct -j {jobs_str} --format='JobID,State%32' --noheader -P 2>/dev/null; "
907
+ f"scancel {jobs_str}"
908
+ )
909
+
895
910
  ssh_command = ["ssh"]
896
911
  if socket is not None:
897
912
  ssh_command.append(f"-S {socket}")
898
913
  ssh_command.append(f"{username}@{hostname}")
899
- ssh_command.append(kill_command)
914
+ ssh_command.append(combined_command)
900
915
  ssh_command = " ".join(ssh_command)
916
+
901
917
  completed_process = subprocess.run(
902
918
  args=shlex.split(ssh_command), capture_output=True
903
919
  )
904
- if completed_process.returncode != 0:
905
- raise RuntimeError(
906
- "failed to kill slurm job\n{}".format(
907
- completed_process.stderr.decode("utf-8")
908
- )
909
- )
910
- return completed_process
920
+
921
+ # Parse the sacct output (before scancel runs)
922
+ sacct_output = completed_process.stdout.decode("utf-8")
923
+ sacct_output_lines = sacct_output.strip().split("\n")
924
+ slurm_status = None
925
+ if sacct_output_lines and len(slurm_job_ids) == 1:
926
+ slurm_status = _parse_slurm_job_status(slurm_job_ids[0], sacct_output_lines)
927
+
928
+ return slurm_status, completed_process
911
929
 
912
930
 
913
931
  def _parse_slurm_job_status(slurm_job_id: str, sacct_output_lines: List[str]) -> str:
@@ -62,6 +62,7 @@ class LocalExporter(BaseExporter):
62
62
  """Export job artifacts to local directory."""
63
63
  # Merge auto-export + CLI config
64
64
  cfg = extract_exporter_config(job_data, "local", self.config)
65
+ skip_validation = bool(cfg.get("skip_validation", False))
65
66
 
66
67
  output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
67
68
  job_export_dir = output_dir / job_data.invocation_id / job_data.job_id
@@ -78,24 +79,30 @@ class LocalExporter(BaseExporter):
78
79
  # Same as local_filesystem (we're on the remote machine, accessing locally)
79
80
  exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
80
81
  elif paths["storage_type"] == "remote_ssh":
81
- exported_files = ssh_download_artifacts(
82
- paths, job_export_dir, cfg, None
83
- )
84
- elif paths["storage_type"] == "gitlab_ci_local":
85
- exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
86
- elif paths["storage_type"] == "gitlab_remote":
87
- raise NotImplementedError("Unsupported storage type")
88
- # exported_files = self._download_gitlab_remote_artifacts(
89
- # paths, job_export_dir
90
- # )
82
+ cp = ssh_setup_masters({job_data.job_id: job_data})
83
+ try:
84
+ exported_files = ssh_download_artifacts(
85
+ paths, job_export_dir, cfg, cp
86
+ )
87
+ finally:
88
+ ssh_cleanup_masters(cp)
91
89
  else:
92
- raise ValueError(
93
- f"Cannot export from storage type: {paths['storage_type']}"
90
+ raise NotImplementedError(
91
+ f"Export not implemented for storage type: {paths['storage_type']}"
94
92
  )
95
93
 
96
94
  # Validate artifacts
97
95
  artifacts_dir = job_export_dir / "artifacts"
98
- validation = validate_artifacts(artifacts_dir)
96
+ validation = (
97
+ validate_artifacts(artifacts_dir)
98
+ if not skip_validation
99
+ else {
100
+ "can_export": True,
101
+ "missing_required": [],
102
+ "missing_optional": [],
103
+ "message": "Validation skipped",
104
+ }
105
+ )
99
106
 
100
107
  # Save metadata
101
108
  self._save_job_metadata(job_data, job_export_dir)
@@ -271,10 +278,12 @@ class LocalExporter(BaseExporter):
271
278
  ) -> List[str]:
272
279
  exported_files: List[str] = []
273
280
  copy_logs = bool(cfg.get("copy_logs", False))
281
+ copy_artifacts = bool(cfg.get("copy_artifacts", True))
274
282
  only_required = bool(cfg.get("only_required", True))
275
283
 
284
+ # separate logic for artifacts and logs
276
285
  # artifacts/
277
- if paths["artifacts_dir"].exists():
286
+ if copy_artifacts and paths["artifacts_dir"].exists():
278
287
  if only_required:
279
288
  names = [
280
289
  a
@@ -288,7 +297,7 @@ class LocalExporter(BaseExporter):
288
297
  shutil.copy2(src, dst)
289
298
  exported_files.append(str(dst))
290
299
  else:
291
- # Copy everything under artifacts/ recursively
300
+ # Restore recursive copy (test_copy_all_tree expects nested files)
292
301
  shutil.copytree(
293
302
  paths["artifacts_dir"], export_dir / "artifacts", dirs_exist_ok=True
294
303
  )
@@ -302,7 +311,7 @@ class LocalExporter(BaseExporter):
302
311
 
303
312
  # logs/
304
313
  # If only_required is False → always copy logs; otherwise respect copy_logs
305
- if (not only_required or copy_logs) and paths["logs_dir"].exists():
314
+ if ((not only_required) or copy_logs) and paths["logs_dir"].exists():
306
315
  shutil.copytree(paths["logs_dir"], export_dir / "logs", dirs_exist_ok=True)
307
316
  exported_files.extend(
308
317
  [str(f) for f in (export_dir / "logs").rglob("*") if f.is_file()]