nemo-evaluator-launcher 0.1.14__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/functional.py +19 -29
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/local.yaml +1 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/base.py +23 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/executor.py +17 -71
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/executor.py +48 -7
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/run.template.sh +18 -6
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/slurm/executor.py +40 -22
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/local.py +25 -16
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/mlflow.py +168 -70
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/utils.py +85 -33
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/wandb.py +40 -5
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/pyproject.toml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/types.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/main.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/run.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/status.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/cli/version.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.15}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -440,23 +440,28 @@ def export_results(
|
|
|
440
440
|
single_id = invocation_ids[0]
|
|
441
441
|
|
|
442
442
|
if "." in single_id: # job_id
|
|
443
|
+
# Try reading config from artifacts working dir (auto-export on remote node)
|
|
444
|
+
cfg_file = None
|
|
445
|
+
for name in ("run_config.yml", "config.yml"):
|
|
446
|
+
p = Path(name)
|
|
447
|
+
if p.exists():
|
|
448
|
+
cfg_file = p
|
|
449
|
+
break
|
|
450
|
+
|
|
443
451
|
md_job_data = None
|
|
444
|
-
|
|
445
|
-
ypath_artifacts = Path("run_config.yml")
|
|
446
|
-
if ypath_artifacts.exists():
|
|
452
|
+
if cfg_file:
|
|
447
453
|
try:
|
|
448
454
|
cfg_yaml = (
|
|
449
|
-
yaml.safe_load(
|
|
450
|
-
or {}
|
|
455
|
+
yaml.safe_load(cfg_file.read_text(encoding="utf-8")) or {}
|
|
451
456
|
)
|
|
452
|
-
|
|
457
|
+
|
|
458
|
+
# Merge exporter override file if present
|
|
453
459
|
ypath_export = Path("export_config.yml")
|
|
454
460
|
if ypath_export.exists():
|
|
455
461
|
exp_yaml = (
|
|
456
462
|
yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
|
|
457
463
|
or {}
|
|
458
464
|
)
|
|
459
|
-
# execution.auto_export contains auto-export destinations
|
|
460
465
|
exec_cfg = cfg_yaml.get("execution") or {}
|
|
461
466
|
auto_exp = (exp_yaml.get("execution") or {}).get(
|
|
462
467
|
"auto_export"
|
|
@@ -464,42 +469,30 @@ def export_results(
|
|
|
464
469
|
if auto_exp is not None:
|
|
465
470
|
exec_cfg["auto_export"] = auto_exp
|
|
466
471
|
cfg_yaml["execution"] = exec_cfg
|
|
467
|
-
|
|
468
|
-
# top-level export block contains exporter config
|
|
469
472
|
if "export" in exp_yaml:
|
|
470
473
|
cfg_yaml["export"] = exp_yaml["export"]
|
|
471
|
-
|
|
472
|
-
# Merge evaluation.tasks from export_config (Slurm writes it there)
|
|
473
474
|
if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
|
|
474
475
|
eval_cfg = cfg_yaml.get("evaluation") or {}
|
|
475
476
|
eval_cfg.update(exp_yaml["evaluation"])
|
|
476
477
|
cfg_yaml["evaluation"] = eval_cfg
|
|
477
478
|
|
|
478
|
-
# metadata
|
|
479
479
|
executor_name = (cfg_yaml.get("execution") or {}).get(
|
|
480
480
|
"type", "local"
|
|
481
481
|
)
|
|
482
|
-
|
|
483
482
|
md_job_data = JobData(
|
|
484
483
|
invocation_id=single_id.split(".")[0],
|
|
485
484
|
job_id=single_id,
|
|
486
485
|
timestamp=0.0,
|
|
487
|
-
executor=executor_name,
|
|
486
|
+
executor=executor_name, # ensures slurm tag is preserved
|
|
488
487
|
data={
|
|
489
488
|
"output_dir": str(Path.cwd().parent),
|
|
490
|
-
"storage_type": "remote_local",
|
|
489
|
+
"storage_type": "remote_local", # no SSH in auto-export path
|
|
491
490
|
},
|
|
492
491
|
config=cfg_yaml,
|
|
493
492
|
)
|
|
494
|
-
# DEBUG: print what we loaded
|
|
495
|
-
print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
|
|
496
|
-
if "evaluation" in cfg_yaml:
|
|
497
|
-
print(
|
|
498
|
-
f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
|
|
499
|
-
)
|
|
500
493
|
except Exception:
|
|
501
494
|
md_job_data = None
|
|
502
|
-
|
|
495
|
+
|
|
503
496
|
job_data = md_job_data or ExecutionDB().get_job(single_id)
|
|
504
497
|
if job_data is None:
|
|
505
498
|
return {
|
|
@@ -507,7 +500,6 @@ def export_results(
|
|
|
507
500
|
"error": f"Job {single_id} not found in ExecutionDB",
|
|
508
501
|
}
|
|
509
502
|
|
|
510
|
-
# Convert single job result to invocation-like structure
|
|
511
503
|
job_result = exporter.export_job(job_data)
|
|
512
504
|
return {
|
|
513
505
|
"success": job_result.success,
|
|
@@ -522,10 +514,9 @@ def export_results(
|
|
|
522
514
|
},
|
|
523
515
|
"metadata": job_result.metadata or {},
|
|
524
516
|
}
|
|
517
|
+
|
|
525
518
|
elif single_id.isdigit(): # pipeline_id
|
|
526
|
-
# Find job by pipeline_id
|
|
527
519
|
db = ExecutionDB()
|
|
528
|
-
# Search all jobs for matching pipeline_id
|
|
529
520
|
for job_id, job_data in db._jobs.items():
|
|
530
521
|
if job_data.data.get("pipeline_id") == int(single_id):
|
|
531
522
|
job_result = exporter.export_job(job_data)
|
|
@@ -542,14 +533,13 @@ def export_results(
|
|
|
542
533
|
"metadata": job_result.metadata or {},
|
|
543
534
|
}
|
|
544
535
|
return {"success": False, "error": f"Pipeline {single_id} not found"}
|
|
536
|
+
|
|
545
537
|
else: # invocation_id
|
|
546
538
|
result = exporter.export_invocation(single_id)
|
|
547
|
-
# Ensure metadata is present in job results to prevent KeyError
|
|
548
539
|
if "jobs" in result:
|
|
549
540
|
for job_id, job_result in result["jobs"].items():
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
return result # type: ignore[no-any-return]
|
|
541
|
+
job_result.setdefault("metadata", {})
|
|
542
|
+
return result
|
|
553
543
|
else:
|
|
554
544
|
# Multiple IDs - parse and group
|
|
555
545
|
db = ExecutionDB()
|
|
@@ -21,6 +21,7 @@ port: 8000
|
|
|
21
21
|
tensor_parallel_size: 8
|
|
22
22
|
pipeline_parallel_size: 1
|
|
23
23
|
data_parallel_size: 1
|
|
24
|
+
gpu_memory_utilization: 0.95
|
|
24
25
|
extra_args: ""
|
|
25
26
|
env_vars: {} # {name: value} dict
|
|
26
27
|
|
|
@@ -37,5 +38,5 @@ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
|
|
|
37
38
|
--trust-remote-code
|
|
38
39
|
--served-model-name ${deployment.served_model_name}
|
|
39
40
|
--enforce-eager
|
|
40
|
-
--gpu-memory-utilization
|
|
41
|
+
--gpu-memory-utilization ${deployment.gpu_memory_utilization}
|
|
41
42
|
${deployment.extra_args}
|
|
@@ -95,3 +95,26 @@ class BaseExecutor(ABC):
|
|
|
95
95
|
NotImplementedError: If not implemented by a subclass.
|
|
96
96
|
"""
|
|
97
97
|
raise NotImplementedError("Subclasses must implement this method")
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def get_kill_failure_message(
|
|
101
|
+
job_id: str, container_or_id: str, status: Optional[ExecutionState] = None
|
|
102
|
+
) -> str:
|
|
103
|
+
"""Generate an informative error message when kill fails based on job status.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
job_id: The job ID that failed to kill.
|
|
107
|
+
container_or_id: Container name, SLURM job ID, or other identifier.
|
|
108
|
+
status: Optional execution state of the job.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str: An informative error message with job status context.
|
|
112
|
+
"""
|
|
113
|
+
if status == ExecutionState.SUCCESS:
|
|
114
|
+
return f"Could not find or kill job {job_id} ({container_or_id}) - job already completed successfully"
|
|
115
|
+
elif status == ExecutionState.FAILED:
|
|
116
|
+
return f"Could not find or kill job {job_id} ({container_or_id}) - job already failed"
|
|
117
|
+
elif status == ExecutionState.KILLED:
|
|
118
|
+
return f"Could not find or kill job {job_id} ({container_or_id}) - job was already killed"
|
|
119
|
+
# Generic error message
|
|
120
|
+
return f"Could not find or kill job {job_id} ({container_or_id})"
|
|
@@ -622,76 +622,14 @@ class LeptonExecutor(BaseExecutor):
|
|
|
622
622
|
def kill_job(job_id: str) -> None:
|
|
623
623
|
"""Kill Lepton evaluation jobs and clean up endpoints.
|
|
624
624
|
|
|
625
|
-
For invocation IDs, this will kill all jobs and clean up all
|
|
626
|
-
dedicated endpoints created for the invocation.
|
|
627
|
-
|
|
628
625
|
Args:
|
|
629
|
-
job_id: The job ID
|
|
626
|
+
job_id: The job ID to kill.
|
|
630
627
|
|
|
631
628
|
Raises:
|
|
632
629
|
ValueError: If job is not found or invalid.
|
|
633
630
|
RuntimeError: If job cannot be killed.
|
|
634
631
|
"""
|
|
635
632
|
db = ExecutionDB()
|
|
636
|
-
|
|
637
|
-
# If it looks like an invocation_id, kill all jobs for that invocation
|
|
638
|
-
if len(job_id) == 8 and "." not in job_id:
|
|
639
|
-
jobs = db.get_jobs(job_id)
|
|
640
|
-
if not jobs:
|
|
641
|
-
raise ValueError(f"No jobs found for invocation {job_id}")
|
|
642
|
-
|
|
643
|
-
endpoint_names = (
|
|
644
|
-
set()
|
|
645
|
-
) # Use set to avoid duplicates (though each should be unique)
|
|
646
|
-
lepton_job_names = []
|
|
647
|
-
|
|
648
|
-
# Collect all Lepton jobs and endpoint info
|
|
649
|
-
for curr_job_data in jobs.values():
|
|
650
|
-
if curr_job_data.executor != "lepton":
|
|
651
|
-
continue
|
|
652
|
-
|
|
653
|
-
# Collect endpoint name for this job (each task may have its own)
|
|
654
|
-
endpoint_name = curr_job_data.data.get("endpoint_name")
|
|
655
|
-
if endpoint_name:
|
|
656
|
-
endpoint_names.add(endpoint_name)
|
|
657
|
-
|
|
658
|
-
lepton_job_name = curr_job_data.data.get("lepton_job_name")
|
|
659
|
-
if lepton_job_name:
|
|
660
|
-
lepton_job_names.append(lepton_job_name)
|
|
661
|
-
|
|
662
|
-
# Mark job as killed in database
|
|
663
|
-
curr_job_data.data["status"] = "killed"
|
|
664
|
-
curr_job_data.data["killed_time"] = time.time()
|
|
665
|
-
db.write_job(curr_job_data)
|
|
666
|
-
|
|
667
|
-
print(
|
|
668
|
-
f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
|
|
669
|
-
)
|
|
670
|
-
|
|
671
|
-
# Cancel all Lepton jobs
|
|
672
|
-
for lepton_job_name in lepton_job_names:
|
|
673
|
-
success = delete_lepton_job(lepton_job_name)
|
|
674
|
-
if success:
|
|
675
|
-
print(f"✅ Cancelled Lepton job: {lepton_job_name}")
|
|
676
|
-
else:
|
|
677
|
-
print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
|
|
678
|
-
|
|
679
|
-
# Clean up all dedicated endpoints
|
|
680
|
-
if endpoint_names:
|
|
681
|
-
print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
|
|
682
|
-
for endpoint_name in endpoint_names:
|
|
683
|
-
success = delete_lepton_endpoint(endpoint_name)
|
|
684
|
-
if success:
|
|
685
|
-
print(f"✅ Cleaned up endpoint: {endpoint_name}")
|
|
686
|
-
else:
|
|
687
|
-
print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
|
|
688
|
-
else:
|
|
689
|
-
print("📌 No dedicated endpoints to clean up (using shared endpoint)")
|
|
690
|
-
|
|
691
|
-
print(f"🛑 Killed all resources for invocation {job_id}")
|
|
692
|
-
return
|
|
693
|
-
|
|
694
|
-
# Otherwise, treat as individual job_id
|
|
695
633
|
job_data = db.get_job(job_id)
|
|
696
634
|
if job_data is None:
|
|
697
635
|
raise ValueError(f"Job {job_id} not found")
|
|
@@ -703,17 +641,25 @@ class LeptonExecutor(BaseExecutor):
|
|
|
703
641
|
|
|
704
642
|
# Cancel the specific Lepton job
|
|
705
643
|
lepton_job_name = job_data.data.get("lepton_job_name")
|
|
644
|
+
|
|
706
645
|
if lepton_job_name:
|
|
707
|
-
|
|
708
|
-
if
|
|
646
|
+
cancel_success = delete_lepton_job(lepton_job_name)
|
|
647
|
+
if cancel_success:
|
|
709
648
|
print(f"✅ Cancelled Lepton job: {lepton_job_name}")
|
|
649
|
+
# Mark job as killed in database
|
|
650
|
+
job_data.data["status"] = "killed"
|
|
651
|
+
job_data.data["killed_time"] = time.time()
|
|
652
|
+
db.write_job(job_data)
|
|
710
653
|
else:
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
654
|
+
# Use common helper to get informative error message based on job status
|
|
655
|
+
status_list = LeptonExecutor.get_status(job_id)
|
|
656
|
+
current_status = status_list[0].state if status_list else None
|
|
657
|
+
error_msg = LeptonExecutor.get_kill_failure_message(
|
|
658
|
+
job_id, f"lepton_job: {lepton_job_name}", current_status
|
|
659
|
+
)
|
|
660
|
+
raise RuntimeError(error_msg)
|
|
661
|
+
else:
|
|
662
|
+
raise ValueError(f"No Lepton job name found for job {job_id}")
|
|
717
663
|
|
|
718
664
|
print(f"🛑 Killed Lepton job {job_id}")
|
|
719
665
|
|
|
@@ -415,10 +415,10 @@ class LocalExecutor(BaseExecutor):
|
|
|
415
415
|
|
|
416
416
|
@staticmethod
|
|
417
417
|
def kill_job(job_id: str) -> None:
|
|
418
|
-
"""Kill a local job
|
|
418
|
+
"""Kill a local job.
|
|
419
419
|
|
|
420
420
|
Args:
|
|
421
|
-
job_id: The job ID to kill.
|
|
421
|
+
job_id: The job ID (e.g., abc123.0) to kill.
|
|
422
422
|
|
|
423
423
|
Raises:
|
|
424
424
|
ValueError: If job is not found or invalid.
|
|
@@ -463,14 +463,55 @@ class LocalExecutor(BaseExecutor):
|
|
|
463
463
|
if result.returncode == 0:
|
|
464
464
|
killed_something = True
|
|
465
465
|
|
|
466
|
-
#
|
|
466
|
+
# If we successfully killed something, mark as killed
|
|
467
467
|
if killed_something:
|
|
468
468
|
job_data.data["killed"] = True
|
|
469
469
|
db.write_job(job_data)
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
470
|
+
LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
|
|
471
|
+
return
|
|
472
|
+
|
|
473
|
+
# If nothing was killed, check if this is a pending job
|
|
474
|
+
status_list = LocalExecutor.get_status(job_id)
|
|
475
|
+
if status_list and status_list[0].state == ExecutionState.PENDING:
|
|
476
|
+
# For pending jobs, mark as killed even though there's nothing to kill yet
|
|
477
|
+
job_data.data["killed"] = True
|
|
478
|
+
db.write_job(job_data)
|
|
479
|
+
LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
# Use common helper to get informative error message based on job status
|
|
483
|
+
current_status = status_list[0].state if status_list else None
|
|
484
|
+
error_msg = LocalExecutor.get_kill_failure_message(
|
|
485
|
+
job_id, f"container: {container_name}", current_status
|
|
486
|
+
)
|
|
487
|
+
raise RuntimeError(error_msg)
|
|
488
|
+
|
|
489
|
+
@staticmethod
|
|
490
|
+
def _add_to_killed_jobs(invocation_id: str, job_id: str) -> None:
|
|
491
|
+
"""Add a job ID to the killed jobs file for this invocation.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
invocation_id: The invocation ID.
|
|
495
|
+
job_id: The job ID to mark as killed.
|
|
496
|
+
"""
|
|
497
|
+
db = ExecutionDB()
|
|
498
|
+
jobs = db.get_jobs(invocation_id)
|
|
499
|
+
if not jobs:
|
|
500
|
+
return
|
|
501
|
+
|
|
502
|
+
# Get invocation output directory from any job's output_dir
|
|
503
|
+
first_job_data = next(iter(jobs.values()))
|
|
504
|
+
job_output_dir = pathlib.Path(first_job_data.data.get("output_dir", ""))
|
|
505
|
+
if not job_output_dir.exists():
|
|
506
|
+
return
|
|
507
|
+
|
|
508
|
+
# Invocation dir is parent of job output dir
|
|
509
|
+
invocation_dir = job_output_dir.parent
|
|
510
|
+
killed_jobs_file = invocation_dir / "killed_jobs.txt"
|
|
511
|
+
|
|
512
|
+
# Append job_id to file
|
|
513
|
+
with open(killed_jobs_file, "a") as f:
|
|
514
|
+
f.write(f"{job_id}\n")
|
|
474
515
|
|
|
475
516
|
|
|
476
517
|
def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
|
|
@@ -17,6 +17,11 @@
|
|
|
17
17
|
# check if docker exists
|
|
18
18
|
command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
|
|
19
19
|
|
|
20
|
+
# Initialize: remove killed jobs file from previous runs
|
|
21
|
+
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
22
|
+
killed_jobs_file="$script_dir/killed_jobs.txt"
|
|
23
|
+
rm -f "$killed_jobs_file"
|
|
24
|
+
|
|
20
25
|
{% for task in evaluation_tasks %}
|
|
21
26
|
# {{ task.job_id }} {{ task.name }}
|
|
22
27
|
|
|
@@ -28,13 +33,17 @@ mkdir -m 777 -p "$task_dir"
|
|
|
28
33
|
mkdir -m 777 -p "$artifacts_dir"
|
|
29
34
|
mkdir -m 777 -p "$logs_dir"
|
|
30
35
|
|
|
31
|
-
#
|
|
32
|
-
|
|
36
|
+
# Check if this job was killed
|
|
37
|
+
if [ -f "$killed_jobs_file" ] && grep -q "^{{ task.job_id }}$" "$killed_jobs_file"; then
|
|
38
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} ({{ task.name }}) was killed, skipping execution" | tee -a "$logs_dir/stdout.log"
|
|
39
|
+
else
|
|
40
|
+
# Create pre-start stage file
|
|
41
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
33
42
|
|
|
34
|
-
# Docker run with eval factory command
|
|
35
|
-
(
|
|
36
|
-
|
|
37
|
-
|
|
43
|
+
# Docker run with eval factory command
|
|
44
|
+
(
|
|
45
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
|
|
46
|
+
docker run --rm --shm-size=100g {{ extra_docker_args }} \
|
|
38
47
|
--name {{ task.container_name }} \
|
|
39
48
|
--volume "$artifacts_dir":/results \
|
|
40
49
|
{% for env_var in task.env_vars -%}
|
|
@@ -85,4 +94,7 @@ echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
|
85
94
|
)
|
|
86
95
|
|
|
87
96
|
{% endif %}
|
|
97
|
+
fi
|
|
98
|
+
|
|
99
|
+
|
|
88
100
|
{% endfor %}
|
|
@@ -389,7 +389,7 @@ class SlurmExecutor(BaseExecutor):
|
|
|
389
389
|
"""Kill a SLURM job.
|
|
390
390
|
|
|
391
391
|
Args:
|
|
392
|
-
job_id: The job ID to kill.
|
|
392
|
+
job_id: The job ID (e.g., abc123.0) to kill.
|
|
393
393
|
"""
|
|
394
394
|
db = ExecutionDB()
|
|
395
395
|
job_data = db.get_job(job_id)
|
|
@@ -402,26 +402,31 @@ class SlurmExecutor(BaseExecutor):
|
|
|
402
402
|
f"Job {job_id} is not a slurm job (executor: {job_data.executor})"
|
|
403
403
|
)
|
|
404
404
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
result = _kill_slurm_job(
|
|
405
|
+
# OPTIMIZATION: Query status AND kill in ONE SSH call
|
|
406
|
+
slurm_status, result = _kill_slurm_job(
|
|
408
407
|
slurm_job_ids=[job_data.data.get("slurm_job_id")],
|
|
409
408
|
username=job_data.data.get("username"),
|
|
410
409
|
hostname=job_data.data.get("hostname"),
|
|
411
410
|
socket=job_data.data.get("socket"),
|
|
412
411
|
)
|
|
413
412
|
|
|
413
|
+
# Mark job as killed in database if kill succeeded
|
|
414
414
|
if result.returncode == 0:
|
|
415
|
-
killed_something = True
|
|
416
|
-
|
|
417
|
-
# Mark job as killed in database if we killed something
|
|
418
|
-
if killed_something:
|
|
419
415
|
job_data.data["killed"] = True
|
|
420
416
|
db.write_job(job_data)
|
|
421
417
|
else:
|
|
422
|
-
|
|
423
|
-
|
|
418
|
+
# Use the pre-fetched status for better error message
|
|
419
|
+
current_status = None
|
|
420
|
+
if slurm_status:
|
|
421
|
+
current_status = SlurmExecutor._map_slurm_state_to_execution_state(
|
|
422
|
+
slurm_status
|
|
423
|
+
)
|
|
424
|
+
error_msg = SlurmExecutor.get_kill_failure_message(
|
|
425
|
+
job_id,
|
|
426
|
+
f"slurm_job_id: {job_data.data.get('slurm_job_id')}",
|
|
427
|
+
current_status,
|
|
424
428
|
)
|
|
429
|
+
raise RuntimeError(error_msg)
|
|
425
430
|
|
|
426
431
|
|
|
427
432
|
def _create_slurm_sbatch_script(
|
|
@@ -880,34 +885,47 @@ def _query_slurm_jobs_status(
|
|
|
880
885
|
|
|
881
886
|
def _kill_slurm_job(
|
|
882
887
|
slurm_job_ids: List[str], username: str, hostname: str, socket: str | None
|
|
883
|
-
) -> None:
|
|
884
|
-
"""Kill a SLURM job.
|
|
888
|
+
) -> tuple[str | None, subprocess.CompletedProcess]:
|
|
889
|
+
"""Kill a SLURM job, querying status first in one SSH call for efficiency.
|
|
885
890
|
|
|
886
891
|
Args:
|
|
887
892
|
slurm_job_ids: List of SLURM job IDs to kill.
|
|
888
893
|
username: SSH username.
|
|
889
894
|
hostname: SSH hostname.
|
|
890
895
|
socket: control socket location or None
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Tuple of (status_string, completed_process) where status_string is the SLURM status or None
|
|
891
899
|
"""
|
|
892
900
|
if len(slurm_job_ids) == 0:
|
|
893
|
-
return
|
|
894
|
-
|
|
901
|
+
return None, subprocess.CompletedProcess(args=[], returncode=0)
|
|
902
|
+
|
|
903
|
+
jobs_str = ",".join(slurm_job_ids)
|
|
904
|
+
# Combine both commands in one SSH call: query THEN kill
|
|
905
|
+
combined_command = (
|
|
906
|
+
f"sacct -j {jobs_str} --format='JobID,State%32' --noheader -P 2>/dev/null; "
|
|
907
|
+
f"scancel {jobs_str}"
|
|
908
|
+
)
|
|
909
|
+
|
|
895
910
|
ssh_command = ["ssh"]
|
|
896
911
|
if socket is not None:
|
|
897
912
|
ssh_command.append(f"-S {socket}")
|
|
898
913
|
ssh_command.append(f"{username}@{hostname}")
|
|
899
|
-
ssh_command.append(
|
|
914
|
+
ssh_command.append(combined_command)
|
|
900
915
|
ssh_command = " ".join(ssh_command)
|
|
916
|
+
|
|
901
917
|
completed_process = subprocess.run(
|
|
902
918
|
args=shlex.split(ssh_command), capture_output=True
|
|
903
919
|
)
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
920
|
+
|
|
921
|
+
# Parse the sacct output (before scancel runs)
|
|
922
|
+
sacct_output = completed_process.stdout.decode("utf-8")
|
|
923
|
+
sacct_output_lines = sacct_output.strip().split("\n")
|
|
924
|
+
slurm_status = None
|
|
925
|
+
if sacct_output_lines and len(slurm_job_ids) == 1:
|
|
926
|
+
slurm_status = _parse_slurm_job_status(slurm_job_ids[0], sacct_output_lines)
|
|
927
|
+
|
|
928
|
+
return slurm_status, completed_process
|
|
911
929
|
|
|
912
930
|
|
|
913
931
|
def _parse_slurm_job_status(slurm_job_id: str, sacct_output_lines: List[str]) -> str:
|
|
@@ -62,6 +62,7 @@ class LocalExporter(BaseExporter):
|
|
|
62
62
|
"""Export job artifacts to local directory."""
|
|
63
63
|
# Merge auto-export + CLI config
|
|
64
64
|
cfg = extract_exporter_config(job_data, "local", self.config)
|
|
65
|
+
skip_validation = bool(cfg.get("skip_validation", False))
|
|
65
66
|
|
|
66
67
|
output_dir = Path(cfg.get("output_dir", "./nemo-evaluator-launcher-results"))
|
|
67
68
|
job_export_dir = output_dir / job_data.invocation_id / job_data.job_id
|
|
@@ -78,24 +79,30 @@ class LocalExporter(BaseExporter):
|
|
|
78
79
|
# Same as local_filesystem (we're on the remote machine, accessing locally)
|
|
79
80
|
exported_files = self._copy_local_artifacts(paths, job_export_dir, cfg)
|
|
80
81
|
elif paths["storage_type"] == "remote_ssh":
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
# exported_files = self._download_gitlab_remote_artifacts(
|
|
89
|
-
# paths, job_export_dir
|
|
90
|
-
# )
|
|
82
|
+
cp = ssh_setup_masters({job_data.job_id: job_data})
|
|
83
|
+
try:
|
|
84
|
+
exported_files = ssh_download_artifacts(
|
|
85
|
+
paths, job_export_dir, cfg, cp
|
|
86
|
+
)
|
|
87
|
+
finally:
|
|
88
|
+
ssh_cleanup_masters(cp)
|
|
91
89
|
else:
|
|
92
|
-
raise
|
|
93
|
-
f"
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
f"Export not implemented for storage type: {paths['storage_type']}"
|
|
94
92
|
)
|
|
95
93
|
|
|
96
94
|
# Validate artifacts
|
|
97
95
|
artifacts_dir = job_export_dir / "artifacts"
|
|
98
|
-
validation =
|
|
96
|
+
validation = (
|
|
97
|
+
validate_artifacts(artifacts_dir)
|
|
98
|
+
if not skip_validation
|
|
99
|
+
else {
|
|
100
|
+
"can_export": True,
|
|
101
|
+
"missing_required": [],
|
|
102
|
+
"missing_optional": [],
|
|
103
|
+
"message": "Validation skipped",
|
|
104
|
+
}
|
|
105
|
+
)
|
|
99
106
|
|
|
100
107
|
# Save metadata
|
|
101
108
|
self._save_job_metadata(job_data, job_export_dir)
|
|
@@ -271,10 +278,12 @@ class LocalExporter(BaseExporter):
|
|
|
271
278
|
) -> List[str]:
|
|
272
279
|
exported_files: List[str] = []
|
|
273
280
|
copy_logs = bool(cfg.get("copy_logs", False))
|
|
281
|
+
copy_artifacts = bool(cfg.get("copy_artifacts", True))
|
|
274
282
|
only_required = bool(cfg.get("only_required", True))
|
|
275
283
|
|
|
284
|
+
# separate logic for artifacts and logs
|
|
276
285
|
# artifacts/
|
|
277
|
-
if paths["artifacts_dir"].exists():
|
|
286
|
+
if copy_artifacts and paths["artifacts_dir"].exists():
|
|
278
287
|
if only_required:
|
|
279
288
|
names = [
|
|
280
289
|
a
|
|
@@ -288,7 +297,7 @@ class LocalExporter(BaseExporter):
|
|
|
288
297
|
shutil.copy2(src, dst)
|
|
289
298
|
exported_files.append(str(dst))
|
|
290
299
|
else:
|
|
291
|
-
#
|
|
300
|
+
# Restore recursive copy (test_copy_all_tree expects nested files)
|
|
292
301
|
shutil.copytree(
|
|
293
302
|
paths["artifacts_dir"], export_dir / "artifacts", dirs_exist_ok=True
|
|
294
303
|
)
|
|
@@ -302,7 +311,7 @@ class LocalExporter(BaseExporter):
|
|
|
302
311
|
|
|
303
312
|
# logs/
|
|
304
313
|
# If only_required is False → always copy logs; otherwise respect copy_logs
|
|
305
|
-
if (not only_required or copy_logs) and paths["logs_dir"].exists():
|
|
314
|
+
if ((not only_required) or copy_logs) and paths["logs_dir"].exists():
|
|
306
315
|
shutil.copytree(paths["logs_dir"], export_dir / "logs", dirs_exist_ok=True)
|
|
307
316
|
exported_files.extend(
|
|
308
317
|
[str(f) for f in (export_dir / "logs").rglob("*") if f.is_file()]
|