nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/api/functional.py +55 -5
- nemo_evaluator_launcher/api/types.py +21 -14
- nemo_evaluator_launcher/cli/ls_task.py +280 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
- nemo_evaluator_launcher/cli/main.py +17 -2
- nemo_evaluator_launcher/cli/run.py +43 -52
- nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
- nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
- nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
- nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
- nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
- nemo_evaluator_launcher/common/helpers.py +44 -28
- nemo_evaluator_launcher/common/mapping.py +166 -177
- nemo_evaluator_launcher/common/printing_utils.py +18 -12
- nemo_evaluator_launcher/configs/deployment/nim.yaml +3 -1
- nemo_evaluator_launcher/executors/lepton/executor.py +26 -8
- nemo_evaluator_launcher/executors/local/executor.py +6 -2
- nemo_evaluator_launcher/executors/slurm/executor.py +270 -22
- nemo_evaluator_launcher/package_info.py +1 -1
- nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
- nemo_evaluator_launcher/resources/mapping.toml +62 -354
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/METADATA +2 -1
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/RECORD +27 -20
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/top_level.txt +0 -0
|
@@ -34,7 +34,7 @@ from nemo_evaluator_launcher.common.execdb import (
|
|
|
34
34
|
from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
|
|
35
35
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
36
36
|
from nemo_evaluator_launcher.common.mapping import (
|
|
37
|
-
|
|
37
|
+
get_task_definition_for_job,
|
|
38
38
|
load_tasks_mapping,
|
|
39
39
|
)
|
|
40
40
|
from nemo_evaluator_launcher.common.printing_utils import red
|
|
@@ -293,8 +293,10 @@ class LeptonExecutor(BaseExecutor):
|
|
|
293
293
|
return
|
|
294
294
|
|
|
295
295
|
# Construct the full endpoint URL
|
|
296
|
-
task_definition =
|
|
297
|
-
task.name,
|
|
296
|
+
task_definition = get_task_definition_for_job(
|
|
297
|
+
task_query=task.name,
|
|
298
|
+
base_mapping=tasks_mapping,
|
|
299
|
+
container=task.get("container"),
|
|
298
300
|
)
|
|
299
301
|
task_endpoint_type = task_definition["endpoint_type"]
|
|
300
302
|
endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
|
|
@@ -383,7 +385,11 @@ class LeptonExecutor(BaseExecutor):
|
|
|
383
385
|
|
|
384
386
|
# Submit each evaluation task as a Lepton job
|
|
385
387
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
386
|
-
task_definition =
|
|
388
|
+
task_definition = get_task_definition_for_job(
|
|
389
|
+
task_query=task.name,
|
|
390
|
+
base_mapping=tasks_mapping,
|
|
391
|
+
container=task.get("container"),
|
|
392
|
+
)
|
|
387
393
|
|
|
388
394
|
# Create job ID and Lepton job name (max 36 chars)
|
|
389
395
|
job_id = generate_job_id(invocation_id, idx)
|
|
@@ -889,9 +895,13 @@ def _dry_run_lepton(
|
|
|
889
895
|
) -> None:
|
|
890
896
|
print("DRY RUN: Lepton job configurations prepared")
|
|
891
897
|
try:
|
|
892
|
-
# validate tasks
|
|
898
|
+
# validate tasks (container overrides are supported)
|
|
893
899
|
for task in cfg.evaluation.tasks:
|
|
894
|
-
|
|
900
|
+
_ = get_task_definition_for_job(
|
|
901
|
+
task_query=task.name,
|
|
902
|
+
base_mapping=tasks_mapping,
|
|
903
|
+
container=task.get("container"),
|
|
904
|
+
)
|
|
895
905
|
|
|
896
906
|
# nice-to-have checks (existing endpoint URL or endpoints mapping)
|
|
897
907
|
if getattr(cfg.deployment, "type", None) == "none":
|
|
@@ -909,7 +919,11 @@ def _dry_run_lepton(
|
|
|
909
919
|
else:
|
|
910
920
|
endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
|
|
911
921
|
for task in cfg.evaluation.tasks:
|
|
912
|
-
td =
|
|
922
|
+
td = get_task_definition_for_job(
|
|
923
|
+
task_query=task.name,
|
|
924
|
+
base_mapping=tasks_mapping,
|
|
925
|
+
container=task.get("container"),
|
|
926
|
+
)
|
|
913
927
|
etype = td.get("endpoint_type")
|
|
914
928
|
if etype not in endpoints_cfg:
|
|
915
929
|
raise ValueError(
|
|
@@ -928,7 +942,11 @@ def _dry_run_lepton(
|
|
|
928
942
|
getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
|
|
929
943
|
)
|
|
930
944
|
for task in cfg.evaluation.tasks:
|
|
931
|
-
td =
|
|
945
|
+
td = get_task_definition_for_job(
|
|
946
|
+
task_query=task.name,
|
|
947
|
+
base_mapping=tasks_mapping,
|
|
948
|
+
container=task.get("container"),
|
|
949
|
+
)
|
|
932
950
|
required = td.get("required_env_vars", []) or []
|
|
933
951
|
for var in required:
|
|
934
952
|
# Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
|
|
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
|
|
|
49
49
|
)
|
|
50
50
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
51
51
|
from nemo_evaluator_launcher.common.mapping import (
|
|
52
|
-
|
|
52
|
+
get_task_definition_for_job,
|
|
53
53
|
load_tasks_mapping,
|
|
54
54
|
)
|
|
55
55
|
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
|
|
@@ -123,7 +123,11 @@ class LocalExecutor(BaseExecutor):
|
|
|
123
123
|
|
|
124
124
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
125
125
|
timestamp = get_timestamp_string()
|
|
126
|
-
task_definition =
|
|
126
|
+
task_definition = get_task_definition_for_job(
|
|
127
|
+
task_query=task.name,
|
|
128
|
+
base_mapping=tasks_mapping,
|
|
129
|
+
container=task.get("container"),
|
|
130
|
+
)
|
|
127
131
|
|
|
128
132
|
if cfg.deployment.type != "none":
|
|
129
133
|
# container name
|
|
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
|
|
|
49
49
|
)
|
|
50
50
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
51
51
|
from nemo_evaluator_launcher.common.mapping import (
|
|
52
|
-
|
|
52
|
+
get_task_definition_for_job,
|
|
53
53
|
load_tasks_mapping,
|
|
54
54
|
)
|
|
55
55
|
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
|
|
@@ -109,7 +109,11 @@ class SlurmExecutor(BaseExecutor):
|
|
|
109
109
|
(local_task_subdir / "artifacts").mkdir()
|
|
110
110
|
|
|
111
111
|
# resolve eval image and pass directly via task override
|
|
112
|
-
task_definition =
|
|
112
|
+
task_definition = get_task_definition_for_job(
|
|
113
|
+
task_query=task.name,
|
|
114
|
+
base_mapping=tasks_mapping,
|
|
115
|
+
container=task.get("container"),
|
|
116
|
+
)
|
|
113
117
|
eval_image = task_definition["container"]
|
|
114
118
|
if "container" in task:
|
|
115
119
|
eval_image = task["container"]
|
|
@@ -201,6 +205,22 @@ class SlurmExecutor(BaseExecutor):
|
|
|
201
205
|
hostname=cfg.execution.hostname,
|
|
202
206
|
socket=socket,
|
|
203
207
|
)
|
|
208
|
+
|
|
209
|
+
if socket_or_none is None:
|
|
210
|
+
raise RuntimeError(
|
|
211
|
+
f"Failed to connect to the cluster {cfg.execution.hostname} as user {cfg.execution.username}. "
|
|
212
|
+
"Please check your SSH configuration."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Validate that all mount paths exist on the remote host
|
|
216
|
+
mount_paths = _collect_mount_paths(cfg)
|
|
217
|
+
_validate_remote_paths_exist(
|
|
218
|
+
paths=mount_paths,
|
|
219
|
+
username=cfg.execution.username,
|
|
220
|
+
hostname=cfg.execution.hostname,
|
|
221
|
+
socket=socket_or_none,
|
|
222
|
+
)
|
|
223
|
+
|
|
204
224
|
_make_remote_execution_output_dir(
|
|
205
225
|
dirpath=cfg.execution.output_dir,
|
|
206
226
|
username=cfg.execution.username,
|
|
@@ -388,10 +408,10 @@ class SlurmExecutor(BaseExecutor):
|
|
|
388
408
|
)
|
|
389
409
|
statuses = []
|
|
390
410
|
for i, slurm_job_id in enumerate(slurm_job_ids):
|
|
391
|
-
slurm_status = slurm_jobs_status[slurm_job_id]
|
|
411
|
+
slurm_status = slurm_jobs_status[slurm_job_id][0]
|
|
392
412
|
if slurm_job_id in latest_slurm_job_ids:
|
|
393
413
|
latest_slurm_job_id = latest_slurm_job_ids[slurm_job_id]
|
|
394
|
-
slurm_status = latest_slurm_jobs_status[latest_slurm_job_id]
|
|
414
|
+
slurm_status = latest_slurm_jobs_status[latest_slurm_job_id][0]
|
|
395
415
|
progress = progress_list[i]
|
|
396
416
|
progress = progress if progress is not None else "unknown"
|
|
397
417
|
execution_state = SlurmExecutor._map_slurm_state_to_execution_state(
|
|
@@ -507,7 +527,11 @@ def _create_slurm_sbatch_script(
|
|
|
507
527
|
"""
|
|
508
528
|
# get task from mapping, overrides, urls
|
|
509
529
|
tasks_mapping = load_tasks_mapping()
|
|
510
|
-
task_definition =
|
|
530
|
+
task_definition = get_task_definition_for_job(
|
|
531
|
+
task_query=task.name,
|
|
532
|
+
base_mapping=tasks_mapping,
|
|
533
|
+
container=task.get("container"),
|
|
534
|
+
)
|
|
511
535
|
|
|
512
536
|
# TODO(public release): convert to template
|
|
513
537
|
s = "#!/bin/bash\n"
|
|
@@ -531,7 +555,8 @@ def _create_slurm_sbatch_script(
|
|
|
531
555
|
)
|
|
532
556
|
s += "#SBATCH --job-name {}\n".format(job_name)
|
|
533
557
|
s += "#SBATCH --exclusive\n"
|
|
534
|
-
s += "#SBATCH --
|
|
558
|
+
s += "#SBATCH --no-requeue\n" # We have our own auto-resume logic
|
|
559
|
+
s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.log")
|
|
535
560
|
s += "\n"
|
|
536
561
|
s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
|
|
537
562
|
s += "\n"
|
|
@@ -619,7 +644,7 @@ def _create_slurm_sbatch_script(
|
|
|
619
644
|
s += deployment_srun_cmd
|
|
620
645
|
|
|
621
646
|
# wait for the server to initialize
|
|
622
|
-
health_path = cfg.deployment.get("
|
|
647
|
+
health_path = cfg.deployment.endpoints.get("health", "/health")
|
|
623
648
|
# For multi-instance check all node IPs, for single instance check localhost
|
|
624
649
|
if cfg.deployment.get("multiple_instances", False):
|
|
625
650
|
ip_list = '"${NODES_IPS_ARRAY[@]}"'
|
|
@@ -685,7 +710,7 @@ def _create_slurm_sbatch_script(
|
|
|
685
710
|
|
|
686
711
|
s += "# evaluation client\n"
|
|
687
712
|
s += "srun --mpi pmix --overlap "
|
|
688
|
-
s += "--nodes 1 --ntasks 1 "
|
|
713
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
689
714
|
s += "--container-image {} ".format(eval_image)
|
|
690
715
|
evaluation_env_var_names = list(
|
|
691
716
|
cfg.execution.get("env_vars", {}).get("evaluation", {})
|
|
@@ -696,7 +721,7 @@ def _create_slurm_sbatch_script(
|
|
|
696
721
|
s += "--no-container-mount-home "
|
|
697
722
|
|
|
698
723
|
s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
|
|
699
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.
|
|
724
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.log")
|
|
700
725
|
s += "bash -c '\n"
|
|
701
726
|
s += eval_factory_command
|
|
702
727
|
s += "'\n\n"
|
|
@@ -810,15 +835,15 @@ def _generate_auto_export_section(
|
|
|
810
835
|
|
|
811
836
|
s += " # export\n"
|
|
812
837
|
s += " srun --mpi pmix --overlap "
|
|
813
|
-
s += "--nodes 1 --ntasks 1 "
|
|
838
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
814
839
|
s += "--container-image {} ".format(export_image)
|
|
815
840
|
if export_env:
|
|
816
841
|
s += "--container-env {} ".format(",".join(export_env))
|
|
817
842
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
818
843
|
s += "--no-container-mount-home "
|
|
819
844
|
|
|
820
|
-
s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
|
|
821
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.
|
|
845
|
+
s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts,{remote_task_subdir}/logs:{remote_task_subdir}/logs "
|
|
846
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
|
|
822
847
|
s += " bash -c '\n"
|
|
823
848
|
# FIXME(martas): would be good to install specific version
|
|
824
849
|
s += " pip install nemo-evaluator-launcher[all]\n"
|
|
@@ -984,8 +1009,121 @@ def _query_slurm_jobs_status(
|
|
|
984
1009
|
username: str,
|
|
985
1010
|
hostname: str,
|
|
986
1011
|
socket: str | None,
|
|
987
|
-
) -> Dict[str, str]:
|
|
988
|
-
"""Query SLURM for job statuses using sacct
|
|
1012
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1013
|
+
"""Query SLURM for job statuses using squeue (for active jobs) and sacct (fallback).
|
|
1014
|
+
|
|
1015
|
+
This function first tries squeue which is more accurate for currently running jobs,
|
|
1016
|
+
then falls back to sacct for completed/historical jobs that squeue doesn't show.
|
|
1017
|
+
It also finds follow-up jobs (from autoresume) that depend on our known jobs.
|
|
1018
|
+
|
|
1019
|
+
Args:
|
|
1020
|
+
slurm_job_ids: List of SLURM job IDs to query.
|
|
1021
|
+
username: SSH username.
|
|
1022
|
+
hostname: SSH hostname.
|
|
1023
|
+
socket: control socket location or None
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
Dict mapping from slurm_job_id to tuple of status, current_job_id.
|
|
1027
|
+
"""
|
|
1028
|
+
if len(slurm_job_ids) == 0:
|
|
1029
|
+
return {}
|
|
1030
|
+
|
|
1031
|
+
# First, try squeue for active jobs (more accurate for running jobs)
|
|
1032
|
+
squeue_statuses = _query_squeue_for_jobs(slurm_job_ids, username, hostname, socket)
|
|
1033
|
+
|
|
1034
|
+
# For jobs not found in squeue, fall back to sacct
|
|
1035
|
+
missing_jobs = [job_id for job_id in slurm_job_ids if job_id not in squeue_statuses]
|
|
1036
|
+
sacct_statuses = {}
|
|
1037
|
+
|
|
1038
|
+
if missing_jobs:
|
|
1039
|
+
sacct_statuses = _query_sacct_for_jobs(missing_jobs, username, hostname, socket)
|
|
1040
|
+
|
|
1041
|
+
# Combine results, preferring squeue data
|
|
1042
|
+
combined_statuses = {**sacct_statuses, **squeue_statuses}
|
|
1043
|
+
|
|
1044
|
+
return combined_statuses
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def _query_squeue_for_jobs(
|
|
1048
|
+
slurm_job_ids: List[str],
|
|
1049
|
+
username: str,
|
|
1050
|
+
hostname: str,
|
|
1051
|
+
socket: str | None,
|
|
1052
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1053
|
+
"""Query SLURM for active job statuses using squeue command.
|
|
1054
|
+
|
|
1055
|
+
This function finds:
|
|
1056
|
+
1. Jobs that directly match our known job IDs
|
|
1057
|
+
2. Follow-up jobs that depend on our known job IDs (from autoresume mechanism)
|
|
1058
|
+
|
|
1059
|
+
For follow-up jobs, returns the status mapped to the original job ID, along with
|
|
1060
|
+
the actual current SLURM job ID.
|
|
1061
|
+
|
|
1062
|
+
Args:
|
|
1063
|
+
slurm_job_ids: List of SLURM job IDs to query.
|
|
1064
|
+
username: SSH username.
|
|
1065
|
+
hostname: SSH hostname.
|
|
1066
|
+
socket: control socket location or None
|
|
1067
|
+
|
|
1068
|
+
Returns:
|
|
1069
|
+
Dict mapping from original slurm_job_id to tuple of status, current_job_id.
|
|
1070
|
+
"""
|
|
1071
|
+
if len(slurm_job_ids) == 0:
|
|
1072
|
+
return {}
|
|
1073
|
+
|
|
1074
|
+
# Use squeue to get active jobs - more accurate than sacct for running jobs
|
|
1075
|
+
squeue_command = "squeue -u {} -h -o '%i|%T|%E'".format(username)
|
|
1076
|
+
|
|
1077
|
+
ssh_command = ["ssh"]
|
|
1078
|
+
if socket is not None:
|
|
1079
|
+
ssh_command.append(f"-S {socket}")
|
|
1080
|
+
ssh_command.append(f"{username}@{hostname}")
|
|
1081
|
+
ssh_command.append(squeue_command)
|
|
1082
|
+
ssh_command = " ".join(ssh_command)
|
|
1083
|
+
|
|
1084
|
+
completed_process = subprocess.run(
|
|
1085
|
+
args=shlex.split(ssh_command),
|
|
1086
|
+
stdout=subprocess.PIPE,
|
|
1087
|
+
stderr=subprocess.PIPE,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
squeue_statuses = {}
|
|
1091
|
+
dependent_jobs = []
|
|
1092
|
+
if completed_process.returncode == 0:
|
|
1093
|
+
squeue_output = completed_process.stdout.decode("utf-8")
|
|
1094
|
+
squeue_output_lines = squeue_output.strip().split("\n")
|
|
1095
|
+
|
|
1096
|
+
for line in squeue_output_lines:
|
|
1097
|
+
if not line.strip():
|
|
1098
|
+
continue
|
|
1099
|
+
parts = line.split("|")
|
|
1100
|
+
if len(parts) >= 3:
|
|
1101
|
+
job_id = parts[0].strip()
|
|
1102
|
+
status = parts[1].strip()
|
|
1103
|
+
dependency = parts[2].strip()
|
|
1104
|
+
# Extract base job ID (handle array jobs like 123456_0 -> 123456)
|
|
1105
|
+
base_job_id = job_id.split("_")[0].split("[")[0]
|
|
1106
|
+
if base_job_id in slurm_job_ids:
|
|
1107
|
+
squeue_statuses[base_job_id] = status, base_job_id
|
|
1108
|
+
elif dependency and dependency != "(null)":
|
|
1109
|
+
dependent_jobs.append((base_job_id, status, dependency))
|
|
1110
|
+
|
|
1111
|
+
for dep_job_id, dep_status, dependency in dependent_jobs:
|
|
1112
|
+
for known_job_id in slurm_job_ids:
|
|
1113
|
+
if known_job_id in dependency and known_job_id not in squeue_statuses:
|
|
1114
|
+
squeue_statuses[known_job_id] = dep_status, dep_job_id
|
|
1115
|
+
break
|
|
1116
|
+
|
|
1117
|
+
return squeue_statuses
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def _query_sacct_for_jobs(
|
|
1121
|
+
slurm_job_ids: List[str],
|
|
1122
|
+
username: str,
|
|
1123
|
+
hostname: str,
|
|
1124
|
+
socket: str | None,
|
|
1125
|
+
) -> Dict[str, tuple[str, str]]:
|
|
1126
|
+
"""Query SLURM for job statuses using sacct command (for completed/historical jobs).
|
|
989
1127
|
|
|
990
1128
|
Args:
|
|
991
1129
|
slurm_job_ids: List of SLURM job IDs to query.
|
|
@@ -994,10 +1132,11 @@ def _query_slurm_jobs_status(
|
|
|
994
1132
|
socket: control socket location or None
|
|
995
1133
|
|
|
996
1134
|
Returns:
|
|
997
|
-
Dict mapping from slurm_job_id to
|
|
1135
|
+
Dict mapping from slurm_job_id to tuple of status, job_id.
|
|
998
1136
|
"""
|
|
999
1137
|
if len(slurm_job_ids) == 0:
|
|
1000
1138
|
return {}
|
|
1139
|
+
|
|
1001
1140
|
sacct_command = "sacct -j {} --format='JobID,State%32' --noheader -P".format(
|
|
1002
1141
|
",".join(slurm_job_ids)
|
|
1003
1142
|
)
|
|
@@ -1024,7 +1163,7 @@ def _query_slurm_jobs_status(
|
|
|
1024
1163
|
slurm_jobs_status = {}
|
|
1025
1164
|
for slurm_job_id in slurm_job_ids:
|
|
1026
1165
|
slurm_job_status = _parse_slurm_job_status(slurm_job_id, sacct_output_lines)
|
|
1027
|
-
slurm_jobs_status[slurm_job_id] = slurm_job_status
|
|
1166
|
+
slurm_jobs_status[slurm_job_id] = slurm_job_status, slurm_job_id
|
|
1028
1167
|
return slurm_jobs_status
|
|
1029
1168
|
|
|
1030
1169
|
|
|
@@ -1239,9 +1378,11 @@ def _generate_haproxy_config_with_placeholders(cfg):
|
|
|
1239
1378
|
for i in range(num_nodes):
|
|
1240
1379
|
nodes.append({"ip": f"{{IP_{i}}}", "port": cfg.deployment.port})
|
|
1241
1380
|
|
|
1242
|
-
# Get health check parameters
|
|
1381
|
+
# Get health check parameters - prefer proxy config, fallback to deployment.endpoints.health
|
|
1243
1382
|
proxy_config = cfg.execution.get("proxy", {}).get("config", {})
|
|
1244
|
-
health_check_path = proxy_config.get(
|
|
1383
|
+
health_check_path = proxy_config.get(
|
|
1384
|
+
"health_check_path", cfg.deployment.endpoints.get("health", "/health")
|
|
1385
|
+
)
|
|
1245
1386
|
health_check_status = proxy_config.get("health_check_status", 200)
|
|
1246
1387
|
haproxy_port = proxy_config.get("haproxy_port", 5009)
|
|
1247
1388
|
|
|
@@ -1276,7 +1417,7 @@ def _generate_haproxy_config(cfg, nodes_ips):
|
|
|
1276
1417
|
)
|
|
1277
1418
|
|
|
1278
1419
|
# Get health check parameters from deployment config
|
|
1279
|
-
health_check_path = cfg.deployment.get("
|
|
1420
|
+
health_check_path = cfg.deployment.endpoints.get("health", "/health")
|
|
1280
1421
|
health_check_status = cfg.deployment.get("health_check_status", 200)
|
|
1281
1422
|
haproxy_port = cfg.deployment.get("haproxy_port", 5009)
|
|
1282
1423
|
|
|
@@ -1336,7 +1477,7 @@ def _generate_deployment_srun_command(
|
|
|
1336
1477
|
s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
|
|
1337
1478
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
1338
1479
|
s += "--no-container-mount-home "
|
|
1339
|
-
s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.
|
|
1480
|
+
s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.log")
|
|
1340
1481
|
|
|
1341
1482
|
deployment_env_var_names = list(
|
|
1342
1483
|
cfg.execution.get("env_vars", {}).get("deployment", {})
|
|
@@ -1436,10 +1577,10 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
|
|
|
1436
1577
|
s += "done\n"
|
|
1437
1578
|
s += "\n"
|
|
1438
1579
|
s += "srun --mpi pmix --overlap "
|
|
1439
|
-
s += "--nodes 1 --ntasks 1 "
|
|
1580
|
+
s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
|
|
1440
1581
|
s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
|
|
1441
1582
|
s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
|
|
1442
|
-
s += f"--output {remote_task_subdir}/logs/proxy-%A.
|
|
1583
|
+
s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
|
|
1443
1584
|
s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
|
|
1444
1585
|
s += "PROXY_PID=$! # capture the PID of the proxy background srun process\n"
|
|
1445
1586
|
s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
|
|
@@ -1454,3 +1595,110 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
|
|
|
1454
1595
|
s += "\n"
|
|
1455
1596
|
|
|
1456
1597
|
return s
|
|
1598
|
+
|
|
1599
|
+
|
|
1600
|
+
def _collect_mount_paths(cfg: DictConfig) -> List[str]:
|
|
1601
|
+
"""Collect all mount source paths from the configuration.
|
|
1602
|
+
|
|
1603
|
+
Args:
|
|
1604
|
+
cfg: The configuration object for the evaluation run.
|
|
1605
|
+
|
|
1606
|
+
Returns:
|
|
1607
|
+
List of source paths that need to be mounted.
|
|
1608
|
+
"""
|
|
1609
|
+
mount_paths = []
|
|
1610
|
+
|
|
1611
|
+
# Deployment mounts
|
|
1612
|
+
if cfg.deployment.type != "none":
|
|
1613
|
+
if checkpoint_path := cfg.deployment.get("checkpoint_path"):
|
|
1614
|
+
mount_paths.append(checkpoint_path)
|
|
1615
|
+
if cache_path := cfg.deployment.get("cache_path"):
|
|
1616
|
+
mount_paths.append(cache_path)
|
|
1617
|
+
for source_mnt in cfg.execution.get("mounts", {}).get("deployment", {}).keys():
|
|
1618
|
+
mount_paths.append(source_mnt)
|
|
1619
|
+
|
|
1620
|
+
# Evaluation mounts
|
|
1621
|
+
for source_mnt in cfg.execution.get("mounts", {}).get("evaluation", {}).keys():
|
|
1622
|
+
mount_paths.append(source_mnt)
|
|
1623
|
+
|
|
1624
|
+
return mount_paths
|
|
1625
|
+
|
|
1626
|
+
|
|
1627
|
+
def _validate_remote_paths_exist(
|
|
1628
|
+
paths: List[str],
|
|
1629
|
+
username: str,
|
|
1630
|
+
hostname: str,
|
|
1631
|
+
socket: str | None,
|
|
1632
|
+
) -> None:
|
|
1633
|
+
"""Validate that all specified paths exist as directories on the remote host.
|
|
1634
|
+
|
|
1635
|
+
Args:
|
|
1636
|
+
paths: List of directory paths to validate.
|
|
1637
|
+
username: SSH username.
|
|
1638
|
+
hostname: SSH hostname.
|
|
1639
|
+
socket: control socket location or None
|
|
1640
|
+
|
|
1641
|
+
Raises:
|
|
1642
|
+
ValueError: If any paths do not exist as directories on the remote host.
|
|
1643
|
+
"""
|
|
1644
|
+
if not paths:
|
|
1645
|
+
return
|
|
1646
|
+
|
|
1647
|
+
# Remove duplicates while preserving order
|
|
1648
|
+
unique_paths = list(dict.fromkeys(paths))
|
|
1649
|
+
|
|
1650
|
+
# Build a single SSH command to check all paths at once
|
|
1651
|
+
test_commands = []
|
|
1652
|
+
for path in unique_paths:
|
|
1653
|
+
# Use test -d to check if directory exists
|
|
1654
|
+
# Escape single quotes in path using POSIX-safe method: ' becomes '"'"'
|
|
1655
|
+
escaped_path = path.replace("'", "'\"'\"'")
|
|
1656
|
+
test_commands.append(
|
|
1657
|
+
f"test -d '{escaped_path}' && echo 'EXISTS:{path}' || echo 'MISSING:{path}'"
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
combined_command = " ; ".join(test_commands)
|
|
1661
|
+
|
|
1662
|
+
ssh_command = ["ssh"]
|
|
1663
|
+
if socket is not None:
|
|
1664
|
+
ssh_command.append(f"-S {socket}")
|
|
1665
|
+
ssh_command.append(f"{username}@{hostname}")
|
|
1666
|
+
ssh_command.append(combined_command)
|
|
1667
|
+
ssh_command = " ".join(ssh_command)
|
|
1668
|
+
|
|
1669
|
+
logger.info("Validating mount directories exist on remote host", cmd=ssh_command)
|
|
1670
|
+
completed_process = subprocess.run(
|
|
1671
|
+
args=shlex.split(ssh_command),
|
|
1672
|
+
stdout=subprocess.PIPE,
|
|
1673
|
+
stderr=subprocess.PIPE,
|
|
1674
|
+
)
|
|
1675
|
+
|
|
1676
|
+
if completed_process.returncode != 0:
|
|
1677
|
+
error_msg = (
|
|
1678
|
+
completed_process.stderr.decode("utf-8")
|
|
1679
|
+
if completed_process.stderr
|
|
1680
|
+
else "Unknown error"
|
|
1681
|
+
)
|
|
1682
|
+
logger.error(
|
|
1683
|
+
"Error validating remote paths",
|
|
1684
|
+
code=completed_process.returncode,
|
|
1685
|
+
msg=error_msg,
|
|
1686
|
+
)
|
|
1687
|
+
raise RuntimeError(f"Failed to validate remote paths: {error_msg}")
|
|
1688
|
+
|
|
1689
|
+
# Parse output to find missing paths
|
|
1690
|
+
output = completed_process.stdout.decode("utf-8")
|
|
1691
|
+
missing_paths = []
|
|
1692
|
+
for line in output.strip().split("\n"):
|
|
1693
|
+
if line.startswith("MISSING:"):
|
|
1694
|
+
missing_path = line.replace("MISSING:", "")
|
|
1695
|
+
missing_paths.append(missing_path)
|
|
1696
|
+
|
|
1697
|
+
if missing_paths:
|
|
1698
|
+
error_message = (
|
|
1699
|
+
f"The following mount paths do not exist as directories on {username}@{hostname}:\n"
|
|
1700
|
+
+ "\n".join(f" - {path}" for path in missing_paths)
|
|
1701
|
+
+ "\n\nMount paths must be directories. Please create these directories on the cluster or update your configuration."
|
|
1702
|
+
)
|
|
1703
|
+
logger.error("Mount validation failed", missing_paths=missing_paths)
|
|
1704
|
+
raise ValueError(error_message)
|