nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. nemo_evaluator_launcher/api/functional.py +55 -5
  2. nemo_evaluator_launcher/api/types.py +21 -14
  3. nemo_evaluator_launcher/cli/ls_task.py +280 -0
  4. nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
  5. nemo_evaluator_launcher/cli/main.py +17 -2
  6. nemo_evaluator_launcher/cli/run.py +43 -52
  7. nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
  8. nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
  9. nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
  10. nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
  11. nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
  12. nemo_evaluator_launcher/common/helpers.py +44 -28
  13. nemo_evaluator_launcher/common/mapping.py +166 -177
  14. nemo_evaluator_launcher/common/printing_utils.py +18 -12
  15. nemo_evaluator_launcher/configs/deployment/nim.yaml +3 -1
  16. nemo_evaluator_launcher/executors/lepton/executor.py +26 -8
  17. nemo_evaluator_launcher/executors/local/executor.py +6 -2
  18. nemo_evaluator_launcher/executors/slurm/executor.py +270 -22
  19. nemo_evaluator_launcher/package_info.py +1 -1
  20. nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
  21. nemo_evaluator_launcher/resources/mapping.toml +62 -354
  22. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/METADATA +2 -1
  23. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/RECORD +27 -20
  24. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/WHEEL +0 -0
  25. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/entry_points.txt +0 -0
  26. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/licenses/LICENSE +0 -0
  27. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.67.dist-info}/top_level.txt +0 -0
@@ -34,7 +34,7 @@ from nemo_evaluator_launcher.common.execdb import (
34
34
  from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
35
35
  from nemo_evaluator_launcher.common.logging_utils import logger
36
36
  from nemo_evaluator_launcher.common.mapping import (
37
- get_task_from_mapping,
37
+ get_task_definition_for_job,
38
38
  load_tasks_mapping,
39
39
  )
40
40
  from nemo_evaluator_launcher.common.printing_utils import red
@@ -293,8 +293,10 @@ class LeptonExecutor(BaseExecutor):
293
293
  return
294
294
 
295
295
  # Construct the full endpoint URL
296
- task_definition = get_task_from_mapping(
297
- task.name, tasks_mapping
296
+ task_definition = get_task_definition_for_job(
297
+ task_query=task.name,
298
+ base_mapping=tasks_mapping,
299
+ container=task.get("container"),
298
300
  )
299
301
  task_endpoint_type = task_definition["endpoint_type"]
300
302
  endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
@@ -383,7 +385,11 @@ class LeptonExecutor(BaseExecutor):
383
385
 
384
386
  # Submit each evaluation task as a Lepton job
385
387
  for idx, task in enumerate(cfg.evaluation.tasks):
386
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
388
+ task_definition = get_task_definition_for_job(
389
+ task_query=task.name,
390
+ base_mapping=tasks_mapping,
391
+ container=task.get("container"),
392
+ )
387
393
 
388
394
  # Create job ID and Lepton job name (max 36 chars)
389
395
  job_id = generate_job_id(invocation_id, idx)
@@ -889,9 +895,13 @@ def _dry_run_lepton(
889
895
  ) -> None:
890
896
  print("DRY RUN: Lepton job configurations prepared")
891
897
  try:
892
- # validate tasks
898
+ # validate tasks (container overrides are supported)
893
899
  for task in cfg.evaluation.tasks:
894
- get_task_from_mapping(task.name, tasks_mapping)
900
+ _ = get_task_definition_for_job(
901
+ task_query=task.name,
902
+ base_mapping=tasks_mapping,
903
+ container=task.get("container"),
904
+ )
895
905
 
896
906
  # nice-to-have checks (existing endpoint URL or endpoints mapping)
897
907
  if getattr(cfg.deployment, "type", None) == "none":
@@ -909,7 +919,11 @@ def _dry_run_lepton(
909
919
  else:
910
920
  endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
911
921
  for task in cfg.evaluation.tasks:
912
- td = get_task_from_mapping(task.name, tasks_mapping)
922
+ td = get_task_definition_for_job(
923
+ task_query=task.name,
924
+ base_mapping=tasks_mapping,
925
+ container=task.get("container"),
926
+ )
913
927
  etype = td.get("endpoint_type")
914
928
  if etype not in endpoints_cfg:
915
929
  raise ValueError(
@@ -928,7 +942,11 @@ def _dry_run_lepton(
928
942
  getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
929
943
  )
930
944
  for task in cfg.evaluation.tasks:
931
- td = get_task_from_mapping(task.name, tasks_mapping)
945
+ td = get_task_definition_for_job(
946
+ task_query=task.name,
947
+ base_mapping=tasks_mapping,
948
+ container=task.get("container"),
949
+ )
932
950
  required = td.get("required_env_vars", []) or []
933
951
  for var in required:
934
952
  # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
49
49
  )
50
50
  from nemo_evaluator_launcher.common.logging_utils import logger
51
51
  from nemo_evaluator_launcher.common.mapping import (
52
- get_task_from_mapping,
52
+ get_task_definition_for_job,
53
53
  load_tasks_mapping,
54
54
  )
55
55
  from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -123,7 +123,11 @@ class LocalExecutor(BaseExecutor):
123
123
 
124
124
  for idx, task in enumerate(cfg.evaluation.tasks):
125
125
  timestamp = get_timestamp_string()
126
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
126
+ task_definition = get_task_definition_for_job(
127
+ task_query=task.name,
128
+ base_mapping=tasks_mapping,
129
+ container=task.get("container"),
130
+ )
127
131
 
128
132
  if cfg.deployment.type != "none":
129
133
  # container name
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
49
49
  )
50
50
  from nemo_evaluator_launcher.common.logging_utils import logger
51
51
  from nemo_evaluator_launcher.common.mapping import (
52
- get_task_from_mapping,
52
+ get_task_definition_for_job,
53
53
  load_tasks_mapping,
54
54
  )
55
55
  from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -109,7 +109,11 @@ class SlurmExecutor(BaseExecutor):
109
109
  (local_task_subdir / "artifacts").mkdir()
110
110
 
111
111
  # resolve eval image and pass directly via task override
112
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
112
+ task_definition = get_task_definition_for_job(
113
+ task_query=task.name,
114
+ base_mapping=tasks_mapping,
115
+ container=task.get("container"),
116
+ )
113
117
  eval_image = task_definition["container"]
114
118
  if "container" in task:
115
119
  eval_image = task["container"]
@@ -201,6 +205,22 @@ class SlurmExecutor(BaseExecutor):
201
205
  hostname=cfg.execution.hostname,
202
206
  socket=socket,
203
207
  )
208
+
209
+ if socket_or_none is None:
210
+ raise RuntimeError(
211
+ f"Failed to connect to the cluster {cfg.execution.hostname} as user {cfg.execution.username}. "
212
+ "Please check your SSH configuration."
213
+ )
214
+
215
+ # Validate that all mount paths exist on the remote host
216
+ mount_paths = _collect_mount_paths(cfg)
217
+ _validate_remote_paths_exist(
218
+ paths=mount_paths,
219
+ username=cfg.execution.username,
220
+ hostname=cfg.execution.hostname,
221
+ socket=socket_or_none,
222
+ )
223
+
204
224
  _make_remote_execution_output_dir(
205
225
  dirpath=cfg.execution.output_dir,
206
226
  username=cfg.execution.username,
@@ -388,10 +408,10 @@ class SlurmExecutor(BaseExecutor):
388
408
  )
389
409
  statuses = []
390
410
  for i, slurm_job_id in enumerate(slurm_job_ids):
391
- slurm_status = slurm_jobs_status[slurm_job_id]
411
+ slurm_status = slurm_jobs_status[slurm_job_id][0]
392
412
  if slurm_job_id in latest_slurm_job_ids:
393
413
  latest_slurm_job_id = latest_slurm_job_ids[slurm_job_id]
394
- slurm_status = latest_slurm_jobs_status[latest_slurm_job_id]
414
+ slurm_status = latest_slurm_jobs_status[latest_slurm_job_id][0]
395
415
  progress = progress_list[i]
396
416
  progress = progress if progress is not None else "unknown"
397
417
  execution_state = SlurmExecutor._map_slurm_state_to_execution_state(
@@ -507,7 +527,11 @@ def _create_slurm_sbatch_script(
507
527
  """
508
528
  # get task from mapping, overrides, urls
509
529
  tasks_mapping = load_tasks_mapping()
510
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
530
+ task_definition = get_task_definition_for_job(
531
+ task_query=task.name,
532
+ base_mapping=tasks_mapping,
533
+ container=task.get("container"),
534
+ )
511
535
 
512
536
  # TODO(public release): convert to template
513
537
  s = "#!/bin/bash\n"
@@ -531,7 +555,8 @@ def _create_slurm_sbatch_script(
531
555
  )
532
556
  s += "#SBATCH --job-name {}\n".format(job_name)
533
557
  s += "#SBATCH --exclusive\n"
534
- s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.out")
558
+ s += "#SBATCH --no-requeue\n" # We have our own auto-resume logic
559
+ s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.log")
535
560
  s += "\n"
536
561
  s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
537
562
  s += "\n"
@@ -619,7 +644,7 @@ def _create_slurm_sbatch_script(
619
644
  s += deployment_srun_cmd
620
645
 
621
646
  # wait for the server to initialize
622
- health_path = cfg.deployment.get("health_check_path", "/health")
647
+ health_path = cfg.deployment.endpoints.get("health", "/health")
623
648
  # For multi-instance check all node IPs, for single instance check localhost
624
649
  if cfg.deployment.get("multiple_instances", False):
625
650
  ip_list = '"${NODES_IPS_ARRAY[@]}"'
@@ -685,7 +710,7 @@ def _create_slurm_sbatch_script(
685
710
 
686
711
  s += "# evaluation client\n"
687
712
  s += "srun --mpi pmix --overlap "
688
- s += "--nodes 1 --ntasks 1 " # Client always runs on single node
713
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
689
714
  s += "--container-image {} ".format(eval_image)
690
715
  evaluation_env_var_names = list(
691
716
  cfg.execution.get("env_vars", {}).get("evaluation", {})
@@ -696,7 +721,7 @@ def _create_slurm_sbatch_script(
696
721
  s += "--no-container-mount-home "
697
722
 
698
723
  s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
699
- s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
724
+ s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.log")
700
725
  s += "bash -c '\n"
701
726
  s += eval_factory_command
702
727
  s += "'\n\n"
@@ -810,15 +835,15 @@ def _generate_auto_export_section(
810
835
 
811
836
  s += " # export\n"
812
837
  s += " srun --mpi pmix --overlap "
813
- s += "--nodes 1 --ntasks 1 " # Client always runs on single node
838
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
814
839
  s += "--container-image {} ".format(export_image)
815
840
  if export_env:
816
841
  s += "--container-env {} ".format(",".join(export_env))
817
842
  if not cfg.execution.get("mounts", {}).get("mount_home", True):
818
843
  s += "--no-container-mount-home "
819
844
 
820
- s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
821
- s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.out")
845
+ s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts,{remote_task_subdir}/logs:{remote_task_subdir}/logs "
846
+ s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
822
847
  s += " bash -c '\n"
823
848
  # FIXME(martas): would be good to install specific version
824
849
  s += " pip install nemo-evaluator-launcher[all]\n"
@@ -984,8 +1009,121 @@ def _query_slurm_jobs_status(
984
1009
  username: str,
985
1010
  hostname: str,
986
1011
  socket: str | None,
987
- ) -> Dict[str, str]:
988
- """Query SLURM for job statuses using sacct command.
1012
+ ) -> Dict[str, tuple[str, str]]:
1013
+ """Query SLURM for job statuses using squeue (for active jobs) and sacct (fallback).
1014
+
1015
+ This function first tries squeue which is more accurate for currently running jobs,
1016
+ then falls back to sacct for completed/historical jobs that squeue doesn't show.
1017
+ It also finds follow-up jobs (from autoresume) that depend on our known jobs.
1018
+
1019
+ Args:
1020
+ slurm_job_ids: List of SLURM job IDs to query.
1021
+ username: SSH username.
1022
+ hostname: SSH hostname.
1023
+ socket: control socket location or None
1024
+
1025
+ Returns:
1026
+ Dict mapping from slurm_job_id to tuple of status, current_job_id.
1027
+ """
1028
+ if len(slurm_job_ids) == 0:
1029
+ return {}
1030
+
1031
+ # First, try squeue for active jobs (more accurate for running jobs)
1032
+ squeue_statuses = _query_squeue_for_jobs(slurm_job_ids, username, hostname, socket)
1033
+
1034
+ # For jobs not found in squeue, fall back to sacct
1035
+ missing_jobs = [job_id for job_id in slurm_job_ids if job_id not in squeue_statuses]
1036
+ sacct_statuses = {}
1037
+
1038
+ if missing_jobs:
1039
+ sacct_statuses = _query_sacct_for_jobs(missing_jobs, username, hostname, socket)
1040
+
1041
+ # Combine results, preferring squeue data
1042
+ combined_statuses = {**sacct_statuses, **squeue_statuses}
1043
+
1044
+ return combined_statuses
1045
+
1046
+
1047
+ def _query_squeue_for_jobs(
1048
+ slurm_job_ids: List[str],
1049
+ username: str,
1050
+ hostname: str,
1051
+ socket: str | None,
1052
+ ) -> Dict[str, tuple[str, str]]:
1053
+ """Query SLURM for active job statuses using squeue command.
1054
+
1055
+ This function finds:
1056
+ 1. Jobs that directly match our known job IDs
1057
+ 2. Follow-up jobs that depend on our known job IDs (from autoresume mechanism)
1058
+
1059
+ For follow-up jobs, returns the status mapped to the original job ID, along with
1060
+ the actual current SLURM job ID.
1061
+
1062
+ Args:
1063
+ slurm_job_ids: List of SLURM job IDs to query.
1064
+ username: SSH username.
1065
+ hostname: SSH hostname.
1066
+ socket: control socket location or None
1067
+
1068
+ Returns:
1069
+ Dict mapping from original slurm_job_id to tuple of status, current_job_id.
1070
+ """
1071
+ if len(slurm_job_ids) == 0:
1072
+ return {}
1073
+
1074
+ # Use squeue to get active jobs - more accurate than sacct for running jobs
1075
+ squeue_command = "squeue -u {} -h -o '%i|%T|%E'".format(username)
1076
+
1077
+ ssh_command = ["ssh"]
1078
+ if socket is not None:
1079
+ ssh_command.append(f"-S {socket}")
1080
+ ssh_command.append(f"{username}@{hostname}")
1081
+ ssh_command.append(squeue_command)
1082
+ ssh_command = " ".join(ssh_command)
1083
+
1084
+ completed_process = subprocess.run(
1085
+ args=shlex.split(ssh_command),
1086
+ stdout=subprocess.PIPE,
1087
+ stderr=subprocess.PIPE,
1088
+ )
1089
+
1090
+ squeue_statuses = {}
1091
+ dependent_jobs = []
1092
+ if completed_process.returncode == 0:
1093
+ squeue_output = completed_process.stdout.decode("utf-8")
1094
+ squeue_output_lines = squeue_output.strip().split("\n")
1095
+
1096
+ for line in squeue_output_lines:
1097
+ if not line.strip():
1098
+ continue
1099
+ parts = line.split("|")
1100
+ if len(parts) >= 3:
1101
+ job_id = parts[0].strip()
1102
+ status = parts[1].strip()
1103
+ dependency = parts[2].strip()
1104
+ # Extract base job ID (handle array jobs like 123456_0 -> 123456)
1105
+ base_job_id = job_id.split("_")[0].split("[")[0]
1106
+ if base_job_id in slurm_job_ids:
1107
+ squeue_statuses[base_job_id] = status, base_job_id
1108
+ elif dependency and dependency != "(null)":
1109
+ dependent_jobs.append((base_job_id, status, dependency))
1110
+
1111
+ for dep_job_id, dep_status, dependency in dependent_jobs:
1112
+ for known_job_id in slurm_job_ids:
1113
+ if known_job_id in dependency and known_job_id not in squeue_statuses:
1114
+ squeue_statuses[known_job_id] = dep_status, dep_job_id
1115
+ break
1116
+
1117
+ return squeue_statuses
1118
+
1119
+
1120
+ def _query_sacct_for_jobs(
1121
+ slurm_job_ids: List[str],
1122
+ username: str,
1123
+ hostname: str,
1124
+ socket: str | None,
1125
+ ) -> Dict[str, tuple[str, str]]:
1126
+ """Query SLURM for job statuses using sacct command (for completed/historical jobs).
989
1127
 
990
1128
  Args:
991
1129
  slurm_job_ids: List of SLURM job IDs to query.
@@ -994,10 +1132,11 @@ def _query_slurm_jobs_status(
994
1132
  socket: control socket location or None
995
1133
 
996
1134
  Returns:
997
- Dict mapping from slurm_job_id to returned slurm status.
1135
+ Dict mapping from slurm_job_id to tuple of status, job_id.
998
1136
  """
999
1137
  if len(slurm_job_ids) == 0:
1000
1138
  return {}
1139
+
1001
1140
  sacct_command = "sacct -j {} --format='JobID,State%32' --noheader -P".format(
1002
1141
  ",".join(slurm_job_ids)
1003
1142
  )
@@ -1024,7 +1163,7 @@ def _query_slurm_jobs_status(
1024
1163
  slurm_jobs_status = {}
1025
1164
  for slurm_job_id in slurm_job_ids:
1026
1165
  slurm_job_status = _parse_slurm_job_status(slurm_job_id, sacct_output_lines)
1027
- slurm_jobs_status[slurm_job_id] = slurm_job_status
1166
+ slurm_jobs_status[slurm_job_id] = slurm_job_status, slurm_job_id
1028
1167
  return slurm_jobs_status
1029
1168
 
1030
1169
 
@@ -1239,9 +1378,11 @@ def _generate_haproxy_config_with_placeholders(cfg):
1239
1378
  for i in range(num_nodes):
1240
1379
  nodes.append({"ip": f"{{IP_{i}}}", "port": cfg.deployment.port})
1241
1380
 
1242
- # Get health check parameters from execution config
1381
+ # Get health check parameters - prefer proxy config, fallback to deployment.endpoints.health
1243
1382
  proxy_config = cfg.execution.get("proxy", {}).get("config", {})
1244
- health_check_path = proxy_config.get("health_check_path", "/health")
1383
+ health_check_path = proxy_config.get(
1384
+ "health_check_path", cfg.deployment.endpoints.get("health", "/health")
1385
+ )
1245
1386
  health_check_status = proxy_config.get("health_check_status", 200)
1246
1387
  haproxy_port = proxy_config.get("haproxy_port", 5009)
1247
1388
 
@@ -1276,7 +1417,7 @@ def _generate_haproxy_config(cfg, nodes_ips):
1276
1417
  )
1277
1418
 
1278
1419
  # Get health check parameters from deployment config
1279
- health_check_path = cfg.deployment.get("health_check_path", "/health")
1420
+ health_check_path = cfg.deployment.endpoints.get("health", "/health")
1280
1421
  health_check_status = cfg.deployment.get("health_check_status", 200)
1281
1422
  haproxy_port = cfg.deployment.get("haproxy_port", 5009)
1282
1423
 
@@ -1336,7 +1477,7 @@ def _generate_deployment_srun_command(
1336
1477
  s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
1337
1478
  if not cfg.execution.get("mounts", {}).get("mount_home", True):
1338
1479
  s += "--no-container-mount-home "
1339
- s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.out")
1480
+ s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.log")
1340
1481
 
1341
1482
  deployment_env_var_names = list(
1342
1483
  cfg.execution.get("env_vars", {}).get("deployment", {})
@@ -1436,10 +1577,10 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
1436
1577
  s += "done\n"
1437
1578
  s += "\n"
1438
1579
  s += "srun --mpi pmix --overlap "
1439
- s += "--nodes 1 --ntasks 1 "
1580
+ s += "--nodelist ${nodes_array[0]} --nodes 1 --ntasks 1 "
1440
1581
  s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
1441
1582
  s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
1442
- s += f"--output {remote_task_subdir}/logs/proxy-%A.out "
1583
+ s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
1443
1584
  s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
1444
1585
  s += "PROXY_PID=$! # capture the PID of the proxy background srun process\n"
1445
1586
  s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
@@ -1454,3 +1595,110 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
1454
1595
  s += "\n"
1455
1596
 
1456
1597
  return s
1598
+
1599
+
1600
+ def _collect_mount_paths(cfg: DictConfig) -> List[str]:
1601
+ """Collect all mount source paths from the configuration.
1602
+
1603
+ Args:
1604
+ cfg: The configuration object for the evaluation run.
1605
+
1606
+ Returns:
1607
+ List of source paths that need to be mounted.
1608
+ """
1609
+ mount_paths = []
1610
+
1611
+ # Deployment mounts
1612
+ if cfg.deployment.type != "none":
1613
+ if checkpoint_path := cfg.deployment.get("checkpoint_path"):
1614
+ mount_paths.append(checkpoint_path)
1615
+ if cache_path := cfg.deployment.get("cache_path"):
1616
+ mount_paths.append(cache_path)
1617
+ for source_mnt in cfg.execution.get("mounts", {}).get("deployment", {}).keys():
1618
+ mount_paths.append(source_mnt)
1619
+
1620
+ # Evaluation mounts
1621
+ for source_mnt in cfg.execution.get("mounts", {}).get("evaluation", {}).keys():
1622
+ mount_paths.append(source_mnt)
1623
+
1624
+ return mount_paths
1625
+
1626
+
1627
+ def _validate_remote_paths_exist(
1628
+ paths: List[str],
1629
+ username: str,
1630
+ hostname: str,
1631
+ socket: str | None,
1632
+ ) -> None:
1633
+ """Validate that all specified paths exist as directories on the remote host.
1634
+
1635
+ Args:
1636
+ paths: List of directory paths to validate.
1637
+ username: SSH username.
1638
+ hostname: SSH hostname.
1639
+ socket: control socket location or None
1640
+
1641
+ Raises:
1642
+ ValueError: If any paths do not exist as directories on the remote host.
1643
+ """
1644
+ if not paths:
1645
+ return
1646
+
1647
+ # Remove duplicates while preserving order
1648
+ unique_paths = list(dict.fromkeys(paths))
1649
+
1650
+ # Build a single SSH command to check all paths at once
1651
+ test_commands = []
1652
+ for path in unique_paths:
1653
+ # Use test -d to check if directory exists
1654
+ # Escape single quotes in path using POSIX-safe method: ' becomes '"'"'
1655
+ escaped_path = path.replace("'", "'\"'\"'")
1656
+ test_commands.append(
1657
+ f"test -d '{escaped_path}' && echo 'EXISTS:{path}' || echo 'MISSING:{path}'"
1658
+ )
1659
+
1660
+ combined_command = " ; ".join(test_commands)
1661
+
1662
+ ssh_command = ["ssh"]
1663
+ if socket is not None:
1664
+ ssh_command.append(f"-S {socket}")
1665
+ ssh_command.append(f"{username}@{hostname}")
1666
+ ssh_command.append(combined_command)
1667
+ ssh_command = " ".join(ssh_command)
1668
+
1669
+ logger.info("Validating mount directories exist on remote host", cmd=ssh_command)
1670
+ completed_process = subprocess.run(
1671
+ args=shlex.split(ssh_command),
1672
+ stdout=subprocess.PIPE,
1673
+ stderr=subprocess.PIPE,
1674
+ )
1675
+
1676
+ if completed_process.returncode != 0:
1677
+ error_msg = (
1678
+ completed_process.stderr.decode("utf-8")
1679
+ if completed_process.stderr
1680
+ else "Unknown error"
1681
+ )
1682
+ logger.error(
1683
+ "Error validating remote paths",
1684
+ code=completed_process.returncode,
1685
+ msg=error_msg,
1686
+ )
1687
+ raise RuntimeError(f"Failed to validate remote paths: {error_msg}")
1688
+
1689
+ # Parse output to find missing paths
1690
+ output = completed_process.stdout.decode("utf-8")
1691
+ missing_paths = []
1692
+ for line in output.strip().split("\n"):
1693
+ if line.startswith("MISSING:"):
1694
+ missing_path = line.replace("MISSING:", "")
1695
+ missing_paths.append(missing_path)
1696
+
1697
+ if missing_paths:
1698
+ error_message = (
1699
+ f"The following mount paths do not exist as directories on {username}@{hostname}:\n"
1700
+ + "\n".join(f" - {path}" for path in missing_paths)
1701
+ + "\n\nMount paths must be directories. Please create these directories on the cluster or update your configuration."
1702
+ )
1703
+ logger.error("Mount validation failed", missing_paths=missing_paths)
1704
+ raise ValueError(error_message)
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 41
19
+ PATCH = 67
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)