nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. nemo_evaluator_launcher/api/functional.py +55 -5
  2. nemo_evaluator_launcher/cli/ls_task.py +280 -0
  3. nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
  4. nemo_evaluator_launcher/cli/main.py +17 -2
  5. nemo_evaluator_launcher/cli/run.py +41 -1
  6. nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
  7. nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
  8. nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
  9. nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
  10. nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
  11. nemo_evaluator_launcher/common/helpers.py +44 -28
  12. nemo_evaluator_launcher/common/mapping.py +341 -155
  13. nemo_evaluator_launcher/common/printing_utils.py +18 -12
  14. nemo_evaluator_launcher/executors/lepton/executor.py +26 -8
  15. nemo_evaluator_launcher/executors/local/executor.py +6 -2
  16. nemo_evaluator_launcher/executors/slurm/executor.py +141 -9
  17. nemo_evaluator_launcher/package_info.py +1 -1
  18. nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
  19. nemo_evaluator_launcher/resources/mapping.toml +62 -354
  20. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/METADATA +2 -1
  21. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/RECORD +25 -18
  22. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/WHEEL +0 -0
  23. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/entry_points.txt +0 -0
  24. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/licenses/LICENSE +0 -0
  25. {nemo_evaluator_launcher-0.1.41.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/top_level.txt +0 -0
@@ -34,7 +34,7 @@ from nemo_evaluator_launcher.common.execdb import (
34
34
  from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
35
35
  from nemo_evaluator_launcher.common.logging_utils import logger
36
36
  from nemo_evaluator_launcher.common.mapping import (
37
- get_task_from_mapping,
37
+ get_task_definition_for_job,
38
38
  load_tasks_mapping,
39
39
  )
40
40
  from nemo_evaluator_launcher.common.printing_utils import red
@@ -293,8 +293,10 @@ class LeptonExecutor(BaseExecutor):
293
293
  return
294
294
 
295
295
  # Construct the full endpoint URL
296
- task_definition = get_task_from_mapping(
297
- task.name, tasks_mapping
296
+ task_definition = get_task_definition_for_job(
297
+ task_query=task.name,
298
+ base_mapping=tasks_mapping,
299
+ container=task.get("container"),
298
300
  )
299
301
  task_endpoint_type = task_definition["endpoint_type"]
300
302
  endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
@@ -383,7 +385,11 @@ class LeptonExecutor(BaseExecutor):
383
385
 
384
386
  # Submit each evaluation task as a Lepton job
385
387
  for idx, task in enumerate(cfg.evaluation.tasks):
386
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
388
+ task_definition = get_task_definition_for_job(
389
+ task_query=task.name,
390
+ base_mapping=tasks_mapping,
391
+ container=task.get("container"),
392
+ )
387
393
 
388
394
  # Create job ID and Lepton job name (max 36 chars)
389
395
  job_id = generate_job_id(invocation_id, idx)
@@ -889,9 +895,13 @@ def _dry_run_lepton(
889
895
  ) -> None:
890
896
  print("DRY RUN: Lepton job configurations prepared")
891
897
  try:
892
- # validate tasks
898
+ # validate tasks (container overrides are supported)
893
899
  for task in cfg.evaluation.tasks:
894
- get_task_from_mapping(task.name, tasks_mapping)
900
+ _ = get_task_definition_for_job(
901
+ task_query=task.name,
902
+ base_mapping=tasks_mapping,
903
+ container=task.get("container"),
904
+ )
895
905
 
896
906
  # nice-to-have checks (existing endpoint URL or endpoints mapping)
897
907
  if getattr(cfg.deployment, "type", None) == "none":
@@ -909,7 +919,11 @@ def _dry_run_lepton(
909
919
  else:
910
920
  endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
911
921
  for task in cfg.evaluation.tasks:
912
- td = get_task_from_mapping(task.name, tasks_mapping)
922
+ td = get_task_definition_for_job(
923
+ task_query=task.name,
924
+ base_mapping=tasks_mapping,
925
+ container=task.get("container"),
926
+ )
913
927
  etype = td.get("endpoint_type")
914
928
  if etype not in endpoints_cfg:
915
929
  raise ValueError(
@@ -928,7 +942,11 @@ def _dry_run_lepton(
928
942
  getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
929
943
  )
930
944
  for task in cfg.evaluation.tasks:
931
- td = get_task_from_mapping(task.name, tasks_mapping)
945
+ td = get_task_definition_for_job(
946
+ task_query=task.name,
947
+ base_mapping=tasks_mapping,
948
+ container=task.get("container"),
949
+ )
932
950
  required = td.get("required_env_vars", []) or []
933
951
  for var in required:
934
952
  # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
49
49
  )
50
50
  from nemo_evaluator_launcher.common.logging_utils import logger
51
51
  from nemo_evaluator_launcher.common.mapping import (
52
- get_task_from_mapping,
52
+ get_task_definition_for_job,
53
53
  load_tasks_mapping,
54
54
  )
55
55
  from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -123,7 +123,11 @@ class LocalExecutor(BaseExecutor):
123
123
 
124
124
  for idx, task in enumerate(cfg.evaluation.tasks):
125
125
  timestamp = get_timestamp_string()
126
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
126
+ task_definition = get_task_definition_for_job(
127
+ task_query=task.name,
128
+ base_mapping=tasks_mapping,
129
+ container=task.get("container"),
130
+ )
127
131
 
128
132
  if cfg.deployment.type != "none":
129
133
  # container name
@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
49
49
  )
50
50
  from nemo_evaluator_launcher.common.logging_utils import logger
51
51
  from nemo_evaluator_launcher.common.mapping import (
52
- get_task_from_mapping,
52
+ get_task_definition_for_job,
53
53
  load_tasks_mapping,
54
54
  )
55
55
  from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -109,7 +109,11 @@ class SlurmExecutor(BaseExecutor):
109
109
  (local_task_subdir / "artifacts").mkdir()
110
110
 
111
111
  # resolve eval image and pass directly via task override
112
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
112
+ task_definition = get_task_definition_for_job(
113
+ task_query=task.name,
114
+ base_mapping=tasks_mapping,
115
+ container=task.get("container"),
116
+ )
113
117
  eval_image = task_definition["container"]
114
118
  if "container" in task:
115
119
  eval_image = task["container"]
@@ -201,6 +205,22 @@ class SlurmExecutor(BaseExecutor):
201
205
  hostname=cfg.execution.hostname,
202
206
  socket=socket,
203
207
  )
208
+
209
+ if socket_or_none is None:
210
+ raise RuntimeError(
211
+ f"Failed to connect to the cluster {cfg.execution.hostname} as user {cfg.execution.username}. "
212
+ "Please check your SSH configuration."
213
+ )
214
+
215
+ # Validate that all mount paths exist on the remote host
216
+ mount_paths = _collect_mount_paths(cfg)
217
+ _validate_remote_paths_exist(
218
+ paths=mount_paths,
219
+ username=cfg.execution.username,
220
+ hostname=cfg.execution.hostname,
221
+ socket=socket_or_none,
222
+ )
223
+
204
224
  _make_remote_execution_output_dir(
205
225
  dirpath=cfg.execution.output_dir,
206
226
  username=cfg.execution.username,
@@ -507,7 +527,11 @@ def _create_slurm_sbatch_script(
507
527
  """
508
528
  # get task from mapping, overrides, urls
509
529
  tasks_mapping = load_tasks_mapping()
510
- task_definition = get_task_from_mapping(task.name, tasks_mapping)
530
+ task_definition = get_task_definition_for_job(
531
+ task_query=task.name,
532
+ base_mapping=tasks_mapping,
533
+ container=task.get("container"),
534
+ )
511
535
 
512
536
  # TODO(public release): convert to template
513
537
  s = "#!/bin/bash\n"
@@ -531,7 +555,8 @@ def _create_slurm_sbatch_script(
531
555
  )
532
556
  s += "#SBATCH --job-name {}\n".format(job_name)
533
557
  s += "#SBATCH --exclusive\n"
534
- s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.out")
558
+ s += "#SBATCH --no-requeue\n" # We have our own auto-resume logic
559
+ s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.log")
535
560
  s += "\n"
536
561
  s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
537
562
  s += "\n"
@@ -696,7 +721,7 @@ def _create_slurm_sbatch_script(
696
721
  s += "--no-container-mount-home "
697
722
 
698
723
  s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
699
- s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
724
+ s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.log")
700
725
  s += "bash -c '\n"
701
726
  s += eval_factory_command
702
727
  s += "'\n\n"
@@ -817,8 +842,8 @@ def _generate_auto_export_section(
817
842
  if not cfg.execution.get("mounts", {}).get("mount_home", True):
818
843
  s += "--no-container-mount-home "
819
844
 
820
- s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
821
- s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.out")
845
+ s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts,{remote_task_subdir}/logs:{remote_task_subdir}/logs "
846
+ s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
822
847
  s += " bash -c '\n"
823
848
  # FIXME(martas): would be good to install specific version
824
849
  s += " pip install nemo-evaluator-launcher[all]\n"
@@ -1336,7 +1361,7 @@ def _generate_deployment_srun_command(
1336
1361
  s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
1337
1362
  if not cfg.execution.get("mounts", {}).get("mount_home", True):
1338
1363
  s += "--no-container-mount-home "
1339
- s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.out")
1364
+ s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.log")
1340
1365
 
1341
1366
  deployment_env_var_names = list(
1342
1367
  cfg.execution.get("env_vars", {}).get("deployment", {})
@@ -1439,7 +1464,7 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
1439
1464
  s += "--nodes 1 --ntasks 1 "
1440
1465
  s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
1441
1466
  s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
1442
- s += f"--output {remote_task_subdir}/logs/proxy-%A.out "
1467
+ s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
1443
1468
  s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
1444
1469
  s += "PROXY_PID=$! # capture the PID of the proxy background srun process\n"
1445
1470
  s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
@@ -1454,3 +1479,110 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
1454
1479
  s += "\n"
1455
1480
 
1456
1481
  return s
1482
+
1483
+
1484
+ def _collect_mount_paths(cfg: DictConfig) -> List[str]:
1485
+ """Collect all mount source paths from the configuration.
1486
+
1487
+ Args:
1488
+ cfg: The configuration object for the evaluation run.
1489
+
1490
+ Returns:
1491
+ List of source paths that need to be mounted.
1492
+ """
1493
+ mount_paths = []
1494
+
1495
+ # Deployment mounts
1496
+ if cfg.deployment.type != "none":
1497
+ if checkpoint_path := cfg.deployment.get("checkpoint_path"):
1498
+ mount_paths.append(checkpoint_path)
1499
+ if cache_path := cfg.deployment.get("cache_path"):
1500
+ mount_paths.append(cache_path)
1501
+ for source_mnt in cfg.execution.get("mounts", {}).get("deployment", {}).keys():
1502
+ mount_paths.append(source_mnt)
1503
+
1504
+ # Evaluation mounts
1505
+ for source_mnt in cfg.execution.get("mounts", {}).get("evaluation", {}).keys():
1506
+ mount_paths.append(source_mnt)
1507
+
1508
+ return mount_paths
1509
+
1510
+
1511
+ def _validate_remote_paths_exist(
1512
+ paths: List[str],
1513
+ username: str,
1514
+ hostname: str,
1515
+ socket: str | None,
1516
+ ) -> None:
1517
+ """Validate that all specified paths exist as directories on the remote host.
1518
+
1519
+ Args:
1520
+ paths: List of directory paths to validate.
1521
+ username: SSH username.
1522
+ hostname: SSH hostname.
1523
+ socket: control socket location or None
1524
+
1525
+ Raises:
1526
+ ValueError: If any paths do not exist as directories on the remote host.
1527
+ """
1528
+ if not paths:
1529
+ return
1530
+
1531
+ # Remove duplicates while preserving order
1532
+ unique_paths = list(dict.fromkeys(paths))
1533
+
1534
+ # Build a single SSH command to check all paths at once
1535
+ test_commands = []
1536
+ for path in unique_paths:
1537
+ # Use test -d to check if directory exists
1538
+ # Escape single quotes in path using POSIX-safe method: ' becomes '"'"'
1539
+ escaped_path = path.replace("'", "'\"'\"'")
1540
+ test_commands.append(
1541
+ f"test -d '{escaped_path}' && echo 'EXISTS:{path}' || echo 'MISSING:{path}'"
1542
+ )
1543
+
1544
+ combined_command = " ; ".join(test_commands)
1545
+
1546
+ ssh_command = ["ssh"]
1547
+ if socket is not None:
1548
+ ssh_command.append(f"-S {socket}")
1549
+ ssh_command.append(f"{username}@{hostname}")
1550
+ ssh_command.append(combined_command)
1551
+ ssh_command = " ".join(ssh_command)
1552
+
1553
+ logger.info("Validating mount directories exist on remote host", cmd=ssh_command)
1554
+ completed_process = subprocess.run(
1555
+ args=shlex.split(ssh_command),
1556
+ stdout=subprocess.PIPE,
1557
+ stderr=subprocess.PIPE,
1558
+ )
1559
+
1560
+ if completed_process.returncode != 0:
1561
+ error_msg = (
1562
+ completed_process.stderr.decode("utf-8")
1563
+ if completed_process.stderr
1564
+ else "Unknown error"
1565
+ )
1566
+ logger.error(
1567
+ "Error validating remote paths",
1568
+ code=completed_process.returncode,
1569
+ msg=error_msg,
1570
+ )
1571
+ raise RuntimeError(f"Failed to validate remote paths: {error_msg}")
1572
+
1573
+ # Parse output to find missing paths
1574
+ output = completed_process.stdout.decode("utf-8")
1575
+ missing_paths = []
1576
+ for line in output.strip().split("\n"):
1577
+ if line.startswith("MISSING:"):
1578
+ missing_path = line.replace("MISSING:", "")
1579
+ missing_paths.append(missing_path)
1580
+
1581
+ if missing_paths:
1582
+ error_message = (
1583
+ f"The following mount paths do not exist as directories on {username}@{hostname}:\n"
1584
+ + "\n".join(f" - {path}" for path in missing_paths)
1585
+ + "\n\nMount paths must be directories. Please create these directories on the cluster or update your configuration."
1586
+ )
1587
+ logger.error("Mount validation failed", missing_paths=missing_paths)
1588
+ raise ValueError(error_message)
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 41
19
+ PATCH = 56
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)