PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.41py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

nemo_evaluator_launcher/executors/lepton/executor.py CHANGED Viewed

@@ -34,7 +34,7 @@ from nemo_evaluator_launcher.common.execdb import (
 from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
 from nemo_evaluator_launcher.common.logging_utils import logger
 from nemo_evaluator_launcher.common.mapping import (
-    get_task_from_mapping,
+    get_task_definition_for_job,
     load_tasks_mapping,
 )
 from nemo_evaluator_launcher.common.printing_utils import red
@@ -293,8 +293,10 @@ class LeptonExecutor(BaseExecutor):
                             return
                         # Construct the full endpoint URL
-                        task_definition = get_task_from_mapping(
-                            task.name, tasks_mapping
+                        task_definition = get_task_definition_for_job(
+                            task_query=task.name,
+                            base_mapping=tasks_mapping,
+                            container=task.get("container"),
                         )
                         task_endpoint_type = task_definition["endpoint_type"]
                         endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
@@ -383,7 +385,11 @@ class LeptonExecutor(BaseExecutor):
             # Submit each evaluation task as a Lepton job
             for idx, task in enumerate(cfg.evaluation.tasks):
-                task_definition = get_task_from_mapping(task.name, tasks_mapping)
+                task_definition = get_task_definition_for_job(
+                    task_query=task.name,
+                    base_mapping=tasks_mapping,
+                    container=task.get("container"),
+                )
                 # Create job ID and Lepton job name (max 36 chars)
                 job_id = generate_job_id(invocation_id, idx)
@@ -889,9 +895,13 @@ def _dry_run_lepton(
 ) -> None:
     print("DRY RUN: Lepton job configurations prepared")
     try:
-        # validate tasks
+        # validate tasks (container overrides are supported)
         for task in cfg.evaluation.tasks:
-            get_task_from_mapping(task.name, tasks_mapping)
+            _ = get_task_definition_for_job(
+                task_query=task.name,
+                base_mapping=tasks_mapping,
+                container=task.get("container"),
+            )
         # nice-to-have checks (existing endpoint URL or endpoints mapping)
         if getattr(cfg.deployment, "type", None) == "none":
@@ -909,7 +919,11 @@ def _dry_run_lepton(
         else:
             endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
             for task in cfg.evaluation.tasks:
-                td = get_task_from_mapping(task.name, tasks_mapping)
+                td = get_task_definition_for_job(
+                    task_query=task.name,
+                    base_mapping=tasks_mapping,
+                    container=task.get("container"),
+                )
                 etype = td.get("endpoint_type")
                 if etype not in endpoints_cfg:
                     raise ValueError(
@@ -928,7 +942,11 @@ def _dry_run_lepton(
             getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
         )
         for task in cfg.evaluation.tasks:
-            td = get_task_from_mapping(task.name, tasks_mapping)
+            td = get_task_definition_for_job(
+                task_query=task.name,
+                base_mapping=tasks_mapping,
+                container=task.get("container"),
+            )
             required = td.get("required_env_vars", []) or []
             for var in required:
                 # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic

nemo_evaluator_launcher/executors/local/executor.py CHANGED Viewed

@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
 )
 from nemo_evaluator_launcher.common.logging_utils import logger
 from nemo_evaluator_launcher.common.mapping import (
-    get_task_from_mapping,
+    get_task_definition_for_job,
     load_tasks_mapping,
 )
 from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -123,7 +123,11 @@ class LocalExecutor(BaseExecutor):
         for idx, task in enumerate(cfg.evaluation.tasks):
             timestamp = get_timestamp_string()
-            task_definition = get_task_from_mapping(task.name, tasks_mapping)
+            task_definition = get_task_definition_for_job(
+                task_query=task.name,
+                base_mapping=tasks_mapping,
+                container=task.get("container"),
+            )
             if cfg.deployment.type != "none":
                 # container name

nemo_evaluator_launcher/executors/slurm/executor.py CHANGED Viewed

@@ -49,7 +49,7 @@ from nemo_evaluator_launcher.common.helpers import (
 )
 from nemo_evaluator_launcher.common.logging_utils import logger
 from nemo_evaluator_launcher.common.mapping import (
-    get_task_from_mapping,
+    get_task_definition_for_job,
     load_tasks_mapping,
 )
 from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
@@ -109,7 +109,11 @@ class SlurmExecutor(BaseExecutor):
                 (local_task_subdir / "artifacts").mkdir()
                 # resolve eval image and pass directly via task override
-                task_definition = get_task_from_mapping(task.name, tasks_mapping)
+                task_definition = get_task_definition_for_job(
+                    task_query=task.name,
+                    base_mapping=tasks_mapping,
+                    container=task.get("container"),
+                )
                 eval_image = task_definition["container"]
                 if "container" in task:
                     eval_image = task["container"]
@@ -201,6 +205,22 @@ class SlurmExecutor(BaseExecutor):
                 hostname=cfg.execution.hostname,
                 socket=socket,
             )
+            if socket_or_none is None:
+                raise RuntimeError(
+                    f"Failed to connect to the cluster {cfg.execution.hostname} as user {cfg.execution.username}. "
+                    "Please check your SSH configuration."
+                )
+            # Validate that all mount paths exist on the remote host
+            mount_paths = _collect_mount_paths(cfg)
+            _validate_remote_paths_exist(
+                paths=mount_paths,
+                username=cfg.execution.username,
+                hostname=cfg.execution.hostname,
+                socket=socket_or_none,
+            )
             _make_remote_execution_output_dir(
                 dirpath=cfg.execution.output_dir,
                 username=cfg.execution.username,
@@ -507,7 +527,11 @@ def _create_slurm_sbatch_script(
     """
     # get task from mapping, overrides, urls
     tasks_mapping = load_tasks_mapping()
-    task_definition = get_task_from_mapping(task.name, tasks_mapping)
+    task_definition = get_task_definition_for_job(
+        task_query=task.name,
+        base_mapping=tasks_mapping,
+        container=task.get("container"),
+    )
     # TODO(public release): convert to template
     s = "#!/bin/bash\n"
@@ -531,7 +555,8 @@ def _create_slurm_sbatch_script(
     )
     s += "#SBATCH --job-name {}\n".format(job_name)
     s += "#SBATCH --exclusive\n"
-    s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.out")
+    s += "#SBATCH --no-requeue\n"  # We have our own auto-resume logic
+    s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.log")
     s += "\n"
     s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
     s += "\n"
@@ -696,7 +721,7 @@ def _create_slurm_sbatch_script(
         s += "--no-container-mount-home "
     s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
-    s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
+    s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.log")
     s += "bash -c '\n"
     s += eval_factory_command
     s += "'\n\n"
@@ -817,8 +842,8 @@ def _generate_auto_export_section(
     if not cfg.execution.get("mounts", {}).get("mount_home", True):
         s += "--no-container-mount-home "
-    s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
-    s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.out")
+    s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts,{remote_task_subdir}/logs:{remote_task_subdir}/logs "
+    s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
     s += "    bash -c '\n"
     # FIXME(martas): would be good to install specific version
     s += "        pip install nemo-evaluator-launcher[all]\n"
@@ -1336,7 +1361,7 @@ def _generate_deployment_srun_command(
         s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
     if not cfg.execution.get("mounts", {}).get("mount_home", True):
         s += "--no-container-mount-home "
-    s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.out")
+    s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.log")
     deployment_env_var_names = list(
         cfg.execution.get("env_vars", {}).get("deployment", {})
@@ -1439,7 +1464,7 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
     s += "--nodes 1 --ntasks 1 "
     s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
     s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
-    s += f"--output {remote_task_subdir}/logs/proxy-%A.out "
+    s += f"--output {remote_task_subdir}/logs/proxy-%A.log "
     s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
     s += "PROXY_PID=$!  # capture the PID of the proxy background srun process\n"
     s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
@@ -1454,3 +1479,110 @@ def _generate_haproxy_srun_command(cfg, remote_task_subdir):
     s += "\n"
     return s
+def _collect_mount_paths(cfg: DictConfig) -> List[str]:
+    """Collect all mount source paths from the configuration.
+    Args:
+        cfg: The configuration object for the evaluation run.
+    Returns:
+        List of source paths that need to be mounted.
+    """
+    mount_paths = []
+    # Deployment mounts
+    if cfg.deployment.type != "none":
+        if checkpoint_path := cfg.deployment.get("checkpoint_path"):
+            mount_paths.append(checkpoint_path)
+        if cache_path := cfg.deployment.get("cache_path"):
+            mount_paths.append(cache_path)
+        for source_mnt in cfg.execution.get("mounts", {}).get("deployment", {}).keys():
+            mount_paths.append(source_mnt)
+    # Evaluation mounts
+    for source_mnt in cfg.execution.get("mounts", {}).get("evaluation", {}).keys():
+        mount_paths.append(source_mnt)
+    return mount_paths
+def _validate_remote_paths_exist(
+    paths: List[str],
+    username: str,
+    hostname: str,
+    socket: str | None,
+) -> None:
+    """Validate that all specified paths exist as directories on the remote host.
+    Args:
+        paths: List of directory paths to validate.
+        username: SSH username.
+        hostname: SSH hostname.
+        socket: control socket location or None
+    Raises:
+        ValueError: If any paths do not exist as directories on the remote host.
+    """
+    if not paths:
+        return
+    # Remove duplicates while preserving order
+    unique_paths = list(dict.fromkeys(paths))
+    # Build a single SSH command to check all paths at once
+    test_commands = []
+    for path in unique_paths:
+        # Use test -d to check if directory exists
+        # Escape single quotes in path using POSIX-safe method: ' becomes '"'"'
+        escaped_path = path.replace("'", "'\"'\"'")
+        test_commands.append(
+            f"test -d '{escaped_path}' && echo 'EXISTS:{path}' || echo 'MISSING:{path}'"
+        )
+    combined_command = " ; ".join(test_commands)
+    ssh_command = ["ssh"]
+    if socket is not None:
+        ssh_command.append(f"-S {socket}")
+    ssh_command.append(f"{username}@{hostname}")
+    ssh_command.append(combined_command)
+    ssh_command = " ".join(ssh_command)
+    logger.info("Validating mount directories exist on remote host", cmd=ssh_command)
+    completed_process = subprocess.run(
+        args=shlex.split(ssh_command),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    if completed_process.returncode != 0:
+        error_msg = (
+            completed_process.stderr.decode("utf-8")
+            if completed_process.stderr
+            else "Unknown error"
+        )
+        logger.error(
+            "Error validating remote paths",
+            code=completed_process.returncode,
+            msg=error_msg,
+        )
+        raise RuntimeError(f"Failed to validate remote paths: {error_msg}")
+    # Parse output to find missing paths
+    output = completed_process.stdout.decode("utf-8")
+    missing_paths = []
+    for line in output.strip().split("\n"):
+        if line.startswith("MISSING:"):
+            missing_path = line.replace("MISSING:", "")
+            missing_paths.append(missing_path)
+    if missing_paths:
+        error_message = (
+            f"The following mount paths do not exist as directories on {username}@{hostname}:\n"
+            + "\n".join(f"  - {path}" for path in missing_paths)
+            + "\n\nMount paths must be directories. Please create these directories on the cluster or update your configuration."
+        )
+        logger.error("Mount validation failed", missing_paths=missing_paths)
+        raise ValueError(error_message)

nemo_evaluator_launcher/package_info.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # Below is the _next_ version that will be published, not the currently published one.
 MAJOR = 0
 MINOR = 1
-PATCH = 41
+PATCH = 56
 PRE_RELEASE = ""
 # Use the following formatting: (major, minor, patch, pre-release)

nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl

nemo-evaluator-launcher 0.1.41py3-none-any.whl → 0.1.56py3-none-any.whl