PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

nemo_evaluator_launcher/__init__.py +15 -1
nemo_evaluator_launcher/api/functional.py +188 -27
nemo_evaluator_launcher/api/types.py +9 -0
nemo_evaluator_launcher/cli/export.py +131 -12
nemo_evaluator_launcher/cli/info.py +477 -82
nemo_evaluator_launcher/cli/kill.py +5 -3
nemo_evaluator_launcher/cli/logs.py +102 -0
nemo_evaluator_launcher/cli/ls_runs.py +31 -10
nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
nemo_evaluator_launcher/cli/main.py +101 -5
nemo_evaluator_launcher/cli/run.py +153 -30
nemo_evaluator_launcher/cli/status.py +49 -5
nemo_evaluator_launcher/cli/version.py +26 -23
nemo_evaluator_launcher/common/execdb.py +121 -27
nemo_evaluator_launcher/common/helpers.py +213 -33
nemo_evaluator_launcher/common/logging_utils.py +16 -5
nemo_evaluator_launcher/common/printing_utils.py +100 -0
nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
nemo_evaluator_launcher/executors/base.py +54 -1
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
nemo_evaluator_launcher/executors/local/executor.py +492 -56
nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
nemo_evaluator_launcher/exporters/base.py +9 -0
nemo_evaluator_launcher/exporters/gsheets.py +27 -9
nemo_evaluator_launcher/exporters/local.py +30 -16
nemo_evaluator_launcher/exporters/mlflow.py +245 -74
nemo_evaluator_launcher/exporters/utils.py +139 -184
nemo_evaluator_launcher/exporters/wandb.py +157 -43
nemo_evaluator_launcher/package_info.py +6 -3
nemo_evaluator_launcher/resources/mapping.toml +56 -15
nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0

nemo_evaluator_launcher/executors/slurm/executor.py CHANGED Viewed

@@ -30,6 +30,7 @@ from pathlib import Path
 from typing import Dict, List, Optional
 import yaml
+from jinja2 import Environment, FileSystemLoader
 from omegaconf import DictConfig, OmegaConf
 from nemo_evaluator_launcher.common.execdb import (
@@ -39,17 +40,19 @@ from nemo_evaluator_launcher.common.execdb import (
     generate_job_id,
 )
 from nemo_evaluator_launcher.common.helpers import (
+    CmdAndReadableComment,
+    _str_to_echo_command,
     get_api_key_name,
-    get_endpoint_url,
     get_eval_factory_command,
     get_eval_factory_dataset_size_from_run_config,
-    get_health_url,
     get_timestamp_string,
 )
+from nemo_evaluator_launcher.common.logging_utils import logger
 from nemo_evaluator_launcher.common.mapping import (
     get_task_from_mapping,
     load_tasks_mapping,
 )
+from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
 from nemo_evaluator_launcher.executors.base import (
     BaseExecutor,
     ExecutionState,
@@ -93,6 +96,7 @@ class SlurmExecutor(BaseExecutor):
             tasks_mapping = load_tasks_mapping()
             eval_images: list[str] = []
+            is_potentially_unsafe = False
             for idx, task in enumerate(cfg.evaluation.tasks):
                 # calculate job_id
                 job_id = f"{invocation_id}.{idx}"
@@ -113,7 +117,7 @@ class SlurmExecutor(BaseExecutor):
                 eval_images.append(eval_image)
                 # generate and write down sbatch script
-                sbatch_script_content_str = _create_slurm_sbatch_script(
+                sbatch_script_content_struct = _create_slurm_sbatch_script(
                     cfg=cfg,
                     task=task,
                     eval_image=eval_image,
@@ -121,6 +125,32 @@ class SlurmExecutor(BaseExecutor):
                     invocation_id=invocation_id,
                     job_id=job_id,
                 )
+                # Create proxy config file with placeholder IPs for multi-instance deployments
+                if cfg.deployment.get("multiple_instances", False):
+                    proxy_type = cfg.execution.get("proxy", {}).get("type", "haproxy")
+                    if proxy_type == "haproxy":
+                        proxy_config = _generate_haproxy_config_with_placeholders(cfg)
+                    else:
+                        raise ValueError(
+                            f"Unsupported proxy type: {proxy_type}. Currently only 'haproxy' is supported."
+                        )
+                    # Save both template and working config
+                    proxy_template_path = local_task_subdir / "proxy.cfg.template"
+                    proxy_config_path = local_task_subdir / "proxy.cfg"
+                    with open(proxy_template_path, "w") as f:
+                        f.write(proxy_config)
+                    with open(proxy_config_path, "w") as f:
+                        f.write(proxy_config)
+                sbatch_script_content_str = sbatch_script_content_struct.cmd
+                # We accumulate if any task contains unsafe commands
+                is_potentially_unsafe = (
+                    is_potentially_unsafe
+                    or sbatch_script_content_struct.is_potentially_unsafe
+                )
                 local_runsub_path = local_task_subdir / "run.sub"
                 remote_runsub_path = remote_task_subdir / "run.sub"
                 with open(local_runsub_path, "w") as f:
@@ -130,15 +160,41 @@ class SlurmExecutor(BaseExecutor):
                 remote_runsub_paths.append(remote_runsub_path)
             if dry_run:
-                print("\n\n=============================================\n\n")
-                print("DRY RUN: SLURM scripts prepared")
+                print(bold("\n\n=============================================\n\n"))
+                print(bold(cyan("DRY RUN: SLURM scripts prepared")))
                 for idx, local_runsub_path in enumerate(local_runsub_paths):
-                    print(f"\n\n =========== Task {idx} ===================== \n\n")
+                    print(cyan(f"\n\n=========== Task {idx} =====================\n\n"))
                     with open(local_runsub_path, "r") as f:
-                        print(f.read())
-                print("\nTo submit jobs, run the executor without --dry-run")
+                        print(grey(f.read()))
+                print(bold("To submit jobs") + ", run the executor without --dry-run")
+                if is_potentially_unsafe:
+                    print(
+                        red(
+                            "\nFound `pre_cmd` (evaluation or deployment) which carries security risk. When running without --dry-run "
+                            "make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
+                        )
+                    )
                 return invocation_id
+            if is_potentially_unsafe:
+                if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
+                    logger.warning(
+                        "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
+                        "is set, proceeding with caution."
+                    )
+                else:
+                    logger.error(
+                        "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
+                        "is not set. This might carry security risk and unstable environments. "
+                        "To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
+                    )
+                    raise AttributeError(
+                        "Untrusted command found in config, make sure you trust and "
+                        "set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
+                    )
             socket = str(Path(tmpdirname) / "socket")
             socket_or_none = _open_master_connection(
                 username=cfg.execution.username,
@@ -174,10 +230,11 @@ class SlurmExecutor(BaseExecutor):
             for idx, (slurm_job_id, remote_runsub_path) in enumerate(
                 zip(slurm_job_ids, remote_runsub_paths)
             ):
+                job_id = generate_job_id(invocation_id, idx)
                 db.write_job(
                     job=JobData(
                         invocation_id=invocation_id,
-                        job_id=generate_job_id(invocation_id, idx),
+                        job_id=job_id,
                         timestamp=time.time(),
                         executor="slurm",
                         data={
@@ -204,8 +261,8 @@ class SlurmExecutor(BaseExecutor):
         """
         db = ExecutionDB()
-        # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
-        if len(id) == 8 and "." not in id:
+        # If id looks like an invocation_id
+        if "." not in id:
             jobs = db.get_jobs(id)
             if not jobs:
                 return []
@@ -388,7 +445,7 @@ class SlurmExecutor(BaseExecutor):
         """Kill a SLURM job.
         Args:
-            job_id: The job ID to kill.
+            job_id: The job ID (e.g., abc123.0) to kill.
         """
         db = ExecutionDB()
         job_data = db.get_job(job_id)
@@ -401,26 +458,31 @@ class SlurmExecutor(BaseExecutor):
                 f"Job {job_id} is not a slurm job (executor: {job_data.executor})"
             )
-        killed_something = False
-        result = _kill_slurm_job(
+        # OPTIMIZATION: Query status AND kill in ONE SSH call
+        slurm_status, result = _kill_slurm_job(
             slurm_job_ids=[job_data.data.get("slurm_job_id")],
             username=job_data.data.get("username"),
             hostname=job_data.data.get("hostname"),
             socket=job_data.data.get("socket"),
         )
+        # Mark job as killed in database if kill succeeded
         if result.returncode == 0:
-            killed_something = True
-        # Mark job as killed in database if we killed something
-        if killed_something:
             job_data.data["killed"] = True
             db.write_job(job_data)
         else:
-            raise RuntimeError(
-                f"Could not find or kill job {job_id} (slurm_job_id: {job_data.data.get('slurm_job_id')})"
+            # Use the pre-fetched status for better error message
+            current_status = None
+            if slurm_status:
+                current_status = SlurmExecutor._map_slurm_state_to_execution_state(
+                    slurm_status
+                )
+            error_msg = SlurmExecutor.get_kill_failure_message(
+                job_id,
+                f"slurm_job_id: {job_data.data.get('slurm_job_id')}",
+                current_status,
             )
+            raise RuntimeError(error_msg)
 def _create_slurm_sbatch_script(
@@ -430,7 +492,7 @@ def _create_slurm_sbatch_script(
     remote_task_subdir: Path,
     invocation_id: str,
     job_id: str,
-) -> str:
+) -> CmdAndReadableComment:
     """Generate the contents of a SLURM sbatch script for a given evaluation task.
     Args:
@@ -446,7 +508,6 @@ def _create_slurm_sbatch_script(
     # get task from mapping, overrides, urls
     tasks_mapping = load_tasks_mapping()
     task_definition = get_task_from_mapping(task.name, tasks_mapping)
-    health_url = get_health_url(cfg, get_endpoint_url(cfg, task, task_definition))
     # TODO(public release): convert to template
     s = "#!/bin/bash\n"
@@ -461,6 +522,8 @@ def _create_slurm_sbatch_script(
         s += "#SBATCH --gpus-per-node {}\n".format(cfg.execution.gpus_per_node)
     if hasattr(cfg.execution, "gres"):
         s += "#SBATCH --gres {}\n".format(cfg.execution.gres)
+    if cfg.execution.get("sbatch_comment"):
+        s += "#SBATCH --comment='{}'\n".format(cfg.execution.sbatch_comment)
     job_name = "{account}-{subproject}.{details}".format(
         account=cfg.execution.account,
         subproject=cfg.execution.subproject,
@@ -470,6 +533,8 @@ def _create_slurm_sbatch_script(
     s += "#SBATCH --exclusive\n"
     s += "#SBATCH --output {}\n".format(remote_task_subdir / "logs" / "slurm-%A.out")
     s += "\n"
+    s += f'TASK_DIR="{str(remote_task_subdir)}"\n'
+    s += "\n"
     # collect all env vars
     env_vars = copy.deepcopy(dict(cfg.evaluation.get("env_vars", {})))
@@ -484,8 +549,11 @@ def _create_slurm_sbatch_script(
         if os.getenv(env_var) is None:
             raise ValueError(f"Trying to pass an unset environment variable {env_var}.")
-    # check if required env vars are defined:
+    # check if required env vars are defined (excluding NEMO_EVALUATOR_DATASET_DIR which is handled separately):
     for required_env_var in task_definition.get("required_env_vars", []):
+        # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic below
+        if required_env_var == "NEMO_EVALUATOR_DATASET_DIR":
+            continue
         if required_env_var not in env_vars.keys():
             raise ValueError(
                 f"{task.name} task requires environment variable {required_env_var}."
@@ -531,6 +599,7 @@ def _create_slurm_sbatch_script(
     # prepare deployment mounts
     deployment_mounts_list = []
+    deployment_is_unsafe = False
     if cfg.deployment.type != "none":
         if checkpoint_path := cfg.deployment.get("checkpoint_path"):
             deployment_mounts_list.append(f"{checkpoint_path}:/checkpoint:ro")
@@ -542,36 +611,33 @@ def _create_slurm_sbatch_script(
             deployment_mounts_list.append(f"{source_mnt}:{target_mnt}")
         # add deployment srun command
-        s += "# deployment server\n"
-        s += "srun --mpi pmix --overlap "
-        s += "--container-image {} ".format(cfg.deployment.image)
-        if deployment_mounts_list:
-            s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
-        if not cfg.execution.get("mounts", {}).get("mount_home", True):
-            s += "--no-container-mount-home "
-        s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A.out")
-        deployment_env_var_names = list(
-            cfg.execution.get("env_vars", {}).get("deployment", {})
-        )
-        if cfg.deployment.get("env_vars"):
-            warnings.warn(
-                "cfg.deployment.env_vars will be deprecated in future versions. "
-                "Use cfg.execution.env_vars.deployment instead.",
-                category=DeprecationWarning,
-                stacklevel=2,
+        deployment_srun_cmd, deployment_is_unsafe, deployment_debug = (
+            _generate_deployment_srun_command(
+                cfg, deployment_mounts_list, remote_task_subdir
             )
-            deployment_env_var_names.extend(list(cfg.deployment["env_vars"]))
-        if deployment_env_var_names:
-            s += f"--container-env {','.join(deployment_env_var_names)} "
-        s += "{} &\n\n".format(cfg.deployment.command)  # run asynchronously
-        s += (
-            "SERVER_PID=$!  # capture the PID of the server background srun process\n\n"
         )
+        s += deployment_srun_cmd
         # wait for the server to initialize
-        s += _WAIT_FOR_SERVER_HANDLER.format(health_url=health_url)
+        health_path = cfg.deployment.get("health_check_path", "/health")
+        # For multi-instance check all node IPs, for single instance check localhost
+        if cfg.deployment.get("multiple_instances", False):
+            ip_list = '"${NODES_IPS_ARRAY[@]}"'
+        else:
+            ip_list = '"127.0.0.1"'
+        s += _get_wait_for_server_handler(
+            ip_list,
+            cfg.deployment.port,
+            health_path,
+            "server",
+            check_pid=True,
+        )
         s += "\n\n"
+        # add proxy load balancer for multi-instance deployments
+        if cfg.deployment.get("multiple_instances", False):
+            s += _get_proxy_server_srun_command(cfg, remote_task_subdir)
     # prepare evaluation mounts
     evaluation_mounts_list = [
         "{}:/results".format(remote_task_subdir / "artifacts"),
@@ -581,9 +647,45 @@ def _create_slurm_sbatch_script(
     ):
         evaluation_mounts_list.append(f"{source_mnt}:{target_mnt}")
+    # Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
+    if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get("required_env_vars", []):
+        # Get dataset directory from task config
+        if "dataset_dir" in task:
+            dataset_mount_host = task["dataset_dir"]
+        else:
+            raise ValueError(
+                f"{task.name} task requires a dataset_dir to be specified. "
+                f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
+            )
+        # Get container mount path (default to /datasets if not specified)
+        dataset_mount_container = task.get("dataset_mount_path", "/datasets")
+        # Add dataset mount to evaluation mounts list
+        evaluation_mounts_list.append(f"{dataset_mount_host}:{dataset_mount_container}")
+        # Export NEMO_EVALUATOR_DATASET_DIR environment variable
+        s += f"export NEMO_EVALUATOR_DATASET_DIR={dataset_mount_container}\n\n"
+    eval_factory_command_struct = get_eval_factory_command(
+        cfg,
+        task,
+        task_definition,
+    )
+    eval_factory_command = eval_factory_command_struct.cmd
+    # The debug comment for placing into the script and easy debug. Reason
+    # (see `CmdAndReadableComment`) is the current way of passing the command
+    # is base64-encoded config `echo`-ed into file.
+    # TODO(agronskiy): cleaner way is to encode everything with base64, not
+    # some parts (like ef_config.yaml) and just output as logs somewhere.
+    eval_factory_command_debug_comment = eval_factory_command_struct.debug
     # add evaluation srun command
+    s += "# Debug contents of the eval factory command's config\n"
+    s += eval_factory_command_debug_comment
+    s += "\n\n"
     s += "# evaluation client\n"
     s += "srun --mpi pmix --overlap "
+    s += "--nodes 1 --ntasks 1 "  # Client always runs on single node
     s += "--container-image {} ".format(eval_image)
     evaluation_env_var_names = list(
         cfg.execution.get("env_vars", {}).get("evaluation", {})
@@ -592,43 +694,139 @@ def _create_slurm_sbatch_script(
         s += "--container-env {} ".format(",".join(evaluation_env_var_names))
     if not cfg.execution.get("mounts", {}).get("mount_home", True):
         s += "--no-container-mount-home "
     s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
     s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
-    s += "bash -c '"
-    s += get_eval_factory_command(cfg, task, task_definition)
+    s += "bash -c '\n"
+    s += eval_factory_command
     s += "'\n\n"
     # terminate the server after all evaluation clients finish
     if cfg.deployment.type != "none":
-        s += "kill $SERVER_PID  # terminate the server to finish gracefully\n\n"
+        s += "kill $SERVER_PID  # terminate the server to finish gracefully\n"
+        if cfg.deployment.get("multiple_instances", False):
+            s += "kill $PROXY_PID  # terminate proxy to finish gracefully\n"
+        s += "\n"
     # auto-export
-    if cfg.execution.get("auto_export", {}).get("destinations", []):
-        s += _generate_auto_export_section(cfg, job_id)
+    ae_cfg = cfg.execution.get("auto_export")
+    destinations: list = []
+    if isinstance(ae_cfg, list):
+        destinations = list(ae_cfg)
+    elif isinstance(ae_cfg, dict) or isinstance(ae_cfg, DictConfig):
+        destinations = list(ae_cfg.get("destinations", []) or [])
+    if destinations:
+        export_env = dict(cfg.execution.get("env_vars", {}).get("export", {}) or {})
+        s += _generate_auto_export_section(
+            cfg, job_id, destinations, export_env, remote_task_subdir
+        )
-    return s
+    debug_str = "\n".join(["# " + line for line in s.splitlines()])
+    # Combine unsafe flags from both deployment and evaluation
+    is_potentially_unsafe = (
+        eval_factory_command_struct.is_potentially_unsafe or deployment_is_unsafe
+    )
+    return CmdAndReadableComment(
+        cmd=s,
+        debug=debug_str,
+        is_potentially_unsafe=is_potentially_unsafe,
+    )
 def _generate_auto_export_section(
     cfg: DictConfig,
-    job_id: str,  # Complete job_id string
+    job_id: str,
+    destinations: list,
+    export_env: dict,
+    remote_task_subdir: Path,
+    export_image: str = "python:3.12.7-slim",
 ) -> str:
     """Generate simple auto-export section for sbatch script."""
-    auto_export_config = cfg.execution.get("auto_export", {})
-    destinations = auto_export_config.get("destinations", [])
     if not destinations:
         return ""
-    s = "\n# AUTO-EXPORT ON SUCCESS\n"
+    s = "\n# Auto-export on success\n"
     s += "EVAL_EXIT_CODE=$?\n"
     s += "if [ $EVAL_EXIT_CODE -eq 0 ]; then\n"
     s += "    echo 'Evaluation completed successfully. Starting auto-export...'\n"
+    s += f'    cd "{remote_task_subdir}/artifacts"\n'
-    for dest in destinations:
-        s += f"    echo 'Exporting to {dest}...'\n"
-        s += f"    nemo-evaluator-launcher export {job_id} --dest {dest} || echo 'Export to {dest} failed'\n"
+    # Work with DictConfig; convert only for YAML at the end
+    exec_type = (
+        cfg.execution.type
+        if hasattr(cfg.execution, "type")
+        else cfg.execution.get("type", "slurm")
+    )
+    eval_tasks = (
+        list(cfg.evaluation.tasks)
+        if hasattr(cfg, "evaluation") and hasattr(cfg.evaluation, "tasks")
+        else list((cfg.get("evaluation", {}) or {}).get("tasks", []) or [])
+    )
+    export_block = cfg.get("export", {}) or {}
+    payload = {
+        "execution": {
+            "auto_export": {
+                "destinations": list(destinations),
+                **({"env_vars": dict(export_env)} if export_env else {}),
+            },
+            "type": exec_type,
+        },
+        "evaluation": {"tasks": eval_tasks},
+    }
+    if export_block:
+        # Convert just this block to plain for YAML
+        payload["export"] = (
+            OmegaConf.to_object(export_block)
+            if OmegaConf.is_config(export_block)
+            else dict(export_block)
+        )
+    # Final YAML (single conversion at the end)
+    payload_clean = OmegaConf.to_container(OmegaConf.create(payload), resolve=True)
+    yaml_str = yaml.safe_dump(payload_clean, sort_keys=False)
+    s += "    cat > export_config.yml << 'EOF'\n"
+    s += yaml_str
+    s += "EOF\n"
+    # write launcher config as config.yml for exporters (no core command)
+    submitted_yaml = yaml.safe_dump(
+        OmegaConf.to_container(cfg, resolve=True), sort_keys=False
+    )
+    s += "    cat > config.yml << 'EOF'\n"
+    s += submitted_yaml
+    s += "EOF\n"
+    # Export host only env before running auto export
+    for k, v in (export_env or {}).items():
+        if isinstance(v, str) and re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", v):
+            s += f'    export {k}="${{{v}}}"\n'
+        else:
+            esc = str(v).replace('"', '\\"')
+            s += f'    export {k}="{esc}"\n'
+    s += "    # export\n"
+    s += "    srun --mpi pmix --overlap "
+    s += "--nodes 1 --ntasks 1 "  # Client always runs on single node
+    s += "--container-image {} ".format(export_image)
+    if export_env:
+        s += "--container-env {} ".format(",".join(export_env))
+    if not cfg.execution.get("mounts", {}).get("mount_home", True):
+        s += "--no-container-mount-home "
+    s += f"--container-mounts {remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts "
+    s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.out")
+    s += "    bash -c '\n"
+    # FIXME(martas): would be good to install specific version
+    s += "        pip install nemo-evaluator-launcher[all]\n"
+    s += f"        cd {remote_task_subdir}/artifacts\n"
+    for dest in destinations:
+        s += f'        echo "Exporting to {dest}..."\n'
+        s += f'        nemo-evaluator-launcher export {job_id} --dest {dest} || echo "Export to {dest} failed"\n'
+    s += "'\n"
     s += "    echo 'Auto-export completed.'\n"
     s += "else\n"
     s += "    echo 'Evaluation failed with exit code $EVAL_EXIT_CODE. Skipping auto-export.'\n"
@@ -643,9 +841,12 @@ def _open_master_connection(
     socket: str,
 ) -> str | None:
     ssh_command = f"ssh -MNf -S {socket} {username}@{hostname}"
+    logger.info("Opening master connection", cmd=ssh_command)
     completed_process = subprocess.run(args=shlex.split(ssh_command))
     if completed_process.returncode == 0:
+        logger.info("Opened master connection successfully", cmd=ssh_command)
         return socket
+    logger.error("Failed to open master connection", code=completed_process.returncode)
     return None
@@ -657,9 +858,7 @@ def _close_master_connection(
     if socket is None:
         return
     ssh_command = f"ssh -O exit -S {socket} {username}@{hostname}"
-    completed_process = subprocess.run(
-        args=shlex.split(ssh_command), capture_output=True
-    )
+    completed_process = subprocess.run(args=shlex.split(ssh_command))
     if completed_process.returncode != 0:
         raise RuntimeError(
             "failed to close the master connection\n{}".format(
@@ -681,12 +880,23 @@ def _make_remote_execution_output_dir(
     ssh_command.append(f"{username}@{hostname}")
     ssh_command.append(mkdir_command)
     ssh_command = " ".join(ssh_command)
-    completed_process = subprocess.run(args=shlex.split(ssh_command))
+    logger.info("Creating remote dir", cmd=ssh_command)
+    completed_process = subprocess.run(
+        args=shlex.split(ssh_command), stderr=subprocess.PIPE
+    )
     if completed_process.returncode != 0:
+        error_msg = (
+            completed_process.stderr.decode("utf-8")
+            if completed_process.stderr
+            else "Unknown error"
+        )
+        logger.error(
+            "Erorr creating remote dir",
+            code=completed_process.returncode,
+            msg=error_msg,
+        )
         raise RuntimeError(
-            "failed to make a remote execution output dir\n{}".format(
-                completed_process.stderr.decode("utf-8")
-            )
+            "failed to make a remote execution output dir\n{}".format(error_msg)
         )
@@ -712,14 +922,25 @@ def _rsync_upload_rundirs(
     remote_destination_str = f"{username}@{hostname}:{remote_target}"
     local_sources_str = " ".join(map(str, local_sources))
     rsync_upload_command = f"rsync -qcaz {local_sources_str} {remote_destination_str}"
-    completed_process = subprocess.run(args=shlex.split(rsync_upload_command))
+    logger.info("Rsyncing to remote dir", cmd=rsync_upload_command)
+    completed_process = subprocess.run(
+        args=shlex.split(rsync_upload_command),
+        stderr=subprocess.PIPE,
+    )
     if completed_process.returncode != 0:
-        raise RuntimeError(
-            "failed to upload local sources\n{}".format(
-                completed_process.stderr.decode("utf-8")
-            )
+        error_msg = (
+            completed_process.stderr.decode("utf-8")
+            if completed_process.stderr
+            else "Unknown error"
         )
+        logger.error(
+            "Erorr rsyncing to remote dir",
+            code=completed_process.returncode,
+            msg=error_msg,
+        )
+        raise RuntimeError("failed to upload local sources\n{}".format(error_msg))
 def _sbatch_remote_runsubs(
     remote_runsub_paths: List[Path],
@@ -739,19 +960,22 @@ def _sbatch_remote_runsubs(
     ssh_command.append(f"{username}@{hostname}")
     ssh_command.append(sbatch_commands)
     ssh_command = " ".join(ssh_command)
+    logger.info("Running sbatch", cmd=ssh_command)
     completed_process = subprocess.run(
-        args=shlex.split(ssh_command), capture_output=True
+        args=shlex.split(ssh_command),
+        # NOTE(agronskiy): look out for hangs and deadlocks
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
     )
     if completed_process.returncode != 0:
+        error_msg = completed_process.stderr.decode("utf-8")
         raise RuntimeError(
-            "failed to submit sbatch scripts for execution\n{}".format(
-                completed_process.stderr.decode("utf-8")
-            )
+            "failed to submit sbatch scripts for execution\n{}".format(error_msg)
         )
     sbatch_output = completed_process.stdout.decode("utf-8")
     slurm_job_ids = re.findall(r"(?<=Submitted batch job )\d+", sbatch_output)
+    logger.info("Started sbatch successfully", slurm_job_ids=slurm_job_ids)
     return slurm_job_ids
@@ -784,7 +1008,10 @@ def _query_slurm_jobs_status(
     ssh_command.append(sacct_command)
     ssh_command = " ".join(ssh_command)
     completed_process = subprocess.run(
-        args=shlex.split(ssh_command), capture_output=True
+        args=shlex.split(ssh_command),
+        # NOTE(agronskiy): look out for hangs and deadlocks
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
     )
     if completed_process.returncode != 0:
         raise RuntimeError(
@@ -803,34 +1030,50 @@ def _query_slurm_jobs_status(
 def _kill_slurm_job(
     slurm_job_ids: List[str], username: str, hostname: str, socket: str | None
-) -> None:
-    """Kill a SLURM job.
+) -> tuple[str | None, subprocess.CompletedProcess]:
+    """Kill a SLURM job, querying status first in one SSH call for efficiency.
     Args:
         slurm_job_ids: List of SLURM job IDs to kill.
         username: SSH username.
         hostname: SSH hostname.
         socket: control socket location or None
+    Returns:
+        Tuple of (status_string, completed_process) where status_string is the SLURM status or None
     """
     if len(slurm_job_ids) == 0:
-        return {}
-    kill_command = "scancel {}".format(",".join(slurm_job_ids))
+        return None, subprocess.CompletedProcess(args=[], returncode=0)
+    jobs_str = ",".join(slurm_job_ids)
+    # Combine both commands in one SSH call: query THEN kill
+    combined_command = (
+        f"sacct -j {jobs_str} --format='JobID,State%32' --noheader -P 2>/dev/null; "
+        f"scancel {jobs_str}"
+    )
     ssh_command = ["ssh"]
     if socket is not None:
         ssh_command.append(f"-S {socket}")
     ssh_command.append(f"{username}@{hostname}")
-    ssh_command.append(kill_command)
+    ssh_command.append(combined_command)
     ssh_command = " ".join(ssh_command)
     completed_process = subprocess.run(
-        args=shlex.split(ssh_command), capture_output=True
+        args=shlex.split(ssh_command),
+        # NOTE(agronskiy): look out for hangs and deadlocks
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
     )
-    if completed_process.returncode != 0:
-        raise RuntimeError(
-            "failed to kill slurm job\n{}".format(
-                completed_process.stderr.decode("utf-8")
-            )
-        )
-    return completed_process
+    # Parse the sacct output (before scancel runs)
+    sacct_output = completed_process.stdout.decode("utf-8")
+    sacct_output_lines = sacct_output.strip().split("\n")
+    slurm_status = None
+    if sacct_output_lines and len(slurm_job_ids) == 1:
+        slurm_status = _parse_slurm_job_status(slurm_job_ids[0], sacct_output_lines)
+    return slurm_status, completed_process
 def _parse_slurm_job_status(slurm_job_id: str, sacct_output_lines: List[str]) -> str:
@@ -898,7 +1141,10 @@ def _read_files_from_remote(
     ssh_command.append(cat_commands)
     ssh_command = " ".join(ssh_command)
     completed_process = subprocess.run(
-        args=shlex.split(ssh_command), capture_output=True
+        args=shlex.split(ssh_command),
+        # NOTE(agronskiy): look out for hangs and deadlocks
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
     )
     if completed_process.returncode != 0:
         raise RuntimeError(
@@ -975,9 +1221,236 @@ sbatch --dependency=afternotok:$SLURM_JOB_ID $_this_script $SLURM_JOB_ID
 """.strip()
-_WAIT_FOR_SERVER_HANDLER = """
-date
-# wait for the server to initialize
-bash -c 'while [[ "$(curl -s -o /dev/null -w "%{{http_code}}" {health_url})" != "200" ]]; do kill -0 '"$SERVER_PID"' 2>/dev/null || {{ echo "Server process '"$SERVER_PID"' died"; exit 1; }}; sleep 5; done'
+def _generate_haproxy_config_with_placeholders(cfg):
+    """Generate HAProxy configuration with placeholder IPs using Jinja template."""
+    # Set up Jinja environment
+    template_dir = Path(__file__).parent
+    template_path = template_dir / "proxy.cfg.template"
+    if not template_path.exists():
+        raise FileNotFoundError(f"Proxy template not found: {template_path}")
+    env = Environment(loader=FileSystemLoader(template_dir))
+    template = env.get_template("proxy.cfg.template")
+    # Prepare template data with placeholder IPs - use actual number of nodes
+    num_nodes = cfg.execution.num_nodes
+    nodes = []
+    for i in range(num_nodes):
+        nodes.append({"ip": f"{{IP_{i}}}", "port": cfg.deployment.port})
+    # Get health check parameters from execution config
+    proxy_config = cfg.execution.get("proxy", {}).get("config", {})
+    health_check_path = proxy_config.get("health_check_path", "/health")
+    health_check_status = proxy_config.get("health_check_status", 200)
+    haproxy_port = proxy_config.get("haproxy_port", 5009)
+    # Render template
+    config = template.render(
+        haproxy_port=haproxy_port,
+        health_check_path=health_check_path,
+        health_check_status=health_check_status,
+        nodes=nodes,
+    )
+    return config
+def _generate_haproxy_config(cfg, nodes_ips):
+    """Generate HAProxy configuration using Jinja template."""
+    # Set up Jinja environment
+    template_dir = Path(__file__).parent
+    template_path = template_dir / "proxy.cfg.template"
+    if not template_path.exists():
+        raise FileNotFoundError(f"Proxy template not found: {template_path}")
+    env = Environment(loader=FileSystemLoader(template_dir))
+    template = env.get_template("proxy.cfg.template")
+    # Prepare template data
+    nodes = []
+    for i, ip in enumerate(nodes_ips, 1):
+        nodes.append(
+            {"ip": ip, "port": cfg.deployment.port}  # All nodes use the same port
+        )
+    # Get health check parameters from deployment config
+    health_check_path = cfg.deployment.get("health_check_path", "/health")
+    health_check_status = cfg.deployment.get("health_check_status", 200)
+    haproxy_port = cfg.deployment.get("haproxy_port", 5009)
+    # Render template
+    config = template.render(
+        haproxy_port=haproxy_port,
+        health_check_path=health_check_path,
+        health_check_status=health_check_status,
+        nodes=nodes,
+    )
+    return config
+def _generate_deployment_srun_command(
+    cfg, deployment_mounts_list, remote_task_subdir, instance_id: int = 0
+):
+    """Generate the deployment srun command with proper node/ntask configuration.
+    Returns:
+        tuple: (script_string, is_potentially_unsafe, debug_comment)
+    """
+    s = ""
+    debug_comment = ""
+    is_potentially_unsafe = False
+    s += "# deployment server\n"
+    # Extract pre_cmd for later use inside container
+    pre_cmd: str = cfg.deployment.get("pre_cmd") or ""
+    if pre_cmd:
+        is_potentially_unsafe = True
+        create_pre_script_cmd = _str_to_echo_command(
+            pre_cmd, filename="deployment_pre_cmd.sh"
+        )
+        debug_comment += create_pre_script_cmd.debug + "\n\n"
+    s += "# Get node IPs\n"
+    s += "nodes=( $(scontrol show hostnames $SLURM_JOB_NODELIST) )\n"
+    s += 'nodes_array=("${nodes[@]}")  # Ensure nodes are stored properly\n'
+    s += 'export NODES_IPS_ARRAY=($(for node in "${nodes_array[@]}"; do srun --nodelist=$node --ntasks=1 --nodes=1 hostname --ip-address; done))\n'
+    s += 'echo "Node IPs: ${NODES_IPS_ARRAY[@]}"\n'
+    s += "# Export MASTER_IP as the first node IP\n"
+    s += "export MASTER_IP=${NODES_IPS_ARRAY[0]}\n"
+    s += 'echo "MASTER_IP: $MASTER_IP"\n'
+    # Add debug comment for deployment pre_cmd before srun command
+    if debug_comment:
+        s += "# Debug contents of deployment pre_cmd\n"
+        s += debug_comment
+        s += "\n"
+    s += "srun --mpi pmix --overlap "
+    s += f"--nodes {cfg.execution.num_nodes} --ntasks {cfg.execution.get('deployment', {}).get('n_tasks', 1)} "
+    s += "--container-image {} ".format(cfg.deployment.image)
+    if deployment_mounts_list:
+        s += "--container-mounts {} ".format(",".join(deployment_mounts_list))
+    if not cfg.execution.get("mounts", {}).get("mount_home", True):
+        s += "--no-container-mount-home "
+    s += "--output {} ".format(remote_task_subdir / "logs" / "server-%A-%t.out")
+    deployment_env_var_names = list(
+        cfg.execution.get("env_vars", {}).get("deployment", {})
+    )
+    if cfg.deployment.get("env_vars"):
+        warnings.warn(
+            "cfg.deployment.env_vars will be deprecated in future versions. "
+            "Use cfg.execution.env_vars.deployment instead.",
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
+        deployment_env_var_names.extend(list(cfg.deployment["env_vars"]))
+    # Always add MASTER_IP to the environment variables
+    if "MASTER_IP" not in deployment_env_var_names:
+        deployment_env_var_names.append("MASTER_IP")
+    if deployment_env_var_names:
+        s += f"--container-env {','.join(deployment_env_var_names)} "
+    # Wrap deployment command to execute pre_cmd inside container if needed
+    if pre_cmd:
+        # Create a wrapper command that runs inside the container:
+        # 1. Create deployment_pre_cmd.sh file
+        # 2. Source it
+        # 3. Execute the original deployment command
+        create_pre_script_cmd = _str_to_echo_command(
+            pre_cmd, filename="deployment_pre_cmd.sh"
+        )
+        # Escape single quotes in the deployment command for bash -c
+        escaped_deployment_cmd = cfg.deployment.command.replace("'", "'\"'\"'")
+        wrapped_command = (
+            f"bash -c '{create_pre_script_cmd.cmd} && "
+            f"source deployment_pre_cmd.sh && "
+            f"{escaped_deployment_cmd}'"
+        )
+        s += "{} &\n\n".format(wrapped_command)
+    else:
+        s += "{} &\n\n".format(cfg.deployment.command)  # run asynchronously
+    s += "SERVER_PID=$!  # capture the PID of the server background srun process\n\n"
+    return s, is_potentially_unsafe, debug_comment
+def _get_wait_for_server_handler(
+    ip_list: str,
+    port: int,
+    health_check_path: str,
+    service_name: str = "server",
+    check_pid: bool = False,
+):
+    """Generate wait for server handler that takes a list of IPs."""
+    pid_check = ""
+    if check_pid:
+        pid_check = 'kill -0 "$SERVER_PID" 2>/dev/null || { echo "Server process $SERVER_PID died"; exit 1; }'
+    handler = f"""date
+# wait for the {service_name} to initialize
+for ip in {ip_list}; do
+  echo "Waiting for {service_name} on $ip..."
+  while [[ "$(curl -s -o /dev/null -w "%{{http_code}}" http://$ip:{port}{health_check_path})" != "200" ]]; do
+    {pid_check}
+    sleep 5
+  done
+  echo "{service_name} ready on $ip!"
+done
 date
 """.strip()
+    return handler
+def _get_proxy_server_srun_command(cfg, remote_task_subdir):
+    """Generate proxy server srun command based on proxy type."""
+    proxy_type = cfg.execution.get("proxy", {}).get("type", "haproxy")
+    if proxy_type == "haproxy":
+        return _generate_haproxy_srun_command(cfg, remote_task_subdir)
+    else:
+        raise ValueError(
+            f"Unsupported proxy type: {proxy_type}. Currently only 'haproxy' is supported."
+        )
+def _generate_haproxy_srun_command(cfg, remote_task_subdir):
+    """Generate HAProxy-specific srun command using template-based config."""
+    s = ""
+    s += "# Proxy load balancer\n"
+    s += "# Copy template to config file (important for restarts)\n"
+    s += f"cp {remote_task_subdir}/proxy.cfg.template {remote_task_subdir}/proxy.cfg\n"
+    s += "# Replace placeholder IPs with actual node IPs\n"
+    s += f"proxy_config_file={remote_task_subdir}/proxy.cfg\n"
+    s += 'for i in "${!NODES_IPS_ARRAY[@]}"; do\n'
+    s += '    ip="${NODES_IPS_ARRAY[$i]}"\n'
+    s += '    sed -i "s/{IP_$i}/$ip/g" "$proxy_config_file"\n'
+    s += "done\n"
+    s += "\n"
+    s += "srun --mpi pmix --overlap "
+    s += "--nodes 1 --ntasks 1 "
+    s += f"--container-image {cfg.execution.get('proxy', {}).get('image', 'haproxy:latest')} "
+    s += f"--container-mounts {remote_task_subdir}/proxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro "
+    s += f"--output {remote_task_subdir}/logs/proxy-%A.out "
+    s += "haproxy -f /usr/local/etc/haproxy/haproxy.cfg &\n"
+    s += "PROXY_PID=$!  # capture the PID of the proxy background srun process\n"
+    s += 'echo "Proxy started with PID: $PROXY_PID"\n\n'
+    # Wait for proxy to be ready on localhost
+    proxy_config = cfg.execution.get("proxy", {}).get("config", {})
+    haproxy_port = proxy_config.get("haproxy_port", 5009)
+    health_path = proxy_config.get("health_check_path", "/health")
+    s += _get_wait_for_server_handler(
+        "127.0.0.1", haproxy_port, health_path, "Proxy", check_pid=False
+    )
+    s += "\n"
+    return s

nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl