PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

nemo_evaluator_launcher/__init__.py +15 -1
nemo_evaluator_launcher/api/functional.py +188 -27
nemo_evaluator_launcher/api/types.py +9 -0
nemo_evaluator_launcher/cli/export.py +131 -12
nemo_evaluator_launcher/cli/info.py +477 -82
nemo_evaluator_launcher/cli/kill.py +5 -3
nemo_evaluator_launcher/cli/logs.py +102 -0
nemo_evaluator_launcher/cli/ls_runs.py +31 -10
nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
nemo_evaluator_launcher/cli/main.py +101 -5
nemo_evaluator_launcher/cli/run.py +153 -30
nemo_evaluator_launcher/cli/status.py +49 -5
nemo_evaluator_launcher/cli/version.py +26 -23
nemo_evaluator_launcher/common/execdb.py +121 -27
nemo_evaluator_launcher/common/helpers.py +213 -33
nemo_evaluator_launcher/common/logging_utils.py +16 -5
nemo_evaluator_launcher/common/printing_utils.py +100 -0
nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
nemo_evaluator_launcher/executors/base.py +54 -1
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
nemo_evaluator_launcher/executors/local/executor.py +492 -56
nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
nemo_evaluator_launcher/exporters/base.py +9 -0
nemo_evaluator_launcher/exporters/gsheets.py +27 -9
nemo_evaluator_launcher/exporters/local.py +30 -16
nemo_evaluator_launcher/exporters/mlflow.py +245 -74
nemo_evaluator_launcher/exporters/utils.py +139 -184
nemo_evaluator_launcher/exporters/wandb.py +157 -43
nemo_evaluator_launcher/package_info.py +6 -3
nemo_evaluator_launcher/resources/mapping.toml +56 -15
nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0

nemo_evaluator_launcher/executors/lepton/deployment_helpers.py CHANGED Viewed

@@ -19,6 +19,7 @@ Handles Lepton endpoint creation, management, and health checks.
 """
 import json
+import shlex
 import subprocess
 import time
 from pathlib import Path
@@ -27,6 +28,7 @@ from typing import Any, Dict, Optional
 # Import lepton dependencies
 from omegaconf import DictConfig
+from nemo_evaluator_launcher.common.helpers import _str_to_echo_command
 from nemo_evaluator_launcher.common.logging_utils import logger
@@ -235,6 +237,8 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
     Returns:
         Container specification for Lepton.
     """
+    # Extract pre_cmd from deployment_cfg
+    pre_cmd: str = deployment_cfg.get("pre_cmd") or ""
     container_spec = {
         "image": deployment_cfg.image,
         "ports": [{"container_port": deployment_cfg.port}],
@@ -258,6 +262,18 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
         if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
             command_parts.extend(deployment_cfg.extra_args.split())
+        # Wrap with pre_cmd if provided
+        if pre_cmd:
+            create_pre_script_cmd = _str_to_echo_command(
+                pre_cmd, filename="deployment_pre_cmd.sh"
+            )
+            original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
+            command_parts = [
+                "/bin/bash",
+                "-c",
+                f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
+            ]
         container_spec["command"] = command_parts
     elif deployment_cfg.type == "sglang":
@@ -278,12 +294,31 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
         if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
             command_parts.extend(deployment_cfg.extra_args.split())
+        # Wrap with pre_cmd if provided
+        if pre_cmd:
+            create_pre_script_cmd = _str_to_echo_command(
+                pre_cmd, filename="deployment_pre_cmd.sh"
+            )
+            original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
+            command_parts = [
+                "/bin/bash",
+                "-c",
+                f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
+            ]
         container_spec["command"] = command_parts
     elif deployment_cfg.type == "nim":
         # NIM containers use their default entrypoint - no custom command needed
         # Configuration is handled via environment variables
-        pass
+        # pre_cmd is not supported for NIM deployments
+        if pre_cmd:
+            logger.error(
+                "pre_cmd is not supported for NIM deployments",
+                deployment_type="nim",
+                pre_cmd=pre_cmd,
+            )
+            raise ValueError("pre_cmd is not supported for NIM deployments")
     return container_spec
@@ -428,14 +463,34 @@ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
             print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
             return True
         else:
-            print(f"❌ Failed to create Lepton endpoint: {result.stderr}")
+            error_msg = result.stderr.strip() if result.stderr else ""
+            output_msg = result.stdout.strip() if result.stdout else ""
+            print(
+                f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
+            )
+            if error_msg:
+                print(f"   stderr: {error_msg}")
+            if output_msg:
+                print(f"   stdout: {output_msg}")
             return False
-    except subprocess.TimeoutExpired:
-        print(f"❌ Timeout creating Lepton endpoint: {endpoint_name}")
+    except subprocess.TimeoutExpired as e:
+        print(
+            f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
+        )
+        if hasattr(e, "stderr") and e.stderr:
+            print(f"   stderr: {e.stderr}")
+        if hasattr(e, "stdout") and e.stdout:
+            print(f"   stdout: {e.stdout}")
         return False
     except subprocess.CalledProcessError as e:
-        print(f"❌ Error creating Lepton endpoint: {e}")
+        print(
+            f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
+        )
+        if hasattr(e, "stderr") and e.stderr:
+            print(f"   stderr: {e.stderr}")
+        if hasattr(e, "stdout") and e.stdout:
+            print(f"   stdout: {e.stdout}")
         return False
     finally:
         # Clean up temporary file

nemo_evaluator_launcher/executors/lepton/executor.py CHANGED Viewed

@@ -18,6 +18,7 @@
 Handles deployment and evaluation using Lepton endpoints with NIM containers.
 """
+import os
 import time
 from pathlib import Path
 from typing import List
@@ -36,6 +37,7 @@ from nemo_evaluator_launcher.common.mapping import (
     get_task_from_mapping,
     load_tasks_mapping,
 )
+from nemo_evaluator_launcher.common.printing_utils import red
 from nemo_evaluator_launcher.executors.base import (
     BaseExecutor,
     ExecutionState,
@@ -78,9 +80,75 @@ class LeptonExecutor(BaseExecutor):
                 "LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
             )
+        # Load tasks mapping
+        tasks_mapping = load_tasks_mapping()
+        job_ids = []
+        lepton_job_names = []
+        endpoint_names = []  # Track multiple endpoints
+        db = ExecutionDB()
         # Generate invocation ID
         invocation_id = generate_invocation_id()
+        # TODO(agronskiy): the structure of this executor differs from others,
+        # so the best place to check for unsafe commands yelids a bit of duplication.
+        # We can't use the get_eval_factory_command here because the port is not yet
+        # populated.
+        # Refactor the whole thing.
+        is_potentially_unsafe = False
+        for idx, task in enumerate(cfg.evaluation.tasks):
+            pre_cmd: str = task.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
+            if pre_cmd:
+                is_potentially_unsafe = True
+                break
+        # Check for deployment pre_cmd
+        deployment_pre_cmd: str = cfg.deployment.get("pre_cmd") or ""
+        if deployment_pre_cmd:
+            is_potentially_unsafe = True
+        # DRY-RUN mode
+        if dry_run:
+            output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
+            output_dir.mkdir(parents=True, exist_ok=True)
+            # Validate configuration
+            _dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
+            if cfg.deployment.type == "none":
+                print("Using existing endpoint (deployment: none)")
+                print("using shared endpoint")
+            else:
+                print(f"with endpoint type '{cfg.deployment.type}'")
+            if is_potentially_unsafe:
+                print(
+                    red(
+                        "\nFound `pre_cmd` (evaluation or deployment) which carries security risk. When running without --dry-run "
+                        "make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
+                    )
+                )
+            return invocation_id
+        if is_potentially_unsafe:
+            if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
+                logger.warning(
+                    "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
+                    "is set, proceeding with caution."
+                )
+            else:
+                logger.error(
+                    "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
+                    "is not set. This might carry security risk and unstable environments. "
+                    "To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
+                )
+                raise AttributeError(
+                    "Untrusted command found in config, make sure you trust and "
+                    "set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
+                )
         # For deployment: none, we use the existing endpoint for all tasks
         if cfg.deployment.type == "none":
             print("📌 Using existing endpoint (deployment: none)")
@@ -88,13 +156,6 @@ class LeptonExecutor(BaseExecutor):
             print(f"✅ Using shared endpoint: {shared_endpoint_url}")
         try:
-            # Load tasks mapping
-            tasks_mapping = load_tasks_mapping()
-            job_ids = []
-            lepton_job_names = []
-            endpoint_names = []  # Track multiple endpoints
-            db = ExecutionDB()
             # Create local directory for outputs
             output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
             output_dir.mkdir(parents=True, exist_ok=True)
@@ -139,8 +200,13 @@ class LeptonExecutor(BaseExecutor):
                     task_index = str(idx)
                     endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
-                    # Ensure we don't exceed 36 character limit
                     if len(endpoint_name) > 36:
+                        logger.info(
+                            "Lepton endpoint name will be deployed under name {task_name}",
+                            task_name=task.name,
+                            original=endpoint_name,
+                            limit=36,
+                        )
                         # Truncate task name further if needed
                         max_task_len = (
                             36
@@ -151,7 +217,19 @@ class LeptonExecutor(BaseExecutor):
                         )  # 3 hyphens
                         short_task_name = sanitized_task_name[:max_task_len]
                         endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
+                        logger.info(
+                            "Lepton endpoint name is auto-generated",
+                            task_name=task.name,
+                            original=endpoint_name,
+                            truncated=endpoint_name,
+                            limit=36,
+                        )
+                    logger.info(
+                        "Lepton endpoint name (auto-generated)",
+                        task_name=task.name,
+                        endpoint_name=endpoint_name,
+                    )
                     endpoint_names.append(endpoint_name)
                     endpoint_creation_tasks.append((idx, task, endpoint_name))
@@ -298,20 +376,6 @@ class LeptonExecutor(BaseExecutor):
                     f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
                 )
-            if dry_run:
-                print("🔍 DRY RUN: Lepton job configurations prepared")
-                print(f"   - Tasks: {len(cfg.evaluation.tasks)}")
-                for idx, task in enumerate(cfg.evaluation.tasks):
-                    if cfg.deployment.type == "none":
-                        print(f"   - Task {idx}: {task.name} using shared endpoint")
-                    else:
-                        print(
-                            f"   - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
-                        )
-                print(f"   - Output directory: {output_dir}")
-                print("\nTo submit jobs, run the executor without --dry-run")
-                return invocation_id
             # ================================================================
             # JOB SUBMISSION (Sequential, as before)
             # ================================================================
@@ -334,8 +398,18 @@ class LeptonExecutor(BaseExecutor):
                 max_base_length = 36 - 1 - len(suffix)  # -1 for the hyphen
                 if len(base_job_name) > max_base_length:
                     base_job_name = base_job_name[:max_base_length]
+                    logger.info(
+                        "Lepton job auto-generated name",
+                        task_name=task.name,
+                        job_name=f"{base_job_name}-{suffix}",
+                    )
                 lepton_job_name = f"{base_job_name}-{suffix}"
+                logger.info(
+                    "Lepton job name (auto-generated)",
+                    task_name=task.name,
+                    job_name=lepton_job_name,
+                )
                 job_ids.append(job_id)
                 lepton_job_names.append(lepton_job_name)
@@ -377,7 +451,12 @@ class LeptonExecutor(BaseExecutor):
                     cfg.target.api_endpoint.url = full_endpoint_url
                     # Generate command with the correct endpoint URL
-                    eval_command = get_eval_factory_command(cfg, task, task_definition)
+                    eval_command_struct = get_eval_factory_command(
+                        cfg, task, task_definition
+                    )
+                    eval_command = eval_command_struct.cmd
+                    # Debug string for explainability of some base64-parts of the command
+                    eval_command_debug_comment = eval_command_struct.debug
                 finally:
                     # Restore original URL and struct mode
@@ -402,6 +481,7 @@ class LeptonExecutor(BaseExecutor):
                     task_name=task.name,
                     invocation_id=invocation_id,
                     eval_command=eval_command,  # Pass the fixed command
+                    eval_command_debug_comment=eval_command_debug_comment,
                 )
                 # Prepare job command to run the launch script
@@ -456,6 +536,33 @@ class LeptonExecutor(BaseExecutor):
                     job_mounts.append(mount_dict)
+                # Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
+                if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get(
+                    "required_env_vars", []
+                ):
+                    # Get dataset directory from task config
+                    if "dataset_dir" in task:
+                        dataset_mount_host = task["dataset_dir"]
+                    else:
+                        raise ValueError(
+                            f"{task.name} task requires a dataset_dir to be specified. "
+                            f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
+                        )
+                    # Get container mount path (default to /datasets if not specified)
+                    dataset_mount_container = task.get(
+                        "dataset_mount_path", "/datasets"
+                    )
+                    # Add dataset mount to job mounts
+                    # Lepton mount format: {"path": "/path/in/container", "mount_from": {"path": "/host/path"}}
+                    job_mounts.append(
+                        {
+                            "path": dataset_mount_container,
+                            "mount_from": {"path": dataset_mount_host},
+                        }
+                    )
+                    # Add NEMO_EVALUATOR_DATASET_DIR environment variable
+                    job_env_vars["NEMO_EVALUATOR_DATASET_DIR"] = dataset_mount_container
                 print(
                     f"   - Storage: {len(job_mounts)} mount(s) with evaluation ID isolation"
                 )
@@ -482,7 +589,8 @@ class LeptonExecutor(BaseExecutor):
                 if not job_success:
                     raise RuntimeError(
-                        f"Failed to submit Lepton job for task: {task.name}. Error: {error_msg}"
+                        f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
+                        f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
                     )
                 # Store job metadata in database (with task-specific endpoint info)
@@ -504,8 +612,6 @@ class LeptonExecutor(BaseExecutor):
                     )
                 )
-                print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
             # Jobs submitted successfully - return immediately (non-blocking)
             print(
                 f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
@@ -536,9 +642,8 @@ class LeptonExecutor(BaseExecutor):
             return invocation_id
-        except Exception as e:
+        except Exception:
             # Clean up any created endpoints on failure
-            print(f"❌ Error during evaluation: {e}")
             if cfg.deployment.type != "none" and "endpoint_names" in locals():
                 for endpoint_name in endpoint_names:
                     if endpoint_name:
@@ -559,7 +664,7 @@ class LeptonExecutor(BaseExecutor):
         db = ExecutionDB()
         # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
-        if len(id) == 8 and "." not in id:
+        if "." not in id:
             return _get_statuses_for_invocation_id(id=id, db=db)
         # Otherwise, treat as job_id
         job_data = db.get_job(id)
@@ -577,7 +682,7 @@ class LeptonExecutor(BaseExecutor):
                 job_state = lepton_status.get("state", "Unknown")
                 # Map Lepton job states to our execution states
-                if job_state == "Succeeded":
+                if job_state in ["Succeeded", "Completed"]:
                     state = ExecutionState.SUCCESS
                 elif job_state in ["Running", "Pending", "Starting"]:
                     state = ExecutionState.RUNNING
@@ -624,76 +729,14 @@ class LeptonExecutor(BaseExecutor):
     def kill_job(job_id: str) -> None:
         """Kill Lepton evaluation jobs and clean up endpoints.
-        For invocation IDs, this will kill all jobs and clean up all
-        dedicated endpoints created for the invocation.
         Args:
-            job_id: The job ID or invocation ID to kill.
+            job_id: The job ID to kill.
         Raises:
             ValueError: If job is not found or invalid.
             RuntimeError: If job cannot be killed.
         """
         db = ExecutionDB()
-        # If it looks like an invocation_id, kill all jobs for that invocation
-        if len(job_id) == 8 and "." not in job_id:
-            jobs = db.get_jobs(job_id)
-            if not jobs:
-                raise ValueError(f"No jobs found for invocation {job_id}")
-            endpoint_names = (
-                set()
-            )  # Use set to avoid duplicates (though each should be unique)
-            lepton_job_names = []
-            # Collect all Lepton jobs and endpoint info
-            for curr_job_data in jobs.values():
-                if curr_job_data.executor != "lepton":
-                    continue
-                # Collect endpoint name for this job (each task may have its own)
-                endpoint_name = curr_job_data.data.get("endpoint_name")
-                if endpoint_name:
-                    endpoint_names.add(endpoint_name)
-                lepton_job_name = curr_job_data.data.get("lepton_job_name")
-                if lepton_job_name:
-                    lepton_job_names.append(lepton_job_name)
-                # Mark job as killed in database
-                curr_job_data.data["status"] = "killed"
-                curr_job_data.data["killed_time"] = time.time()
-                db.write_job(curr_job_data)
-            print(
-                f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
-            )
-            # Cancel all Lepton jobs
-            for lepton_job_name in lepton_job_names:
-                success = delete_lepton_job(lepton_job_name)
-                if success:
-                    print(f"✅ Cancelled Lepton job: {lepton_job_name}")
-                else:
-                    print(f"⚠️  Failed to cancel Lepton job: {lepton_job_name}")
-            # Clean up all dedicated endpoints
-            if endpoint_names:
-                print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
-                for endpoint_name in endpoint_names:
-                    success = delete_lepton_endpoint(endpoint_name)
-                    if success:
-                        print(f"✅ Cleaned up endpoint: {endpoint_name}")
-                    else:
-                        print(f"⚠️  Failed to cleanup endpoint: {endpoint_name}")
-            else:
-                print("📌 No dedicated endpoints to clean up (using shared endpoint)")
-            print(f"🛑 Killed all resources for invocation {job_id}")
-            return
-        # Otherwise, treat as individual job_id
         job_data = db.get_job(job_id)
         if job_data is None:
             raise ValueError(f"Job {job_id} not found")
@@ -705,17 +748,25 @@ class LeptonExecutor(BaseExecutor):
         # Cancel the specific Lepton job
         lepton_job_name = job_data.data.get("lepton_job_name")
         if lepton_job_name:
-            success = delete_lepton_job(lepton_job_name)
-            if success:
+            cancel_success = delete_lepton_job(lepton_job_name)
+            if cancel_success:
                 print(f"✅ Cancelled Lepton job: {lepton_job_name}")
+                # Mark job as killed in database
+                job_data.data["status"] = "killed"
+                job_data.data["killed_time"] = time.time()
+                db.write_job(job_data)
             else:
-                print(f"⚠️  Failed to cancel Lepton job: {lepton_job_name}")
-        # Mark job as killed in database
-        job_data.data["status"] = "killed"
-        job_data.data["killed_time"] = time.time()
-        db.write_job(job_data)
+                # Use common helper to get informative error message based on job status
+                status_list = LeptonExecutor.get_status(job_id)
+                current_status = status_list[0].state if status_list else None
+                error_msg = LeptonExecutor.get_kill_failure_message(
+                    job_id, f"lepton_job: {lepton_job_name}", current_status
+                )
+                raise RuntimeError(error_msg)
+        else:
+            raise ValueError(f"No Lepton job name found for job {job_id}")
         print(f"🛑 Killed Lepton job {job_id}")
@@ -761,6 +812,7 @@ def _create_evaluation_launch_script(
     task_name: str,
     invocation_id: str,
     eval_command: str,
+    eval_command_debug_comment: str,
 ) -> str:
     """Create bash script for running evaluation in Lepton job container.
@@ -774,6 +826,7 @@ def _create_evaluation_launch_script(
         task_name: Name of the evaluation task.
         invocation_id: Unique invocation identifier.
         eval_command: The evaluation command with correct endpoint URL.
+        eval_command_debug_comment: The debug comment for placing into the script and easy debug
     Returns:
         String containing the bash launch script.
@@ -806,6 +859,8 @@ echo "Invocation ID: {invocation_id}"
 echo "Endpoint URL: {endpoint_url}"
 echo "Command: {eval_command_modified}"
+{eval_command_debug_comment}
 # Execute the evaluation with proper error handling
 set +e
 {eval_command_modified}
@@ -829,6 +884,90 @@ exit 0
     return script
+def _dry_run_lepton(
+    cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
+) -> None:
+    print("DRY RUN: Lepton job configurations prepared")
+    try:
+        # validate tasks
+        for task in cfg.evaluation.tasks:
+            get_task_from_mapping(task.name, tasks_mapping)
+        # nice-to-have checks (existing endpoint URL or endpoints mapping)
+        if getattr(cfg.deployment, "type", None) == "none":
+            tgt = getattr(cfg, "target", {})
+            api = (
+                tgt.get("api_endpoint")
+                if isinstance(tgt, dict)
+                else getattr(tgt, "api_endpoint", None)
+            ) or {}
+            url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
+            if not url or not str(url).strip():
+                raise ValueError(
+                    "target.api_endpoint.url must be set when deployment.type == 'none'"
+                )
+        else:
+            endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
+            for task in cfg.evaluation.tasks:
+                td = get_task_from_mapping(task.name, tasks_mapping)
+                etype = td.get("endpoint_type")
+                if etype not in endpoints_cfg:
+                    raise ValueError(
+                        f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
+                    )
+                path = endpoints_cfg.get(etype)
+                if not isinstance(path, str) or not path.startswith("/"):
+                    raise ValueError(
+                        f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
+                    )
+        # lepton env var presence (reference-level)
+        tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
+        lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
+        api_key_name = getattr(
+            getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
+        )
+        for task in cfg.evaluation.tasks:
+            td = get_task_from_mapping(task.name, tasks_mapping)
+            required = td.get("required_env_vars", []) or []
+            for var in required:
+                # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
+                if var == "NEMO_EVALUATOR_DATASET_DIR":
+                    if "dataset_dir" not in task:
+                        raise ValueError(
+                            f"Task '{task.name}' requires dataset_dir to be specified. "
+                            f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
+                        )
+                    continue
+                if var == "API_KEY":
+                    if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
+                        raise ValueError(
+                            f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
+                            "or target.api_endpoint.api_key_name"
+                        )
+                else:
+                    if var not in lepton_env_vars:
+                        raise ValueError(
+                            f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
+                        )
+        # success (use realized output directory if invocation_id is available)
+        preview_output_dir = (
+            Path(cfg.execution.output_dir).absolute() / invocation_id
+            if invocation_id
+            else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
+        )
+        print(f"   - Tasks: {len(cfg.evaluation.tasks)}")
+        for idx, task in enumerate(cfg.evaluation.tasks):
+            print(f"   - Task {idx}: {task.name}")
+        print(f"   - Output directory: {preview_output_dir}")
+        print("\nTo run evaluation, execute run command without --dry-run")
+    except Exception as e:
+        print(f"❌ Configuration invalid: {e}")
+        logger.error("Lepton dry-run validation failed", error=str(e))
+        return
 def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
     """Helper method that returns statuses if id is the invocation id"""
     jobs = db.get_jobs(id)

nemo_evaluator_launcher/executors/lepton/job_helpers.py CHANGED Viewed

@@ -23,13 +23,6 @@ import subprocess
 import time
 from typing import Any, List, Union
-from leptonai.api.v1.types.affinity import LeptonResourceAffinity
-from leptonai.api.v1.types.common import LeptonVisibility, Metadata
-from leptonai.api.v1.types.deployment import EnvVar, LeptonContainer, Mount
-from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
-# Import lepton dependencies
-from leptonai.api.v2.client import APIClient
 from omegaconf import DictConfig
 from nemo_evaluator_launcher.common.logging_utils import logger
@@ -92,6 +85,18 @@ def _create_lepton_job_api(
 ) -> tuple[bool, str]:
     """Create Lepton job using API client (preferred method)."""
     try:
+        # Import leptonai dependencies locally
+        from leptonai.api.v1.types.affinity import LeptonResourceAffinity
+        from leptonai.api.v1.types.common import LeptonVisibility, Metadata
+        from leptonai.api.v1.types.deployment import (
+            EnvValue,
+            EnvVar,
+            LeptonContainer,
+            Mount,
+        )
+        from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
+        from leptonai.api.v2.client import APIClient
         client = APIClient()
         # Prepare environment variables (support both direct values and secret references)
@@ -99,12 +104,8 @@ def _create_lepton_job_api(
         if env_vars:
             for key, value in env_vars.items():
                 # Handle both regular dicts and OmegaConf objects
-                from omegaconf import DictConfig
                 if isinstance(value, (dict, DictConfig)) and "value_from" in value:
                     # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
-                    from leptonai.api.v1.types.deployment import EnvValue
                     # Convert OmegaConf to dict if needed
                     value_dict = dict(value) if isinstance(value, DictConfig) else value
                     env_var = EnvVar(
@@ -203,6 +204,9 @@ def get_lepton_job_status(job_name_or_id: str) -> dict[Any, Any] | None:
 def _get_lepton_job_status_api(job_name_or_id: str) -> dict[Any, Any] | None:
     """Get job status using API client (preferred method)."""
     try:
+        # Import leptonai dependencies locally
+        from leptonai.api.v2.client import APIClient
         client = APIClient()
         # Try to get job by ID first, then by name

nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl