PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

nemo_evaluator_launcher/__init__.py +15 -1
nemo_evaluator_launcher/api/functional.py +188 -27
nemo_evaluator_launcher/api/types.py +9 -0
nemo_evaluator_launcher/cli/export.py +131 -12
nemo_evaluator_launcher/cli/info.py +477 -82
nemo_evaluator_launcher/cli/kill.py +5 -3
nemo_evaluator_launcher/cli/logs.py +102 -0
nemo_evaluator_launcher/cli/ls_runs.py +31 -10
nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
nemo_evaluator_launcher/cli/main.py +101 -5
nemo_evaluator_launcher/cli/run.py +153 -30
nemo_evaluator_launcher/cli/status.py +49 -5
nemo_evaluator_launcher/cli/version.py +26 -23
nemo_evaluator_launcher/common/execdb.py +121 -27
nemo_evaluator_launcher/common/helpers.py +213 -33
nemo_evaluator_launcher/common/logging_utils.py +16 -5
nemo_evaluator_launcher/common/printing_utils.py +100 -0
nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
nemo_evaluator_launcher/executors/base.py +54 -1
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
nemo_evaluator_launcher/executors/local/executor.py +492 -56
nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
nemo_evaluator_launcher/exporters/base.py +9 -0
nemo_evaluator_launcher/exporters/gsheets.py +27 -9
nemo_evaluator_launcher/exporters/local.py +30 -16
nemo_evaluator_launcher/exporters/mlflow.py +245 -74
nemo_evaluator_launcher/exporters/utils.py +139 -184
nemo_evaluator_launcher/exporters/wandb.py +157 -43
nemo_evaluator_launcher/package_info.py +6 -3
nemo_evaluator_launcher/resources/mapping.toml +56 -15
nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0

nemo_evaluator_launcher/common/helpers.py CHANGED Viewed

@@ -16,29 +16,90 @@
 import base64
 import copy
 import datetime
+from dataclasses import dataclass
 from typing import Optional
 import yaml
 from omegaconf import DictConfig, OmegaConf
+from nemo_evaluator_launcher.cli.version import get_versions
 from nemo_evaluator_launcher.common.logging_utils import logger
-def _yaml_to_echo_command(yaml_str: str, filename: str = "config_ef.yaml") -> str:
-    yaml_str_b64 = base64.b64encode(yaml_str.encode("utf-8")).decode("utf-8")
-    return f'echo "{yaml_str_b64}" | base64 -d > {filename}'
+@dataclass(frozen=True)
+class CmdAndReadableComment:
+    """See the comment to `_yaml_to_echo_command`."""
+    # Actual command. Might include hard-to-debug elements such as base64-encoded
+    # configs.
+    cmd: str
+    # A debuggale readable comment that can be passed along for accompanying
+    # the actual command
+    debug: str
+    # Whether the content might be potentially unsafe. This is a flag useful for
+    # downstream callers who want to raise exceptions e.g. when a script was
+    # saved that would execute this command.
+    is_potentially_unsafe: bool = False
+def _str_to_echo_command(str_to_save: str, filename: str) -> CmdAndReadableComment:
+    """Create a safe (see below) echo command saving a string to file.
+    Safety in this context means the ability to pass such echo command through the
+    `bash -c '...'` boundaries for example.
+    Naturally, enconding with base64 creates debuggability issues. For that, the second
+    output of the function is the string with bash comment signs prepended.
+    """
+    str_to_save_b64 = base64.b64encode(str_to_save.encode("utf-8")).decode("utf-8")
+    debug_str = "\n".join(
+        [f"# Contents of {filename}"] + ["# " + s for s in str_to_save.splitlines()]
+    )
+    return CmdAndReadableComment(
+        cmd=f'echo "{str_to_save_b64}" | base64 -d > {filename}', debug=debug_str
+    )
+def _set_nested_optionally_overriding(
+    d: dict, keys: list[str], val: object, *, override_if_exists: bool = False
+):
+    """Sets d[...keys....] = value, creating keys all the way"""
+    temp = d
+    for key in keys[:-1]:
+        temp = temp.setdefault(key, {})
+    if override_if_exists or keys[-1] not in temp:
+        temp[keys[-1]] = val
 def get_eval_factory_config(
-    cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
+    cfg: DictConfig,
+    user_task_config: DictConfig,
 ) -> dict:
     """Extract config fields for eval factory.
     This function extracts the config field similar to how overrides are handled.
+    Overrides will start to be deprecated (or not, but at least a warning will be logged).
     """
+    if cfg.evaluation.get("overrides") or user_task_config.get("overrides"):
+        # TODO(agronskiy): start removing overrides, test `test_start_deprecating_overrides`
+        # will start failing soon.
+        logger.warning(
+            "We are deprecating using old-style dot-delimited overrides "
+            "in favour of `nemo_evaluator_config` field. Please check "
+            "the documentation."
+        )
+    logger.debug("Getting nemo evaluator merged config")
     # Extract config fields similar to overrides - convert to basic Python types first
-    cfg_config = cfg.evaluation.get("config", {})
-    user_config = user_task_config.get("config", {})
+    # Support both new and old format for backward compatibility
+    cfg_config = cfg.evaluation.get("nemo_evaluator_config") or cfg.evaluation.get(
+        "config", {}
+    )
+    user_config = user_task_config.get("nemo_evaluator_config") or user_task_config.get(
+        "config", {}
+    )
     # Convert OmegaConf objects to basic Python types
     if cfg_config:
@@ -47,17 +108,115 @@ def get_eval_factory_config(
         user_config = OmegaConf.to_container(user_config, resolve=True)
     # Merge the configs
-    config_fields = copy.deepcopy(cfg_config or {})
-    config_fields.update(user_config or {})
+    merged_nemo_evaluator_config: dict = OmegaConf.to_container(
+        OmegaConf.merge(cfg_config, user_config)
+    )
+    logger.debug(
+        "Merged nemo evaluator config, not final",
+        source_global_cfg=cfg_config,
+        source_task_config=user_config,
+        result=merged_nemo_evaluator_config,
+    )
-    return config_fields
+    return merged_nemo_evaluator_config
 def get_eval_factory_command(
-    cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
-) -> str:
-    config_fields = get_eval_factory_config(cfg, user_task_config, task_definition)
+    cfg: DictConfig,
+    user_task_config: DictConfig,
+    task_definition: dict,
+) -> CmdAndReadableComment:
+    # This gets the eval_factory_config merged from both top-level and task-level.
+    merged_nemo_evaluator_config = get_eval_factory_config(
+        cfg,
+        user_task_config,
+    )
+    # We now prepare the config to be passed to `nemo-evaluator` command.
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "url"],
+        get_endpoint_url(
+            cfg,
+            merged_nemo_evaluator_config=merged_nemo_evaluator_config,
+            endpoint_type=task_definition["endpoint_type"],
+        ),
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "model_id"],
+        get_served_model_name(cfg),
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "type"],
+        task_definition["endpoint_type"],
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["config", "type"],
+        task_definition["task"],
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["config", "output_dir"],
+        "/results",
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "api_key"],
+        "API_KEY",
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        [
+            "metadata",
+            "launcher_resolved_config",
+        ],
+        OmegaConf.to_container(cfg, resolve=True),
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["metadata", "versioning"],
+        get_versions(),
+    )
+    # Now get the pre_cmd either from `evaluation.pre_cmd` or task-level pre_cmd. Note the
+    # order -- task level wins.
+    pre_cmd: str = (
+        user_task_config.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
+    )
+    is_potentially_unsafe = False
+    if pre_cmd:
+        logger.warning(
+            "Found non-empty pre_cmd that might be a security risk if executed. "
+            "Setting `is_potentially_unsafe` to `True`",
+            pre_cmd=pre_cmd,
+        )
+        is_potentially_unsafe = True
+        _set_nested_optionally_overriding(
+            merged_nemo_evaluator_config,
+            ["metadata", "pre_cmd"],
+            pre_cmd,
+        )
+    create_pre_script_cmd = _str_to_echo_command(pre_cmd, filename="pre_cmd.sh")
+    create_yaml_cmd = _str_to_echo_command(
+        yaml.safe_dump(merged_nemo_evaluator_config), "config_ef.yaml"
+    )
+    # NOTE: we use `source` to allow tricks like exports etc (if needed) -- it runs in the same
+    # shell as the command.
+    eval_command = (
+        "cmd=$(command -v nemo-evaluator >/dev/null 2>&1 && echo nemo-evaluator || echo eval-factory) "
+        + "&& source pre_cmd.sh "
+        + "&& $cmd run_eval --run_config config_ef.yaml"
+    )
+    # NOTE: see note and test about deprecating that.
     overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
     overrides.update(dict(user_task_config.get("overrides", {})))
     # NOTE(dfridman): Temporary fix to make sure that the overrides arg is not split into multiple lines.
@@ -66,32 +225,46 @@ def get_eval_factory_command(
         k: (v.strip("\n") if isinstance(v, str) else v) for k, v in overrides.items()
     }
     overrides_str = ",".join([f"{k}={v}" for k, v in overrides.items()])
-    model_url = get_endpoint_url(cfg, user_task_config, task_definition)
-    model_id = get_served_model_name(cfg)
-    model_type = task_definition["endpoint_type"]
-    eval_type = task_definition["task"]
-    create_file_cmd = _yaml_to_echo_command(
-        yaml.safe_dump(config_fields), "config_ef.yaml"
+    if overrides_str:
+        eval_command = f"{eval_command} --overrides {overrides_str}"
+    # We return both the command and the debugging base64-decoded strings, useful
+    # for exposing when building scripts.
+    return CmdAndReadableComment(
+        cmd=create_pre_script_cmd.cmd
+        + " && "
+        + create_yaml_cmd.cmd
+        + " && "
+        + eval_command,
+        debug=create_pre_script_cmd.debug + "\n\n" + create_yaml_cmd.debug,
+        is_potentially_unsafe=is_potentially_unsafe,
     )
-    nv_eval_command = f"""nv_eval run_eval --model_id {model_id} --model_type {model_type} --eval_type {eval_type} --model_url {model_url} --api_key_name API_KEY --output_dir /results --run_config config_ef.yaml"""
-    if overrides:
-        nv_eval_command = f"{nv_eval_command} --overrides {overrides_str}"
-    return create_file_cmd + " && " + "cat config_ef.yaml && " + nv_eval_command
 def get_endpoint_url(
-    cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
+    cfg: DictConfig,
+    merged_nemo_evaluator_config: dict,
+    endpoint_type: str,
 ) -> str:
     def apply_url_override(url: str) -> str:
         """Apply user URL override if provided."""
-        override_url = user_task_config.get("overrides", {}).get(
-            "config.target.api_endpoint.url"
+        nemo_evaluator_config_url = (
+            merged_nemo_evaluator_config.get("target", {})
+            .get("api_endpoint", {})
+            .get("url", None)
         )
-        return override_url if override_url is not None else url
+        if nemo_evaluator_config_url:
+            return nemo_evaluator_config_url
+        # Being deprecated, see `get_eval_factory_config` message.
+        overrides_old_style_url = merged_nemo_evaluator_config.get("overrides", {}).get(
+            "target.api_endpoint.url", None
+        )
+        if overrides_old_style_url:
+            return overrides_old_style_url
+        return url
     if cfg.deployment.type == "none":
         # For deployment: none, use target URL regardless of executor type
@@ -113,9 +286,16 @@ def get_endpoint_url(
     else:
         # Local executor - use localhost
-        task_endpoint_type = task_definition["endpoint_type"]
-        endpoint_uri = cfg.deployment.endpoints[task_endpoint_type]
-        endpoint_url = f"http://127.0.0.1:{cfg.deployment.port}{endpoint_uri}"
+        endpoint_uri = cfg.deployment.endpoints[endpoint_type]
+        # Use HAProxy port if multiple_instances is enabled
+        if cfg.deployment.get("multiple_instances", False):
+            proxy_config = cfg.execution.get("proxy", {}).get("config", {})
+            port = proxy_config.get("haproxy_port", 5009)
+        else:
+            port = cfg.deployment.port
+        endpoint_url = f"http://127.0.0.1:{port}{endpoint_uri}"
         return endpoint_url

nemo_evaluator_launcher/common/logging_utils.py CHANGED Viewed

@@ -61,8 +61,9 @@ import structlog
 # both are unset, default would be used.
 _LOG_LEVEL_ENV_VAR = "NEMO_EVALUATOR_LOG_LEVEL"
 _DEFAULT_LOG_LEVEL = "WARNING"
-_SENSITIVE_KEY_SUBSTRINGS = {
-    # Keep minimal, broad substrings (normalized: lowercased, no spaces/_/-)
+_SENSITIVE_KEY_SUBSTRINGS_NORMALIZED = {
+    # Keep minimal, broad substrings
+    # NOTE: normalized: lowercased, no spaces/_/-
     "authorization",  # covers proxy-authorization, etc.
     "apikey",  # covers api_key, api-key, x-api-key, nvidia_api_key, ...
     "accesskey",  # covers access_key / access-key
@@ -73,6 +74,10 @@ _SENSITIVE_KEY_SUBSTRINGS = {
     "pwd",  # common shorthand
     "passwd",  # common variant
 }
+_ALLOWLISTED_KEYS_SUBSTRINGS = {
+    # NOTE: non-normalized (for allowlisting we want more control)
+    "_tokens",  # This likely would allow us to not redact useful stuff like `limit_tokens`, `max_new_tokens`
+}
 def _mask(val: object) -> str:
@@ -91,8 +96,11 @@ def _normalize(name: object) -> str:
 def _is_sensitive_key(key: object) -> bool:
-    k = _normalize(key)
-    return any(substr in k for substr in _SENSITIVE_KEY_SUBSTRINGS)
+    k_norm = _normalize(key)
+    k_non_norm = str(key)
+    return any(
+        substr in k_norm for substr in _SENSITIVE_KEY_SUBSTRINGS_NORMALIZED
+    ) and not any(substr in k_non_norm for substr in _ALLOWLISTED_KEYS_SUBSTRINGS)
 def _redact_mapping(m: dict) -> dict:
@@ -263,6 +271,9 @@ def _configure_structlog() -> None:
         structlog.processors.UnicodeDecoder(),
     ]
+    # Check if stderr is a TTY to determine if colors should be enabled
+    colors_enabled = sys.stderr.isatty()
     logging.config.dictConfig(
         {
             "version": 1,
@@ -273,7 +284,7 @@ def _configure_structlog() -> None:
                     "()": "structlog.stdlib.ProcessorFormatter",
                     "processors": [
                         *shared_processors,
-                        MainConsoleRenderer(colors=True),
+                        MainConsoleRenderer(colors=colors_enabled),
                     ],
                 },
                 # Formatter for plain file output

nemo_evaluator_launcher/common/printing_utils.py ADDED Viewed

@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Printing utils for more structured or visually appealing prints.
+NOTE: use printing only for main application output that matters. For logging,
+see `logging_utils.py`.
+USAGE:
+```
+  from nemo_evaluator_launcher.common.printing_utils import red, bold
+  print(bold(red("some red bold")))
+```
+"""
+import os
+import sys
+# If this env var is set, it will override a more standard "LOG_LEVEL". If
+# both are unset, default would be used.
+_DISABLE_COLOR_ENV_VAR = "NEMO_EVALUATOR_DISABLE_COLOR"
+def _is_color_disabled():
+    # Check environment variable first
+    env_var = os.environ.get(_DISABLE_COLOR_ENV_VAR, "0").lower()
+    if "1" in env_var or "yes" in env_var or "y" in env_var or "true" in env_var:
+        return True
+    # If not explicitly disabled, check if stdout is a TTY
+    # Colors are disabled if output is not a TTY
+    if not sys.stdout.isatty():
+        return True
+    return False
+_CODES: dict[str, str] = dict(
+    green="\033[32m",
+    red="\033[31m",
+    red_bg="\033[41m",  # red background
+    cyan="\033[36m",
+    yellow="\033[33m",
+    magenta="\033[35m",
+    grey="\033[90m",
+    bold="\033[1m",
+    reset="\033[0m",
+)
+# If the colors are disabled, we null-out all the codes.
+if _is_color_disabled():
+    for c in _CODES.keys():
+        _CODES[c] = ""
+def green(s: str) -> str:
+    return _CODES["green"] + s + _CODES["reset"]
+def red(s: str) -> str:
+    return _CODES["red"] + s + _CODES["reset"]
+def red_bg(s: str) -> str:
+    return _CODES["red_bg"] + s + _CODES["reset"]
+def cyan(s: str) -> str:
+    return _CODES["cyan"] + s + _CODES["reset"]
+def yellow(s: str) -> str:
+    return _CODES["yellow"] + s + _CODES["reset"]
+def magenta(s: str) -> str:
+    return _CODES["magenta"] + s + _CODES["reset"]
+def grey(s: str) -> str:
+    return _CODES["grey"] + s + _CODES["reset"]
+def bold(s: str) -> str:
+    return _CODES["bold"] + s + _CODES["reset"]

nemo_evaluator_launcher/configs/deployment/generic.yaml ADDED Viewed

@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Generic server deployment configuration template
+#
+type: generic
+image: ???  # Docker image to use for deployment
+command: ???  # Command to run the server
+# Server configuration
+port: 8000
+served_model_name: ???  # Name of the served model (used in command templates and evaluation)
+extra_args: ""  # Additional command line arguments
+env_vars: {}  # Environment variables as {name: value} dict
+checkpoint_path: null  # Path to model checkpoint
+# API endpoints (customize based on your server)
+endpoints:
+  chat: /v1/chat/completions
+  completions: /v1/completions
+  health: /health

nemo_evaluator_launcher/configs/deployment/sglang.yaml CHANGED Viewed

@@ -19,6 +19,7 @@ checkpoint_path: ???
 served_model_name: ???
 port: 8000
 tensor_parallel_size: 8
+pipeline_parallel_size: 1
 data_parallel_size: 1
 extra_args: ""
 env_vars: {} # {name: value} dict
@@ -33,6 +34,7 @@ command: python3 -m sglang.launch_server
   --host 0.0.0.0
   --port ${deployment.port}
   --served-model-name ${deployment.served_model_name}
-  --tp ${deployment.tensor_parallel_size}
-  --dp ${deployment.data_parallel_size}
+  --tp-size ${deployment.tensor_parallel_size}
+  --dp-size ${deployment.data_parallel_size}
+  --pp-size ${deployment.pipeline_parallel_size}
   ${deployment.extra_args}

nemo_evaluator_launcher/configs/deployment/trtllm.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+type: trtllm
+image: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
+checkpoint_path: ???
+served_model_name: ???
+port: 8000
+tensor_parallel_size: 8
+pipeline_parallel_size: 1
+extra_args: ""
+endpoints:
+  chat: /v1/chat/completions
+  completions: /v1/completions
+  health: /health
+command: mpirun --allow-run-as-root --oversubscribe
+  trtllm-serve serve /checkpoint
+  --tp_size=${deployment.tensor_parallel_size}
+  --pp_size=${deployment.pipeline_parallel_size}
+  --host 0.0.0.0
+  --port ${deployment.port}
+  --backend pytorch
+  --trust_remote_code
+  ${deployment.extra_args}

nemo_evaluator_launcher/configs/deployment/vllm.yaml CHANGED Viewed

@@ -21,6 +21,7 @@ port: 8000
 tensor_parallel_size: 8
 pipeline_parallel_size: 1
 data_parallel_size: 1
+gpu_memory_utilization: 0.95
 extra_args: ""
 env_vars: {} # {name: value} dict
@@ -36,6 +37,5 @@ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
   --port ${deployment.port}
   --trust-remote-code
   --served-model-name ${deployment.served_model_name}
-  --enforce-eager
-  --gpu-memory-utilization 0.95
+  --gpu-memory-utilization ${deployment.gpu_memory_utilization}
   ${deployment.extra_args}

nemo_evaluator_launcher/configs/execution/local.yaml CHANGED Viewed

@@ -15,3 +15,5 @@
 #
 type: local
 output_dir: ???
+extra_docker_args: ""
+mode: sequential

nemo_evaluator_launcher/configs/execution/slurm/default.yaml CHANGED Viewed

@@ -14,16 +14,23 @@
 # limitations under the License.
 #
 # Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
-hostname: ???
-username: ${oc.env:USER}
-account: ???
+type: slurm              # Executor is chosen based on this field
+hostname: ???            # SLURM headnode (login) hostname (required)
+username: ${oc.env:USER} # Defaults to $USER env var
+account: ???             # SLURM account allocation (required)
+output_dir: ???          # Absolute path accessible on compute nodes (required)
 partition: batch
 num_nodes: 1
 ntasks_per_node: 1
 gres: gpu:8
 walltime: 01:00:00
 subproject: nemo-evaluator-launcher
-output_dir: ???
+sbatch_comment: null  # Optional comment for SLURM job (translates to #SBATCH --comment='...')
+# Deployment-specific SLURM configuration
+deployment:
+  n_tasks: 1             # Number of tasks for deployment srun (default: 1, for multi-instance set to num_nodes)
 env_vars:
   deployment: {}
   evaluation: {}
@@ -31,3 +38,11 @@ mounts:
   deployment: {}
   evaluation: {}
   mount_home: true
+proxy:
+  type: haproxy
+  image: haproxy:latest
+  config:
+    haproxy_port: 5009
+    health_check_path: /health
+    health_check_status: 200

nemo_evaluator_launcher/executors/base.py CHANGED Viewed

@@ -21,10 +21,12 @@ Defines the abstract interface for all executor implementations and common statu
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Optional
+from typing import Any, Iterator, Optional, Tuple
 from omegaconf import DictConfig
+from nemo_evaluator_launcher.common.logging_utils import logger
 class ExecutionState(Enum):
     """Enumeration of possible execution states."""
@@ -95,3 +97,54 @@ class BaseExecutor(ABC):
             NotImplementedError: If not implemented by a subclass.
         """
         raise NotImplementedError("Subclasses must implement this method")
+    @staticmethod
+    def get_kill_failure_message(
+        job_id: str, container_or_id: str, status: Optional[ExecutionState] = None
+    ) -> str:
+        """Generate an informative error message when kill fails based on job status.
+        Args:
+            job_id: The job ID that failed to kill.
+            container_or_id: Container name, SLURM job ID, or other identifier.
+            status: Optional execution state of the job.
+        Returns:
+            str: An informative error message with job status context.
+        """
+        if status == ExecutionState.SUCCESS:
+            return f"Could not find or kill job {job_id} ({container_or_id}) - job already completed successfully"
+        elif status == ExecutionState.FAILED:
+            return f"Could not find or kill job {job_id} ({container_or_id}) - job already failed"
+        elif status == ExecutionState.KILLED:
+            return f"Could not find or kill job {job_id} ({container_or_id}) - job was already killed"
+        # Generic error message
+        return f"Could not find or kill job {job_id} ({container_or_id})"
+    @staticmethod
+    def stream_logs(
+        id: str, executor_name: Optional[str] = None
+    ) -> Iterator[Tuple[str, str, str]]:
+        """Stream logs from a job or invocation group.
+        This is an optional method that executors can implement to provide log streaming.
+        If not implemented, it will log a warning and raise NotImplementedError.
+        Args:
+            id: Unique job identifier or invocation identifier.
+            executor_name: Optional executor name for warning messages. If not provided,
+                will attempt to infer from the calling context.
+        Yields:
+            Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
+                Empty lines are yielded as empty strings.
+        Raises:
+            NotImplementedError: If the executor does not support log streaming.
+        """
+        executor_display_name = executor_name or "this executor"
+        logger.warning(
+            f"Log streaming is not yet implemented for executor '{executor_display_name}'. "
+            "Only 'local' executor currently supports log streaming."
+        )
+        raise NotImplementedError("This executor does not support log streaming")

nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl