PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.19__tar.gz → 0.1.26__tar.gz - Mend

nemo-evaluator-launcher 0.1.19tar.gz → 0.1.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nemo-evaluator-launcher
-Version: 0.1.19
+Version: 0.1.26
 Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
 Author: NVIDIA
 Author-email: nemo-toolkit@nvidia.com
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
 Provides-Extra: wandb
 Requires-Dist: wandb>=0.15.0; extra == "wandb"
 Provides-Extra: gsheets
-Requires-Dist: gsheets>=0.1.0; extra == "gsheets"
+Requires-Dist: gspread>=5.0.0; extra == "gsheets"
 Provides-Extra: exporters
 Requires-Dist: mlflow; extra == "exporters"
 Requires-Dist: wandb; extra == "exporters"

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/pyproject.toml RENAMED Viewed

@@ -40,7 +40,7 @@ repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-l
 [project.optional-dependencies]
 mlflow = ["mlflow>=2.8.0"]
 wandb = ["wandb>=0.15.0"]
-gsheets = ["gsheets>=0.1.0"]
+gsheets = ["gspread>=5.0.0"]
 exporters = ["mlflow", "wandb", "gsheets"]
 all = ["mlflow", "wandb", "gsheets"]

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/helpers.py RENAMED Viewed

@@ -57,13 +57,38 @@ def _yaml_to_echo_command(
     )
+def _set_nested_optionally_overriding(
+    d: dict, keys: list[str], val: object, *, override_if_exists: bool = False
+):
+    """Sets d[...keys....] = value, creating keys all the way"""
+    temp = d
+    for key in keys[:-1]:
+        temp = temp.setdefault(key, {})
+    if override_if_exists or keys[-1] not in temp:
+        temp[keys[-1]] = val
 def get_eval_factory_config(
-    cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
+    cfg: DictConfig,
+    user_task_config: DictConfig,
 ) -> dict:
     """Extract config fields for eval factory.
     This function extracts the config field similar to how overrides are handled.
+    Overrides will be start to be deprecated (or not, but at least a warning will be logged).
     """
+    if cfg.evaluation.get("overrides") or user_task_config.get("overrides"):
+        # TODO(agronskiy): start removing overrides, test `test_start_deprecating_overrides`
+        # will start failing soon.
+        logger.warning(
+            "We are deprecating using old-style dot-delimited overrides "
+            "in favour of `nemo_evaluator_config` field. Please check "
+            "the documentation."
+        )
+    logger.debug("Getting nemo evaluator merged config")
     # Extract config fields similar to overrides - convert to basic Python types first
     # Support both new and old format for backward compatibility
     cfg_config = cfg.evaluation.get("nemo_evaluator_config") or cfg.evaluation.get(
@@ -80,17 +105,73 @@ def get_eval_factory_config(
         user_config = OmegaConf.to_container(user_config, resolve=True)
     # Merge the configs
-    config_fields = copy.deepcopy(cfg_config or {})
-    config_fields.update(user_config or {})
+    merged_nemo_evaluator_config: dict = OmegaConf.to_container(
+        OmegaConf.merge(cfg_config, user_config)
+    )
-    return config_fields
+    logger.debug(
+        "Merged nemo evaluator config, not final",
+        source_global_cfg=cfg_config,
+        source_task_config=user_config,
+        result=merged_nemo_evaluator_config,
+    )
+    return merged_nemo_evaluator_config
 def get_eval_factory_command(
     cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
 ) -> CmdAndReadableComment:
-    config_fields = get_eval_factory_config(cfg, user_task_config, task_definition)
+    merged_nemo_evaluator_config = get_eval_factory_config(
+        cfg,
+        user_task_config,
+    )
+    # We now prepare the config to be passed to `nemo-evaluator` command.
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "url"],
+        get_endpoint_url(
+            cfg,
+            merged_nemo_evaluator_config=merged_nemo_evaluator_config,
+            endpoint_type=task_definition["endpoint_type"],
+        ),
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "model_id"],
+        get_served_model_name(cfg),
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "type"],
+        task_definition["endpoint_type"],
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["config", "type"],
+        task_definition["task"],
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["config", "output_dir"],
+        "/results",
+    )
+    _set_nested_optionally_overriding(
+        merged_nemo_evaluator_config,
+        ["target", "api_endpoint", "api_key"],
+        "API_KEY",
+    )
+    create_file_cmd = _yaml_to_echo_command(
+        yaml.safe_dump(merged_nemo_evaluator_config), "config_ef.yaml"
+    )
+    eval_command = (
+        "cmd=$(command -v nemo-evaluator >/dev/null 2>&1 && echo nemo-evaluator || echo eval-factory) "
+        + "&& $cmd run_eval --run_config config_ef.yaml"
+    )
+    # NOTE: see note and test about deprecating that.
     overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
     overrides.update(dict(user_task_config.get("overrides", {})))
     # NOTE(dfridman): Temporary fix to make sure that the overrides arg is not split into multiple lines.
@@ -99,18 +180,7 @@ def get_eval_factory_command(
         k: (v.strip("\n") if isinstance(v, str) else v) for k, v in overrides.items()
     }
     overrides_str = ",".join([f"{k}={v}" for k, v in overrides.items()])
-    model_url = get_endpoint_url(cfg, user_task_config, task_definition)
-    model_id = get_served_model_name(cfg)
-    model_type = task_definition["endpoint_type"]
-    eval_type = task_definition["task"]
-    create_file_cmd = _yaml_to_echo_command(
-        yaml.safe_dump(config_fields), "config_ef.yaml"
-    )
-    eval_command = f"""cmd=$([[ $(command -v nemo-evaluator) ]] && echo 'nemo-evaluator' || echo 'eval-factory') && $cmd run_eval --model_id {model_id} --model_type {model_type} --eval_type {eval_type} --model_url {model_url} --api_key_name API_KEY --output_dir /results --run_config config_ef.yaml"""
-    if overrides:
+    if overrides_str:
         eval_command = f"{eval_command} --overrides {overrides_str}"
     # We return both the command and the debugging base64-decoded strings, useful
@@ -121,24 +191,29 @@ def get_eval_factory_command(
 def get_endpoint_url(
-    cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
+    cfg: DictConfig,
+    merged_nemo_evaluator_config: dict,
+    endpoint_type: str,
 ) -> str:
     def apply_url_override(url: str) -> str:
         """Apply user URL override if provided."""
-        nemo_evaluator_config_url = user_task_config.get(
-            "nemo_evaluator_config", {}
-        ).get("target.api_endpoint.url", None)
-        override_url = user_task_config.get("overrides", {}).get(
-            "config.target.api_endpoint.url", None
+        nemo_evaluator_config_url = (
+            merged_nemo_evaluator_config.get("target", {})
+            .get("api_endpoint", {})
+            .get("url", None)
         )
-        return (
-            override_url
-            if override_url is not None
-            else nemo_evaluator_config_url
-            if nemo_evaluator_config_url is not None
-            else url
+        if nemo_evaluator_config_url:
+            return nemo_evaluator_config_url
+        # Being deprecated, see `get_eval_factory_config` message.
+        overrides_old_style_url = merged_nemo_evaluator_config.get("overrides", {}).get(
+            "target.api_endpoint.url", None
         )
+        if overrides_old_style_url:
+            return overrides_old_style_url
+        return url
     if cfg.deployment.type == "none":
         # For deployment: none, use target URL regardless of executor type
@@ -160,8 +235,7 @@ def get_endpoint_url(
     else:
         # Local executor - use localhost
-        task_endpoint_type = task_definition["endpoint_type"]
-        endpoint_uri = cfg.deployment.endpoints[task_endpoint_type]
+        endpoint_uri = cfg.deployment.endpoints[endpoint_type]
         endpoint_url = f"http://127.0.0.1:{cfg.deployment.port}{endpoint_uri}"
         return endpoint_url

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/common/logging_utils.py RENAMED Viewed

@@ -61,8 +61,9 @@ import structlog
 # both are unset, default would be used.
 _LOG_LEVEL_ENV_VAR = "NEMO_EVALUATOR_LOG_LEVEL"
 _DEFAULT_LOG_LEVEL = "WARNING"
-_SENSITIVE_KEY_SUBSTRINGS = {
-    # Keep minimal, broad substrings (normalized: lowercased, no spaces/_/-)
+_SENSITIVE_KEY_SUBSTRINGS_NORMALIZED = {
+    # Keep minimal, broad substrings
+    # NOTE: normalized: lowercased, no spaces/_/-
     "authorization",  # covers proxy-authorization, etc.
     "apikey",  # covers api_key, api-key, x-api-key, nvidia_api_key, ...
     "accesskey",  # covers access_key / access-key
@@ -73,6 +74,10 @@ _SENSITIVE_KEY_SUBSTRINGS = {
     "pwd",  # common shorthand
     "passwd",  # common variant
 }
+_ALLOWLISTED_KEYS_SUBSTRINGS = {
+    # NOTE: non-normalized (for allowlisting we want more control)
+    "_tokens",  # This likely would allow us to not redact useful stuff like `limit_tokens`, `max_new_tokens`
+}
 def _mask(val: object) -> str:
@@ -91,8 +96,11 @@ def _normalize(name: object) -> str:
 def _is_sensitive_key(key: object) -> bool:
-    k = _normalize(key)
-    return any(substr in k for substr in _SENSITIVE_KEY_SUBSTRINGS)
+    k_norm = _normalize(key)
+    k_non_norm = str(key)
+    return any(
+        substr in k_norm for substr in _SENSITIVE_KEY_SUBSTRINGS_NORMALIZED
+    ) and not any(substr in k_non_norm for substr in _ALLOWLISTED_KEYS_SUBSTRINGS)
 def _redact_mapping(m: dict) -> dict:

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/lepton/executor.py RENAMED Viewed

@@ -610,7 +610,7 @@ class LeptonExecutor(BaseExecutor):
                 job_state = lepton_status.get("state", "Unknown")
                 # Map Lepton job states to our execution states
-                if job_state == "Succeeded":
+                if job_state in ["Succeeded", "Completed"]:
                     state = ExecutionState.SUCCESS
                 elif job_state in ["Running", "Pending", "Starting"]:
                     state = ExecutionState.RUNNING

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/executors/slurm/executor.py RENAMED Viewed

@@ -42,6 +42,7 @@ from nemo_evaluator_launcher.common.helpers import (
     get_api_key_name,
     get_endpoint_url,
     get_eval_factory_command,
+    get_eval_factory_config,
     get_eval_factory_dataset_size_from_run_config,
     get_health_url,
     get_timestamp_string,
@@ -453,7 +454,15 @@ def _create_slurm_sbatch_script(
     # get task from mapping, overrides, urls
     tasks_mapping = load_tasks_mapping()
     task_definition = get_task_from_mapping(task.name, tasks_mapping)
-    health_url = get_health_url(cfg, get_endpoint_url(cfg, task, task_definition))
+    # Create merged config for get_endpoint_url
+    merged_nemo_evaluator_config = get_eval_factory_config(cfg, task)
+    health_url = get_health_url(
+        cfg,
+        get_endpoint_url(
+            cfg, merged_nemo_evaluator_config, task_definition["endpoint_type"]
+        ),
+    )
     # TODO(public release): convert to template
     s = "#!/bin/bash\n"

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/package_info.py RENAMED Viewed

@@ -16,7 +16,7 @@
 # Below is the _next_ version that will be published, not the currently published one.
 MAJOR = 0
 MINOR = 1
-PATCH = 19
+PATCH = 26
 PRE_RELEASE = ""
 # Use the following formatting: (major, minor, patch, pre-release)

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher/resources/mapping.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 # NOTE(agronskiy): checked parity
 [lm-evaluation-harness]
-container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
 [lm-evaluation-harness.tasks.chat.ifeval]
 required_env_vars = []
@@ -124,7 +124,7 @@ required_env_vars = []
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [mtbench]
-container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
 [mtbench.tasks.chat.mtbench]
@@ -134,7 +134,7 @@ container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [ifbench]
-container = "nvcr.io/nvidia/eval-factory/ifbench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
 [ifbench.tasks.chat.ifbench]
 required_env_vars = []
@@ -142,7 +142,7 @@ required_env_vars = []
 ###############################################################################
 [simple_evals]
-container = "nvcr.io/nvidia/eval-factory/simple-evals:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
 [simple_evals.tasks.chat.gpqa_diamond]
 required_env_vars = ["HF_TOKEN"]
@@ -213,7 +213,7 @@ required_env_vars = []
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [bigcode-evaluation-harness]
-container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
 [bigcode-evaluation-harness.tasks.chat.mbpp]
 required_env_vars = []
@@ -226,12 +226,12 @@ required_env_vars = []
 [bigcode-evaluation-harness.tasks.completions.humaneval]
 required_env_vars = []
-[bigcode-evaluation-harness.tasks.completions.humaneval_instruct]
+[bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
 ###############################################################################
 [livecodebench]
-container = "nvcr.io/nvidia/eval-factory/livecodebench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
 [livecodebench.tasks.chat.livecodebench_0724_0125]
 required_env_vars = []
@@ -242,7 +242,7 @@ required_env_vars = []
 ###############################################################################
 [scicode]
-container = "nvcr.io/nvidia/eval-factory/scicode:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
 [scicode.tasks.chat.aa_scicode]
 required_env_vars = []
@@ -250,7 +250,7 @@ required_env_vars = []
 ###############################################################################
 [hle]
-container = "nvcr.io/nvidia/eval-factory/hle:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/hle:25.10"
 [hle.tasks.chat.hle]
 required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
@@ -258,7 +258,7 @@ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
 ###############################################################################
 [bfcl]
-container = "nvcr.io/nvidia/eval-factory/bfcl:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
 [bfcl.tasks.chat.bfclv2_ast_prompting]
 required_env_vars = []
@@ -267,9 +267,20 @@ required_env_vars = []
 required_env_vars = []
+###############################################################################
+[profbench]
+container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
+[profbench.tasks.chat.llm_judge]
+required_env_vars = []
+[profbench.tasks.chat.report_generation]
+required_env_vars = []
 ###############################################################################
 [vlmevalkit]
-container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
 [vlmevalkit.tasks.vlm.ocrbench]
 required_env_vars = []
@@ -286,15 +297,40 @@ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
 ###############################################################################
 [garak]
-container = "nvcr.io/nvidia/eval-factory/garak:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/garak:25.10"
 [garak.tasks.chat.garak]
 required_env_vars = []
+###############################################################################
+# NOTE(wprazuch): to verify if the tasks need any env var setting
+[nemo_skills]
+container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
+[nemo_skills.tasks.chat.ns_aime2024]
+required_env_vars = ["JUDGE_API_KEY"]
+[nemo_skills.tasks.chat.ns_aime2025]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_bfcl_v3]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_gpqa]
+required_env_vars = ["HF_TOKEN"]
+[nemo_skills.tasks.chat.ns_hle]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_mmlu]
+required_env_vars = ["HF_TOKEN"]
+[nemo_skills.tasks.chat.ns_mmlu_pro]
+required_env_vars = ["HF_TOKEN"]
 ###############################################################################
 [safety-harness]
-container = "nvcr.io/nvidia/eval-factory/safety-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
 [safety-harness.tasks.chat.aegis_v2]
 required_env_vars = ["HF_TOKEN"]
@@ -303,7 +339,7 @@ required_env_vars = ["HF_TOKEN"]
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [helm]
-container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/helm:25.10"
 [helm.tasks.chat.medcalc_bench]
@@ -339,6 +375,6 @@ container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [tooltalk]
-container = "nvcr.io/nvidia/eval-factory/tooltalk:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
 [tooltalk.tasks.chat.tooltalk]

{nemo_evaluator_launcher-0.1.19 → nemo_evaluator_launcher-0.1.26}/src/nemo_evaluator_launcher.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nemo-evaluator-launcher
-Version: 0.1.19
+Version: 0.1.26
 Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
 Author: NVIDIA
 Author-email: nemo-toolkit@nvidia.com
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
 Provides-Extra: wandb
 Requires-Dist: wandb>=0.15.0; extra == "wandb"
 Provides-Extra: gsheets
-Requires-Dist: gsheets>=0.1.0; extra == "gsheets"
+Requires-Dist: gspread>=5.0.0; extra == "gsheets"
 Provides-Extra: exporters
 Requires-Dist: mlflow; extra == "exporters"
 Requires-Dist: wandb; extra == "exporters"