PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

nemo_evaluator_launcher/__init__.py +15 -1
nemo_evaluator_launcher/api/functional.py +188 -27
nemo_evaluator_launcher/api/types.py +9 -0
nemo_evaluator_launcher/cli/export.py +131 -12
nemo_evaluator_launcher/cli/info.py +477 -82
nemo_evaluator_launcher/cli/kill.py +5 -3
nemo_evaluator_launcher/cli/logs.py +102 -0
nemo_evaluator_launcher/cli/ls_runs.py +31 -10
nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
nemo_evaluator_launcher/cli/main.py +101 -5
nemo_evaluator_launcher/cli/run.py +153 -30
nemo_evaluator_launcher/cli/status.py +49 -5
nemo_evaluator_launcher/cli/version.py +26 -23
nemo_evaluator_launcher/common/execdb.py +121 -27
nemo_evaluator_launcher/common/helpers.py +213 -33
nemo_evaluator_launcher/common/logging_utils.py +16 -5
nemo_evaluator_launcher/common/printing_utils.py +100 -0
nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
nemo_evaluator_launcher/executors/base.py +54 -1
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
nemo_evaluator_launcher/executors/local/executor.py +492 -56
nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
nemo_evaluator_launcher/exporters/base.py +9 -0
nemo_evaluator_launcher/exporters/gsheets.py +27 -9
nemo_evaluator_launcher/exporters/local.py +30 -16
nemo_evaluator_launcher/exporters/mlflow.py +245 -74
nemo_evaluator_launcher/exporters/utils.py +139 -184
nemo_evaluator_launcher/exporters/wandb.py +157 -43
nemo_evaluator_launcher/package_info.py +6 -3
nemo_evaluator_launcher/resources/mapping.toml +56 -15
nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0

nemo_evaluator_launcher/exporters/wandb.py CHANGED Viewed

@@ -19,7 +19,7 @@ import os
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 import yaml
@@ -38,6 +38,7 @@ from nemo_evaluator_launcher.exporters.registry import register_exporter
 from nemo_evaluator_launcher.exporters.utils import (
     extract_accuracy_metrics,
     extract_exporter_config,
+    get_artifact_root,
     get_available_artifacts,
     get_benchmark_info,
     get_task_name,
@@ -67,10 +68,41 @@ class WandBExporter(BaseExporter):
                 "log_mode", "per_task"
             )  # Default per_task for immediate export
-            # Get metrics
-            metrics = extract_accuracy_metrics(
-                job_data, self.get_job_paths, wandb_config.get("log_metrics", [])
-            )
+            # Stage artifacts locally if remote_ssh (e.g., Slurm), so we can extract metrics
+            staged_base_dir = None
+            try:
+                paths = self.get_job_paths(job_data)
+                if paths.get("storage_type") == "remote_ssh":
+                    tmp_stage = Path(tempfile.mkdtemp(prefix="wandb_stage_"))
+                    LocalExporter(
+                        {
+                            "output_dir": str(tmp_stage),
+                            "copy_logs": wandb_config.get("log_logs", False),
+                            "only_required": wandb_config.get("only_required", True),
+                        }
+                    ).export_job(job_data)
+                    staged_base_dir = (
+                        tmp_stage / job_data.invocation_id / job_data.job_id
+                    )
+            except Exception as e:
+                logger.warning(f"W&B: staging failed for {job_data.job_id}: {e}")
+            # Metrics (prefer staged if available)
+            log_metrics = wandb_config.get("log_metrics", [])
+            if staged_base_dir and (staged_base_dir / "artifacts").exists():
+                metrics = extract_accuracy_metrics(
+                    job_data,
+                    lambda _: {
+                        "artifacts_dir": staged_base_dir / "artifacts",
+                        "storage_type": "local_filesystem",
+                    },
+                    log_metrics,
+                )
+            else:
+                metrics = extract_accuracy_metrics(
+                    job_data, self.get_job_paths, log_metrics
+                )
             if not metrics:
                 return ExportResult(
                     success=False, dest="wandb", message="No metrics found"
@@ -163,29 +195,92 @@ class WandBExporter(BaseExporter):
             return {"success": False, "error": f"W&B export failed: {str(e)}"}
     def _log_artifacts(
-        self, job_data: JobData, wandb_config: Dict[str, Any], artifact
+        self,
+        job_data: JobData,
+        wandb_config: Dict[str, Any],
+        artifact,
+        register_staging_dir=None,
     ) -> List[str]:
-        """Log evaluation artifacts to WandB using LocalExporter for transfer."""
+        """Log evaluation artifacts to WandB using LocalExporter for staging."""
         if not wandb_config.get("log_artifacts", True):
             return []
         try:
             temp_dir = tempfile.mkdtemp(prefix="wandb_artifacts_")
-            local_exporter = LocalExporter({"output_dir": temp_dir})
+            if callable(register_staging_dir):
+                register_staging_dir(temp_dir)
+            local_exporter = LocalExporter(
+                {
+                    "output_dir": temp_dir,
+                    "copy_logs": wandb_config.get(
+                        "log_logs", wandb_config.get("copy_logs", False)
+                    ),
+                    "only_required": wandb_config.get("only_required", True),
+                    "format": wandb_config.get("format"),
+                    "log_metrics": wandb_config.get("log_metrics", []),
+                    "output_filename": wandb_config.get("output_filename"),
+                }
+            )
             local_result = local_exporter.export_job(job_data)
             if not local_result.success:
                 logger.error(f"Failed to download artifacts: {local_result.message}")
                 return []
-            artifacts_dir = Path(local_result.dest) / "artifacts"
-            logged_names = []
-            task_name = get_task_name(job_data)
-            for fname in get_available_artifacts(artifacts_dir):
-                fpath = artifacts_dir / fname
-                if fpath.exists():
-                    artifact.add_file(str(fpath), name=f"{task_name}/{fname}")
-                    logged_names.append(fname)
-            shutil.rmtree(temp_dir)
+            base_dir = Path(local_result.dest)
+            artifacts_dir = base_dir / "artifacts"
+            logs_dir = base_dir / "logs"
+            logged_names: list[str] = []
+            artifact_root = get_artifact_root(job_data)  # "<harness>.<benchmark>"
+            # Add config file only when artifacts logging is enabled
+            if wandb_config.get("log_artifacts", True):
+                cfg_added = False
+                for fname in ("config.yml", "run_config.yml"):
+                    p = artifacts_dir / fname
+                    if p.exists():
+                        artifact.add_file(str(p), name=f"{artifact_root}/{fname}")
+                        logged_names.append(fname)
+                        cfg_added = True
+                        break
+                if not cfg_added:
+                    with tempfile.NamedTemporaryFile(
+                        "w", suffix=".yaml", delete=False
+                    ) as tmp_cfg:
+                        yaml.dump(
+                            job_data.config or {},
+                            tmp_cfg,
+                            default_flow_style=False,
+                            sort_keys=False,
+                        )
+                        cfg_path = tmp_cfg.name
+                    artifact.add_file(cfg_path, name=f"{artifact_root}/config.yaml")
+                    os.unlink(cfg_path)
+                    logged_names.append("config.yaml")
+            files_to_upload: list[Path] = []
+            if wandb_config.get("only_required", True):
+                for fname in get_available_artifacts(artifacts_dir):
+                    p = artifacts_dir / fname
+                    if p.exists():
+                        files_to_upload.append(p)
+            else:
+                for p in artifacts_dir.iterdir():
+                    if p.is_file():
+                        files_to_upload.append(p)
+            for fpath in files_to_upload:
+                rel = fpath.relative_to(artifacts_dir).as_posix()
+                artifact.add_file(str(fpath), name=f"{artifact_root}/artifacts/{rel}")
+                logged_names.append(rel)
+            if wandb_config.get("log_logs", False) and logs_dir.exists():
+                for p in logs_dir.rglob("*"):
+                    if p.is_file():
+                        rel = p.relative_to(logs_dir).as_posix()
+                        artifact.add_file(str(p), name=f"{artifact_root}/logs/{rel}")
+                        logged_names.append(f"logs/{rel}")
             return logged_names
         except Exception as e:
             logger.error(f"Error logging artifacts: {e}")
@@ -193,7 +288,7 @@ class WandBExporter(BaseExporter):
     def _check_existing_run(
         self, identifier: str, job_data: JobData, config: Dict[str, Any]
-    ) -> tuple[bool, str]:
+    ) -> tuple[bool, Optional[str]]:
         """Check if run exists based on webhook metadata then name patterns."""
         try:
             import wandb
@@ -204,7 +299,7 @@ class WandBExporter(BaseExporter):
             if not (entity and project):
                 return False, None
-            # # Check webhook metadata for run_id first
+            # Check webhook metadata for run_id first
             webhook_meta = job_data.data.get("webhook_metadata", {})
             if (
                 webhook_meta.get("webhook_source") == "wandb"
@@ -281,10 +376,14 @@ class WandBExporter(BaseExporter):
             run_args["resume"] = "allow"
         # Config metadata
+        exec_type = (job_data.config or {}).get("execution", {}).get(
+            "type"
+        ) or job_data.executor
         run_config = {
             "invocation_id": job_data.invocation_id,
-            "executor": job_data.executor,
+            "executor": exec_type,
         }
         if log_mode == "per_task":
             run_config["job_id"] = job_data.job_id
             run_config["harness"] = harness
@@ -306,6 +405,13 @@ class WandBExporter(BaseExporter):
         # Initialize
         run = wandb.init(**{k: v for k, v in run_args.items() if v is not None})
+        # Track staging dirs for this run
+        staging_dirs: List[str] = []
+        def register_staging_dir(path: str) -> None:
+            if path and os.path.isdir(path):
+                staging_dirs.append(path)
         # In multi_task, aggregate lists after init (no overwrite)
         if log_mode == "multi_task":
             try:
@@ -339,34 +445,42 @@ class WandBExporter(BaseExporter):
                 "harness": harness,
             },
         )
-        with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp_cfg:
-            yaml.dump(job_data.config or {}, tmp_cfg, default_flow_style=False)
-            cfg_path = tmp_cfg.name
-        artifact.add_file(cfg_path, name="config.yaml")
-        os.unlink(cfg_path)
-        logged_artifacts = self._log_artifacts(job_data, config, artifact)
-        run.log_artifact(artifact)
+        logged_artifacts = self._log_artifacts(
+            job_data, config, artifact, register_staging_dir=register_staging_dir
+        )
-        # charts for each logged metric
         try:
-            for k in metrics.keys():
-                run.define_metric(k, summary="last")
-        except Exception:
-            pass
+            run.log_artifact(artifact)
+            # charts for each logged metric
+            try:
+                for k in metrics.keys():
+                    run.define_metric(k, summary="last")
+            except Exception:
+                pass
-        # Log metrics with per-task step
-        try:
-            step_idx = int(job_data.job_id.split(".")[-1])
-        except Exception:
-            step_idx = 0
-        run.log(metrics, step=step_idx)
+            # Log metrics with per-task step
+            try:
+                step_idx = int(job_data.job_id.split(".")[-1])
+            except Exception:
+                step_idx = 0
+            run.log(metrics, step=step_idx)
-        # metrics summary
-        try:
-            run.summary.update(metrics)
-        except Exception:
-            pass
+            # metrics summary
+            try:
+                run.summary.update(metrics)
+            except Exception:
+                pass
+        finally:
+            for d in staging_dirs:
+                try:
+                    shutil.rmtree(d, ignore_errors=True)
+                except Exception:
+                    pass
+            try:
+                run.finish()
+            except Exception:
+                pass
         return {
             "run_id": run.id,

nemo_evaluator_launcher/package_info.py CHANGED Viewed

@@ -13,10 +13,11 @@
 # limitations under the License.
+# Below is the _next_ version that will be published, not the currently published one.
 MAJOR = 0
 MINOR = 1
-PATCH = 0
-PRE_RELEASE = "rc6"
+PATCH = 41
+PRE_RELEASE = ""
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -24,12 +25,14 @@ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
 __shortversion__ = ".".join(map(str, VERSION[:3]))
 __version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:])
+# BEGIN(if-changed): check the pyproject.toml, too
 __package_name__ = "nemo_evaluator_launcher"
 __contact_names__ = "NVIDIA"
 __contact_emails__ = "nemo-toolkit@nvidia.com"
 __homepage__ = "https://github.com/NVIDIA-NeMo/Eval"
 __repository_url__ = "https://github.com/NVIDIA-NeMo/Eval"
-__download_url__ = "https://github.com/NVIDIA-NeMo/Eval/releases"
+__download_url__ = "https://github.com/NVIDIA-NeMo/Evaluator/releases"
 __description__ = "Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends"
 __license__ = "Apache2"
 __keywords__ = "deep learning, evaluations, machine learning, gpu, NLP, pytorch, torch"
+# END(if-changed)

nemo_evaluator_launcher/resources/mapping.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 # NOTE(agronskiy): checked parity
 [lm-evaluation-harness]
-container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.10"
 [lm-evaluation-harness.tasks.chat.ifeval]
 required_env_vars = []
@@ -79,6 +79,8 @@ required_env_vars = []
 [lm-evaluation-harness.tasks.chat.mmlu_redux_instruct]
+[lm-evaluation-harness.tasks.chat.mmlu_cot_0_shot_chat]
 [lm-evaluation-harness.tasks.completions.gsm8k]
 required_env_vars = []
@@ -124,7 +126,7 @@ required_env_vars = []
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [mtbench]
-container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/mtbench:25.10"
 [mtbench.tasks.chat.mtbench]
@@ -134,7 +136,7 @@ container = "nvcr.io/nvidia/eval-factory/mtbench:25.08.1"
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [ifbench]
-container = "nvcr.io/nvidia/eval-factory/ifbench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/ifbench:25.10"
 [ifbench.tasks.chat.ifbench]
 required_env_vars = []
@@ -142,7 +144,7 @@ required_env_vars = []
 ###############################################################################
 [simple_evals]
-container = "nvcr.io/nvidia/eval-factory/simple-evals:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/simple-evals:25.10"
 [simple_evals.tasks.chat.gpqa_diamond]
 required_env_vars = ["HF_TOKEN"]
@@ -213,7 +215,7 @@ required_env_vars = []
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [bigcode-evaluation-harness]
-container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.10"
 [bigcode-evaluation-harness.tasks.chat.mbpp]
 required_env_vars = []
@@ -226,12 +228,12 @@ required_env_vars = []
 [bigcode-evaluation-harness.tasks.completions.humaneval]
 required_env_vars = []
-[bigcode-evaluation-harness.tasks.completions.humaneval_instruct]
+[bigcode-evaluation-harness.tasks.chat.humaneval_instruct]
 ###############################################################################
 [livecodebench]
-container = "nvcr.io/nvidia/eval-factory/livecodebench:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/livecodebench:25.10"
 [livecodebench.tasks.chat.livecodebench_0724_0125]
 required_env_vars = []
@@ -242,7 +244,7 @@ required_env_vars = []
 ###############################################################################
 [scicode]
-container = "nvcr.io/nvidia/eval-factory/scicode:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/scicode:25.10"
 [scicode.tasks.chat.aa_scicode]
 required_env_vars = []
@@ -250,7 +252,7 @@ required_env_vars = []
 ###############################################################################
 [hle]
-container = "nvcr.io/nvidia/eval-factory/hle:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/hle:25.10"
 [hle.tasks.chat.hle]
 required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
@@ -258,7 +260,7 @@ required_env_vars = ["HF_TOKEN", "OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
 ###############################################################################
 [bfcl]
-container = "nvcr.io/nvidia/eval-factory/bfcl:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/bfcl:25.10"
 [bfcl.tasks.chat.bfclv2_ast_prompting]
 required_env_vars = []
@@ -267,9 +269,20 @@ required_env_vars = []
 required_env_vars = []
+###############################################################################
+[profbench]
+container = "nvcr.io/nvidia/eval-factory/profbench:25.10"
+[profbench.tasks.chat.llm_judge]
+required_env_vars = []
+[profbench.tasks.chat.report_generation]
+required_env_vars = []
 ###############################################################################
 [vlmevalkit]
-container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/vlmevalkit:25.10"
 [vlmevalkit.tasks.vlm.ocrbench]
 required_env_vars = []
@@ -286,15 +299,43 @@ required_env_vars = ["OPENAI_CLIENT_ID", "OPENAI_CLIENT_SECRET"]
 ###############################################################################
 [garak]
-container = "nvcr.io/nvidia/eval-factory/garak:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/garak:25.10"
 [garak.tasks.chat.garak]
 required_env_vars = []
+###############################################################################
+# NOTE(wprazuch): to verify if the tasks need any env var setting
+[nemo_skills]
+container = "nvcr.io/nvidia/eval-factory/nemo_skills:25.10"
+[nemo_skills.tasks.chat.ns_aime2024]
+required_env_vars = ["JUDGE_API_KEY"]
+[nemo_skills.tasks.chat.ns_aime2025]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_bfcl_v3]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_gpqa]
+required_env_vars = ["HF_TOKEN"]
+[nemo_skills.tasks.chat.ns_hle]
+required_env_vars = []
+[nemo_skills.tasks.chat.ns_mmlu]
+required_env_vars = ["HF_TOKEN"]
+[nemo_skills.tasks.chat.ns_mmlu_pro]
+required_env_vars = ["HF_TOKEN"]
+[nemo_skills.tasks.chat.ns_aa_lcr]
+required_env_vars = ["JUDGE_API_KEY"]
 ###############################################################################
 [safety-harness]
-container = "nvcr.io/nvidia/eval-factory/safety-harness:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/safety-harness:25.10"
 [safety-harness.tasks.chat.aegis_v2]
 required_env_vars = ["HF_TOKEN"]
@@ -303,7 +344,7 @@ required_env_vars = ["HF_TOKEN"]
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [helm]
-container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/helm:25.10"
 [helm.tasks.chat.medcalc_bench]
@@ -339,6 +380,6 @@ container = "nvcr.io/nvidia/eval-factory/helm:25.08.1"
 ###############################################################################
 # NOTE(agronskiy): checked parity
 [tooltalk]
-container = "nvcr.io/nvidia/eval-factory/tooltalk:25.08.1"
+container = "nvcr.io/nvidia/eval-factory/tooltalk:25.10"
 [tooltalk.tasks.chat.tooltalk]

nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl