PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.41py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

nemo_evaluator_launcher/cli/ls_tasks.py CHANGED Viewed

@@ -18,6 +18,13 @@ from dataclasses import dataclass
 from simple_parsing import field
+from nemo_evaluator_launcher.common.printing_utils import (
+    bold,
+    cyan,
+    grey,
+    magenta,
+)
 @dataclass
 class Cmd:
@@ -28,20 +35,101 @@ class Cmd:
         action="store_true",
         help="Print output as JSON instead of table format",
     )
+    from_container: str = field(
+        default="",
+        help="Load tasks from container image (e.g., nvcr.io/nvidia/eval-factory/simple-evals:25.10). "
+        "If provided, extracts framework.yml from container and lists tasks on-the-fly instead of using mapping.toml",
+    )
     def execute(self) -> None:
         # Import heavy dependencies only when needed
         import json
-        from nemo_evaluator_launcher.api.functional import get_tasks_list
+        if self.from_container:
+            # Load tasks from container
+            from nemo_evaluator_launcher.common.container_metadata import (
+                load_tasks_from_container,
+            )
+            try:
+                tasks = load_tasks_from_container(self.from_container)
+            except ValueError as e:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "Failed to load tasks from container",
+                    container=self.from_container,
+                    error=str(e),
+                )
+                return
+            except Exception as e:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "Failed to load tasks from container",
+                    container=self.from_container,
+                    error=str(e),
+                    exc_info=True,
+                )
+                return
+            if not tasks:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "No tasks found in container",
+                    container=self.from_container,
+                )
+                return
-        # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
-        data = get_tasks_list()
-        headers = ["task", "endpoint_type", "harness", "container"]
+            # Convert TaskIntermediateRepresentation to format expected by get_tasks_list()
+            # Build data structure matching get_tasks_list() output format
+            data = []
+            for task in tasks:
+                # Extract endpoint types from defaults
+                endpoint_types = (
+                    task.defaults.get("target", {})
+                    .get("api_endpoint", {})
+                    .get("type", "chat")
+                )
+                if isinstance(endpoint_types, str):
+                    endpoint_types = [endpoint_types]
+                data.append(
+                    [
+                        task.name,  # task
+                        ",".join(endpoint_types)
+                        if isinstance(endpoint_types, list)
+                        else endpoint_types,  # endpoint_type
+                        task.harness,  # harness
+                        task.container,  # container
+                        getattr(task, "container_arch", "") or "",  # arch
+                        task.description,  # description
+                    ]
+                )
+        else:
+            # Default behavior: load from mapping.toml via get_tasks_list()
+            from nemo_evaluator_launcher.api.functional import get_tasks_list
+            # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
+            data = get_tasks_list()
+        headers = [
+            "task",
+            "endpoint_type",
+            "harness",
+            "container",
+            "arch",
+            "description",
+        ]
         supported_benchmarks = []
         for task_data in data:
-            assert len(task_data) == len(headers)
-            supported_benchmarks.append(dict(zip(headers, task_data)))
+            if len(task_data) < len(headers):
+                raise ValueError(
+                    f"Invalid task row shape: expected at least {len(headers)} columns, got {len(task_data)}"
+                )
+            # Backwards/forwards compat: allow extra columns and ignore them.
+            supported_benchmarks.append(dict(zip(headers, task_data[: len(headers)])))
         if self.json:
             print(json.dumps({"tasks": supported_benchmarks}, indent=2))
@@ -49,11 +137,55 @@ class Cmd:
             self._print_table(supported_benchmarks)
     def _print_table(self, tasks: list[dict]) -> None:
-        """Print tasks grouped by harness and container in table format."""
+        """Print tasks grouped by harness and container in table format with colorized output."""
         if not tasks:
             print("No tasks found.")
             return
+        def _truncate(s: str, max_len: int) -> str:
+            s = s or ""
+            if max_len <= 0:
+                return ""
+            if len(s) <= max_len:
+                return s
+            if max_len <= 3:
+                return s[:max_len]
+            return s[: max_len - 3] + "..."
+        def _infer_arch(container: str, container_tasks: list[dict]) -> str:
+            # Prefer explicit arch from task IRs.
+            for t in container_tasks:
+                a = (t.get("arch") or "").strip()
+                if a:
+                    return a
+            # Heuristic fallback: look for common suffixes in tag.
+            c = (container or "").lower()
+            if "arm64" in c or "aarch64" in c:
+                return "arm"
+            if "amd64" in c or "x86_64" in c:
+                return "amd"
+            return "unknown"
+        def _infer_registry(container: str) -> str:
+            try:
+                from nemo_evaluator_launcher.common.container_metadata.utils import (
+                    parse_container_image,
+                )
+                registry_type, _registry_url, _repo, _ref = parse_container_image(
+                    container
+                )
+                return str(registry_type)
+            except Exception:
+                # Best-effort fallback for unknown formats.
+                c = (container or "").lower()
+                if "nvcr.io/" in c or c.startswith("nvcr.io"):
+                    return "nvcr"
+                if "gitlab" in c:
+                    return "gitlab"
+                return ""
         # Group tasks by harness and container
         grouped = defaultdict(lambda: defaultdict(list))
         for task in tasks:
@@ -70,67 +202,88 @@ class Cmd:
                 if j > 0:
                     print()  # Spacing between containers
-                # Prepare task table first to get column widths
-                task_headers = ["task", "endpoint_type"]
                 rows = []
                 for task in container_tasks:
-                    rows.append([task["task"], task["endpoint_type"]])
-                # Sort tasks alphabetically for better readability
-                rows.sort(key=lambda x: x[0])
-                # Calculate column widths with some padding
-                widths = [
-                    max(len(task_headers[i]), max(len(str(row[i])) for row in rows)) + 2
-                    for i in range(len(task_headers))
-                ]
-                # Calculate minimum table width based on task columns
-                min_table_width = sum(widths) + len(widths) + 1
+                    rows.append(
+                        {
+                            "task": str(task.get("task", "")),
+                            "endpoint": str(task.get("endpoint_type", "")),
+                            "description": str(task.get("description", "")),
+                        }
+                    )
+                rows.sort(key=lambda r: r["task"].lower())
                 # Calculate required width for header content
                 harness_line = f"harness: {harness}"
                 container_line = f"container: {container}"
+                arch_line = f"arch: {_infer_arch(container, container_tasks)}"
+                registry_line = f"registry: {_infer_registry(container)}"
                 header_content_width = (
-                    max(len(harness_line), len(container_line)) + 4
+                    max(
+                        len(harness_line),
+                        len(container_line),
+                        len(arch_line),
+                        len(registry_line),
+                    )
+                    + 4
                 )  # +4 for "| " and " |"
-                # Use the larger of the two widths
-                table_width = max(min_table_width, header_content_width)
+                # Limit separator width to prevent overflow on small terminals
+                # Use terminal width if available, otherwise cap at 120 characters
+                import shutil
-                # Print combined header with harness and container info
-                print("=" * table_width)
-                print(f"{harness_line}")
-                print(f"{container_line}")
+                try:
+                    terminal_width = shutil.get_terminal_size().columns
+                    separator_width = min(terminal_width - 2, 160)  # -2 safety margin
+                except Exception:
+                    # Fallback if terminal size can't be determined
+                    separator_width = 120
-                # Adjust column widths to fill the full table width
-                available_width = table_width
-                # Give more space to the first column (task names can be long)
-                adjusted_widths = [
-                    max(
-                        widths[0], available_width * 2 // 3
-                    ),  # 2/3 of available width for task
-                    0,  # Will be calculated as remainder
-                ]
-                adjusted_widths[1] = (
-                    available_width - adjusted_widths[0]
-                )  # Remainder for endpoint_type
+                separator_width = max(separator_width, min(header_content_width, 160))
+                # Table columns (keep compact and stable).
+                col_task = 36
+                col_endpoint = 14
+                sep = "  "
+                fixed = col_task + col_endpoint + len(sep) * 2
+                col_desc = max(20, separator_width - fixed)
+                # Print combined header with harness and container info - colorized
+                # Keys: magenta, Values: cyan (matching logging utils)
+                print(bold("=" * separator_width))
+                print(f"{magenta('harness:')} {cyan(str(harness))}")
+                print(f"{magenta('container:')} {cyan(str(container))}")
+                arch = _infer_arch(container, container_tasks)
+                registry = _infer_registry(container)
+                print(f"{magenta('arch:')} {cyan(str(arch))}")
+                if registry:
+                    print(f"{magenta('registry:')} {cyan(str(registry))}")
                 # Print task table header separator
-                print(" " * table_width)
-                header_row = f"{task_headers[0]:<{adjusted_widths[0]}}{task_headers[1]:<{adjusted_widths[1]}}"
-                print(header_row)
-                print("-" * table_width)
-                # Print task rows
-                for row in rows:
-                    data_row = f"{str(row[0]):<{adjusted_widths[0]}}{str(row[1]):<{adjusted_widths[1]}}"
-                    print(data_row)
-                print("-" * table_width)
-                # Show task count
+                print()
+                print(
+                    bold(
+                        f"{'task':<{col_task}}{sep}"
+                        f"{'endpoint':<{col_endpoint}}{sep}"
+                        f"{'description':<{col_desc}}"
+                    )
+                )
+                print(bold("-" * separator_width))
+                # Print task rows - use grey for task descriptions
+                for r in rows:
+                    line = (
+                        f"{_truncate(r['task'], col_task):<{col_task}}{sep}"
+                        f"{_truncate(r['endpoint'], col_endpoint):<{col_endpoint}}{sep}"
+                        f"{_truncate(r['description'], col_desc):<{col_desc}}"
+                    )
+                    print(grey(line))
+                print(bold("-" * separator_width))
+                # Show task count - grey for count text
                 task_count = len(rows)
-                print(f"  {task_count} task{'s' if task_count != 1 else ''} available")
-                print("=" * table_width)
+                task_word = "task" if task_count == 1 else "tasks"
+                print(f"  {grey(f'{task_count} {task_word} available')}")
+                print(bold("=" * separator_width))
                 print()

nemo_evaluator_launcher/cli/main.py CHANGED Viewed

@@ -24,6 +24,7 @@ import nemo_evaluator_launcher.cli.info as info
 import nemo_evaluator_launcher.cli.kill as kill
 import nemo_evaluator_launcher.cli.logs as logs
 import nemo_evaluator_launcher.cli.ls_runs as ls_runs
+import nemo_evaluator_launcher.cli.ls_task as ls_task
 import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
 import nemo_evaluator_launcher.cli.run as run
 import nemo_evaluator_launcher.cli.status as status
@@ -49,6 +50,7 @@ def is_verbose_enabled(args) -> bool:
         "tasks_alias",
         "tasks",
         "runs",
+        "task",
         "export",
     ]
     for subcmd in subcommands:
@@ -159,6 +161,14 @@ def create_parser() -> ArgumentParser:
     )
     ls_runs_parser.add_arguments(ls_runs.Cmd, dest="runs")
+    # ls task (task details)
+    ls_task_parser = ls_sub.add_parser(
+        "task",
+        help="Show task details",
+        description="Show detailed information about a specific task",
+    )
+    ls_task_parser.add_arguments(ls_task.Cmd, dest="task")
     # Export subcommand
     export_parser = subparsers.add_parser(
         "export",
@@ -220,12 +230,17 @@ def main() -> None:
         args.kill.execute()
     elif args.command == "ls":
         # Dispatch nested ls subcommands
-        if args.ls_command is None or args.ls_command == "tasks":
-            # Default to tasks when no subcommand specified
+        if args.ls_command == "tasks":
+            # When explicitly "ls tasks", use args.tasks (has correct from_container)
+            args.tasks.execute()
+        elif args.ls_command is None:
+            # When just "ls" (no subcommand), use args.tasks_alias
             if hasattr(args, "tasks_alias"):
                 args.tasks_alias.execute()
             else:
                 args.tasks.execute()
+        elif args.ls_command == "task":
+            args.task.execute()
         elif args.ls_command == "runs":
             args.runs.execute()
     elif args.command == "export":

nemo_evaluator_launcher/cli/run.py CHANGED Viewed

@@ -77,6 +77,15 @@ class Cmd:
         alias=["-n", "--dry-run"],
         metadata={"help": "Do not run the evaluation, just print the config."},
     )
+    tasks: list[str] = field(
+        default_factory=list,
+        action="append",
+        nargs="?",
+        alias=["-t"],
+        metadata={
+            "help": "Run only specific tasks from the config. Example: -t ifeval -t gsm8k"
+        },
+    )
     config_output: str | None = field(
         default=None,
         alias=["--config-output"],
@@ -85,12 +94,31 @@ class Cmd:
         },
     )
+    def _parse_requested_tasks(self) -> list[str]:
+        """Parse -t arguments into a list of task names.
+        Handles None values that can be appended when using nargs="?" with action="append".
+        """
+        requested_tasks = []
+        for task_arg in self.tasks:
+            # Skip None or empty values (can happen with nargs="?")
+            if not task_arg:
+                continue
+            task_name = task_arg.strip()
+            if task_name and task_name not in requested_tasks:
+                requested_tasks.append(task_name)
+        return requested_tasks
     def execute(self) -> None:
         # Import heavy dependencies only when needed
         import yaml
         from omegaconf import OmegaConf
-        from nemo_evaluator_launcher.api.functional import RunConfig, run_eval
+        from nemo_evaluator_launcher.api.functional import (
+            RunConfig,
+            filter_tasks,
+            run_eval,
+        )
         # Validate config_mode value
         if self.config_mode not in ["hydra", "raw"]:
@@ -104,6 +132,9 @@ class Cmd:
                 "--config-mode=raw requires --config to be specified. Raw mode loads config files directly."
             )
+        # Parse requested tasks if -t is specified
+        requested_tasks = self._parse_requested_tasks() if self.tasks else None
         # Load configuration either from Hydra or directly from a config file
         if self.config_mode == "raw" and self.config:
             # Validate that raw config loading is not used with other config options
@@ -147,6 +178,15 @@ class Cmd:
                 hydra_overrides=self.override,
             )
+        # Apply task filtering if -t is specified
+        if requested_tasks:
+            config = filter_tasks(config, requested_tasks)
+            logger.info(
+                "Running filtered tasks",
+                count=len(config.evaluation.tasks),
+                tasks=[t.name for t in config.evaluation.tasks],
+            )
         try:
             invocation_id = run_eval(config, self.dry_run)
         except Exception as e:

nemo_evaluator_launcher/common/container_metadata/__init__.py ADDED Viewed

@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Container metadata management: registries, intermediate representations, and loading."""
+from nemo_evaluator_launcher.common.container_metadata.intermediate_repr import (
+    HarnessIntermediateRepresentation,
+    TaskIntermediateRepresentation,
+    load_harnesses_and_tasks_from_tasks_file,
+    load_tasks_from_tasks_file,
+)
+from nemo_evaluator_launcher.common.container_metadata.registries import (
+    DockerRegistryHandler,
+    create_authenticator,
+)
+from nemo_evaluator_launcher.common.container_metadata.utils import (
+    parse_container_image,
+)
+__all__ = [
+    "DockerRegistryHandler",
+    "create_authenticator",
+    "HarnessIntermediateRepresentation",
+    "TaskIntermediateRepresentation",
+    "load_harnesses_and_tasks_from_tasks_file",
+    "load_tasks_from_tasks_file",
+    "parse_container_image",
+]
+# Optional imports:
+# `loading` pulls in `nemo_evaluator` (and deps like `pydantic`). Keep IR-only
+# workflows (e.g., docs autogen) usable without requiring the full stack.
+try:
+    from nemo_evaluator_launcher.common.container_metadata.loading import (  # noqa: F401
+        extract_framework_yml,
+        load_tasks_from_container,
+        parse_framework_to_irs,
+    )
+    __all__.extend(
+        [
+            "extract_framework_yml",
+            "load_tasks_from_container",
+            "parse_framework_to_irs",
+        ]
+    )
+except ModuleNotFoundError:
+    # Allow importing this package for IR-only workflows (docs autogen, etc.)
+    pass

nemo-evaluator-launcher 0.1.41__py3-none-any.whl → 0.1.56__py3-none-any.whl

nemo-evaluator-launcher 0.1.41py3-none-any.whl → 0.1.56py3-none-any.whl