PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.19py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

nemo_evaluator_launcher/cli/ls_tasks.py CHANGED Viewed

@@ -18,6 +18,13 @@ from dataclasses import dataclass
 from simple_parsing import field
+from nemo_evaluator_launcher.common.printing_utils import (
+    bold,
+    cyan,
+    grey,
+    magenta,
+)
 @dataclass
 class Cmd:
@@ -28,20 +35,101 @@ class Cmd:
         action="store_true",
         help="Print output as JSON instead of table format",
     )
+    from_container: str = field(
+        default="",
+        help="Load tasks from container image (e.g., nvcr.io/nvidia/eval-factory/simple-evals:25.10). "
+        "If provided, extracts framework.yml from container and lists tasks on-the-fly instead of using mapping.toml",
+    )
     def execute(self) -> None:
         # Import heavy dependencies only when needed
         import json
-        from nemo_evaluator_launcher.api.functional import get_tasks_list
+        if self.from_container:
+            # Load tasks from container
+            from nemo_evaluator_launcher.common.container_metadata import (
+                load_tasks_from_container,
+            )
+            try:
+                tasks = load_tasks_from_container(self.from_container)
+            except ValueError as e:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "Failed to load tasks from container",
+                    container=self.from_container,
+                    error=str(e),
+                )
+                return
+            except Exception as e:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "Failed to load tasks from container",
+                    container=self.from_container,
+                    error=str(e),
+                    exc_info=True,
+                )
+                return
+            if not tasks:
+                from nemo_evaluator_launcher.common.logging_utils import logger
+                logger.error(
+                    "No tasks found in container",
+                    container=self.from_container,
+                )
+                return
-        # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
-        data = get_tasks_list()
-        headers = ["task", "endpoint_type", "harness", "container"]
+            # Convert TaskIntermediateRepresentation to format expected by get_tasks_list()
+            # Build data structure matching get_tasks_list() output format
+            data = []
+            for task in tasks:
+                # Extract endpoint types from defaults
+                endpoint_types = (
+                    task.defaults.get("target", {})
+                    .get("api_endpoint", {})
+                    .get("type", "chat")
+                )
+                if isinstance(endpoint_types, str):
+                    endpoint_types = [endpoint_types]
+                data.append(
+                    [
+                        task.name,  # task
+                        ",".join(endpoint_types)
+                        if isinstance(endpoint_types, list)
+                        else endpoint_types,  # endpoint_type
+                        task.harness,  # harness
+                        task.container,  # container
+                        getattr(task, "container_arch", "") or "",  # arch
+                        task.description,  # description
+                    ]
+                )
+        else:
+            # Default behavior: load from mapping.toml via get_tasks_list()
+            from nemo_evaluator_launcher.api.functional import get_tasks_list
+            # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
+            data = get_tasks_list()
+        headers = [
+            "task",
+            "endpoint_type",
+            "harness",
+            "container",
+            "arch",
+            "description",
+        ]
         supported_benchmarks = []
         for task_data in data:
-            assert len(task_data) == len(headers)
-            supported_benchmarks.append(dict(zip(headers, task_data)))
+            if len(task_data) < len(headers):
+                raise ValueError(
+                    f"Invalid task row shape: expected at least {len(headers)} columns, got {len(task_data)}"
+                )
+            # Backwards/forwards compat: allow extra columns and ignore them.
+            supported_benchmarks.append(dict(zip(headers, task_data[: len(headers)])))
         if self.json:
             print(json.dumps({"tasks": supported_benchmarks}, indent=2))
@@ -49,11 +137,55 @@ class Cmd:
             self._print_table(supported_benchmarks)
     def _print_table(self, tasks: list[dict]) -> None:
-        """Print tasks grouped by harness and container in table format."""
+        """Print tasks grouped by harness and container in table format with colorized output."""
         if not tasks:
             print("No tasks found.")
             return
+        def _truncate(s: str, max_len: int) -> str:
+            s = s or ""
+            if max_len <= 0:
+                return ""
+            if len(s) <= max_len:
+                return s
+            if max_len <= 3:
+                return s[:max_len]
+            return s[: max_len - 3] + "..."
+        def _infer_arch(container: str, container_tasks: list[dict]) -> str:
+            # Prefer explicit arch from task IRs.
+            for t in container_tasks:
+                a = (t.get("arch") or "").strip()
+                if a:
+                    return a
+            # Heuristic fallback: look for common suffixes in tag.
+            c = (container or "").lower()
+            if "arm64" in c or "aarch64" in c:
+                return "arm"
+            if "amd64" in c or "x86_64" in c:
+                return "amd"
+            return "unknown"
+        def _infer_registry(container: str) -> str:
+            try:
+                from nemo_evaluator_launcher.common.container_metadata.utils import (
+                    parse_container_image,
+                )
+                registry_type, _registry_url, _repo, _ref = parse_container_image(
+                    container
+                )
+                return str(registry_type)
+            except Exception:
+                # Best-effort fallback for unknown formats.
+                c = (container or "").lower()
+                if "nvcr.io/" in c or c.startswith("nvcr.io"):
+                    return "nvcr"
+                if "gitlab" in c:
+                    return "gitlab"
+                return ""
         # Group tasks by harness and container
         grouped = defaultdict(lambda: defaultdict(list))
         for task in tasks:
@@ -70,67 +202,88 @@ class Cmd:
                 if j > 0:
                     print()  # Spacing between containers
-                # Prepare task table first to get column widths
-                task_headers = ["task", "endpoint_type"]
                 rows = []
                 for task in container_tasks:
-                    rows.append([task["task"], task["endpoint_type"]])
-                # Sort tasks alphabetically for better readability
-                rows.sort(key=lambda x: x[0])
-                # Calculate column widths with some padding
-                widths = [
-                    max(len(task_headers[i]), max(len(str(row[i])) for row in rows)) + 2
-                    for i in range(len(task_headers))
-                ]
-                # Calculate minimum table width based on task columns
-                min_table_width = sum(widths) + len(widths) + 1
+                    rows.append(
+                        {
+                            "task": str(task.get("task", "")),
+                            "endpoint": str(task.get("endpoint_type", "")),
+                            "description": str(task.get("description", "")),
+                        }
+                    )
+                rows.sort(key=lambda r: r["task"].lower())
                 # Calculate required width for header content
                 harness_line = f"harness: {harness}"
                 container_line = f"container: {container}"
+                arch_line = f"arch: {_infer_arch(container, container_tasks)}"
+                registry_line = f"registry: {_infer_registry(container)}"
                 header_content_width = (
-                    max(len(harness_line), len(container_line)) + 4
+                    max(
+                        len(harness_line),
+                        len(container_line),
+                        len(arch_line),
+                        len(registry_line),
+                    )
+                    + 4
                 )  # +4 for "| " and " |"
-                # Use the larger of the two widths
-                table_width = max(min_table_width, header_content_width)
+                # Limit separator width to prevent overflow on small terminals
+                # Use terminal width if available, otherwise cap at 120 characters
+                import shutil
-                # Print combined header with harness and container info
-                print("=" * table_width)
-                print(f"{harness_line}")
-                print(f"{container_line}")
+                try:
+                    terminal_width = shutil.get_terminal_size().columns
+                    separator_width = min(terminal_width - 2, 160)  # -2 safety margin
+                except Exception:
+                    # Fallback if terminal size can't be determined
+                    separator_width = 120
-                # Adjust column widths to fill the full table width
-                available_width = table_width
-                # Give more space to the first column (task names can be long)
-                adjusted_widths = [
-                    max(
-                        widths[0], available_width * 2 // 3
-                    ),  # 2/3 of available width for task
-                    0,  # Will be calculated as remainder
-                ]
-                adjusted_widths[1] = (
-                    available_width - adjusted_widths[0]
-                )  # Remainder for endpoint_type
+                separator_width = max(separator_width, min(header_content_width, 160))
+                # Table columns (keep compact and stable).
+                col_task = 36
+                col_endpoint = 14
+                sep = "  "
+                fixed = col_task + col_endpoint + len(sep) * 2
+                col_desc = max(20, separator_width - fixed)
+                # Print combined header with harness and container info - colorized
+                # Keys: magenta, Values: cyan (matching logging utils)
+                print(bold("=" * separator_width))
+                print(f"{magenta('harness:')} {cyan(str(harness))}")
+                print(f"{magenta('container:')} {cyan(str(container))}")
+                arch = _infer_arch(container, container_tasks)
+                registry = _infer_registry(container)
+                print(f"{magenta('arch:')} {cyan(str(arch))}")
+                if registry:
+                    print(f"{magenta('registry:')} {cyan(str(registry))}")
                 # Print task table header separator
-                print(" " * table_width)
-                header_row = f"{task_headers[0]:<{adjusted_widths[0]}}{task_headers[1]:<{adjusted_widths[1]}}"
-                print(header_row)
-                print("-" * table_width)
-                # Print task rows
-                for row in rows:
-                    data_row = f"{str(row[0]):<{adjusted_widths[0]}}{str(row[1]):<{adjusted_widths[1]}}"
-                    print(data_row)
-                print("-" * table_width)
-                # Show task count
+                print()
+                print(
+                    bold(
+                        f"{'task':<{col_task}}{sep}"
+                        f"{'endpoint':<{col_endpoint}}{sep}"
+                        f"{'description':<{col_desc}}"
+                    )
+                )
+                print(bold("-" * separator_width))
+                # Print task rows - use grey for task descriptions
+                for r in rows:
+                    line = (
+                        f"{_truncate(r['task'], col_task):<{col_task}}{sep}"
+                        f"{_truncate(r['endpoint'], col_endpoint):<{col_endpoint}}{sep}"
+                        f"{_truncate(r['description'], col_desc):<{col_desc}}"
+                    )
+                    print(grey(line))
+                print(bold("-" * separator_width))
+                # Show task count - grey for count text
                 task_count = len(rows)
-                print(f"  {task_count} task{'s' if task_count != 1 else ''} available")
-                print("=" * table_width)
+                task_word = "task" if task_count == 1 else "tasks"
+                print(f"  {grey(f'{task_count} {task_word} available')}")
+                print(bold("=" * separator_width))
                 print()

nemo_evaluator_launcher/cli/main.py CHANGED Viewed

@@ -22,7 +22,9 @@ from simple_parsing import ArgumentParser
 import nemo_evaluator_launcher.cli.export as export
 import nemo_evaluator_launcher.cli.info as info
 import nemo_evaluator_launcher.cli.kill as kill
+import nemo_evaluator_launcher.cli.logs as logs
 import nemo_evaluator_launcher.cli.ls_runs as ls_runs
+import nemo_evaluator_launcher.cli.ls_task as ls_task
 import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
 import nemo_evaluator_launcher.cli.run as run
 import nemo_evaluator_launcher.cli.status as status
@@ -42,11 +44,13 @@ def is_verbose_enabled(args) -> bool:
     subcommands = [
         "run",
         "status",
+        "logs",
         "info",
         "kill",
         "tasks_alias",
         "tasks",
         "runs",
+        "task",
         "export",
     ]
     for subcmd in subcommands:
@@ -106,6 +110,14 @@ def create_parser() -> ArgumentParser:
     )
     status_parser.add_arguments(status.Cmd, dest="status")
+    # Logs subcommand
+    logs_parser = subparsers.add_parser(
+        "logs",
+        help="Stream logs from evaluation jobs",
+        description="Stream logs from evaluation jobs by invocation ID or job ID",
+    )
+    logs_parser.add_arguments(logs.Cmd, dest="logs")
     # Kill subcommand
     kill_parser = subparsers.add_parser(
         "kill",
@@ -149,6 +161,14 @@ def create_parser() -> ArgumentParser:
     )
     ls_runs_parser.add_arguments(ls_runs.Cmd, dest="runs")
+    # ls task (task details)
+    ls_task_parser = ls_sub.add_parser(
+        "task",
+        help="Show task details",
+        description="Show detailed information about a specific task",
+    )
+    ls_task_parser.add_arguments(ls_task.Cmd, dest="task")
     # Export subcommand
     export_parser = subparsers.add_parser(
         "export",
@@ -204,16 +224,23 @@ def main() -> None:
         args.run.execute()
     elif args.command == "status":
         args.status.execute()
+    elif args.command == "logs":
+        args.logs.execute()
     elif args.command == "kill":
         args.kill.execute()
     elif args.command == "ls":
         # Dispatch nested ls subcommands
-        if args.ls_command is None or args.ls_command == "tasks":
-            # Default to tasks when no subcommand specified
+        if args.ls_command == "tasks":
+            # When explicitly "ls tasks", use args.tasks (has correct from_container)
+            args.tasks.execute()
+        elif args.ls_command is None:
+            # When just "ls" (no subcommand), use args.tasks_alias
             if hasattr(args, "tasks_alias"):
                 args.tasks_alias.execute()
             else:
                 args.tasks.execute()
+        elif args.ls_command == "task":
+            args.task.execute()
         elif args.ls_command == "runs":
             args.runs.execute()
     elif args.command == "export":

nemo_evaluator_launcher/cli/run.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import pathlib
 import time
 from dataclasses import dataclass
+from typing import Literal
 from simple_parsing import field
@@ -26,6 +27,7 @@ from nemo_evaluator_launcher.common.printing_utils import (
     green,
     magenta,
     red,
+    yellow,
 )
@@ -33,6 +35,13 @@ from nemo_evaluator_launcher.common.printing_utils import (
 class Cmd:
     """Run command parameters"""
+    config: str | None = field(
+        default=None,
+        alias=["--config"],
+        metadata={
+            "help": "Full path to config file. Uses Hydra by default (--config-mode=hydra). Use --config-mode=raw to load directly (bypasses Hydra)."
+        },
+    )
     config_name: str = field(
         default="default",
         alias=["-c", "--config-name"],
@@ -47,11 +56,11 @@ class Cmd:
             "help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
         },
     )
-    run_config_file: str | None = field(
-        default=None,
-        alias=["-f", "--run-config-file"],
+    config_mode: Literal["hydra", "raw"] = field(
+        default="hydra",
+        alias=["--config-mode"],
         metadata={
-            "help": "Path to a run config file to load directly (bypasses Hydra config loading)."
+            "help": "Config loading mode: 'hydra' (default) uses Hydra config system, 'raw' loads config file directly bypassing Hydra."
         },
     )
     override: list[str] = field(
@@ -68,6 +77,15 @@ class Cmd:
         alias=["-n", "--dry-run"],
         metadata={"help": "Do not run the evaluation, just print the config."},
     )
+    tasks: list[str] = field(
+        default_factory=list,
+        action="append",
+        nargs="?",
+        alias=["-t"],
+        metadata={
+            "help": "Run only specific tasks from the config. Example: -t ifeval -t gsm8k"
+        },
+    )
     config_output: str | None = field(
         default=None,
         alias=["--config-output"],
@@ -76,35 +94,97 @@ class Cmd:
         },
     )
+    def _parse_requested_tasks(self) -> list[str]:
+        """Parse -t arguments into a list of task names.
+        Handles None values that can be appended when using nargs="?" with action="append".
+        """
+        requested_tasks = []
+        for task_arg in self.tasks:
+            # Skip None or empty values (can happen with nargs="?")
+            if not task_arg:
+                continue
+            task_name = task_arg.strip()
+            if task_name and task_name not in requested_tasks:
+                requested_tasks.append(task_name)
+        return requested_tasks
     def execute(self) -> None:
         # Import heavy dependencies only when needed
         import yaml
         from omegaconf import OmegaConf
-        from nemo_evaluator_launcher.api.functional import RunConfig, run_eval
+        from nemo_evaluator_launcher.api.functional import (
+            RunConfig,
+            filter_tasks,
+            run_eval,
+        )
+        # Validate config_mode value
+        if self.config_mode not in ["hydra", "raw"]:
+            raise ValueError(
+                f"Invalid --config-mode value: {self.config_mode}. Must be 'hydra' or 'raw'."
+            )
-        # Load configuration either from Hydra or from a run config file
-        if self.run_config_file:
-            # Validate that run config file is not used with other config options
+        # Validate that raw mode requires --config
+        if self.config_mode == "raw" and self.config is None:
+            raise ValueError(
+                "--config-mode=raw requires --config to be specified. Raw mode loads config files directly."
+            )
+        # Parse requested tasks if -t is specified
+        requested_tasks = self._parse_requested_tasks() if self.tasks else None
+        # Load configuration either from Hydra or directly from a config file
+        if self.config_mode == "raw" and self.config:
+            # Validate that raw config loading is not used with other config options
             if self.config_name != "default":
-                raise ValueError("Cannot use --run-config-file with --config-name")
+                raise ValueError(
+                    "Cannot use --config-mode=raw with --config-name. Raw mode only works with --config."
+                )
             if self.config_dir is not None:
-                raise ValueError("Cannot use --run-config-file with --config-dir")
+                raise ValueError(
+                    "Cannot use --config-mode=raw with --config-dir. Raw mode only works with --config."
+                )
             if self.override:
-                raise ValueError("Cannot use --run-config-file with --override")
+                raise ValueError(
+                    "Cannot use --config-mode=raw with --override. Raw mode only works with --config."
+                )
-            # Load from run config file
-            with open(self.run_config_file, "r") as f:
+            # Load from config file directly (bypass Hydra)
+            with open(self.config, "r") as f:
                 config_dict = yaml.safe_load(f)
             # Create RunConfig from the loaded data
             config = OmegaConf.create(config_dict)
         else:
+            # Handle --config parameter: split path into config_dir and config_name for Hydra
+            if self.config:
+                if self.config_name != "default":
+                    raise ValueError("Cannot use --config with --config-name")
+                if self.config_dir is not None:
+                    raise ValueError("Cannot use --config with --config-dir")
+                config_path = pathlib.Path(self.config)
+                config_dir = str(config_path.parent)
+                config_name = str(config_path.stem)
+            else:
+                config_dir = self.config_dir
+                config_name = self.config_name
             # Load the complete Hydra configuration
             config = RunConfig.from_hydra(
-                config_name=self.config_name,
+                config_dir=config_dir,
+                config_name=config_name,
                 hydra_overrides=self.override,
-                config_dir=self.config_dir,
+            )
+        # Apply task filtering if -t is specified
+        if requested_tasks:
+            config = filter_tasks(config, requested_tasks)
+            logger.info(
+                "Running filtered tasks",
+                count=len(config.evaluation.tasks),
+                tasks=[t.name for t in config.evaluation.tasks],
             )
         try:
@@ -150,7 +230,7 @@ class Cmd:
                 f.write("#\n")
                 f.write("# To rerun this exact configuration:\n")
                 f.write(
-                    f"# nemo-evaluator-launcher run --run-config-file {config_path}\n"
+                    f"# nemo-evaluator-launcher run --config {config_path} --config-mode=raw\n"
                 )
                 f.write("#\n")
                 f.write(config_yaml)
@@ -164,6 +244,10 @@ class Cmd:
                 bold(cyan("To check status: "))
                 + f"nemo-evaluator-launcher status {invocation_id}"
             )
+            print(
+                bold(cyan("To view job info: "))
+                + f"nemo-evaluator-launcher info {invocation_id}"
+            )
             print(
                 bold(cyan("To kill all jobs: "))
                 + f"nemo-evaluator-launcher kill {invocation_id}"
@@ -198,3 +282,17 @@ class Cmd:
                     )
                 )
             )
+        # Warn if both config_dir and config_name are provided (and config_name is not default)
+        if (
+            self.config is None
+            and self.config_dir is not None
+            and self.config_name != "default"
+        ):
+            joint_path = pathlib.Path(self.config_dir) / f"{self.config_name}.yaml"
+            print(
+                yellow(
+                    f"Warning: Using --config-dir and --config-name together is deprecated. "
+                    f"Please use --config {joint_path} instead."
+                )
+            )

nemo_evaluator_launcher/cli/version.py CHANGED Viewed

@@ -19,6 +19,29 @@ import importlib
 from dataclasses import dataclass
 from nemo_evaluator_launcher import __package_name__, __version__
+from nemo_evaluator_launcher.common.logging_utils import logger
+def get_versions() -> dict:
+    internal_module_name = "nemo_evaluator_launcher_internal"
+    res = {__package_name__: __version__}
+    # Check for internal package
+    try:
+        internal_module = importlib.import_module(internal_module_name)
+        # Try to get version from internal package
+        internal_version = getattr(internal_module, "__version__", None)
+        if internal_version:
+            res[internal_module_name] = internal_version
+        else:
+            res[internal_module_name] = "available (version unknown)"
+    except ImportError:
+        # Internal package not available - this is expected in many cases
+        pass
+    except Exception as e:
+        logger.error(f"nemo_evaluator_launcher_internal: error loading ({e})")
+        raise
+    return res
 @dataclass
@@ -27,26 +50,6 @@ class Cmd:
     def execute(self) -> None:
         """Execute the version command."""
-        print(f"{__package_name__}: {__version__}")
-        # Check for internal package
-        try:
-            internal_module = importlib.import_module(
-                "nemo_evaluator_launcher_internal"
-            )
-            # Try to get version from internal package
-            try:
-                internal_version = getattr(internal_module, "__version__", None)
-                if internal_version:
-                    print(f"nemo-evaluator-launcher-internal: {internal_version}")
-                else:
-                    print(
-                        "nemo-evaluator-launcher-internal: available (version unknown)"
-                    )
-            except Exception:
-                print("nemo-evaluator-launcher-internal: available (version unknown)")
-        except ImportError:
-            # Internal package not available - this is expected in many cases
-            pass
-        except Exception as e:
-            print(f"nemo-evaluator-launcher-internal: error loading ({e})")
+        res = get_versions()
+        for package, version in res.items():
+            print(f"{package}: {version}")

nemo-evaluator-launcher 0.1.19__py3-none-any.whl → 0.1.56__py3-none-any.whl

nemo-evaluator-launcher 0.1.19py3-none-any.whl → 0.1.56py3-none-any.whl