PyPI - nemo-evaluator-launcher - Versions diffs - 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

nemo_evaluator_launcher/__init__.py +15 -1
nemo_evaluator_launcher/api/functional.py +188 -27
nemo_evaluator_launcher/api/types.py +9 -0
nemo_evaluator_launcher/cli/export.py +131 -12
nemo_evaluator_launcher/cli/info.py +477 -82
nemo_evaluator_launcher/cli/kill.py +5 -3
nemo_evaluator_launcher/cli/logs.py +102 -0
nemo_evaluator_launcher/cli/ls_runs.py +31 -10
nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
nemo_evaluator_launcher/cli/main.py +101 -5
nemo_evaluator_launcher/cli/run.py +153 -30
nemo_evaluator_launcher/cli/status.py +49 -5
nemo_evaluator_launcher/cli/version.py +26 -23
nemo_evaluator_launcher/common/execdb.py +121 -27
nemo_evaluator_launcher/common/helpers.py +213 -33
nemo_evaluator_launcher/common/logging_utils.py +16 -5
nemo_evaluator_launcher/common/printing_utils.py +100 -0
nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
nemo_evaluator_launcher/executors/base.py +54 -1
nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
nemo_evaluator_launcher/executors/local/executor.py +492 -56
nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
nemo_evaluator_launcher/exporters/base.py +9 -0
nemo_evaluator_launcher/exporters/gsheets.py +27 -9
nemo_evaluator_launcher/exporters/local.py +30 -16
nemo_evaluator_launcher/exporters/mlflow.py +245 -74
nemo_evaluator_launcher/exporters/utils.py +139 -184
nemo_evaluator_launcher/exporters/wandb.py +157 -43
nemo_evaluator_launcher/package_info.py +6 -3
nemo_evaluator_launcher/resources/mapping.toml +56 -15
nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
{nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0

nemo_evaluator_launcher/cli/logs.py ADDED Viewed

@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Logs command for streaming logs from evaluation jobs."""
+import sys
+from dataclasses import dataclass
+from typing import Callable, Dict
+from simple_parsing import field
+import nemo_evaluator_launcher.common.printing_utils as pu
+from nemo_evaluator_launcher.api.functional import stream_logs
+from nemo_evaluator_launcher.common.execdb import ExecutionDB
+from nemo_evaluator_launcher.common.logging_utils import logger
+@dataclass
+class Cmd:
+    """Logs command configuration."""
+    ids: list[str] = field(
+        default_factory=list,
+        positional=True,
+        help="Invocation IDs or job IDs (e.g., '15b9f667' or '15b9f667.0'). Multiple IDs can be provided.",
+    )
+    def execute(self) -> None:
+        """Execute the logs command to stream logs from jobs."""
+        if not self.ids:
+            logger.error("At least one ID is required")
+            sys.exit(1)
+        db = ExecutionDB()
+        # Validate all IDs exist
+        all_job_ids = []
+        for id_or_prefix in self.ids:
+            if "." in id_or_prefix:
+                # This is a job ID - get single job
+                job_data = db.get_job(id_or_prefix)
+                if job_data is None:
+                    logger.error(f"Job {id_or_prefix} not found")
+                    sys.exit(1)
+                all_job_ids.append(id_or_prefix)
+            else:
+                # This is an invocation ID - get all jobs
+                jobs = db.get_jobs(id_or_prefix)
+                if not jobs:
+                    logger.error(f"Invocation {id_or_prefix} not found")
+                    sys.exit(1)
+                all_job_ids.extend(jobs.keys())
+        # Build color mapping for job IDs
+        colors = [pu.red, pu.green, pu.yellow, pu.magenta, pu.cyan]
+        job_colors: Dict[str, Callable[[str], str]] = {}
+        color_index = 0
+        for job_id in all_job_ids:
+            job_colors[job_id] = colors[color_index % len(colors)]
+            color_index += 1
+        # Stream logs from executor
+        try:
+            log_stream = stream_logs(self.ids)
+            for job_id, task_name, log_line in log_stream:
+                # Extract short prefix: first 6 chars of invocation ID + job number
+                if "." in job_id:
+                    inv_id, job_num = job_id.split(".", 1)
+                    short_prefix = f"{inv_id[:6]}.{job_num}"
+                else:
+                    short_prefix = job_id[:6]
+                prefix = f"{short_prefix}:"
+                color_func = job_colors.get(job_id, pu.grey)
+                if log_line:
+                    print(f"{color_func(prefix)} {log_line}")
+                else:
+                    # Print empty lines without prefix
+                    print()
+        except ValueError:
+            # Handle case where executor doesn't support streaming
+            # Warning already logged by BaseExecutor.stream_logs
+            pass
+        except KeyboardInterrupt:
+            # Clean exit on Ctrl+C
+            pass
+        except Exception as e:
+            logger.error(f"Error streaming logs: {e}")
+            sys.exit(1)

nemo_evaluator_launcher/cli/ls_runs.py CHANGED Viewed

@@ -20,10 +20,7 @@ from typing import Optional
 from simple_parsing import field
-from nemo_evaluator_launcher.api.functional import (
-    get_invocation_benchmarks,
-    list_all_invocations_summary,
-)
+from nemo_evaluator_launcher.common.logging_utils import logger
 @dataclass
@@ -32,15 +29,25 @@ class Cmd:
     limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
     executor: Optional[str] = field(
-        default=None, alias=["--executor"], help="Filter by executor"
+        default=None,
+        alias=["--executor"],
+        help="Filter by executor",
     )
+    # TODO(agronskiy): think about if we can propagate a `--status` filter into here.
     since: Optional[str] = field(
         default=None,
         alias=["--since"],
-        help="Filter by ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00)",
+        help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
+        "an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
     )
     def execute(self) -> None:
+        # Import heavy dependencies only when needed
+        from nemo_evaluator_launcher.api.functional import (
+            get_invocation_benchmarks,
+            list_all_invocations_summary,
+        )
         rows = list_all_invocations_summary()
         if self.executor:
@@ -52,7 +59,22 @@ class Cmd:
         if self.since:
             try:
-                if "T" in self.since:
+                # Check if it's a relative time format like "1d" or "3h"
+                if self.since.lower().endswith("d") and len(self.since) > 1:
+                    days = int(self.since[:-1])
+                    if days < 0:
+                        raise ValueError("Days should be non-negative")
+                    since_ts = (
+                        _dt.datetime.now() - _dt.timedelta(days=days)
+                    ).timestamp()
+                elif self.since.lower().endswith("h") and len(self.since) > 1:
+                    hours = int(self.since[:-1])
+                    if hours < 0:
+                        raise ValueError("Hours should be non-negative")
+                    since_ts = (
+                        _dt.datetime.now() - _dt.timedelta(hours=hours)
+                    ).timestamp()
+                elif "T" in self.since:
                     since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
                 else:
                     since_ts = _dt.datetime.fromisoformat(
@@ -60,9 +82,8 @@ class Cmd:
                     ).timestamp()
                 rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
             except Exception:
-                print(
-                    f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS",
-                    file=sys.stderr,
+                logger.fatal(
+                    f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
                 )
                 sys.exit(2)

nemo_evaluator_launcher/cli/ls_tasks.py CHANGED Viewed

@@ -13,17 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import json
+from collections import defaultdict
 from dataclasses import dataclass
-from nemo_evaluator_launcher.api.functional import get_tasks_list
+from simple_parsing import field
 @dataclass
 class Cmd:
     """List command configuration."""
+    json: bool = field(
+        default=False,
+        action="store_true",
+        help="Print output as JSON instead of table format",
+    )
     def execute(self) -> None:
+        # Import heavy dependencies only when needed
+        import json
+        from nemo_evaluator_launcher.api.functional import get_tasks_list
         # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
         data = get_tasks_list()
         headers = ["task", "endpoint_type", "harness", "container"]
@@ -31,4 +42,95 @@ class Cmd:
         for task_data in data:
             assert len(task_data) == len(headers)
             supported_benchmarks.append(dict(zip(headers, task_data)))
-        print(json.dumps(supported_benchmarks, indent=2))
+        if self.json:
+            print(json.dumps({"tasks": supported_benchmarks}, indent=2))
+        else:
+            self._print_table(supported_benchmarks)
+    def _print_table(self, tasks: list[dict]) -> None:
+        """Print tasks grouped by harness and container in table format."""
+        if not tasks:
+            print("No tasks found.")
+            return
+        # Group tasks by harness and container
+        grouped = defaultdict(lambda: defaultdict(list))
+        for task in tasks:
+            harness = task["harness"]
+            container = task["container"]
+            grouped[harness][container].append(task)
+        # Print grouped tables
+        for i, (harness, containers) in enumerate(grouped.items()):
+            if i > 0:
+                print()  # Extra spacing between harnesses
+            for j, (container, container_tasks) in enumerate(containers.items()):
+                if j > 0:
+                    print()  # Spacing between containers
+                # Prepare task table first to get column widths
+                task_headers = ["task", "endpoint_type"]
+                rows = []
+                for task in container_tasks:
+                    rows.append([task["task"], task["endpoint_type"]])
+                # Sort tasks alphabetically for better readability
+                rows.sort(key=lambda x: x[0])
+                # Calculate column widths with some padding
+                widths = [
+                    max(len(task_headers[i]), max(len(str(row[i])) for row in rows)) + 2
+                    for i in range(len(task_headers))
+                ]
+                # Calculate minimum table width based on task columns
+                min_table_width = sum(widths) + len(widths) + 1
+                # Calculate required width for header content
+                harness_line = f"harness: {harness}"
+                container_line = f"container: {container}"
+                header_content_width = (
+                    max(len(harness_line), len(container_line)) + 4
+                )  # +4 for "| " and " |"
+                # Use the larger of the two widths
+                table_width = max(min_table_width, header_content_width)
+                # Print combined header with harness and container info
+                print("=" * table_width)
+                print(f"{harness_line}")
+                print(f"{container_line}")
+                # Adjust column widths to fill the full table width
+                available_width = table_width
+                # Give more space to the first column (task names can be long)
+                adjusted_widths = [
+                    max(
+                        widths[0], available_width * 2 // 3
+                    ),  # 2/3 of available width for task
+                    0,  # Will be calculated as remainder
+                ]
+                adjusted_widths[1] = (
+                    available_width - adjusted_widths[0]
+                )  # Remainder for endpoint_type
+                # Print task table header separator
+                print(" " * table_width)
+                header_row = f"{task_headers[0]:<{adjusted_widths[0]}}{task_headers[1]:<{adjusted_widths[1]}}"
+                print(header_row)
+                print("-" * table_width)
+                # Print task rows
+                for row in rows:
+                    data_row = f"{str(row[0]):<{adjusted_widths[0]}}{str(row[1]):<{adjusted_widths[1]}}"
+                    print(data_row)
+                print("-" * table_width)
+                # Show task count
+                task_count = len(rows)
+                print(f"  {task_count} task{'s' if task_count != 1 else ''} available")
+                print("=" * table_width)
+                print()

nemo_evaluator_launcher/cli/main.py CHANGED Viewed

@@ -15,10 +15,14 @@
 #
 """Main CLI module using simple-parsing with subcommands."""
+import os
 from simple_parsing import ArgumentParser
 import nemo_evaluator_launcher.cli.export as export
+import nemo_evaluator_launcher.cli.info as info
 import nemo_evaluator_launcher.cli.kill as kill
+import nemo_evaluator_launcher.cli.logs as logs
 import nemo_evaluator_launcher.cli.ls_runs as ls_runs
 import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
 import nemo_evaluator_launcher.cli.run as run
@@ -29,6 +33,32 @@ from nemo_evaluator_launcher.common.logging_utils import logger
 VERSION_HELP = "Show version information"
+def is_verbose_enabled(args) -> bool:
+    """Check if verbose flag is enabled in any subcommand."""
+    # Check global verbose flag
+    if hasattr(args, "verbose") and args.verbose:
+        return True
+    # Check subcommand verbose flags
+    subcommands = [
+        "run",
+        "status",
+        "logs",
+        "info",
+        "kill",
+        "tasks_alias",
+        "tasks",
+        "runs",
+        "export",
+    ]
+    for subcmd in subcommands:
+        if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
+            if getattr(getattr(args, subcmd), "verbose"):
+                return True
+    return False
 def create_parser() -> ArgumentParser:
     """Create and configure the CLI argument parser with subcommands."""
     parser = ArgumentParser()
@@ -36,6 +66,14 @@ def create_parser() -> ArgumentParser:
     # Add --version flag at the top level
     parser.add_argument("--version", action="store_true", help=VERSION_HELP)
+    # Add --verbose/-v flag for debug logging
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
     subparsers = parser.add_subparsers(dest="command", required=False)
     # Version subcommand
@@ -50,26 +88,61 @@ def create_parser() -> ArgumentParser:
     run_parser = subparsers.add_parser(
         "run", help="Run evaluation", description="Run evaluation"
     )
+    run_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
     run_parser.add_arguments(run.Cmd, dest="run")
     # Status subcommand
     status_parser = subparsers.add_parser(
         "status", help="Check job status", description="Check job status"
     )
+    status_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
     status_parser.add_arguments(status.Cmd, dest="status")
+    # Logs subcommand
+    logs_parser = subparsers.add_parser(
+        "logs",
+        help="Stream logs from evaluation jobs",
+        description="Stream logs from evaluation jobs by invocation ID or job ID",
+    )
+    logs_parser.add_arguments(logs.Cmd, dest="logs")
     # Kill subcommand
     kill_parser = subparsers.add_parser(
         "kill",
         help="Kill a job or invocation",
         description="Kill a job (e.g., aefc4819.0) or entire invocation (e.g., aefc4819) by its ID",
     )
+    kill_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
     kill_parser.add_arguments(kill.Cmd, dest="kill")
     # Ls subcommand (with nested subcommands)
     ls_parser = subparsers.add_parser(
         "ls", help="List resources", description="List tasks or runs"
     )
+    ls_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
+    # Add arguments from `ls tasks` so that they work with `ls` as default alias
+    ls_parser.add_arguments(ls_tasks.Cmd, dest="tasks_alias")
     ls_sub = ls_parser.add_subparsers(dest="ls_command", required=False)
     # ls tasks (default)
@@ -92,8 +165,25 @@ def create_parser() -> ArgumentParser:
         help="Export evaluation results",
         description="Export evaluation results takes a List of invocation ids and a list of destinations(local, gitlab, wandb)",
     )
+    export_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
+    )
     export_parser.add_arguments(export.ExportCmd, dest="export")
+    # Info subcommand
+    info_parser = subparsers.add_parser(
+        "info",
+        help="Display evaluation job information",
+        description="Info functionalities for nemo-evaluator-launcher",
+    )
+    info_parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Enable verbose logging"
+    )
+    info_parser.add_arguments(info.InfoCmd, dest="info")
     return parser
@@ -102,6 +192,10 @@ def main() -> None:
     parser = create_parser()
     args = parser.parse_args()
+    # Handle --verbose flag
+    if is_verbose_enabled(args):
+        os.environ["LOG_LEVEL"] = "DEBUG"
     # Handle --version flag
     if hasattr(args, "version") and args.version:
         version_cmd = version.Cmd()
@@ -120,22 +214,24 @@ def main() -> None:
         args.run.execute()
     elif args.command == "status":
         args.status.execute()
+    elif args.command == "logs":
+        args.logs.execute()
     elif args.command == "kill":
         args.kill.execute()
     elif args.command == "ls":
         # Dispatch nested ls subcommands
         if args.ls_command is None or args.ls_command == "tasks":
             # Default to tasks when no subcommand specified
-            if hasattr(args, "tasks"):
-                args.tasks.execute()
+            if hasattr(args, "tasks_alias"):
+                args.tasks_alias.execute()
             else:
-                # Create default tasks command if not specified
-                tasks_cmd = ls_tasks.Cmd()
-                tasks_cmd.execute()
+                args.tasks.execute()
         elif args.ls_command == "runs":
             args.runs.execute()
     elif args.command == "export":
         args.export.execute()
+    elif args.command == "info":
+        args.info.execute()
 if __name__ == "__main__":

nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

nemo-evaluator-launcher 0.1.0rc6py3-none-any.whl → 0.1.41py3-none-any.whl