nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Logs command for streaming logs from evaluation jobs."""
17
+
18
+ import sys
19
+ from dataclasses import dataclass
20
+ from typing import Callable, Dict
21
+
22
+ from simple_parsing import field
23
+
24
+ import nemo_evaluator_launcher.common.printing_utils as pu
25
+ from nemo_evaluator_launcher.api.functional import stream_logs
26
+ from nemo_evaluator_launcher.common.execdb import ExecutionDB
27
+ from nemo_evaluator_launcher.common.logging_utils import logger
28
+
29
+
30
+ @dataclass
31
+ class Cmd:
32
+ """Logs command configuration."""
33
+
34
+ ids: list[str] = field(
35
+ default_factory=list,
36
+ positional=True,
37
+ help="Invocation IDs or job IDs (e.g., '15b9f667' or '15b9f667.0'). Multiple IDs can be provided.",
38
+ )
39
+
40
+ def execute(self) -> None:
41
+ """Execute the logs command to stream logs from jobs."""
42
+ if not self.ids:
43
+ logger.error("At least one ID is required")
44
+ sys.exit(1)
45
+
46
+ db = ExecutionDB()
47
+
48
+ # Validate all IDs exist
49
+ all_job_ids = []
50
+ for id_or_prefix in self.ids:
51
+ if "." in id_or_prefix:
52
+ # This is a job ID - get single job
53
+ job_data = db.get_job(id_or_prefix)
54
+ if job_data is None:
55
+ logger.error(f"Job {id_or_prefix} not found")
56
+ sys.exit(1)
57
+ all_job_ids.append(id_or_prefix)
58
+ else:
59
+ # This is an invocation ID - get all jobs
60
+ jobs = db.get_jobs(id_or_prefix)
61
+ if not jobs:
62
+ logger.error(f"Invocation {id_or_prefix} not found")
63
+ sys.exit(1)
64
+ all_job_ids.extend(jobs.keys())
65
+
66
+ # Build color mapping for job IDs
67
+ colors = [pu.red, pu.green, pu.yellow, pu.magenta, pu.cyan]
68
+ job_colors: Dict[str, Callable[[str], str]] = {}
69
+ color_index = 0
70
+
71
+ for job_id in all_job_ids:
72
+ job_colors[job_id] = colors[color_index % len(colors)]
73
+ color_index += 1
74
+
75
+ # Stream logs from executor
76
+ try:
77
+ log_stream = stream_logs(self.ids)
78
+ for job_id, task_name, log_line in log_stream:
79
+ # Extract short prefix: first 6 chars of invocation ID + job number
80
+ if "." in job_id:
81
+ inv_id, job_num = job_id.split(".", 1)
82
+ short_prefix = f"{inv_id[:6]}.{job_num}"
83
+ else:
84
+ short_prefix = job_id[:6]
85
+ prefix = f"{short_prefix}:"
86
+ color_func = job_colors.get(job_id, pu.grey)
87
+ if log_line:
88
+ print(f"{color_func(prefix)} {log_line}")
89
+ else:
90
+ # Print empty lines without prefix
91
+ print()
92
+
93
+ except ValueError:
94
+ # Handle case where executor doesn't support streaming
95
+ # Warning already logged by BaseExecutor.stream_logs
96
+ pass
97
+ except KeyboardInterrupt:
98
+ # Clean exit on Ctrl+C
99
+ pass
100
+ except Exception as e:
101
+ logger.error(f"Error streaming logs: {e}")
102
+ sys.exit(1)
@@ -20,10 +20,7 @@ from typing import Optional
20
20
 
21
21
  from simple_parsing import field
22
22
 
23
- from nemo_evaluator_launcher.api.functional import (
24
- get_invocation_benchmarks,
25
- list_all_invocations_summary,
26
- )
23
+ from nemo_evaluator_launcher.common.logging_utils import logger
27
24
 
28
25
 
29
26
  @dataclass
@@ -32,15 +29,25 @@ class Cmd:
32
29
 
33
30
  limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
34
31
  executor: Optional[str] = field(
35
- default=None, alias=["--executor"], help="Filter by executor"
32
+ default=None,
33
+ alias=["--executor"],
34
+ help="Filter by executor",
36
35
  )
36
+ # TODO(agronskiy): think about if we can propagate a `--status` filter into here.
37
37
  since: Optional[str] = field(
38
38
  default=None,
39
39
  alias=["--since"],
40
- help="Filter by ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00)",
40
+ help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
41
+ "an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
41
42
  )
42
43
 
43
44
  def execute(self) -> None:
45
+ # Import heavy dependencies only when needed
46
+ from nemo_evaluator_launcher.api.functional import (
47
+ get_invocation_benchmarks,
48
+ list_all_invocations_summary,
49
+ )
50
+
44
51
  rows = list_all_invocations_summary()
45
52
 
46
53
  if self.executor:
@@ -52,7 +59,22 @@ class Cmd:
52
59
 
53
60
  if self.since:
54
61
  try:
55
- if "T" in self.since:
62
+ # Check if it's a relative time format like "1d" or "3h"
63
+ if self.since.lower().endswith("d") and len(self.since) > 1:
64
+ days = int(self.since[:-1])
65
+ if days < 0:
66
+ raise ValueError("Days should be non-negative")
67
+ since_ts = (
68
+ _dt.datetime.now() - _dt.timedelta(days=days)
69
+ ).timestamp()
70
+ elif self.since.lower().endswith("h") and len(self.since) > 1:
71
+ hours = int(self.since[:-1])
72
+ if hours < 0:
73
+ raise ValueError("Hours should be non-negative")
74
+ since_ts = (
75
+ _dt.datetime.now() - _dt.timedelta(hours=hours)
76
+ ).timestamp()
77
+ elif "T" in self.since:
56
78
  since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
57
79
  else:
58
80
  since_ts = _dt.datetime.fromisoformat(
@@ -60,9 +82,8 @@ class Cmd:
60
82
  ).timestamp()
61
83
  rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
62
84
  except Exception:
63
- print(
64
- f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS",
65
- file=sys.stderr,
85
+ logger.fatal(
86
+ f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
66
87
  )
67
88
  sys.exit(2)
68
89
 
@@ -13,17 +13,28 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
- import json
16
+ from collections import defaultdict
17
17
  from dataclasses import dataclass
18
18
 
19
- from nemo_evaluator_launcher.api.functional import get_tasks_list
19
+ from simple_parsing import field
20
20
 
21
21
 
22
22
  @dataclass
23
23
  class Cmd:
24
24
  """List command configuration."""
25
25
 
26
+ json: bool = field(
27
+ default=False,
28
+ action="store_true",
29
+ help="Print output as JSON instead of table format",
30
+ )
31
+
26
32
  def execute(self) -> None:
33
+ # Import heavy dependencies only when needed
34
+ import json
35
+
36
+ from nemo_evaluator_launcher.api.functional import get_tasks_list
37
+
27
38
  # TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
28
39
  data = get_tasks_list()
29
40
  headers = ["task", "endpoint_type", "harness", "container"]
@@ -31,4 +42,95 @@ class Cmd:
31
42
  for task_data in data:
32
43
  assert len(task_data) == len(headers)
33
44
  supported_benchmarks.append(dict(zip(headers, task_data)))
34
- print(json.dumps(supported_benchmarks, indent=2))
45
+
46
+ if self.json:
47
+ print(json.dumps({"tasks": supported_benchmarks}, indent=2))
48
+ else:
49
+ self._print_table(supported_benchmarks)
50
+
51
+ def _print_table(self, tasks: list[dict]) -> None:
52
+ """Print tasks grouped by harness and container in table format."""
53
+ if not tasks:
54
+ print("No tasks found.")
55
+ return
56
+
57
+ # Group tasks by harness and container
58
+ grouped = defaultdict(lambda: defaultdict(list))
59
+ for task in tasks:
60
+ harness = task["harness"]
61
+ container = task["container"]
62
+ grouped[harness][container].append(task)
63
+
64
+ # Print grouped tables
65
+ for i, (harness, containers) in enumerate(grouped.items()):
66
+ if i > 0:
67
+ print() # Extra spacing between harnesses
68
+
69
+ for j, (container, container_tasks) in enumerate(containers.items()):
70
+ if j > 0:
71
+ print() # Spacing between containers
72
+
73
+ # Prepare task table first to get column widths
74
+ task_headers = ["task", "endpoint_type"]
75
+ rows = []
76
+ for task in container_tasks:
77
+ rows.append([task["task"], task["endpoint_type"]])
78
+
79
+ # Sort tasks alphabetically for better readability
80
+ rows.sort(key=lambda x: x[0])
81
+
82
+ # Calculate column widths with some padding
83
+ widths = [
84
+ max(len(task_headers[i]), max(len(str(row[i])) for row in rows)) + 2
85
+ for i in range(len(task_headers))
86
+ ]
87
+
88
+ # Calculate minimum table width based on task columns
89
+ min_table_width = sum(widths) + len(widths) + 1
90
+
91
+ # Calculate required width for header content
92
+ harness_line = f"harness: {harness}"
93
+ container_line = f"container: {container}"
94
+ header_content_width = (
95
+ max(len(harness_line), len(container_line)) + 4
96
+ ) # +4 for "| " and " |"
97
+
98
+ # Use the larger of the two widths
99
+ table_width = max(min_table_width, header_content_width)
100
+
101
+ # Print combined header with harness and container info
102
+ print("=" * table_width)
103
+ print(f"{harness_line}")
104
+ print(f"{container_line}")
105
+
106
+ # Adjust column widths to fill the full table width
107
+ available_width = table_width
108
+ # Give more space to the first column (task names can be long)
109
+ adjusted_widths = [
110
+ max(
111
+ widths[0], available_width * 2 // 3
112
+ ), # 2/3 of available width for task
113
+ 0, # Will be calculated as remainder
114
+ ]
115
+ adjusted_widths[1] = (
116
+ available_width - adjusted_widths[0]
117
+ ) # Remainder for endpoint_type
118
+
119
+ # Print task table header separator
120
+ print(" " * table_width)
121
+ header_row = f"{task_headers[0]:<{adjusted_widths[0]}}{task_headers[1]:<{adjusted_widths[1]}}"
122
+ print(header_row)
123
+ print("-" * table_width)
124
+
125
+ # Print task rows
126
+ for row in rows:
127
+ data_row = f"{str(row[0]):<{adjusted_widths[0]}}{str(row[1]):<{adjusted_widths[1]}}"
128
+ print(data_row)
129
+
130
+ print("-" * table_width)
131
+ # Show task count
132
+ task_count = len(rows)
133
+ print(f" {task_count} task{'s' if task_count != 1 else ''} available")
134
+ print("=" * table_width)
135
+
136
+ print()
@@ -15,10 +15,14 @@
15
15
  #
16
16
  """Main CLI module using simple-parsing with subcommands."""
17
17
 
18
+ import os
19
+
18
20
  from simple_parsing import ArgumentParser
19
21
 
20
22
  import nemo_evaluator_launcher.cli.export as export
23
+ import nemo_evaluator_launcher.cli.info as info
21
24
  import nemo_evaluator_launcher.cli.kill as kill
25
+ import nemo_evaluator_launcher.cli.logs as logs
22
26
  import nemo_evaluator_launcher.cli.ls_runs as ls_runs
23
27
  import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
24
28
  import nemo_evaluator_launcher.cli.run as run
@@ -29,6 +33,32 @@ from nemo_evaluator_launcher.common.logging_utils import logger
29
33
  VERSION_HELP = "Show version information"
30
34
 
31
35
 
36
+ def is_verbose_enabled(args) -> bool:
37
+ """Check if verbose flag is enabled in any subcommand."""
38
+ # Check global verbose flag
39
+ if hasattr(args, "verbose") and args.verbose:
40
+ return True
41
+
42
+ # Check subcommand verbose flags
43
+ subcommands = [
44
+ "run",
45
+ "status",
46
+ "logs",
47
+ "info",
48
+ "kill",
49
+ "tasks_alias",
50
+ "tasks",
51
+ "runs",
52
+ "export",
53
+ ]
54
+ for subcmd in subcommands:
55
+ if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
56
+ if getattr(getattr(args, subcmd), "verbose"):
57
+ return True
58
+
59
+ return False
60
+
61
+
32
62
  def create_parser() -> ArgumentParser:
33
63
  """Create and configure the CLI argument parser with subcommands."""
34
64
  parser = ArgumentParser()
@@ -36,6 +66,14 @@ def create_parser() -> ArgumentParser:
36
66
  # Add --version flag at the top level
37
67
  parser.add_argument("--version", action="store_true", help=VERSION_HELP)
38
68
 
69
+ # Add --verbose/-v flag for debug logging
70
+ parser.add_argument(
71
+ "-v",
72
+ "--verbose",
73
+ action="store_true",
74
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
75
+ )
76
+
39
77
  subparsers = parser.add_subparsers(dest="command", required=False)
40
78
 
41
79
  # Version subcommand
@@ -50,26 +88,61 @@ def create_parser() -> ArgumentParser:
50
88
  run_parser = subparsers.add_parser(
51
89
  "run", help="Run evaluation", description="Run evaluation"
52
90
  )
91
+ run_parser.add_argument(
92
+ "-v",
93
+ "--verbose",
94
+ action="store_true",
95
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
96
+ )
53
97
  run_parser.add_arguments(run.Cmd, dest="run")
54
98
 
55
99
  # Status subcommand
56
100
  status_parser = subparsers.add_parser(
57
101
  "status", help="Check job status", description="Check job status"
58
102
  )
103
+ status_parser.add_argument(
104
+ "-v",
105
+ "--verbose",
106
+ action="store_true",
107
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
108
+ )
59
109
  status_parser.add_arguments(status.Cmd, dest="status")
60
110
 
111
+ # Logs subcommand
112
+ logs_parser = subparsers.add_parser(
113
+ "logs",
114
+ help="Stream logs from evaluation jobs",
115
+ description="Stream logs from evaluation jobs by invocation ID or job ID",
116
+ )
117
+ logs_parser.add_arguments(logs.Cmd, dest="logs")
118
+
61
119
  # Kill subcommand
62
120
  kill_parser = subparsers.add_parser(
63
121
  "kill",
64
122
  help="Kill a job or invocation",
65
123
  description="Kill a job (e.g., aefc4819.0) or entire invocation (e.g., aefc4819) by its ID",
66
124
  )
125
+ kill_parser.add_argument(
126
+ "-v",
127
+ "--verbose",
128
+ action="store_true",
129
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
130
+ )
67
131
  kill_parser.add_arguments(kill.Cmd, dest="kill")
68
132
 
69
133
  # Ls subcommand (with nested subcommands)
70
134
  ls_parser = subparsers.add_parser(
71
135
  "ls", help="List resources", description="List tasks or runs"
72
136
  )
137
+ ls_parser.add_argument(
138
+ "-v",
139
+ "--verbose",
140
+ action="store_true",
141
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
142
+ )
143
+ # Add arguments from `ls tasks` so that they work with `ls` as default alias
144
+ ls_parser.add_arguments(ls_tasks.Cmd, dest="tasks_alias")
145
+
73
146
  ls_sub = ls_parser.add_subparsers(dest="ls_command", required=False)
74
147
 
75
148
  # ls tasks (default)
@@ -92,8 +165,25 @@ def create_parser() -> ArgumentParser:
92
165
  help="Export evaluation results",
93
166
  description="Export evaluation results takes a List of invocation ids and a list of destinations(local, gitlab, wandb)",
94
167
  )
168
+ export_parser.add_argument(
169
+ "-v",
170
+ "--verbose",
171
+ action="store_true",
172
+ help="Enable verbose logging (sets LOG_LEVEL=DEBUG)",
173
+ )
95
174
  export_parser.add_arguments(export.ExportCmd, dest="export")
96
175
 
176
+ # Info subcommand
177
+ info_parser = subparsers.add_parser(
178
+ "info",
179
+ help="Display evaluation job information",
180
+ description="Info functionalities for nemo-evaluator-launcher",
181
+ )
182
+ info_parser.add_argument(
183
+ "-v", "--verbose", action="store_true", help="Enable verbose logging"
184
+ )
185
+ info_parser.add_arguments(info.InfoCmd, dest="info")
186
+
97
187
  return parser
98
188
 
99
189
 
@@ -102,6 +192,10 @@ def main() -> None:
102
192
  parser = create_parser()
103
193
  args = parser.parse_args()
104
194
 
195
+ # Handle --verbose flag
196
+ if is_verbose_enabled(args):
197
+ os.environ["LOG_LEVEL"] = "DEBUG"
198
+
105
199
  # Handle --version flag
106
200
  if hasattr(args, "version") and args.version:
107
201
  version_cmd = version.Cmd()
@@ -120,22 +214,24 @@ def main() -> None:
120
214
  args.run.execute()
121
215
  elif args.command == "status":
122
216
  args.status.execute()
217
+ elif args.command == "logs":
218
+ args.logs.execute()
123
219
  elif args.command == "kill":
124
220
  args.kill.execute()
125
221
  elif args.command == "ls":
126
222
  # Dispatch nested ls subcommands
127
223
  if args.ls_command is None or args.ls_command == "tasks":
128
224
  # Default to tasks when no subcommand specified
129
- if hasattr(args, "tasks"):
130
- args.tasks.execute()
225
+ if hasattr(args, "tasks_alias"):
226
+ args.tasks_alias.execute()
131
227
  else:
132
- # Create default tasks command if not specified
133
- tasks_cmd = ls_tasks.Cmd()
134
- tasks_cmd.execute()
228
+ args.tasks.execute()
135
229
  elif args.ls_command == "runs":
136
230
  args.runs.execute()
137
231
  elif args.command == "export":
138
232
  args.export.execute()
233
+ elif args.command == "info":
234
+ args.info.execute()
139
235
 
140
236
 
141
237
  if __name__ == "__main__":