nemo-evaluator-launcher 0.1.19__py3-none-any.whl → 0.1.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/api/functional.py +159 -5
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_task.py +280 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +208 -55
- nemo_evaluator_launcher/cli/main.py +29 -2
- nemo_evaluator_launcher/cli/run.py +114 -16
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/container_metadata/__init__.py +61 -0
- nemo_evaluator_launcher/common/container_metadata/intermediate_repr.py +530 -0
- nemo_evaluator_launcher/common/container_metadata/loading.py +1126 -0
- nemo_evaluator_launcher/common/container_metadata/registries.py +824 -0
- nemo_evaluator_launcher/common/container_metadata/utils.py +63 -0
- nemo_evaluator_launcher/common/helpers.py +200 -51
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/mapping.py +341 -155
- nemo_evaluator_launcher/common/printing_utils.py +25 -12
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +14 -0
- nemo_evaluator_launcher/executors/base.py +31 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
- nemo_evaluator_launcher/executors/lepton/executor.py +107 -9
- nemo_evaluator_launcher/executors/local/executor.py +383 -24
- nemo_evaluator_launcher/executors/local/run.template.sh +54 -2
- nemo_evaluator_launcher/executors/slurm/executor.py +559 -64
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/utils.py +32 -46
- nemo_evaluator_launcher/package_info.py +1 -1
- nemo_evaluator_launcher/resources/all_tasks_irs.yaml +17016 -0
- nemo_evaluator_launcher/resources/mapping.toml +64 -315
- {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/METADATA +4 -3
- nemo_evaluator_launcher-0.1.56.dist-info/RECORD +69 -0
- {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.19.dist-info/RECORD +0 -60
- {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.19.dist-info → nemo_evaluator_launcher-0.1.56.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,13 @@ from dataclasses import dataclass
|
|
|
18
18
|
|
|
19
19
|
from simple_parsing import field
|
|
20
20
|
|
|
21
|
+
from nemo_evaluator_launcher.common.printing_utils import (
|
|
22
|
+
bold,
|
|
23
|
+
cyan,
|
|
24
|
+
grey,
|
|
25
|
+
magenta,
|
|
26
|
+
)
|
|
27
|
+
|
|
21
28
|
|
|
22
29
|
@dataclass
|
|
23
30
|
class Cmd:
|
|
@@ -28,20 +35,101 @@ class Cmd:
|
|
|
28
35
|
action="store_true",
|
|
29
36
|
help="Print output as JSON instead of table format",
|
|
30
37
|
)
|
|
38
|
+
from_container: str = field(
|
|
39
|
+
default="",
|
|
40
|
+
help="Load tasks from container image (e.g., nvcr.io/nvidia/eval-factory/simple-evals:25.10). "
|
|
41
|
+
"If provided, extracts framework.yml from container and lists tasks on-the-fly instead of using mapping.toml",
|
|
42
|
+
)
|
|
31
43
|
|
|
32
44
|
def execute(self) -> None:
|
|
33
45
|
# Import heavy dependencies only when needed
|
|
34
46
|
import json
|
|
35
47
|
|
|
36
|
-
|
|
48
|
+
if self.from_container:
|
|
49
|
+
# Load tasks from container
|
|
50
|
+
from nemo_evaluator_launcher.common.container_metadata import (
|
|
51
|
+
load_tasks_from_container,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
tasks = load_tasks_from_container(self.from_container)
|
|
56
|
+
except ValueError as e:
|
|
57
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
58
|
+
|
|
59
|
+
logger.error(
|
|
60
|
+
"Failed to load tasks from container",
|
|
61
|
+
container=self.from_container,
|
|
62
|
+
error=str(e),
|
|
63
|
+
)
|
|
64
|
+
return
|
|
65
|
+
except Exception as e:
|
|
66
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
67
|
+
|
|
68
|
+
logger.error(
|
|
69
|
+
"Failed to load tasks from container",
|
|
70
|
+
container=self.from_container,
|
|
71
|
+
error=str(e),
|
|
72
|
+
exc_info=True,
|
|
73
|
+
)
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
if not tasks:
|
|
77
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
78
|
+
|
|
79
|
+
logger.error(
|
|
80
|
+
"No tasks found in container",
|
|
81
|
+
container=self.from_container,
|
|
82
|
+
)
|
|
83
|
+
return
|
|
37
84
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
85
|
+
# Convert TaskIntermediateRepresentation to format expected by get_tasks_list()
|
|
86
|
+
# Build data structure matching get_tasks_list() output format
|
|
87
|
+
data = []
|
|
88
|
+
for task in tasks:
|
|
89
|
+
# Extract endpoint types from defaults
|
|
90
|
+
endpoint_types = (
|
|
91
|
+
task.defaults.get("target", {})
|
|
92
|
+
.get("api_endpoint", {})
|
|
93
|
+
.get("type", "chat")
|
|
94
|
+
)
|
|
95
|
+
if isinstance(endpoint_types, str):
|
|
96
|
+
endpoint_types = [endpoint_types]
|
|
97
|
+
|
|
98
|
+
data.append(
|
|
99
|
+
[
|
|
100
|
+
task.name, # task
|
|
101
|
+
",".join(endpoint_types)
|
|
102
|
+
if isinstance(endpoint_types, list)
|
|
103
|
+
else endpoint_types, # endpoint_type
|
|
104
|
+
task.harness, # harness
|
|
105
|
+
task.container, # container
|
|
106
|
+
getattr(task, "container_arch", "") or "", # arch
|
|
107
|
+
task.description, # description
|
|
108
|
+
]
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
# Default behavior: load from mapping.toml via get_tasks_list()
|
|
112
|
+
from nemo_evaluator_launcher.api.functional import get_tasks_list
|
|
113
|
+
|
|
114
|
+
# TODO(dfridman): modify `get_tasks_list` to return a list of dicts in the first place
|
|
115
|
+
data = get_tasks_list()
|
|
116
|
+
|
|
117
|
+
headers = [
|
|
118
|
+
"task",
|
|
119
|
+
"endpoint_type",
|
|
120
|
+
"harness",
|
|
121
|
+
"container",
|
|
122
|
+
"arch",
|
|
123
|
+
"description",
|
|
124
|
+
]
|
|
41
125
|
supported_benchmarks = []
|
|
42
126
|
for task_data in data:
|
|
43
|
-
|
|
44
|
-
|
|
127
|
+
if len(task_data) < len(headers):
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"Invalid task row shape: expected at least {len(headers)} columns, got {len(task_data)}"
|
|
130
|
+
)
|
|
131
|
+
# Backwards/forwards compat: allow extra columns and ignore them.
|
|
132
|
+
supported_benchmarks.append(dict(zip(headers, task_data[: len(headers)])))
|
|
45
133
|
|
|
46
134
|
if self.json:
|
|
47
135
|
print(json.dumps({"tasks": supported_benchmarks}, indent=2))
|
|
@@ -49,11 +137,55 @@ class Cmd:
|
|
|
49
137
|
self._print_table(supported_benchmarks)
|
|
50
138
|
|
|
51
139
|
def _print_table(self, tasks: list[dict]) -> None:
|
|
52
|
-
"""Print tasks grouped by harness and container in table format."""
|
|
140
|
+
"""Print tasks grouped by harness and container in table format with colorized output."""
|
|
53
141
|
if not tasks:
|
|
54
142
|
print("No tasks found.")
|
|
55
143
|
return
|
|
56
144
|
|
|
145
|
+
def _truncate(s: str, max_len: int) -> str:
|
|
146
|
+
s = s or ""
|
|
147
|
+
if max_len <= 0:
|
|
148
|
+
return ""
|
|
149
|
+
if len(s) <= max_len:
|
|
150
|
+
return s
|
|
151
|
+
if max_len <= 3:
|
|
152
|
+
return s[:max_len]
|
|
153
|
+
return s[: max_len - 3] + "..."
|
|
154
|
+
|
|
155
|
+
def _infer_arch(container: str, container_tasks: list[dict]) -> str:
|
|
156
|
+
# Prefer explicit arch from task IRs.
|
|
157
|
+
for t in container_tasks:
|
|
158
|
+
a = (t.get("arch") or "").strip()
|
|
159
|
+
if a:
|
|
160
|
+
return a
|
|
161
|
+
|
|
162
|
+
# Heuristic fallback: look for common suffixes in tag.
|
|
163
|
+
c = (container or "").lower()
|
|
164
|
+
if "arm64" in c or "aarch64" in c:
|
|
165
|
+
return "arm"
|
|
166
|
+
if "amd64" in c or "x86_64" in c:
|
|
167
|
+
return "amd"
|
|
168
|
+
return "unknown"
|
|
169
|
+
|
|
170
|
+
def _infer_registry(container: str) -> str:
|
|
171
|
+
try:
|
|
172
|
+
from nemo_evaluator_launcher.common.container_metadata.utils import (
|
|
173
|
+
parse_container_image,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
registry_type, _registry_url, _repo, _ref = parse_container_image(
|
|
177
|
+
container
|
|
178
|
+
)
|
|
179
|
+
return str(registry_type)
|
|
180
|
+
except Exception:
|
|
181
|
+
# Best-effort fallback for unknown formats.
|
|
182
|
+
c = (container or "").lower()
|
|
183
|
+
if "nvcr.io/" in c or c.startswith("nvcr.io"):
|
|
184
|
+
return "nvcr"
|
|
185
|
+
if "gitlab" in c:
|
|
186
|
+
return "gitlab"
|
|
187
|
+
return ""
|
|
188
|
+
|
|
57
189
|
# Group tasks by harness and container
|
|
58
190
|
grouped = defaultdict(lambda: defaultdict(list))
|
|
59
191
|
for task in tasks:
|
|
@@ -70,67 +202,88 @@ class Cmd:
|
|
|
70
202
|
if j > 0:
|
|
71
203
|
print() # Spacing between containers
|
|
72
204
|
|
|
73
|
-
# Prepare task table first to get column widths
|
|
74
|
-
task_headers = ["task", "endpoint_type"]
|
|
75
205
|
rows = []
|
|
76
206
|
for task in container_tasks:
|
|
77
|
-
rows.append(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
for i in range(len(task_headers))
|
|
86
|
-
]
|
|
87
|
-
|
|
88
|
-
# Calculate minimum table width based on task columns
|
|
89
|
-
min_table_width = sum(widths) + len(widths) + 1
|
|
207
|
+
rows.append(
|
|
208
|
+
{
|
|
209
|
+
"task": str(task.get("task", "")),
|
|
210
|
+
"endpoint": str(task.get("endpoint_type", "")),
|
|
211
|
+
"description": str(task.get("description", "")),
|
|
212
|
+
}
|
|
213
|
+
)
|
|
214
|
+
rows.sort(key=lambda r: r["task"].lower())
|
|
90
215
|
|
|
91
216
|
# Calculate required width for header content
|
|
92
217
|
harness_line = f"harness: {harness}"
|
|
93
218
|
container_line = f"container: {container}"
|
|
219
|
+
arch_line = f"arch: {_infer_arch(container, container_tasks)}"
|
|
220
|
+
registry_line = f"registry: {_infer_registry(container)}"
|
|
94
221
|
header_content_width = (
|
|
95
|
-
max(
|
|
222
|
+
max(
|
|
223
|
+
len(harness_line),
|
|
224
|
+
len(container_line),
|
|
225
|
+
len(arch_line),
|
|
226
|
+
len(registry_line),
|
|
227
|
+
)
|
|
228
|
+
+ 4
|
|
96
229
|
) # +4 for "| " and " |"
|
|
97
230
|
|
|
98
|
-
#
|
|
99
|
-
|
|
231
|
+
# Limit separator width to prevent overflow on small terminals
|
|
232
|
+
# Use terminal width if available, otherwise cap at 120 characters
|
|
233
|
+
import shutil
|
|
100
234
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
235
|
+
try:
|
|
236
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
237
|
+
separator_width = min(terminal_width - 2, 160) # -2 safety margin
|
|
238
|
+
except Exception:
|
|
239
|
+
# Fallback if terminal size can't be determined
|
|
240
|
+
separator_width = 120
|
|
105
241
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
242
|
+
separator_width = max(separator_width, min(header_content_width, 160))
|
|
243
|
+
|
|
244
|
+
# Table columns (keep compact and stable).
|
|
245
|
+
col_task = 36
|
|
246
|
+
col_endpoint = 14
|
|
247
|
+
sep = " "
|
|
248
|
+
fixed = col_task + col_endpoint + len(sep) * 2
|
|
249
|
+
col_desc = max(20, separator_width - fixed)
|
|
250
|
+
|
|
251
|
+
# Print combined header with harness and container info - colorized
|
|
252
|
+
# Keys: magenta, Values: cyan (matching logging utils)
|
|
253
|
+
print(bold("=" * separator_width))
|
|
254
|
+
print(f"{magenta('harness:')} {cyan(str(harness))}")
|
|
255
|
+
print(f"{magenta('container:')} {cyan(str(container))}")
|
|
256
|
+
arch = _infer_arch(container, container_tasks)
|
|
257
|
+
registry = _infer_registry(container)
|
|
258
|
+
print(f"{magenta('arch:')} {cyan(str(arch))}")
|
|
259
|
+
if registry:
|
|
260
|
+
print(f"{magenta('registry:')} {cyan(str(registry))}")
|
|
118
261
|
|
|
119
262
|
# Print task table header separator
|
|
120
|
-
print(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
263
|
+
print()
|
|
264
|
+
print(
|
|
265
|
+
bold(
|
|
266
|
+
f"{'task':<{col_task}}{sep}"
|
|
267
|
+
f"{'endpoint':<{col_endpoint}}{sep}"
|
|
268
|
+
f"{'description':<{col_desc}}"
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
print(bold("-" * separator_width))
|
|
272
|
+
|
|
273
|
+
# Print task rows - use grey for task descriptions
|
|
274
|
+
for r in rows:
|
|
275
|
+
line = (
|
|
276
|
+
f"{_truncate(r['task'], col_task):<{col_task}}{sep}"
|
|
277
|
+
f"{_truncate(r['endpoint'], col_endpoint):<{col_endpoint}}{sep}"
|
|
278
|
+
f"{_truncate(r['description'], col_desc):<{col_desc}}"
|
|
279
|
+
)
|
|
280
|
+
print(grey(line))
|
|
281
|
+
|
|
282
|
+
print(bold("-" * separator_width))
|
|
283
|
+
# Show task count - grey for count text
|
|
132
284
|
task_count = len(rows)
|
|
133
|
-
|
|
134
|
-
print("
|
|
285
|
+
task_word = "task" if task_count == 1 else "tasks"
|
|
286
|
+
print(f" {grey(f'{task_count} {task_word} available')}")
|
|
287
|
+
print(bold("=" * separator_width))
|
|
135
288
|
|
|
136
289
|
print()
|
|
@@ -22,7 +22,9 @@ from simple_parsing import ArgumentParser
|
|
|
22
22
|
import nemo_evaluator_launcher.cli.export as export
|
|
23
23
|
import nemo_evaluator_launcher.cli.info as info
|
|
24
24
|
import nemo_evaluator_launcher.cli.kill as kill
|
|
25
|
+
import nemo_evaluator_launcher.cli.logs as logs
|
|
25
26
|
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
27
|
+
import nemo_evaluator_launcher.cli.ls_task as ls_task
|
|
26
28
|
import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
|
|
27
29
|
import nemo_evaluator_launcher.cli.run as run
|
|
28
30
|
import nemo_evaluator_launcher.cli.status as status
|
|
@@ -42,11 +44,13 @@ def is_verbose_enabled(args) -> bool:
|
|
|
42
44
|
subcommands = [
|
|
43
45
|
"run",
|
|
44
46
|
"status",
|
|
47
|
+
"logs",
|
|
45
48
|
"info",
|
|
46
49
|
"kill",
|
|
47
50
|
"tasks_alias",
|
|
48
51
|
"tasks",
|
|
49
52
|
"runs",
|
|
53
|
+
"task",
|
|
50
54
|
"export",
|
|
51
55
|
]
|
|
52
56
|
for subcmd in subcommands:
|
|
@@ -106,6 +110,14 @@ def create_parser() -> ArgumentParser:
|
|
|
106
110
|
)
|
|
107
111
|
status_parser.add_arguments(status.Cmd, dest="status")
|
|
108
112
|
|
|
113
|
+
# Logs subcommand
|
|
114
|
+
logs_parser = subparsers.add_parser(
|
|
115
|
+
"logs",
|
|
116
|
+
help="Stream logs from evaluation jobs",
|
|
117
|
+
description="Stream logs from evaluation jobs by invocation ID or job ID",
|
|
118
|
+
)
|
|
119
|
+
logs_parser.add_arguments(logs.Cmd, dest="logs")
|
|
120
|
+
|
|
109
121
|
# Kill subcommand
|
|
110
122
|
kill_parser = subparsers.add_parser(
|
|
111
123
|
"kill",
|
|
@@ -149,6 +161,14 @@ def create_parser() -> ArgumentParser:
|
|
|
149
161
|
)
|
|
150
162
|
ls_runs_parser.add_arguments(ls_runs.Cmd, dest="runs")
|
|
151
163
|
|
|
164
|
+
# ls task (task details)
|
|
165
|
+
ls_task_parser = ls_sub.add_parser(
|
|
166
|
+
"task",
|
|
167
|
+
help="Show task details",
|
|
168
|
+
description="Show detailed information about a specific task",
|
|
169
|
+
)
|
|
170
|
+
ls_task_parser.add_arguments(ls_task.Cmd, dest="task")
|
|
171
|
+
|
|
152
172
|
# Export subcommand
|
|
153
173
|
export_parser = subparsers.add_parser(
|
|
154
174
|
"export",
|
|
@@ -204,16 +224,23 @@ def main() -> None:
|
|
|
204
224
|
args.run.execute()
|
|
205
225
|
elif args.command == "status":
|
|
206
226
|
args.status.execute()
|
|
227
|
+
elif args.command == "logs":
|
|
228
|
+
args.logs.execute()
|
|
207
229
|
elif args.command == "kill":
|
|
208
230
|
args.kill.execute()
|
|
209
231
|
elif args.command == "ls":
|
|
210
232
|
# Dispatch nested ls subcommands
|
|
211
|
-
if args.ls_command
|
|
212
|
-
#
|
|
233
|
+
if args.ls_command == "tasks":
|
|
234
|
+
# When explicitly "ls tasks", use args.tasks (has correct from_container)
|
|
235
|
+
args.tasks.execute()
|
|
236
|
+
elif args.ls_command is None:
|
|
237
|
+
# When just "ls" (no subcommand), use args.tasks_alias
|
|
213
238
|
if hasattr(args, "tasks_alias"):
|
|
214
239
|
args.tasks_alias.execute()
|
|
215
240
|
else:
|
|
216
241
|
args.tasks.execute()
|
|
242
|
+
elif args.ls_command == "task":
|
|
243
|
+
args.task.execute()
|
|
217
244
|
elif args.ls_command == "runs":
|
|
218
245
|
args.runs.execute()
|
|
219
246
|
elif args.command == "export":
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
import pathlib
|
|
17
17
|
import time
|
|
18
18
|
from dataclasses import dataclass
|
|
19
|
+
from typing import Literal
|
|
19
20
|
|
|
20
21
|
from simple_parsing import field
|
|
21
22
|
|
|
@@ -26,6 +27,7 @@ from nemo_evaluator_launcher.common.printing_utils import (
|
|
|
26
27
|
green,
|
|
27
28
|
magenta,
|
|
28
29
|
red,
|
|
30
|
+
yellow,
|
|
29
31
|
)
|
|
30
32
|
|
|
31
33
|
|
|
@@ -33,6 +35,13 @@ from nemo_evaluator_launcher.common.printing_utils import (
|
|
|
33
35
|
class Cmd:
|
|
34
36
|
"""Run command parameters"""
|
|
35
37
|
|
|
38
|
+
config: str | None = field(
|
|
39
|
+
default=None,
|
|
40
|
+
alias=["--config"],
|
|
41
|
+
metadata={
|
|
42
|
+
"help": "Full path to config file. Uses Hydra by default (--config-mode=hydra). Use --config-mode=raw to load directly (bypasses Hydra)."
|
|
43
|
+
},
|
|
44
|
+
)
|
|
36
45
|
config_name: str = field(
|
|
37
46
|
default="default",
|
|
38
47
|
alias=["-c", "--config-name"],
|
|
@@ -47,11 +56,11 @@ class Cmd:
|
|
|
47
56
|
"help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
|
|
48
57
|
},
|
|
49
58
|
)
|
|
50
|
-
|
|
51
|
-
default=
|
|
52
|
-
alias=["
|
|
59
|
+
config_mode: Literal["hydra", "raw"] = field(
|
|
60
|
+
default="hydra",
|
|
61
|
+
alias=["--config-mode"],
|
|
53
62
|
metadata={
|
|
54
|
-
"help": "
|
|
63
|
+
"help": "Config loading mode: 'hydra' (default) uses Hydra config system, 'raw' loads config file directly bypassing Hydra."
|
|
55
64
|
},
|
|
56
65
|
)
|
|
57
66
|
override: list[str] = field(
|
|
@@ -68,6 +77,15 @@ class Cmd:
|
|
|
68
77
|
alias=["-n", "--dry-run"],
|
|
69
78
|
metadata={"help": "Do not run the evaluation, just print the config."},
|
|
70
79
|
)
|
|
80
|
+
tasks: list[str] = field(
|
|
81
|
+
default_factory=list,
|
|
82
|
+
action="append",
|
|
83
|
+
nargs="?",
|
|
84
|
+
alias=["-t"],
|
|
85
|
+
metadata={
|
|
86
|
+
"help": "Run only specific tasks from the config. Example: -t ifeval -t gsm8k"
|
|
87
|
+
},
|
|
88
|
+
)
|
|
71
89
|
config_output: str | None = field(
|
|
72
90
|
default=None,
|
|
73
91
|
alias=["--config-output"],
|
|
@@ -76,35 +94,97 @@ class Cmd:
|
|
|
76
94
|
},
|
|
77
95
|
)
|
|
78
96
|
|
|
97
|
+
def _parse_requested_tasks(self) -> list[str]:
|
|
98
|
+
"""Parse -t arguments into a list of task names.
|
|
99
|
+
|
|
100
|
+
Handles None values that can be appended when using nargs="?" with action="append".
|
|
101
|
+
"""
|
|
102
|
+
requested_tasks = []
|
|
103
|
+
for task_arg in self.tasks:
|
|
104
|
+
# Skip None or empty values (can happen with nargs="?")
|
|
105
|
+
if not task_arg:
|
|
106
|
+
continue
|
|
107
|
+
task_name = task_arg.strip()
|
|
108
|
+
if task_name and task_name not in requested_tasks:
|
|
109
|
+
requested_tasks.append(task_name)
|
|
110
|
+
return requested_tasks
|
|
111
|
+
|
|
79
112
|
def execute(self) -> None:
|
|
80
113
|
# Import heavy dependencies only when needed
|
|
81
114
|
import yaml
|
|
82
115
|
from omegaconf import OmegaConf
|
|
83
116
|
|
|
84
|
-
from nemo_evaluator_launcher.api.functional import
|
|
117
|
+
from nemo_evaluator_launcher.api.functional import (
|
|
118
|
+
RunConfig,
|
|
119
|
+
filter_tasks,
|
|
120
|
+
run_eval,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Validate config_mode value
|
|
124
|
+
if self.config_mode not in ["hydra", "raw"]:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"Invalid --config-mode value: {self.config_mode}. Must be 'hydra' or 'raw'."
|
|
127
|
+
)
|
|
85
128
|
|
|
86
|
-
#
|
|
87
|
-
if self.
|
|
88
|
-
|
|
129
|
+
# Validate that raw mode requires --config
|
|
130
|
+
if self.config_mode == "raw" and self.config is None:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"--config-mode=raw requires --config to be specified. Raw mode loads config files directly."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Parse requested tasks if -t is specified
|
|
136
|
+
requested_tasks = self._parse_requested_tasks() if self.tasks else None
|
|
137
|
+
|
|
138
|
+
# Load configuration either from Hydra or directly from a config file
|
|
139
|
+
if self.config_mode == "raw" and self.config:
|
|
140
|
+
# Validate that raw config loading is not used with other config options
|
|
89
141
|
if self.config_name != "default":
|
|
90
|
-
raise ValueError(
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"Cannot use --config-mode=raw with --config-name. Raw mode only works with --config."
|
|
144
|
+
)
|
|
91
145
|
if self.config_dir is not None:
|
|
92
|
-
raise ValueError(
|
|
146
|
+
raise ValueError(
|
|
147
|
+
"Cannot use --config-mode=raw with --config-dir. Raw mode only works with --config."
|
|
148
|
+
)
|
|
93
149
|
if self.override:
|
|
94
|
-
raise ValueError(
|
|
150
|
+
raise ValueError(
|
|
151
|
+
"Cannot use --config-mode=raw with --override. Raw mode only works with --config."
|
|
152
|
+
)
|
|
95
153
|
|
|
96
|
-
# Load from
|
|
97
|
-
with open(self.
|
|
154
|
+
# Load from config file directly (bypass Hydra)
|
|
155
|
+
with open(self.config, "r") as f:
|
|
98
156
|
config_dict = yaml.safe_load(f)
|
|
99
157
|
|
|
100
158
|
# Create RunConfig from the loaded data
|
|
101
159
|
config = OmegaConf.create(config_dict)
|
|
102
160
|
else:
|
|
161
|
+
# Handle --config parameter: split path into config_dir and config_name for Hydra
|
|
162
|
+
if self.config:
|
|
163
|
+
if self.config_name != "default":
|
|
164
|
+
raise ValueError("Cannot use --config with --config-name")
|
|
165
|
+
if self.config_dir is not None:
|
|
166
|
+
raise ValueError("Cannot use --config with --config-dir")
|
|
167
|
+
config_path = pathlib.Path(self.config)
|
|
168
|
+
config_dir = str(config_path.parent)
|
|
169
|
+
config_name = str(config_path.stem)
|
|
170
|
+
else:
|
|
171
|
+
config_dir = self.config_dir
|
|
172
|
+
config_name = self.config_name
|
|
173
|
+
|
|
103
174
|
# Load the complete Hydra configuration
|
|
104
175
|
config = RunConfig.from_hydra(
|
|
105
|
-
|
|
176
|
+
config_dir=config_dir,
|
|
177
|
+
config_name=config_name,
|
|
106
178
|
hydra_overrides=self.override,
|
|
107
|
-
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Apply task filtering if -t is specified
|
|
182
|
+
if requested_tasks:
|
|
183
|
+
config = filter_tasks(config, requested_tasks)
|
|
184
|
+
logger.info(
|
|
185
|
+
"Running filtered tasks",
|
|
186
|
+
count=len(config.evaluation.tasks),
|
|
187
|
+
tasks=[t.name for t in config.evaluation.tasks],
|
|
108
188
|
)
|
|
109
189
|
|
|
110
190
|
try:
|
|
@@ -150,7 +230,7 @@ class Cmd:
|
|
|
150
230
|
f.write("#\n")
|
|
151
231
|
f.write("# To rerun this exact configuration:\n")
|
|
152
232
|
f.write(
|
|
153
|
-
f"# nemo-evaluator-launcher run --
|
|
233
|
+
f"# nemo-evaluator-launcher run --config {config_path} --config-mode=raw\n"
|
|
154
234
|
)
|
|
155
235
|
f.write("#\n")
|
|
156
236
|
f.write(config_yaml)
|
|
@@ -164,6 +244,10 @@ class Cmd:
|
|
|
164
244
|
bold(cyan("To check status: "))
|
|
165
245
|
+ f"nemo-evaluator-launcher status {invocation_id}"
|
|
166
246
|
)
|
|
247
|
+
print(
|
|
248
|
+
bold(cyan("To view job info: "))
|
|
249
|
+
+ f"nemo-evaluator-launcher info {invocation_id}"
|
|
250
|
+
)
|
|
167
251
|
print(
|
|
168
252
|
bold(cyan("To kill all jobs: "))
|
|
169
253
|
+ f"nemo-evaluator-launcher kill {invocation_id}"
|
|
@@ -198,3 +282,17 @@ class Cmd:
|
|
|
198
282
|
)
|
|
199
283
|
)
|
|
200
284
|
)
|
|
285
|
+
|
|
286
|
+
# Warn if both config_dir and config_name are provided (and config_name is not default)
|
|
287
|
+
if (
|
|
288
|
+
self.config is None
|
|
289
|
+
and self.config_dir is not None
|
|
290
|
+
and self.config_name != "default"
|
|
291
|
+
):
|
|
292
|
+
joint_path = pathlib.Path(self.config_dir) / f"{self.config_name}.yaml"
|
|
293
|
+
print(
|
|
294
|
+
yellow(
|
|
295
|
+
f"Warning: Using --config-dir and --config-name together is deprecated. "
|
|
296
|
+
f"Please use --config {joint_path} instead."
|
|
297
|
+
)
|
|
298
|
+
)
|
|
@@ -19,6 +19,29 @@ import importlib
|
|
|
19
19
|
from dataclasses import dataclass
|
|
20
20
|
|
|
21
21
|
from nemo_evaluator_launcher import __package_name__, __version__
|
|
22
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_versions() -> dict:
|
|
26
|
+
internal_module_name = "nemo_evaluator_launcher_internal"
|
|
27
|
+
res = {__package_name__: __version__}
|
|
28
|
+
# Check for internal package
|
|
29
|
+
try:
|
|
30
|
+
internal_module = importlib.import_module(internal_module_name)
|
|
31
|
+
# Try to get version from internal package
|
|
32
|
+
internal_version = getattr(internal_module, "__version__", None)
|
|
33
|
+
if internal_version:
|
|
34
|
+
res[internal_module_name] = internal_version
|
|
35
|
+
else:
|
|
36
|
+
res[internal_module_name] = "available (version unknown)"
|
|
37
|
+
except ImportError:
|
|
38
|
+
# Internal package not available - this is expected in many cases
|
|
39
|
+
pass
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"nemo_evaluator_launcher_internal: error loading ({e})")
|
|
42
|
+
raise
|
|
43
|
+
|
|
44
|
+
return res
|
|
22
45
|
|
|
23
46
|
|
|
24
47
|
@dataclass
|
|
@@ -27,26 +50,6 @@ class Cmd:
|
|
|
27
50
|
|
|
28
51
|
def execute(self) -> None:
|
|
29
52
|
"""Execute the version command."""
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
try:
|
|
34
|
-
internal_module = importlib.import_module(
|
|
35
|
-
"nemo_evaluator_launcher_internal"
|
|
36
|
-
)
|
|
37
|
-
# Try to get version from internal package
|
|
38
|
-
try:
|
|
39
|
-
internal_version = getattr(internal_module, "__version__", None)
|
|
40
|
-
if internal_version:
|
|
41
|
-
print(f"nemo-evaluator-launcher-internal: {internal_version}")
|
|
42
|
-
else:
|
|
43
|
-
print(
|
|
44
|
-
"nemo-evaluator-launcher-internal: available (version unknown)"
|
|
45
|
-
)
|
|
46
|
-
except Exception:
|
|
47
|
-
print("nemo-evaluator-launcher-internal: available (version unknown)")
|
|
48
|
-
except ImportError:
|
|
49
|
-
# Internal package not available - this is expected in many cases
|
|
50
|
-
pass
|
|
51
|
-
except Exception as e:
|
|
52
|
-
print(f"nemo-evaluator-launcher-internal: error loading ({e})")
|
|
53
|
+
res = get_versions()
|
|
54
|
+
for package, version in res.items():
|
|
55
|
+
print(f"{package}: {version}")
|