nemo-evaluator-launcher 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +79 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +698 -0
- nemo_evaluator_launcher/api/types.py +98 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +267 -0
- nemo_evaluator_launcher/cli/info.py +512 -0
- nemo_evaluator_launcher/cli/kill.py +41 -0
- nemo_evaluator_launcher/cli/ls_runs.py +134 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
- nemo_evaluator_launcher/cli/main.py +226 -0
- nemo_evaluator_launcher/cli/run.py +200 -0
- nemo_evaluator_launcher/cli/status.py +164 -0
- nemo_evaluator_launcher/cli/version.py +55 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +283 -0
- nemo_evaluator_launcher/common/helpers.py +366 -0
- nemo_evaluator_launcher/common/logging_utils.py +357 -0
- nemo_evaluator_launcher/common/mapping.py +295 -0
- nemo_evaluator_launcher/common/printing_utils.py +93 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +120 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +605 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +121 -0
- nemo_evaluator_launcher/exporters/gsheets.py +409 -0
- nemo_evaluator_launcher/exporters/local.py +502 -0
- nemo_evaluator_launcher/exporters/mlflow.py +619 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +624 -0
- nemo_evaluator_launcher/exporters/wandb.py +490 -0
- nemo_evaluator_launcher/package_info.py +38 -0
- nemo_evaluator_launcher/resources/mapping.toml +380 -0
- nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
- nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""Job information helper functionalities for nemo-evaluator-launcher."""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
from simple_parsing import field
|
|
26
|
+
|
|
27
|
+
from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
|
|
28
|
+
from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
|
|
29
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
30
|
+
from nemo_evaluator_launcher.exporters.local import LocalExporter
|
|
31
|
+
from nemo_evaluator_launcher.exporters.utils import get_task_name
|
|
32
|
+
|
|
33
|
+
# Local exporter helper to copy logs and artifacts
|
|
34
|
+
_EXPORT_HELPER = LocalExporter({})
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class InfoCmd:
|
|
39
|
+
"""Job information functionalities for nemo-evaluator-launcher.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
nemo-evaluator-launcher info <inv> # Full job info
|
|
43
|
+
nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
|
|
44
|
+
nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
|
|
45
|
+
nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
|
|
46
|
+
nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
|
|
47
|
+
nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
|
|
48
|
+
|
|
49
|
+
Notes:
|
|
50
|
+
- Supports invocation IDs and job IDs (space-separated)
|
|
51
|
+
- Shows local or remote paths depending on executor (local/slurm/lepton)
|
|
52
|
+
- Copy operations work for both local and remote jobs (expect longer time for remote jobs)
|
|
53
|
+
- Copy operations are not supported for Lepton executor (yet).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
invocation_ids: List[str] = field(
|
|
57
|
+
positional=True,
|
|
58
|
+
help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# info modes
|
|
62
|
+
config: bool = field(
|
|
63
|
+
default=False, action="store_true", help="Show job configuration"
|
|
64
|
+
)
|
|
65
|
+
artifacts: bool = field(
|
|
66
|
+
default=False, action="store_true", help="Show artifact locations and key files"
|
|
67
|
+
)
|
|
68
|
+
logs: bool = field(
|
|
69
|
+
default=False, action="store_true", help="Show log locations and key files"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# copy operations - work for both local and remote jobs
|
|
73
|
+
copy_logs: str | None = field(
|
|
74
|
+
default=None,
|
|
75
|
+
alias=["--copy-logs"],
|
|
76
|
+
help="Copy logs to a local directory",
|
|
77
|
+
metavar="DIR",
|
|
78
|
+
)
|
|
79
|
+
copy_artifacts: str | None = field(
|
|
80
|
+
default=None,
|
|
81
|
+
alias=["--copy-artifacts"],
|
|
82
|
+
help="Copy artifacts to a local directory",
|
|
83
|
+
metavar="DIR",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def execute(self) -> None:
|
|
87
|
+
VersionCmd().execute()
|
|
88
|
+
logger.info("Info command started", invocation_ids=self.invocation_ids)
|
|
89
|
+
|
|
90
|
+
if not self.invocation_ids:
|
|
91
|
+
logger.error("No job or invocation IDs provided.")
|
|
92
|
+
raise ValueError("No job or invocation IDs provided.")
|
|
93
|
+
|
|
94
|
+
jobs = self._resolve_jobs()
|
|
95
|
+
logger.info(
|
|
96
|
+
"Resolved jobs",
|
|
97
|
+
total_ids=len(self.invocation_ids),
|
|
98
|
+
valid_jobs=len(jobs),
|
|
99
|
+
job_ids=[jid for jid, _ in jobs],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if not jobs:
|
|
103
|
+
logger.info(
|
|
104
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
105
|
+
)
|
|
106
|
+
print(
|
|
107
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
108
|
+
)
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
# show ops
|
|
112
|
+
if self.config:
|
|
113
|
+
self._show_config_info(jobs)
|
|
114
|
+
if self.logs:
|
|
115
|
+
self._show_logs_info(jobs)
|
|
116
|
+
if self.artifacts:
|
|
117
|
+
self._show_artifacts_info(jobs)
|
|
118
|
+
|
|
119
|
+
# copy ops
|
|
120
|
+
args = sys.argv[1:]
|
|
121
|
+
copy_logs_flag = "--copy-logs" in args
|
|
122
|
+
copy_artifacts_flag = "--copy-artifacts" in args
|
|
123
|
+
|
|
124
|
+
if copy_logs_flag:
|
|
125
|
+
if self.copy_logs is None:
|
|
126
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
127
|
+
if not self.copy_logs.strip():
|
|
128
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
129
|
+
logger.info(
|
|
130
|
+
"Copying logs to local directory",
|
|
131
|
+
dest_dir=self.copy_logs,
|
|
132
|
+
job_count=len(jobs),
|
|
133
|
+
)
|
|
134
|
+
self._copy_logs(jobs, self.copy_logs)
|
|
135
|
+
|
|
136
|
+
if copy_artifacts_flag:
|
|
137
|
+
if self.copy_artifacts is None:
|
|
138
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
139
|
+
if not self.copy_artifacts.strip():
|
|
140
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
141
|
+
logger.info(
|
|
142
|
+
"Copying artifacts to local directory",
|
|
143
|
+
dest_dir=self.copy_artifacts,
|
|
144
|
+
job_count=len(jobs),
|
|
145
|
+
)
|
|
146
|
+
self._copy_artifacts(jobs, self.copy_artifacts)
|
|
147
|
+
|
|
148
|
+
# default view when no flags
|
|
149
|
+
if not any(
|
|
150
|
+
[
|
|
151
|
+
self.config,
|
|
152
|
+
self.logs,
|
|
153
|
+
self.artifacts,
|
|
154
|
+
self.copy_logs,
|
|
155
|
+
self.copy_artifacts,
|
|
156
|
+
]
|
|
157
|
+
):
|
|
158
|
+
logger.info(
|
|
159
|
+
"Job metadata details",
|
|
160
|
+
invocation_id=jobs[0][1].invocation_id if jobs else None,
|
|
161
|
+
jobs=len(jobs),
|
|
162
|
+
)
|
|
163
|
+
self._show_invocation_info(jobs)
|
|
164
|
+
|
|
165
|
+
def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
|
|
166
|
+
"""Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
|
|
167
|
+
db = ExecutionDB()
|
|
168
|
+
found: list[tuple[str, JobData]] = []
|
|
169
|
+
for id_or_prefix in self.invocation_ids:
|
|
170
|
+
if "." in id_or_prefix:
|
|
171
|
+
jd = db.get_job(id_or_prefix)
|
|
172
|
+
if jd:
|
|
173
|
+
found.append((jd.job_id, jd))
|
|
174
|
+
else:
|
|
175
|
+
for jid, jd in db.get_jobs(id_or_prefix).items():
|
|
176
|
+
found.append((jid, jd))
|
|
177
|
+
# deduplicate and stable sort
|
|
178
|
+
seen: set[str] = set()
|
|
179
|
+
uniq: list[tuple[str, JobData]] = []
|
|
180
|
+
for jid, jd in found:
|
|
181
|
+
if jid not in seen:
|
|
182
|
+
seen.add(jid)
|
|
183
|
+
uniq.append((jid, jd))
|
|
184
|
+
return sorted(uniq, key=lambda p: p[0])
|
|
185
|
+
|
|
186
|
+
def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
187
|
+
inv = jobs[0][1].invocation_id if jobs else None
|
|
188
|
+
logger.info("Job information", jobs=len(jobs), invocation=inv)
|
|
189
|
+
print(
|
|
190
|
+
f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
for job_id, job_data in jobs:
|
|
194
|
+
self._show_job_info(job_id, job_data)
|
|
195
|
+
print()
|
|
196
|
+
|
|
197
|
+
# footer hint: where to find more metadata
|
|
198
|
+
print(
|
|
199
|
+
"For more details about this run, inspect the Execution DB under your home dir:"
|
|
200
|
+
)
|
|
201
|
+
print(f"Path: {EXEC_DB_FILE}")
|
|
202
|
+
if inv:
|
|
203
|
+
print(f"├── Lookup key: invocation_id={inv}")
|
|
204
|
+
|
|
205
|
+
# Next steps hint
|
|
206
|
+
print("\nNext steps:")
|
|
207
|
+
print(" - Use --logs to show log locations.")
|
|
208
|
+
print(" - Use --artifacts to show artifact locations.")
|
|
209
|
+
print(" - Use --config to show stored job configuration (YAML).")
|
|
210
|
+
print(
|
|
211
|
+
" - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
|
|
212
|
+
)
|
|
213
|
+
print(
|
|
214
|
+
" - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def _show_job_info(self, job_id: str, job_data: JobData) -> None:
|
|
218
|
+
logger.info("Job", job_id=job_id)
|
|
219
|
+
print(f"Job {job_id}")
|
|
220
|
+
|
|
221
|
+
# metadata
|
|
222
|
+
try:
|
|
223
|
+
when = datetime.fromtimestamp(job_data.timestamp).isoformat(
|
|
224
|
+
timespec="seconds"
|
|
225
|
+
)
|
|
226
|
+
except Exception:
|
|
227
|
+
when = str(job_data.timestamp)
|
|
228
|
+
logger.info("Executor", job_id=job_id, executor=job_data.executor)
|
|
229
|
+
logger.info("Created", job_id=job_id, created=when)
|
|
230
|
+
print(f"├── Executor: {job_data.executor}")
|
|
231
|
+
print(f"├── Created: {when}")
|
|
232
|
+
|
|
233
|
+
task_name = get_task_name(job_data)
|
|
234
|
+
if task_name:
|
|
235
|
+
logger.info("Task", job_id=job_id, name=task_name)
|
|
236
|
+
print(f"├── Task: {task_name}")
|
|
237
|
+
|
|
238
|
+
# Determine executor type for file descriptions
|
|
239
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
240
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
241
|
+
|
|
242
|
+
# locations via exporter helper
|
|
243
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
244
|
+
|
|
245
|
+
# Artifacts with file descriptions
|
|
246
|
+
artifacts_list = _get_artifacts_file_list()
|
|
247
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
248
|
+
artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
|
|
249
|
+
logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
|
|
250
|
+
print(f"├── Artifacts: {artifacts_path} (remote)")
|
|
251
|
+
print("│ └── Key files:")
|
|
252
|
+
for filename, desc in artifacts_list:
|
|
253
|
+
print(f"│ ├── {filename} - {desc}")
|
|
254
|
+
else:
|
|
255
|
+
ap = paths.get("artifacts_dir")
|
|
256
|
+
if ap:
|
|
257
|
+
exists = self._check_path_exists(paths, "artifacts")
|
|
258
|
+
logger.info(
|
|
259
|
+
"Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
|
|
260
|
+
)
|
|
261
|
+
print(f"├── Artifacts: {ap} {exists} (local)")
|
|
262
|
+
print("│ └── Key files:")
|
|
263
|
+
for filename, desc in artifacts_list:
|
|
264
|
+
print(f"│ ├── {filename} - {desc}")
|
|
265
|
+
|
|
266
|
+
# Logs with file descriptions
|
|
267
|
+
logs_list = _get_log_file_list(exec_type)
|
|
268
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
269
|
+
logs_path = (
|
|
270
|
+
f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
|
|
271
|
+
)
|
|
272
|
+
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
273
|
+
print(f"├── Logs: {logs_path} (remote)")
|
|
274
|
+
print("│ └── Key files:")
|
|
275
|
+
for filename, desc in logs_list:
|
|
276
|
+
print(f"│ ├── {filename} - {desc}")
|
|
277
|
+
else:
|
|
278
|
+
lp = paths.get("logs_dir")
|
|
279
|
+
if lp:
|
|
280
|
+
exists = self._check_path_exists(paths, "logs")
|
|
281
|
+
logger.info(
|
|
282
|
+
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
283
|
+
)
|
|
284
|
+
print(f"├── Logs: {lp} {exists} (local)")
|
|
285
|
+
print("│ └── Key files:")
|
|
286
|
+
for filename, desc in logs_list:
|
|
287
|
+
print(f"│ ├── {filename} - {desc}")
|
|
288
|
+
|
|
289
|
+
# executor-specific
|
|
290
|
+
d = job_data.data or {}
|
|
291
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
292
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
293
|
+
|
|
294
|
+
if exec_type == "slurm":
|
|
295
|
+
sj = d.get("slurm_job_id")
|
|
296
|
+
if sj:
|
|
297
|
+
print(f"├── Slurm Job ID: {sj}")
|
|
298
|
+
elif exec_type == "gitlab":
|
|
299
|
+
pid = d.get("pipeline_id")
|
|
300
|
+
if pid:
|
|
301
|
+
print(f"├── Pipeline ID: {pid}")
|
|
302
|
+
elif exec_type == "lepton":
|
|
303
|
+
jn = d.get("lepton_job_name")
|
|
304
|
+
if jn:
|
|
305
|
+
print(f"├── Lepton Job: {jn}")
|
|
306
|
+
en = d.get("endpoint_name")
|
|
307
|
+
if en:
|
|
308
|
+
print(f"├── Endpoint: {en}")
|
|
309
|
+
eu = d.get("endpoint_url")
|
|
310
|
+
if eu:
|
|
311
|
+
print(f"├── Endpoint URL: {eu}")
|
|
312
|
+
|
|
313
|
+
def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
314
|
+
logger.info("Log locations")
|
|
315
|
+
print("Log locations:\n")
|
|
316
|
+
for job_id, job_data in jobs:
|
|
317
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
318
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
319
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
320
|
+
logs_list = _get_log_file_list(exec_type)
|
|
321
|
+
|
|
322
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
323
|
+
logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
|
|
324
|
+
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
325
|
+
print(f"{job_id}: {logs_path} (remote)")
|
|
326
|
+
print(" └── Key files:")
|
|
327
|
+
for filename, desc in logs_list:
|
|
328
|
+
print(f" ├── {filename} - {desc}")
|
|
329
|
+
else:
|
|
330
|
+
lp = paths.get("logs_dir")
|
|
331
|
+
if lp:
|
|
332
|
+
exists = self._check_path_exists(paths, "logs")
|
|
333
|
+
logger.info(
|
|
334
|
+
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
335
|
+
)
|
|
336
|
+
print(f"{job_id}: {lp} {exists} (local)")
|
|
337
|
+
print(" └── Key files:")
|
|
338
|
+
for filename, desc in logs_list:
|
|
339
|
+
print(f" ├── {filename} - {desc}")
|
|
340
|
+
|
|
341
|
+
def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
342
|
+
logger.info("Artifact locations")
|
|
343
|
+
print("Artifact locations:\n")
|
|
344
|
+
for job_id, job_data in jobs:
|
|
345
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
346
|
+
artifacts_list = _get_artifacts_file_list()
|
|
347
|
+
|
|
348
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
349
|
+
artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
|
|
350
|
+
logger.info(
|
|
351
|
+
"Artifacts", job_id=job_id, path=artifacts_path, remote=True
|
|
352
|
+
)
|
|
353
|
+
print(f"{job_id}: {artifacts_path} (remote)")
|
|
354
|
+
print(" └── Key files:")
|
|
355
|
+
for filename, desc in artifacts_list:
|
|
356
|
+
print(f" ├── {filename} - {desc}")
|
|
357
|
+
else:
|
|
358
|
+
ap = paths.get("artifacts_dir")
|
|
359
|
+
if ap:
|
|
360
|
+
exists = self._check_path_exists(paths, "artifacts")
|
|
361
|
+
logger.info(
|
|
362
|
+
"Artifacts",
|
|
363
|
+
job_id=job_id,
|
|
364
|
+
path=str(ap),
|
|
365
|
+
exists_indicator=exists,
|
|
366
|
+
)
|
|
367
|
+
print(f"{job_id}: {ap} {exists} (local)")
|
|
368
|
+
print(" └── Key files:")
|
|
369
|
+
for filename, desc in artifacts_list:
|
|
370
|
+
print(f" ├── {filename} - {desc}")
|
|
371
|
+
|
|
372
|
+
def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
373
|
+
for job_id, job_data in jobs:
|
|
374
|
+
logger.info("Configuration for job", job_id=job_id)
|
|
375
|
+
print(f"Configuration for {job_id}:")
|
|
376
|
+
if job_data.config:
|
|
377
|
+
import yaml
|
|
378
|
+
|
|
379
|
+
config_yaml = yaml.dump(
|
|
380
|
+
job_data.config, default_flow_style=False, indent=2
|
|
381
|
+
)
|
|
382
|
+
logger.info("Configuration YAML", job_id=job_id, config=config_yaml)
|
|
383
|
+
print(config_yaml)
|
|
384
|
+
else:
|
|
385
|
+
logger.info("No configuration stored for this job", job_id=job_id)
|
|
386
|
+
print(" No configuration stored for this job.")
|
|
387
|
+
print()
|
|
388
|
+
|
|
389
|
+
def _copy_logs(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
|
|
390
|
+
"""Copy logs using export functionality."""
|
|
391
|
+
self._copy_content(jobs, dest_dir, copy_logs=True, copy_artifacts=False)
|
|
392
|
+
|
|
393
|
+
def _copy_artifacts(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
|
|
394
|
+
"""Copy artifacts using export functionality."""
|
|
395
|
+
self._copy_content(jobs, dest_dir, copy_logs=False, copy_artifacts=True)
|
|
396
|
+
|
|
397
|
+
def _copy_content(
|
|
398
|
+
self,
|
|
399
|
+
jobs: List[Tuple[str, JobData]],
|
|
400
|
+
dest_dir: str,
|
|
401
|
+
copy_logs: bool,
|
|
402
|
+
copy_artifacts: bool,
|
|
403
|
+
) -> None:
|
|
404
|
+
logger.debug(
|
|
405
|
+
"Preparing export call",
|
|
406
|
+
dest_dir=dest_dir,
|
|
407
|
+
copy_logs=copy_logs,
|
|
408
|
+
copy_artifacts=copy_artifacts,
|
|
409
|
+
job_ids=[jid for jid, _ in jobs],
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
from nemo_evaluator_launcher.api.functional import export_results
|
|
413
|
+
|
|
414
|
+
config = {
|
|
415
|
+
"output_dir": dest_dir,
|
|
416
|
+
"only_required": True,
|
|
417
|
+
"copy_logs": bool(copy_logs) and not bool(copy_artifacts),
|
|
418
|
+
"copy_artifacts": bool(copy_artifacts) and not bool(copy_logs),
|
|
419
|
+
}
|
|
420
|
+
# skip artifact validation
|
|
421
|
+
if copy_logs and not copy_artifacts:
|
|
422
|
+
config["skip_validation"] = True
|
|
423
|
+
|
|
424
|
+
job_ids = [job_id for job_id, _ in jobs]
|
|
425
|
+
kind = "logs" if copy_logs else "artifacts"
|
|
426
|
+
logger.info(
|
|
427
|
+
"Copying content", kind=kind, job_count=len(job_ids), dest_dir=dest_dir
|
|
428
|
+
)
|
|
429
|
+
print(f"Copying {kind} for {len(job_ids)} job(s) to {dest_dir}...")
|
|
430
|
+
|
|
431
|
+
result = export_results(job_ids, "local", config)
|
|
432
|
+
logger.debug("Export API call completed", success=result.get("success"))
|
|
433
|
+
|
|
434
|
+
if result.get("success"):
|
|
435
|
+
logger.info(
|
|
436
|
+
"Content copy completed successfully",
|
|
437
|
+
dest_dir=dest_dir,
|
|
438
|
+
job_count=len(jobs),
|
|
439
|
+
)
|
|
440
|
+
if "jobs" in result:
|
|
441
|
+
for jid, job_result in result["jobs"].items():
|
|
442
|
+
if job_result.get("success"):
|
|
443
|
+
print(f"{jid}: Success")
|
|
444
|
+
else:
|
|
445
|
+
print(
|
|
446
|
+
f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
|
|
447
|
+
)
|
|
448
|
+
# Show full destination path
|
|
449
|
+
full_dest_path = Path(dest_dir).resolve()
|
|
450
|
+
print(f"Copied to: {full_dest_path}")
|
|
451
|
+
else:
|
|
452
|
+
err = result.get("error", "Unknown error")
|
|
453
|
+
logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
|
|
454
|
+
print(f"Failed to copy {kind}: {err}")
|
|
455
|
+
|
|
456
|
+
def _check_path_exists(self, paths: Dict[str, Any], path_type: str) -> str:
|
|
457
|
+
"""Check if a path exists and return indicator."""
|
|
458
|
+
try:
|
|
459
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
460
|
+
# For remote paths, we can't easily check existence
|
|
461
|
+
return "(remote)"
|
|
462
|
+
elif path_type == "logs" and "logs_dir" in paths:
|
|
463
|
+
logs_dir = Path(paths["logs_dir"])
|
|
464
|
+
return "(exists)" if logs_dir.exists() else "(not found)"
|
|
465
|
+
elif path_type == "artifacts" and "artifacts_dir" in paths:
|
|
466
|
+
artifacts_dir = Path(paths["artifacts_dir"])
|
|
467
|
+
return "(exists)" if artifacts_dir.exists() else "(not found)"
|
|
468
|
+
except Exception:
|
|
469
|
+
pass
|
|
470
|
+
return ""
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# Helper functions for file descriptions (based on actual code and content analysis)
|
|
474
|
+
def _get_artifacts_file_list() -> list[tuple[str, str]]:
|
|
475
|
+
"""Files generated in artifacts/."""
|
|
476
|
+
return [
|
|
477
|
+
(
|
|
478
|
+
"results.yml",
|
|
479
|
+
"Benchmark scores, task results and resolved run configuration.",
|
|
480
|
+
),
|
|
481
|
+
(
|
|
482
|
+
"eval_factory_metrics.json",
|
|
483
|
+
"Response + runtime stats (latency, tokens count, memory)",
|
|
484
|
+
),
|
|
485
|
+
("metrics.json", "Harness/benchmark metric and configuration"),
|
|
486
|
+
("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
|
|
487
|
+
("report.json", "Report data in json format, if enabled"),
|
|
488
|
+
]
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
|
|
492
|
+
"""Files actually generated in logs/ - executor-specific."""
|
|
493
|
+
et = (executor_type or "local").lower()
|
|
494
|
+
if et == "slurm":
|
|
495
|
+
return [
|
|
496
|
+
("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
|
|
497
|
+
(
|
|
498
|
+
"slurm-{SLURM_JOB_ID}.out",
|
|
499
|
+
"SLURM scheduler stdout/stderr (batch submission, export steps).",
|
|
500
|
+
),
|
|
501
|
+
(
|
|
502
|
+
"server-{SLURM_JOB_ID}.out",
|
|
503
|
+
"Model server logs when a deployment is used.",
|
|
504
|
+
),
|
|
505
|
+
]
|
|
506
|
+
# local executor
|
|
507
|
+
return [
|
|
508
|
+
(
|
|
509
|
+
"stdout.log",
|
|
510
|
+
"Complete evaluation output (timestamps, resolved config, run/export messages).",
|
|
511
|
+
),
|
|
512
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
from simple_parsing import field
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Cmd:
|
|
23
|
+
"""Kill command configuration."""
|
|
24
|
+
|
|
25
|
+
id: str = field(
|
|
26
|
+
positional=True,
|
|
27
|
+
metadata={
|
|
28
|
+
"help": "Job ID (e.g., aefc4819.0) or invocation ID (e.g., aefc4819) to kill"
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def execute(self) -> None:
|
|
33
|
+
"""Execute the kill command."""
|
|
34
|
+
# Import heavy dependencies only when needed
|
|
35
|
+
import json
|
|
36
|
+
|
|
37
|
+
from nemo_evaluator_launcher.api.functional import kill_job_or_invocation
|
|
38
|
+
|
|
39
|
+
result = kill_job_or_invocation(self.id)
|
|
40
|
+
# Output as JSON
|
|
41
|
+
print(json.dumps(result, indent=2))
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import datetime as _dt
|
|
17
|
+
import sys
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from simple_parsing import field
|
|
22
|
+
|
|
23
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Cmd:
|
|
28
|
+
"""List invocations (runs) from the exec DB as a table."""
|
|
29
|
+
|
|
30
|
+
limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
|
|
31
|
+
executor: Optional[str] = field(
|
|
32
|
+
default=None,
|
|
33
|
+
alias=["--executor"],
|
|
34
|
+
help="Filter by executor",
|
|
35
|
+
)
|
|
36
|
+
# TODO(agronskiy): think about if we can propagate a `--status` filter into here.
|
|
37
|
+
since: Optional[str] = field(
|
|
38
|
+
default=None,
|
|
39
|
+
alias=["--since"],
|
|
40
|
+
help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
|
|
41
|
+
"an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def execute(self) -> None:
|
|
45
|
+
# Import heavy dependencies only when needed
|
|
46
|
+
from nemo_evaluator_launcher.api.functional import (
|
|
47
|
+
get_invocation_benchmarks,
|
|
48
|
+
list_all_invocations_summary,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
rows = list_all_invocations_summary()
|
|
52
|
+
|
|
53
|
+
if self.executor:
|
|
54
|
+
rows = [
|
|
55
|
+
r
|
|
56
|
+
for r in rows
|
|
57
|
+
if (r.get("executor") or "").lower() == self.executor.lower()
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
if self.since:
|
|
61
|
+
try:
|
|
62
|
+
# Check if it's a relative time format like "1d" or "3h"
|
|
63
|
+
if self.since.lower().endswith("d") and len(self.since) > 1:
|
|
64
|
+
days = int(self.since[:-1])
|
|
65
|
+
if days < 0:
|
|
66
|
+
raise ValueError("Days should be non-negative")
|
|
67
|
+
since_ts = (
|
|
68
|
+
_dt.datetime.now() - _dt.timedelta(days=days)
|
|
69
|
+
).timestamp()
|
|
70
|
+
elif self.since.lower().endswith("h") and len(self.since) > 1:
|
|
71
|
+
hours = int(self.since[:-1])
|
|
72
|
+
if hours < 0:
|
|
73
|
+
raise ValueError("Hours should be non-negative")
|
|
74
|
+
since_ts = (
|
|
75
|
+
_dt.datetime.now() - _dt.timedelta(hours=hours)
|
|
76
|
+
).timestamp()
|
|
77
|
+
elif "T" in self.since:
|
|
78
|
+
since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
|
|
79
|
+
else:
|
|
80
|
+
since_ts = _dt.datetime.fromisoformat(
|
|
81
|
+
self.since + "T00:00:00"
|
|
82
|
+
).timestamp()
|
|
83
|
+
rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
|
|
84
|
+
except Exception:
|
|
85
|
+
logger.fatal(
|
|
86
|
+
f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
|
|
87
|
+
)
|
|
88
|
+
sys.exit(2)
|
|
89
|
+
|
|
90
|
+
if self.limit is not None and self.limit >= 0:
|
|
91
|
+
rows = rows[: self.limit]
|
|
92
|
+
|
|
93
|
+
header = [
|
|
94
|
+
"invocation_id",
|
|
95
|
+
"earliest_job_ts",
|
|
96
|
+
"num_jobs",
|
|
97
|
+
"executor",
|
|
98
|
+
"benchmarks",
|
|
99
|
+
]
|
|
100
|
+
table_rows = []
|
|
101
|
+
for r in rows:
|
|
102
|
+
ts = r.get("earliest_job_ts", 0) or 0
|
|
103
|
+
try:
|
|
104
|
+
ts_iso = (
|
|
105
|
+
_dt.datetime.fromtimestamp(ts).replace(microsecond=0).isoformat()
|
|
106
|
+
)
|
|
107
|
+
except Exception:
|
|
108
|
+
ts_iso = ""
|
|
109
|
+
inv = r.get("invocation_id", "")
|
|
110
|
+
try:
|
|
111
|
+
bmarks = get_invocation_benchmarks(inv)
|
|
112
|
+
bmarks_cell = ",".join(bmarks) if bmarks else "unknown"
|
|
113
|
+
except Exception:
|
|
114
|
+
bmarks_cell = "unknown"
|
|
115
|
+
table_rows.append(
|
|
116
|
+
[
|
|
117
|
+
str(inv),
|
|
118
|
+
ts_iso,
|
|
119
|
+
str(r.get("num_jobs", 0)),
|
|
120
|
+
str(r.get("executor", "")),
|
|
121
|
+
bmarks_cell,
|
|
122
|
+
]
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
widths = [len(h) for h in header]
|
|
126
|
+
for tr in table_rows:
|
|
127
|
+
for i, cell in enumerate(tr):
|
|
128
|
+
if len(cell) > widths[i]:
|
|
129
|
+
widths[i] = len(cell)
|
|
130
|
+
fmt = " ".join([f"{{:<{w}}}" for w in widths])
|
|
131
|
+
print(fmt.format(*header))
|
|
132
|
+
print(" ".join(["-" * w for w in widths]))
|
|
133
|
+
for tr in table_rows:
|
|
134
|
+
print(fmt.format(*tr))
|