nemo-evaluator-launcher 0.1.15__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (64) hide show
  1. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/PKG-INFO +1 -1
  2. nemo_evaluator_launcher-0.1.17/src/nemo_evaluator_launcher/cli/debug.py +405 -0
  3. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/ls_runs.py +26 -6
  4. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/main.py +24 -1
  5. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/run.py +4 -0
  6. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/lepton/executor.py +127 -22
  7. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/package_info.py +1 -1
  8. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
  9. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +1 -0
  10. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/LICENSE +0 -0
  11. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/README.md +0 -0
  12. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/pyproject.toml +0 -0
  13. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/setup.cfg +0 -0
  14. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/__init__.py +0 -0
  15. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  16. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/api/functional.py +0 -0
  17. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/api/types.py +0 -0
  18. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  19. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  20. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  21. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  22. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  23. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/status.py +0 -0
  24. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/cli/version.py +0 -0
  25. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  26. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  27. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/common/helpers.py +0 -0
  28. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
  29. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  30. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  31. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  32. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  33. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  34. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  35. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
  36. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
  37. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -0
  38. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  39. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
  40. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
  41. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  42. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/base.py +0 -0
  43. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  44. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
  45. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  46. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  47. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/local/executor.py +0 -0
  48. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/local/run.template.sh +0 -0
  49. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  50. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  51. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/executors/slurm/executor.py +0 -0
  52. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  53. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  54. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  55. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
  56. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
  57. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  58. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/utils.py +0 -0
  59. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
  60. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
  61. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  62. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
  63. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
  64. {nemo_evaluator_launcher-0.1.15 → nemo_evaluator_launcher-0.1.17}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -0,0 +1,405 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Debugging helper functionalities for nemo-evaluator-launcher."""
18
+
19
+ from dataclasses import dataclass
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional, Tuple
23
+
24
+ from simple_parsing import field
25
+
26
+ from nemo_evaluator_launcher.cli.export import ExportCmd
27
+ from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
28
+ from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
29
+ from nemo_evaluator_launcher.common.logging_utils import logger
30
+ from nemo_evaluator_launcher.exporters.local import LocalExporter
31
+ from nemo_evaluator_launcher.exporters.utils import get_task_name
32
+
33
+ # Local exporter helper to copy logs and artifacts
34
+ _EXPORT_HELPER = LocalExporter({})
35
+
36
+
37
+ @dataclass
38
+ class DebugCmd(ExportCmd):
39
+ """Debugging functionalities for nemo-evaluator-launcher.
40
+
41
+ Examples:
42
+ nemo-evaluator-launcher debug <inv> # Full debug info
43
+ nemo-evaluator-launcher debug <inv> --config # Show stored job config (YAML)
44
+ nemo-evaluator-launcher debug <inv> --artifacts # Show artifact locations
45
+ nemo-evaluator-launcher debug <inv> --logs # Show log locations
46
+ nemo-evaluator-launcher debug <inv> --copy-logs <path> # Copy logs (default: current dir)
47
+ nemo-evaluator-launcher debug <inv> --copy-artifacts <path> # Copy artifacts (default: current dir)
48
+
49
+ Notes:
50
+ - Supports invocation IDs and job IDs
51
+ - Shows local or remote paths depending on executor (local/slurm/lepton)
52
+ """
53
+
54
+ # local exporter destination defaults to local
55
+ dest: str = field(default="local", init=False)
56
+
57
+ # debug modes
58
+ config: bool = field(default=False, help="Show job configuration")
59
+ artifacts: bool = field(default=False, help="Show artifact locations")
60
+ logs: bool = field(default=False, help="Show log locations")
61
+
62
+ # copy operations
63
+ copy_logs: Optional[str] = field(
64
+ default=None,
65
+ alias=["--copy-logs"],
66
+ nargs="?",
67
+ help="Copy logs to local directory (default: current dir)",
68
+ )
69
+ copy_artifacts: Optional[str] = field(
70
+ default=None,
71
+ alias=["--copy-artifacts"],
72
+ nargs="?",
73
+ help="Copy artifacts to local directory (default: current dir)",
74
+ )
75
+
76
+ def execute(self) -> None:
77
+ # show version
78
+ VersionCmd().execute()
79
+
80
+ logger.info("Debug command started", invocation_ids=self.invocation_ids)
81
+
82
+ if not self.invocation_ids:
83
+ logger.error("No invocation IDs provided")
84
+ raise ValueError("No job or invocation IDs provided.")
85
+
86
+ jobs = self._resolve_jobs()
87
+ logger.info(
88
+ "Resolved jobs",
89
+ total_ids=len(self.invocation_ids),
90
+ valid_jobs=len(jobs),
91
+ job_ids=[jid for jid, _ in jobs],
92
+ )
93
+
94
+ if not jobs:
95
+ logger.info(
96
+ "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
97
+ )
98
+ print(
99
+ "No valid jobs found (jobs may have been deletedd or IDs may be incorrect)."
100
+ )
101
+ return
102
+
103
+ if self.config:
104
+ logger.info("Showing job configuration", job_count=len(jobs))
105
+ self._show_config_info(jobs)
106
+ elif self.logs:
107
+ logger.info("Showing job logs locations", job_count=len(jobs))
108
+ self._show_logs_info(jobs)
109
+ elif self.artifacts:
110
+ logger.info("Showing artifacts locations", job_count=len(jobs))
111
+ self._show_artifacts_info(jobs)
112
+ elif self.copy_logs is not None:
113
+ dest = self.copy_logs or "."
114
+ if not self.copy_logs:
115
+ print(
116
+ "No destination provided for --copy-logs; defaulting to current dir"
117
+ )
118
+ logger.info(
119
+ "Copying logs to local directory", dest_dir=dest, job_count=len(jobs)
120
+ )
121
+ self._copy_logs(jobs, dest)
122
+ elif self.copy_artifacts is not None:
123
+ dest = self.copy_artifacts or "."
124
+ if not self.copy_artifacts:
125
+ print(
126
+ "No destination provided for --copy-artifacts; defaulting to current dir)"
127
+ )
128
+ logger.info(
129
+ "Copying artifacts to local directory",
130
+ dest_dir=dest,
131
+ job_count=len(jobs),
132
+ )
133
+ self._copy_artifacts(jobs, dest)
134
+ else:
135
+ logger.info(
136
+ "Job metadata details",
137
+ invocation_id=jobs[0][1].invocation_id if jobs else None,
138
+ jobs=len(jobs),
139
+ )
140
+ self._show_invocation_debug_info(jobs)
141
+
142
+ def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
143
+ """Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
144
+ db = ExecutionDB()
145
+ found: list[tuple[str, JobData]] = []
146
+ for id_or_prefix in self.invocation_ids:
147
+ if "." in id_or_prefix:
148
+ jd = db.get_job(id_or_prefix)
149
+ if jd:
150
+ found.append((jd.job_id, jd))
151
+ else:
152
+ for jid, jd in db.get_jobs(id_or_prefix).items():
153
+ found.append((jid, jd))
154
+ # deduplicate and stable sort
155
+ seen: set[str] = set()
156
+ uniq: list[tuple[str, JobData]] = []
157
+ for jid, jd in found:
158
+ if jid not in seen:
159
+ seen.add(jid)
160
+ uniq.append((jid, jd))
161
+ return sorted(uniq, key=lambda p: p[0])
162
+
163
+ def _show_invocation_debug_info(self, jobs: List[Tuple[str, JobData]]) -> None:
164
+ inv = jobs[0][1].invocation_id if jobs else None
165
+ logger.info("Debug information", jobs=len(jobs), invocation=inv)
166
+ print(
167
+ f"Debug information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
168
+ )
169
+
170
+ for job_id, job_data in jobs:
171
+ self._show_job_debug_info(job_id, job_data)
172
+ print()
173
+
174
+ # footer hint: where to find more metadata
175
+ print(
176
+ "For more details about this run, inspect the Execution DB under your home dir:"
177
+ )
178
+ print(f"Path: {EXEC_DB_FILE}")
179
+ if inv:
180
+ print(f"├── Lookup key: invocation_id={inv}")
181
+
182
+ # Next steps hint
183
+ print("\nNext steps:")
184
+ print(" - Use --logs to show log locations.")
185
+ print(" - Use --artifacts to show artifact locations.")
186
+ print(" - Use --config to show stored job configuration (YAML).")
187
+ print(" - Use --copy-logs [DIR] to copy logs to a local directory.")
188
+ print(" - Use --copy-artifacts [DIR] to copy artifacts to a local directory.")
189
+
190
+ def _show_job_debug_info(self, job_id: str, job_data: JobData) -> None:
191
+ logger.info("Job", job_id=job_id)
192
+ print(f"Job {job_id}")
193
+
194
+ # metadata
195
+ try:
196
+ when = datetime.fromtimestamp(job_data.timestamp).isoformat(
197
+ timespec="seconds"
198
+ )
199
+ except Exception:
200
+ when = str(job_data.timestamp)
201
+ logger.info("Executor", job_id=job_id, executor=job_data.executor)
202
+ logger.info("Created", job_id=job_id, created=when)
203
+ print(f"├── Executor: {job_data.executor}")
204
+ print(f"├── Created: {when}")
205
+
206
+ task_name = get_task_name(job_data)
207
+ if task_name:
208
+ logger.info("Task", job_id=job_id, name=task_name)
209
+ print(f"├── Task: {task_name}")
210
+
211
+ # locations via exporter helper
212
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
213
+
214
+ # Artifacts
215
+ if paths.get("storage_type") == "remote_ssh":
216
+ artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
217
+ logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
218
+ print(f"├── Artifacts: {artifacts_path} (remote)")
219
+ else:
220
+ ap = paths.get("artifacts_dir")
221
+ if ap:
222
+ exists = self._check_path_exists(paths, "artifacts")
223
+ logger.info(
224
+ "Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
225
+ )
226
+ print(f"├── Artifacts: {ap} {exists} (local)")
227
+
228
+ # Logs
229
+ if paths.get("storage_type") == "remote_ssh":
230
+ logs_path = (
231
+ f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
232
+ )
233
+ logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
234
+ print(f"├── Logs: {logs_path} (remote)")
235
+ else:
236
+ lp = paths.get("logs_dir")
237
+ if lp:
238
+ exists = self._check_path_exists(paths, "logs")
239
+ logger.info(
240
+ "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
241
+ )
242
+ print(f"├── Logs: {lp} {exists} (local)")
243
+
244
+ # executor-specific
245
+ d = job_data.data or {}
246
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
247
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
248
+
249
+ if exec_type == "slurm":
250
+ sj = d.get("slurm_job_id")
251
+ if sj:
252
+ print(f"├── Slurm Job ID: {sj}")
253
+ elif exec_type == "gitlab":
254
+ pid = d.get("pipeline_id")
255
+ if pid:
256
+ print(f"├── Pipeline ID: {pid}")
257
+ elif exec_type == "lepton":
258
+ jn = d.get("lepton_job_name")
259
+ if jn:
260
+ print(f"├── Lepton Job: {jn}")
261
+ en = d.get("endpoint_name")
262
+ if en:
263
+ print(f"├── Endpoint: {en}")
264
+ eu = d.get("endpoint_url")
265
+ if eu:
266
+ print(f"├── Endpoint URL: {eu}")
267
+ # local and others: paths already displayed above; no extra fields needed
268
+
269
+ def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
270
+ logger.info("Log locations")
271
+ print("Log locations:\n")
272
+ for job_id, job_data in jobs:
273
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
274
+ if paths.get("storage_type") == "remote_ssh":
275
+ logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
276
+ logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
277
+ print(f"{job_id}: {logs_path} (remote)")
278
+ else:
279
+ lp = paths.get("logs_dir")
280
+ if lp:
281
+ exists = self._check_path_exists(paths, "logs")
282
+ logger.info(
283
+ "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
284
+ )
285
+ print(f"{job_id}: {lp} {exists} (local)")
286
+
287
+ def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
288
+ logger.info("Artifact locations")
289
+ print("Artifact locations:\n")
290
+ for job_id, job_data in jobs:
291
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
292
+ if paths.get("storage_type") == "remote_ssh":
293
+ artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
294
+ logger.info(
295
+ "Artifacts", job_id=job_id, path=artifacts_path, remote=True
296
+ )
297
+ print(f"{job_id}: {artifacts_path} (remote)")
298
+ else:
299
+ ap = paths.get("artifacts_dir")
300
+ if ap:
301
+ exists = self._check_path_exists(paths, "artifacts")
302
+ logger.info(
303
+ "Artifacts",
304
+ job_id=job_id,
305
+ path=str(ap),
306
+ exists_indicator=exists,
307
+ )
308
+ print(f"{job_id}: {ap} {exists} (local)")
309
+
310
+ def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
311
+ for job_id, job_data in jobs:
312
+ logger.info("Configuration for job", job_id=job_id)
313
+ print(f"Configuration for {job_id}:")
314
+ if job_data.config:
315
+ import yaml
316
+
317
+ config_yaml = yaml.dump(
318
+ job_data.config, default_flow_style=False, indent=2
319
+ )
320
+ logger.info("Configuration YAML", job_id=job_id, config=config_yaml)
321
+ print(config_yaml)
322
+ else:
323
+ logger.info("No configuration stored for this job", job_id=job_id)
324
+ print(" No configuration stored for this job.")
325
+ print()
326
+
327
+ def _copy_logs(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
328
+ """Copy logs using export functionality."""
329
+ self._copy_content(jobs, dest_dir, copy_logs=True, copy_artifacts=False)
330
+
331
+ def _copy_artifacts(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
332
+ """Copy artifacts using export functionality."""
333
+ self._copy_content(jobs, dest_dir, copy_logs=False, copy_artifacts=True)
334
+
335
+ def _copy_content(
336
+ self,
337
+ jobs: List[Tuple[str, JobData]],
338
+ dest_dir: str,
339
+ copy_logs: bool,
340
+ copy_artifacts: bool,
341
+ ) -> None:
342
+ logger.debug(
343
+ "Preparing export call",
344
+ dest_dir=dest_dir,
345
+ copy_logs=copy_logs,
346
+ copy_artifacts=copy_artifacts,
347
+ job_ids=[jid for jid, _ in jobs],
348
+ )
349
+
350
+ from nemo_evaluator_launcher.api.functional import export_results
351
+
352
+ config = {
353
+ "output_dir": dest_dir,
354
+ "only_required": True,
355
+ "copy_logs": bool(copy_logs) and not bool(copy_artifacts),
356
+ "copy_artifacts": bool(copy_artifacts) and not bool(copy_logs),
357
+ }
358
+ # skip artifact validation
359
+ if copy_logs and not copy_artifacts:
360
+ config["skip_validation"] = True
361
+
362
+ job_ids = [job_id for job_id, _ in jobs]
363
+ kind = "logs" if copy_logs else "artifacts"
364
+ logger.info(
365
+ "Copying content", kind=kind, job_count=len(job_ids), dest_dir=dest_dir
366
+ )
367
+ print(f"Copying {kind} for {len(job_ids)} job(s) to {dest_dir}...")
368
+
369
+ result = export_results(job_ids, "local", config)
370
+ logger.debug("Export API call completed", success=result.get("success"))
371
+
372
+ if result.get("success"):
373
+ logger.info(
374
+ "Content copy completed successfully",
375
+ dest_dir=dest_dir,
376
+ job_count=len(jobs),
377
+ )
378
+ if "jobs" in result:
379
+ for jid, job_result in result["jobs"].items():
380
+ if job_result.get("success"):
381
+ print(f"{jid}: Success")
382
+ else:
383
+ print(
384
+ f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
385
+ )
386
+ else:
387
+ err = result.get("error", "Unknown error")
388
+ logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
389
+ print(f"Failed to copy {kind}: {err}")
390
+
391
+ def _check_path_exists(self, paths: Dict[str, Any], path_type: str) -> str:
392
+ """Check if a path exists and return indicator."""
393
+ try:
394
+ if paths.get("storage_type") == "remote_ssh":
395
+ # For remote paths, we can't easily check existence
396
+ return "(remote)"
397
+ elif path_type == "logs" and "logs_dir" in paths:
398
+ logs_dir = Path(paths["logs_dir"])
399
+ return "(exists)" if logs_dir.exists() else "(not found)"
400
+ elif path_type == "artifacts" and "artifacts_dir" in paths:
401
+ artifacts_dir = Path(paths["artifacts_dir"])
402
+ return "(exists)" if artifacts_dir.exists() else "(not found)"
403
+ except Exception:
404
+ pass
405
+ return ""
@@ -20,6 +20,8 @@ from typing import Optional
20
20
 
21
21
  from simple_parsing import field
22
22
 
23
+ from nemo_evaluator_launcher.common.logging_utils import logger
24
+
23
25
 
24
26
  @dataclass
25
27
  class Cmd:
@@ -27,12 +29,16 @@ class Cmd:
27
29
 
28
30
  limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
29
31
  executor: Optional[str] = field(
30
- default=None, alias=["--executor"], help="Filter by executor"
32
+ default=None,
33
+ alias=["--executor"],
34
+ help="Filter by executor",
31
35
  )
36
+ # TODO(agronskiy): think about if we can propagate a `--status` filter into here.
32
37
  since: Optional[str] = field(
33
38
  default=None,
34
39
  alias=["--since"],
35
- help="Filter by ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00)",
40
+ help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
41
+ "an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
36
42
  )
37
43
 
38
44
  def execute(self) -> None:
@@ -53,7 +59,22 @@ class Cmd:
53
59
 
54
60
  if self.since:
55
61
  try:
56
- if "T" in self.since:
62
+ # Check if it's a relative time format like "1d" or "3h"
63
+ if self.since.lower().endswith("d") and len(self.since) > 1:
64
+ days = int(self.since[:-1])
65
+ if days < 0:
66
+ raise ValueError("Days should be non-negative")
67
+ since_ts = (
68
+ _dt.datetime.now() - _dt.timedelta(days=days)
69
+ ).timestamp()
70
+ elif self.since.lower().endswith("h") and len(self.since) > 1:
71
+ hours = int(self.since[:-1])
72
+ if hours < 0:
73
+ raise ValueError("Hours should be non-negative")
74
+ since_ts = (
75
+ _dt.datetime.now() - _dt.timedelta(hours=hours)
76
+ ).timestamp()
77
+ elif "T" in self.since:
57
78
  since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
58
79
  else:
59
80
  since_ts = _dt.datetime.fromisoformat(
@@ -61,9 +82,8 @@ class Cmd:
61
82
  ).timestamp()
62
83
  rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
63
84
  except Exception:
64
- print(
65
- f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS",
66
- file=sys.stderr,
85
+ logger.fatal(
86
+ f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
67
87
  )
68
88
  sys.exit(2)
69
89
 
@@ -19,6 +19,7 @@ import os
19
19
 
20
20
  from simple_parsing import ArgumentParser
21
21
 
22
+ import nemo_evaluator_launcher.cli.debug as debug
22
23
  import nemo_evaluator_launcher.cli.export as export
23
24
  import nemo_evaluator_launcher.cli.kill as kill
24
25
  import nemo_evaluator_launcher.cli.ls_runs as ls_runs
@@ -38,7 +39,16 @@ def is_verbose_enabled(args) -> bool:
38
39
  return True
39
40
 
40
41
  # Check subcommand verbose flags
41
- subcommands = ["run", "status", "kill", "tasks_alias", "tasks", "runs", "export"]
42
+ subcommands = [
43
+ "run",
44
+ "status",
45
+ "kill",
46
+ "tasks_alias",
47
+ "tasks",
48
+ "runs",
49
+ "export",
50
+ "debug",
51
+ ]
42
52
  for subcmd in subcommands:
43
53
  if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
44
54
  if getattr(getattr(args, subcmd), "verbose"):
@@ -153,6 +163,17 @@ def create_parser() -> ArgumentParser:
153
163
  )
154
164
  export_parser.add_arguments(export.ExportCmd, dest="export")
155
165
 
166
+ # Debug helper subcommand
167
+ debug_parser = subparsers.add_parser(
168
+ "debug",
169
+ help="Display evaluation job information",
170
+ description="Debug helper functionalities for nemo-evaluator-launcher",
171
+ )
172
+ debug_parser.add_argument(
173
+ "-v", "--verbose", action="store_true", help="Enable verbose logging"
174
+ )
175
+ debug_parser.add_arguments(debug.DebugCmd, dest="debug")
176
+
156
177
  return parser
157
178
 
158
179
 
@@ -197,6 +218,8 @@ def main() -> None:
197
218
  args.runs.execute()
198
219
  elif args.command == "export":
199
220
  args.export.execute()
221
+ elif args.command == "debug":
222
+ args.debug.execute()
200
223
 
201
224
 
202
225
  if __name__ == "__main__":
@@ -168,3 +168,7 @@ class Cmd:
168
168
  for idx, task in enumerate(tasks):
169
169
  job_id = f"{invocation_id}.{idx}"
170
170
  print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
171
+ print(
172
+ "to print all jobs: nemo-evaluator-launcher ls runs"
173
+ "\n (--since 1d or --since 6h for time span, see --help)"
174
+ )
@@ -78,9 +78,32 @@ class LeptonExecutor(BaseExecutor):
78
78
  "LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
79
79
  )
80
80
 
81
+ # Load tasks mapping
82
+ tasks_mapping = load_tasks_mapping()
83
+ job_ids = []
84
+ lepton_job_names = []
85
+ endpoint_names = [] # Track multiple endpoints
86
+ db = ExecutionDB()
87
+
81
88
  # Generate invocation ID
82
89
  invocation_id = generate_invocation_id()
83
90
 
91
+ # DRY-RUN mode
92
+ if dry_run:
93
+ output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
94
+ output_dir.mkdir(parents=True, exist_ok=True)
95
+
96
+ # Validate configuration
97
+ _dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
98
+
99
+ if cfg.deployment.type == "none":
100
+ print("Using existing endpoint (deployment: none)")
101
+ print("using shared endpoint")
102
+ else:
103
+ print(f"with endpoint type '{cfg.deployment.type}'")
104
+
105
+ return invocation_id
106
+
84
107
  # For deployment: none, we use the existing endpoint for all tasks
85
108
  if cfg.deployment.type == "none":
86
109
  print("📌 Using existing endpoint (deployment: none)")
@@ -88,13 +111,6 @@ class LeptonExecutor(BaseExecutor):
88
111
  print(f"✅ Using shared endpoint: {shared_endpoint_url}")
89
112
 
90
113
  try:
91
- # Load tasks mapping
92
- tasks_mapping = load_tasks_mapping()
93
- job_ids = []
94
- lepton_job_names = []
95
- endpoint_names = [] # Track multiple endpoints
96
- db = ExecutionDB()
97
-
98
114
  # Create local directory for outputs
99
115
  output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
100
116
  output_dir.mkdir(parents=True, exist_ok=True)
@@ -139,8 +155,13 @@ class LeptonExecutor(BaseExecutor):
139
155
  task_index = str(idx)
140
156
  endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
141
157
 
142
- # Ensure we don't exceed 36 character limit
143
158
  if len(endpoint_name) > 36:
159
+ logger.info(
160
+ "Lepton endpoint name will be deployed under name {task_name}",
161
+ task_name=task.name,
162
+ original=endpoint_name,
163
+ limit=36,
164
+ )
144
165
  # Truncate task name further if needed
145
166
  max_task_len = (
146
167
  36
@@ -151,7 +172,19 @@ class LeptonExecutor(BaseExecutor):
151
172
  ) # 3 hyphens
152
173
  short_task_name = sanitized_task_name[:max_task_len]
153
174
  endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
175
+ logger.info(
176
+ "Lepton endpoint name is auto-generated",
177
+ task_name=task.name,
178
+ original=endpoint_name,
179
+ truncated=endpoint_name,
180
+ limit=36,
181
+ )
154
182
 
183
+ logger.info(
184
+ "Lepton endpoint name (auto-generated)",
185
+ task_name=task.name,
186
+ endpoint_name=endpoint_name,
187
+ )
155
188
  endpoint_names.append(endpoint_name)
156
189
  endpoint_creation_tasks.append((idx, task, endpoint_name))
157
190
 
@@ -298,20 +331,6 @@ class LeptonExecutor(BaseExecutor):
298
331
  f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
299
332
  )
300
333
 
301
- if dry_run:
302
- print("🔍 DRY RUN: Lepton job configurations prepared")
303
- print(f" - Tasks: {len(cfg.evaluation.tasks)}")
304
- for idx, task in enumerate(cfg.evaluation.tasks):
305
- if cfg.deployment.type == "none":
306
- print(f" - Task {idx}: {task.name} using shared endpoint")
307
- else:
308
- print(
309
- f" - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
310
- )
311
- print(f" - Output directory: {output_dir}")
312
- print("\nTo submit jobs, run the executor without --dry-run")
313
- return invocation_id
314
-
315
334
  # ================================================================
316
335
  # JOB SUBMISSION (Sequential, as before)
317
336
  # ================================================================
@@ -334,8 +353,18 @@ class LeptonExecutor(BaseExecutor):
334
353
  max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
335
354
  if len(base_job_name) > max_base_length:
336
355
  base_job_name = base_job_name[:max_base_length]
356
+ logger.info(
357
+ "Lepton job auto-generated name",
358
+ task_name=task.name,
359
+ job_name=f"{base_job_name}-{suffix}",
360
+ )
337
361
 
338
362
  lepton_job_name = f"{base_job_name}-{suffix}"
363
+ logger.info(
364
+ "Lepton job name (auto-generated)",
365
+ task_name=task.name,
366
+ job_name=lepton_job_name,
367
+ )
339
368
  job_ids.append(job_id)
340
369
  lepton_job_names.append(lepton_job_name)
341
370
 
@@ -773,6 +802,82 @@ exit 0
773
802
  return script
774
803
 
775
804
 
805
+ def _dry_run_lepton(
806
+ cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
807
+ ) -> None:
808
+ print("DRY RUN: Lepton job configurations prepared")
809
+ try:
810
+ # validate tasks
811
+ for task in cfg.evaluation.tasks:
812
+ get_task_from_mapping(task.name, tasks_mapping)
813
+
814
+ # nice-to-have checks (existing endpoint URL or endpoints mapping)
815
+ if getattr(cfg.deployment, "type", None) == "none":
816
+ tgt = getattr(cfg, "target", {})
817
+ api = (
818
+ tgt.get("api_endpoint")
819
+ if isinstance(tgt, dict)
820
+ else getattr(tgt, "api_endpoint", None)
821
+ ) or {}
822
+ url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
823
+ if not url or not str(url).strip():
824
+ raise ValueError(
825
+ "target.api_endpoint.url must be set when deployment.type == 'none'"
826
+ )
827
+ else:
828
+ endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
829
+ for task in cfg.evaluation.tasks:
830
+ td = get_task_from_mapping(task.name, tasks_mapping)
831
+ etype = td.get("endpoint_type")
832
+ if etype not in endpoints_cfg:
833
+ raise ValueError(
834
+ f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
835
+ )
836
+ path = endpoints_cfg.get(etype)
837
+ if not isinstance(path, str) or not path.startswith("/"):
838
+ raise ValueError(
839
+ f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
840
+ )
841
+
842
+ # lepton env var presence (reference-level)
843
+ tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
844
+ lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
845
+ api_key_name = getattr(
846
+ getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
847
+ )
848
+ for task in cfg.evaluation.tasks:
849
+ td = get_task_from_mapping(task.name, tasks_mapping)
850
+ required = td.get("required_env_vars", []) or []
851
+ for var in required:
852
+ if var == "API_KEY":
853
+ if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
854
+ raise ValueError(
855
+ f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
856
+ "or target.api_endpoint.api_key_name"
857
+ )
858
+ else:
859
+ if var not in lepton_env_vars:
860
+ raise ValueError(
861
+ f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
862
+ )
863
+
864
+ # success (use realized output directory if invocation_id is available)
865
+ preview_output_dir = (
866
+ Path(cfg.execution.output_dir).absolute() / invocation_id
867
+ if invocation_id
868
+ else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
869
+ )
870
+ print(f" - Tasks: {len(cfg.evaluation.tasks)}")
871
+ for idx, task in enumerate(cfg.evaluation.tasks):
872
+ print(f" - Task {idx}: {task.name}")
873
+ print(f" - Output directory: {preview_output_dir}")
874
+ print("\nTo run evaluation, execute run command without --dry-run")
875
+ except Exception as e:
876
+ print(f"❌ Configuration invalid: {e}")
877
+ logger.error("Lepton dry-run validation failed", error=str(e))
878
+ return
879
+
880
+
776
881
  def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
777
882
  """Helper method that returns statuses if id is the invocation id"""
778
883
  jobs = db.get_jobs(id)
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 15
19
+ PATCH = 17
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -14,6 +14,7 @@ src/nemo_evaluator_launcher/api/functional.py
14
14
  src/nemo_evaluator_launcher/api/types.py
15
15
  src/nemo_evaluator_launcher/api/utils.py
16
16
  src/nemo_evaluator_launcher/cli/__init__.py
17
+ src/nemo_evaluator_launcher/cli/debug.py
17
18
  src/nemo_evaluator_launcher/cli/export.py
18
19
  src/nemo_evaluator_launcher/cli/kill.py
19
20
  src/nemo_evaluator_launcher/cli/ls_runs.py