nemo-evaluator-launcher 0.1.17__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (65) hide show
  1. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/PKG-INFO +1 -1
  2. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/types.py +9 -0
  3. nemo_evaluator_launcher-0.1.17/src/nemo_evaluator_launcher/cli/debug.py → nemo_evaluator_launcher-0.1.18/src/nemo_evaluator_launcher/cli/info.py +170 -63
  4. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/main.py +10 -10
  5. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/run.py +39 -13
  6. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/status.py +9 -8
  7. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/helpers.py +36 -4
  8. nemo_evaluator_launcher-0.1.18/src/nemo_evaluator_launcher/common/printing_utils.py +93 -0
  9. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +5 -4
  10. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/executor.py +11 -1
  11. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/executor.py +28 -13
  12. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/run.template.sh +4 -1
  13. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/slurm/executor.py +22 -7
  14. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/package_info.py +1 -1
  15. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
  16. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +2 -1
  17. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/LICENSE +0 -0
  18. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/README.md +0 -0
  19. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/pyproject.toml +0 -0
  20. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/setup.cfg +0 -0
  21. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/__init__.py +0 -0
  22. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  23. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/functional.py +0 -0
  24. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  25. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  26. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  27. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  28. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
  29. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  30. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/version.py +0 -0
  31. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  32. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  33. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
  34. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  35. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  36. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  37. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  38. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  39. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  40. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
  41. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
  42. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -0
  43. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  44. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
  45. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  46. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/base.py +0 -0
  47. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  48. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
  49. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  50. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  51. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  52. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  53. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  54. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  55. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  56. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
  57. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
  58. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  59. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/utils.py +0 -0
  60. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
  61. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
  62. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  63. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
  64. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
  65. {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.17
3
+ Version: 0.1.18
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -19,9 +19,18 @@ This module defines data structures and helpers for configuration and type safet
19
19
  """
20
20
 
21
21
  import os
22
+ import warnings
22
23
  from dataclasses import dataclass
23
24
  from typing import cast
24
25
 
26
+ # ruff: noqa: E402
27
+ # Later when adding optional module to hydra, since the internal package is optional,
28
+ # will generate a hydra warning. We suppress it as distraction and bad UX, before hydra gets invoked.
29
+ warnings.filterwarnings(
30
+ "ignore",
31
+ message="provider=hydra.searchpath.*path=nemo_evaluator_launcher_internal.*is not available\\.",
32
+ )
33
+
25
34
  import hydra
26
35
  from hydra.core.global_hydra import GlobalHydra
27
36
  from omegaconf import DictConfig, OmegaConf
@@ -14,16 +14,16 @@
14
14
  # limitations under the License.
15
15
  #
16
16
 
17
- """Debugging helper functionalities for nemo-evaluator-launcher."""
17
+ """Job information helper functionalities for nemo-evaluator-launcher."""
18
18
 
19
+ import sys
19
20
  from dataclasses import dataclass
20
21
  from datetime import datetime
21
22
  from pathlib import Path
22
- from typing import Any, Dict, List, Optional, Tuple
23
+ from typing import Any, Dict, List, Tuple
23
24
 
24
25
  from simple_parsing import field
25
26
 
26
- from nemo_evaluator_launcher.cli.export import ExportCmd
27
27
  from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
28
28
  from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
29
29
  from nemo_evaluator_launcher.common.logging_utils import logger
@@ -35,52 +35,60 @@ _EXPORT_HELPER = LocalExporter({})
35
35
 
36
36
 
37
37
  @dataclass
38
- class DebugCmd(ExportCmd):
39
- """Debugging functionalities for nemo-evaluator-launcher.
38
+ class InfoCmd:
39
+ """Job information functionalities for nemo-evaluator-launcher.
40
40
 
41
41
  Examples:
42
- nemo-evaluator-launcher debug <inv> # Full debug info
43
- nemo-evaluator-launcher debug <inv> --config # Show stored job config (YAML)
44
- nemo-evaluator-launcher debug <inv> --artifacts # Show artifact locations
45
- nemo-evaluator-launcher debug <inv> --logs # Show log locations
46
- nemo-evaluator-launcher debug <inv> --copy-logs <path> # Copy logs (default: current dir)
47
- nemo-evaluator-launcher debug <inv> --copy-artifacts <path> # Copy artifacts (default: current dir)
42
+ nemo-evaluator-launcher info <inv> # Full job info
43
+ nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
44
+ nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
45
+ nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
46
+ nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
47
+ nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
48
48
 
49
49
  Notes:
50
- - Supports invocation IDs and job IDs
50
+ - Supports invocation IDs and job IDs (space-separated)
51
51
  - Shows local or remote paths depending on executor (local/slurm/lepton)
52
+ - Copy operations work for both local and remote jobs (expect longer time for remote jobs)
53
+ - Copy operations are not supported for Lepton executor (yet).
52
54
  """
53
55
 
54
- # local exporter destination defaults to local
55
- dest: str = field(default="local", init=False)
56
+ invocation_ids: List[str] = field(
57
+ positional=True,
58
+ help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
59
+ )
56
60
 
57
- # debug modes
58
- config: bool = field(default=False, help="Show job configuration")
59
- artifacts: bool = field(default=False, help="Show artifact locations")
60
- logs: bool = field(default=False, help="Show log locations")
61
+ # info modes
62
+ config: bool = field(
63
+ default=False, action="store_true", help="Show job configuration"
64
+ )
65
+ artifacts: bool = field(
66
+ default=False, action="store_true", help="Show artifact locations and key files"
67
+ )
68
+ logs: bool = field(
69
+ default=False, action="store_true", help="Show log locations and key files"
70
+ )
61
71
 
62
- # copy operations
63
- copy_logs: Optional[str] = field(
72
+ # copy operations - work for both local and remote jobs
73
+ copy_logs: str | None = field(
64
74
  default=None,
65
75
  alias=["--copy-logs"],
66
- nargs="?",
67
- help="Copy logs to local directory (default: current dir)",
76
+ help="Copy logs to a local directory",
77
+ metavar="DIR",
68
78
  )
69
- copy_artifacts: Optional[str] = field(
79
+ copy_artifacts: str | None = field(
70
80
  default=None,
71
81
  alias=["--copy-artifacts"],
72
- nargs="?",
73
- help="Copy artifacts to local directory (default: current dir)",
82
+ help="Copy artifacts to a local directory",
83
+ metavar="DIR",
74
84
  )
75
85
 
76
86
  def execute(self) -> None:
77
- # show version
78
87
  VersionCmd().execute()
79
-
80
- logger.info("Debug command started", invocation_ids=self.invocation_ids)
88
+ logger.info("Info command started", invocation_ids=self.invocation_ids)
81
89
 
82
90
  if not self.invocation_ids:
83
- logger.error("No invocation IDs provided")
91
+ logger.error("No job or invocation IDs provided.")
84
92
  raise ValueError("No job or invocation IDs provided.")
85
93
 
86
94
  jobs = self._resolve_jobs()
@@ -96,48 +104,63 @@ class DebugCmd(ExportCmd):
96
104
  "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
97
105
  )
98
106
  print(
99
- "No valid jobs found (jobs may have been deletedd or IDs may be incorrect)."
107
+ "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
100
108
  )
101
109
  return
102
110
 
111
+ # show ops
103
112
  if self.config:
104
- logger.info("Showing job configuration", job_count=len(jobs))
105
113
  self._show_config_info(jobs)
106
- elif self.logs:
107
- logger.info("Showing job logs locations", job_count=len(jobs))
114
+ if self.logs:
108
115
  self._show_logs_info(jobs)
109
- elif self.artifacts:
110
- logger.info("Showing artifacts locations", job_count=len(jobs))
116
+ if self.artifacts:
111
117
  self._show_artifacts_info(jobs)
112
- elif self.copy_logs is not None:
113
- dest = self.copy_logs or "."
114
- if not self.copy_logs:
115
- print(
116
- "No destination provided for --copy-logs; defaulting to current dir"
117
- )
118
+
119
+ # copy ops
120
+ args = sys.argv[1:]
121
+ copy_logs_flag = "--copy-logs" in args
122
+ copy_artifacts_flag = "--copy-artifacts" in args
123
+
124
+ if copy_logs_flag:
125
+ if self.copy_logs is None:
126
+ raise ValueError("--copy-logs requires a directory path")
127
+ if not self.copy_logs.strip():
128
+ raise ValueError("--copy-logs requires a directory path")
118
129
  logger.info(
119
- "Copying logs to local directory", dest_dir=dest, job_count=len(jobs)
130
+ "Copying logs to local directory",
131
+ dest_dir=self.copy_logs,
132
+ job_count=len(jobs),
120
133
  )
121
- self._copy_logs(jobs, dest)
122
- elif self.copy_artifacts is not None:
123
- dest = self.copy_artifacts or "."
124
- if not self.copy_artifacts:
125
- print(
126
- "No destination provided for --copy-artifacts; defaulting to current dir)"
127
- )
134
+ self._copy_logs(jobs, self.copy_logs)
135
+
136
+ if copy_artifacts_flag:
137
+ if self.copy_artifacts is None:
138
+ raise ValueError("--copy-artifacts requires a directory path")
139
+ if not self.copy_artifacts.strip():
140
+ raise ValueError("--copy-artifacts requires a directory path")
128
141
  logger.info(
129
142
  "Copying artifacts to local directory",
130
- dest_dir=dest,
143
+ dest_dir=self.copy_artifacts,
131
144
  job_count=len(jobs),
132
145
  )
133
- self._copy_artifacts(jobs, dest)
134
- else:
146
+ self._copy_artifacts(jobs, self.copy_artifacts)
147
+
148
+ # default view when no flags
149
+ if not any(
150
+ [
151
+ self.config,
152
+ self.logs,
153
+ self.artifacts,
154
+ self.copy_logs,
155
+ self.copy_artifacts,
156
+ ]
157
+ ):
135
158
  logger.info(
136
159
  "Job metadata details",
137
160
  invocation_id=jobs[0][1].invocation_id if jobs else None,
138
161
  jobs=len(jobs),
139
162
  )
140
- self._show_invocation_debug_info(jobs)
163
+ self._show_invocation_info(jobs)
141
164
 
142
165
  def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
143
166
  """Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
@@ -160,15 +183,15 @@ class DebugCmd(ExportCmd):
160
183
  uniq.append((jid, jd))
161
184
  return sorted(uniq, key=lambda p: p[0])
162
185
 
163
- def _show_invocation_debug_info(self, jobs: List[Tuple[str, JobData]]) -> None:
186
+ def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
164
187
  inv = jobs[0][1].invocation_id if jobs else None
165
- logger.info("Debug information", jobs=len(jobs), invocation=inv)
188
+ logger.info("Job information", jobs=len(jobs), invocation=inv)
166
189
  print(
167
- f"Debug information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
190
+ f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
168
191
  )
169
192
 
170
193
  for job_id, job_data in jobs:
171
- self._show_job_debug_info(job_id, job_data)
194
+ self._show_job_info(job_id, job_data)
172
195
  print()
173
196
 
174
197
  # footer hint: where to find more metadata
@@ -184,10 +207,14 @@ class DebugCmd(ExportCmd):
184
207
  print(" - Use --logs to show log locations.")
185
208
  print(" - Use --artifacts to show artifact locations.")
186
209
  print(" - Use --config to show stored job configuration (YAML).")
187
- print(" - Use --copy-logs [DIR] to copy logs to a local directory.")
188
- print(" - Use --copy-artifacts [DIR] to copy artifacts to a local directory.")
210
+ print(
211
+ " - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
212
+ )
213
+ print(
214
+ " - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
215
+ )
189
216
 
190
- def _show_job_debug_info(self, job_id: str, job_data: JobData) -> None:
217
+ def _show_job_info(self, job_id: str, job_data: JobData) -> None:
191
218
  logger.info("Job", job_id=job_id)
192
219
  print(f"Job {job_id}")
193
220
 
@@ -208,14 +235,22 @@ class DebugCmd(ExportCmd):
208
235
  logger.info("Task", job_id=job_id, name=task_name)
209
236
  print(f"├── Task: {task_name}")
210
237
 
238
+ # Determine executor type for file descriptions
239
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
240
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
241
+
211
242
  # locations via exporter helper
212
243
  paths = _EXPORT_HELPER.get_job_paths(job_data)
213
244
 
214
- # Artifacts
245
+ # Artifacts with file descriptions
246
+ artifacts_list = _get_artifacts_file_list()
215
247
  if paths.get("storage_type") == "remote_ssh":
216
248
  artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
217
249
  logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
218
250
  print(f"├── Artifacts: {artifacts_path} (remote)")
251
+ print("│ └── Key files:")
252
+ for filename, desc in artifacts_list:
253
+ print(f"│ ├── {filename} - {desc}")
219
254
  else:
220
255
  ap = paths.get("artifacts_dir")
221
256
  if ap:
@@ -224,14 +259,21 @@ class DebugCmd(ExportCmd):
224
259
  "Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
225
260
  )
226
261
  print(f"├── Artifacts: {ap} {exists} (local)")
262
+ print("│ └── Key files:")
263
+ for filename, desc in artifacts_list:
264
+ print(f"│ ├── {filename} - {desc}")
227
265
 
228
- # Logs
266
+ # Logs with file descriptions
267
+ logs_list = _get_log_file_list(exec_type)
229
268
  if paths.get("storage_type") == "remote_ssh":
230
269
  logs_path = (
231
270
  f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
232
271
  )
233
272
  logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
234
273
  print(f"├── Logs: {logs_path} (remote)")
274
+ print("│ └── Key files:")
275
+ for filename, desc in logs_list:
276
+ print(f"│ ├── {filename} - {desc}")
235
277
  else:
236
278
  lp = paths.get("logs_dir")
237
279
  if lp:
@@ -240,6 +282,9 @@ class DebugCmd(ExportCmd):
240
282
  "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
241
283
  )
242
284
  print(f"├── Logs: {lp} {exists} (local)")
285
+ print("│ └── Key files:")
286
+ for filename, desc in logs_list:
287
+ print(f"│ ├── {filename} - {desc}")
243
288
 
244
289
  # executor-specific
245
290
  d = job_data.data or {}
@@ -264,17 +309,23 @@ class DebugCmd(ExportCmd):
264
309
  eu = d.get("endpoint_url")
265
310
  if eu:
266
311
  print(f"├── Endpoint URL: {eu}")
267
- # local and others: paths already displayed above; no extra fields needed
268
312
 
269
313
  def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
270
314
  logger.info("Log locations")
271
315
  print("Log locations:\n")
272
316
  for job_id, job_data in jobs:
273
317
  paths = _EXPORT_HELPER.get_job_paths(job_data)
318
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
319
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
320
+ logs_list = _get_log_file_list(exec_type)
321
+
274
322
  if paths.get("storage_type") == "remote_ssh":
275
323
  logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
276
324
  logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
277
325
  print(f"{job_id}: {logs_path} (remote)")
326
+ print(" └── Key files:")
327
+ for filename, desc in logs_list:
328
+ print(f" ├── {filename} - {desc}")
278
329
  else:
279
330
  lp = paths.get("logs_dir")
280
331
  if lp:
@@ -283,18 +334,26 @@ class DebugCmd(ExportCmd):
283
334
  "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
284
335
  )
285
336
  print(f"{job_id}: {lp} {exists} (local)")
337
+ print(" └── Key files:")
338
+ for filename, desc in logs_list:
339
+ print(f" ├── {filename} - {desc}")
286
340
 
287
341
  def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
288
342
  logger.info("Artifact locations")
289
343
  print("Artifact locations:\n")
290
344
  for job_id, job_data in jobs:
291
345
  paths = _EXPORT_HELPER.get_job_paths(job_data)
346
+ artifacts_list = _get_artifacts_file_list()
347
+
292
348
  if paths.get("storage_type") == "remote_ssh":
293
349
  artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
294
350
  logger.info(
295
351
  "Artifacts", job_id=job_id, path=artifacts_path, remote=True
296
352
  )
297
353
  print(f"{job_id}: {artifacts_path} (remote)")
354
+ print(" └── Key files:")
355
+ for filename, desc in artifacts_list:
356
+ print(f" ├── {filename} - {desc}")
298
357
  else:
299
358
  ap = paths.get("artifacts_dir")
300
359
  if ap:
@@ -306,6 +365,9 @@ class DebugCmd(ExportCmd):
306
365
  exists_indicator=exists,
307
366
  )
308
367
  print(f"{job_id}: {ap} {exists} (local)")
368
+ print(" └── Key files:")
369
+ for filename, desc in artifacts_list:
370
+ print(f" ├── {filename} - {desc}")
309
371
 
310
372
  def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
311
373
  for job_id, job_data in jobs:
@@ -383,6 +445,9 @@ class DebugCmd(ExportCmd):
383
445
  print(
384
446
  f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
385
447
  )
448
+ # Show full destination path
449
+ full_dest_path = Path(dest_dir).resolve()
450
+ print(f"Copied to: {full_dest_path}")
386
451
  else:
387
452
  err = result.get("error", "Unknown error")
388
453
  logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
@@ -403,3 +468,45 @@ class DebugCmd(ExportCmd):
403
468
  except Exception:
404
469
  pass
405
470
  return ""
471
+
472
+
473
+ # Helper functions for file descriptions (based on actual code and content analysis)
474
+ def _get_artifacts_file_list() -> list[tuple[str, str]]:
475
+ """Files generated in artifacts/."""
476
+ return [
477
+ (
478
+ "results.yml",
479
+ "Benchmark scores, task results and resolved run configuration.",
480
+ ),
481
+ (
482
+ "eval_factory_metrics.json",
483
+ "Response + runtime stats (latency, tokens count, memory)",
484
+ ),
485
+ ("metrics.json", "Harness/benchmark metric and configuration"),
486
+ ("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
487
+ ("report.json", "Report data in json format, if enabled"),
488
+ ]
489
+
490
+
491
+ def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
492
+ """Files actually generated in logs/ - executor-specific."""
493
+ et = (executor_type or "local").lower()
494
+ if et == "slurm":
495
+ return [
496
+ ("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
497
+ (
498
+ "slurm-{SLURM_JOB_ID}.out",
499
+ "SLURM scheduler stdout/stderr (batch submission, export steps).",
500
+ ),
501
+ (
502
+ "server-{SLURM_JOB_ID}.out",
503
+ "Model server logs when a deployment is used.",
504
+ ),
505
+ ]
506
+ # local executor
507
+ return [
508
+ (
509
+ "stdout.log",
510
+ "Complete evaluation output (timestamps, resolved config, run/export messages).",
511
+ ),
512
+ ]
@@ -19,8 +19,8 @@ import os
19
19
 
20
20
  from simple_parsing import ArgumentParser
21
21
 
22
- import nemo_evaluator_launcher.cli.debug as debug
23
22
  import nemo_evaluator_launcher.cli.export as export
23
+ import nemo_evaluator_launcher.cli.info as info
24
24
  import nemo_evaluator_launcher.cli.kill as kill
25
25
  import nemo_evaluator_launcher.cli.ls_runs as ls_runs
26
26
  import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
@@ -42,12 +42,12 @@ def is_verbose_enabled(args) -> bool:
42
42
  subcommands = [
43
43
  "run",
44
44
  "status",
45
+ "info",
45
46
  "kill",
46
47
  "tasks_alias",
47
48
  "tasks",
48
49
  "runs",
49
50
  "export",
50
- "debug",
51
51
  ]
52
52
  for subcmd in subcommands:
53
53
  if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
@@ -163,16 +163,16 @@ def create_parser() -> ArgumentParser:
163
163
  )
164
164
  export_parser.add_arguments(export.ExportCmd, dest="export")
165
165
 
166
- # Debug helper subcommand
167
- debug_parser = subparsers.add_parser(
168
- "debug",
166
+ # Info subcommand
167
+ info_parser = subparsers.add_parser(
168
+ "info",
169
169
  help="Display evaluation job information",
170
- description="Debug helper functionalities for nemo-evaluator-launcher",
170
+ description="Info functionalities for nemo-evaluator-launcher",
171
171
  )
172
- debug_parser.add_argument(
172
+ info_parser.add_argument(
173
173
  "-v", "--verbose", action="store_true", help="Enable verbose logging"
174
174
  )
175
- debug_parser.add_arguments(debug.DebugCmd, dest="debug")
175
+ info_parser.add_arguments(info.InfoCmd, dest="info")
176
176
 
177
177
  return parser
178
178
 
@@ -218,8 +218,8 @@ def main() -> None:
218
218
  args.runs.execute()
219
219
  elif args.command == "export":
220
220
  args.export.execute()
221
- elif args.command == "debug":
222
- args.debug.execute()
221
+ elif args.command == "info":
222
+ args.info.execute()
223
223
 
224
224
 
225
225
  if __name__ == "__main__":
@@ -19,6 +19,15 @@ from dataclasses import dataclass
19
19
 
20
20
  from simple_parsing import field
21
21
 
22
+ from nemo_evaluator_launcher.common.logging_utils import logger
23
+ from nemo_evaluator_launcher.common.printing_utils import (
24
+ bold,
25
+ cyan,
26
+ green,
27
+ magenta,
28
+ red,
29
+ )
30
+
22
31
 
23
32
  @dataclass
24
33
  class Cmd:
@@ -101,15 +110,10 @@ class Cmd:
101
110
  try:
102
111
  invocation_id = run_eval(config, self.dry_run)
103
112
  except Exception as e:
104
- print(f"\033[31m✗ Job submission failed | Error: {e}\033[0m")
113
+ print(red(f"✗ Job submission failed, see logs | Error: {e}"))
114
+ logger.error("Job submission failed", error=e)
105
115
  raise
106
116
 
107
- # Print general success message with invocation ID
108
- if invocation_id is not None and not self.dry_run:
109
- print(
110
- f"\033[32m✓ Job submission successful | Invocation ID: {invocation_id}\033[0m"
111
- )
112
-
113
117
  # Save the complete configuration
114
118
  if not self.dry_run and invocation_id is not None:
115
119
  # Determine config output directory
@@ -151,14 +155,22 @@ class Cmd:
151
155
  f.write("#\n")
152
156
  f.write(config_yaml)
153
157
 
154
- print(f"Complete run config saved to: {config_path}")
158
+ print(bold(cyan("Complete run config saved to: ")) + f"\n {config_path}\n")
159
+ logger.info("Saved complete config", path=config_path)
155
160
 
156
- if invocation_id is not None:
157
- print(f"to check status: nemo-evaluator-launcher status {invocation_id}")
158
- print(f"to kill all jobs: nemo-evaluator-launcher kill {invocation_id}")
161
+ # Print general success message with invocation ID and helpful commands
162
+ if invocation_id is not None and not self.dry_run:
163
+ print(
164
+ bold(cyan("To check status: "))
165
+ + f"nemo-evaluator-launcher status {invocation_id}"
166
+ )
167
+ print(
168
+ bold(cyan("To kill all jobs: "))
169
+ + f"nemo-evaluator-launcher kill {invocation_id}"
170
+ )
159
171
 
160
172
  # Show actual job IDs and task names
161
- print("to kill individual jobs:")
173
+ print(bold(cyan("To kill individual jobs:")))
162
174
  # Access tasks - will work after normalization in run_eval
163
175
  tasks = (
164
176
  config.evaluation.tasks
@@ -168,7 +180,21 @@ class Cmd:
168
180
  for idx, task in enumerate(tasks):
169
181
  job_id = f"{invocation_id}.{idx}"
170
182
  print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
183
+
184
+ print(
185
+ magenta(
186
+ "(all commands accept shortened IDs as long as there are no conflicts)"
187
+ )
188
+ )
171
189
  print(
172
- "to print all jobs: nemo-evaluator-launcher ls runs"
190
+ bold(cyan("To print all jobs: ")) + "nemo-evaluator-launcher ls runs"
173
191
  "\n (--since 1d or --since 6h for time span, see --help)"
174
192
  )
193
+
194
+ print(
195
+ green(
196
+ bold(
197
+ f"✓ Job submission successful | Invocation ID: {invocation_id}"
198
+ )
199
+ )
200
+ )
@@ -17,6 +17,7 @@ from dataclasses import dataclass
17
17
 
18
18
  from simple_parsing import field
19
19
 
20
+ import nemo_evaluator_launcher.common.printing_utils as pu
20
21
  from nemo_evaluator_launcher.executors.base import ExecutionState
21
22
 
22
23
 
@@ -143,17 +144,17 @@ class Cmd:
143
144
  """Format status with Unicode visual indicators only."""
144
145
  # Status mapping based on ExecutionState enum
145
146
  status_formats = {
146
- ExecutionState.SUCCESS.value: "\033[32m✓ SUCCESS\033[0m", # Green Unicode checkmark
147
- ExecutionState.FAILED.value: "\033[31m✗ FAILED\033[0m", # Red Unicode X
148
- ExecutionState.RUNNING.value: "\033[33m▶ RUNNING\033[0m", # Yellow Unicode play button
149
- ExecutionState.PENDING.value: "\033[36m⧗ PENDING\033[0m", # Cyan Unicode hourglass (U+29D7)
150
- ExecutionState.KILLED.value: "\033[35m✗ KILLED\033[0m", # Magenta Unicode X
147
+ ExecutionState.SUCCESS.value: pu.green("✓ SUCCESS"),
148
+ ExecutionState.FAILED.value: pu.red("✗ FAILED"),
149
+ ExecutionState.RUNNING.value: pu.yellow("▶ RUNNING"),
150
+ ExecutionState.PENDING.value: pu.cyan("⧗ PENDING"),
151
+ ExecutionState.KILLED.value: pu.magenta("✗ KILLED"),
151
152
  # Additional states for error handling
152
- "not_found": "\033[90m? NOT FOUND\033[0m", # Gray question mark
153
- "error": "\033[31m✗ ERROR\033[0m", # Red Unicode X
153
+ "not_found": pu.grey("? NOT FOUND"),
154
+ "error": pu.red("✗ ERROR"),
154
155
  }
155
156
 
156
- return status_formats.get(status.lower(), f"\033[90m? {status.upper()}\033[0m")
157
+ return status_formats.get(status.lower(), pu.grey(status.upper()))
157
158
 
158
159
  def _strip_ansi_codes(self, text: str) -> str:
159
160
  """Remove ANSI color codes from text for length calculation."""
@@ -16,6 +16,7 @@
16
16
  import base64
17
17
  import copy
18
18
  import datetime
19
+ from dataclasses import dataclass
19
20
  from typing import Optional
20
21
 
21
22
  import yaml
@@ -24,9 +25,36 @@ from omegaconf import DictConfig, OmegaConf
24
25
  from nemo_evaluator_launcher.common.logging_utils import logger
25
26
 
26
27
 
27
- def _yaml_to_echo_command(yaml_str: str, filename: str = "config_ef.yaml") -> str:
28
+ @dataclass(frozen=True)
29
+ class CmdAndReadableComment:
30
+ """See the comment to `_yaml_to_echo_command`."""
31
+
32
+ # Actual command. Might include hard-to-debug elements such as base64-encoded
33
+ # configs.
34
+ cmd: str
35
+ # A debuggale readable comment that can be passed along for accompanying
36
+ # the actual command
37
+ debug: str
38
+
39
+
40
+ def _yaml_to_echo_command(
41
+ yaml_str: str, filename: str = "config_ef.yaml"
42
+ ) -> CmdAndReadableComment:
43
+ """Create a safe (see below) echo command saving a yaml to file.
44
+
45
+ Safety in this context means the ability to pass such echo command through the
46
+ `bash -c '...'` boundaries for example.
47
+
48
+ Naturally, enconding with base64 creates debuggability issues. For that, the second
49
+ output of the function is the yaml string with bash comment signs prepended.
50
+ """
28
51
  yaml_str_b64 = base64.b64encode(yaml_str.encode("utf-8")).decode("utf-8")
29
- return f'echo "{yaml_str_b64}" | base64 -d > {filename}'
52
+ debug_str = "\n".join(
53
+ [f"# Contents of {filename}"] + ["# " + s for s in yaml_str.splitlines()]
54
+ )
55
+ return CmdAndReadableComment(
56
+ cmd=f'echo "{yaml_str_b64}" | base64 -d > {filename}', debug=debug_str
57
+ )
30
58
 
31
59
 
32
60
  def get_eval_factory_config(
@@ -55,7 +83,7 @@ def get_eval_factory_config(
55
83
 
56
84
  def get_eval_factory_command(
57
85
  cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
58
- ) -> str:
86
+ ) -> CmdAndReadableComment:
59
87
  config_fields = get_eval_factory_config(cfg, user_task_config, task_definition)
60
88
 
61
89
  overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
@@ -80,7 +108,11 @@ def get_eval_factory_command(
80
108
  if overrides:
81
109
  eval_command = f"{eval_command} --overrides {overrides_str}"
82
110
 
83
- return create_file_cmd + " && " + "cat config_ef.yaml && " + eval_command
111
+ # We return both the command and the debugging base64-decoded strings, useful
112
+ # for exposing when building scripts.
113
+ return CmdAndReadableComment(
114
+ cmd=create_file_cmd.cmd + " && " + eval_command, debug=create_file_cmd.debug
115
+ )
84
116
 
85
117
 
86
118
  def get_endpoint_url(
@@ -0,0 +1,93 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Printing utils for more structured or visually appealing prints.
17
+
18
+ NOTE: use printing only for main application output that matters. For logging,
19
+ see `logging_utils.py`.
20
+
21
+ USAGE:
22
+ ```
23
+ from nemo_evaluator_launcher.common.printing_utils import red, bold
24
+ print(bold(red("some red bold")))
25
+ ```
26
+
27
+
28
+ """
29
+
30
+ import os
31
+
32
+ # If this env var is set, it will override a more standard "LOG_LEVEL". If
33
+ # both are unset, default would be used.
34
+ _DISABLE_COLOR_ENV_VAR = "NEMO_EVALUATOR_DISABLE_COLOR"
35
+
36
+
37
+ def _is_color_disabled():
38
+ env_var = os.environ.get(_DISABLE_COLOR_ENV_VAR, "0").lower()
39
+
40
+ if "1" in env_var or "yes" in env_var or "y" in env_var or "true" in env_var:
41
+ return True
42
+
43
+ return False
44
+
45
+
46
+ _CODES: dict[str, str] = dict(
47
+ green="\033[32m",
48
+ red="\033[31m",
49
+ red_bg="\033[41m", # red background
50
+ cyan="\033[36m",
51
+ yellow="\033[33m",
52
+ magenta="\033[35m",
53
+ grey="\033[90m",
54
+ bold="\033[1m",
55
+ reset="\033[0m",
56
+ )
57
+
58
+ # If the colors are disabled, we null-out all the codes.
59
+ if _is_color_disabled():
60
+ for c in _CODES.keys():
61
+ _CODES[c] = ""
62
+
63
+
64
+ def green(s: str) -> str:
65
+ return _CODES["green"] + s + _CODES["reset"]
66
+
67
+
68
+ def red(s: str) -> str:
69
+ return _CODES["red"] + s + _CODES["reset"]
70
+
71
+
72
+ def red_bg(s: str) -> str:
73
+ return _CODES["red_bg"] + s + _CODES["reset"]
74
+
75
+
76
+ def cyan(s: str) -> str:
77
+ return _CODES["cyan"] + s + _CODES["reset"]
78
+
79
+
80
+ def yellow(s: str) -> str:
81
+ return _CODES["yellow"] + s + _CODES["reset"]
82
+
83
+
84
+ def magenta(s: str) -> str:
85
+ return _CODES["magenta"] + s + _CODES["reset"]
86
+
87
+
88
+ def grey(s: str) -> str:
89
+ return _CODES["grey"] + s + _CODES["reset"]
90
+
91
+
92
+ def bold(s: str) -> str:
93
+ return _CODES["bold"] + s + _CODES["reset"]
@@ -14,16 +14,17 @@
14
14
  # limitations under the License.
15
15
  #
16
16
  # Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
17
- hostname: ???
18
- username: ${oc.env:USER}
19
- account: ???
17
+ type: slurm # Executor is chosen based on this field
18
+ hostname: ??? # SLURM headnode (login) hostname (required)
19
+ username: ${oc.env:USER} # Defaults to $USER env var
20
+ account: ??? # SLURM account allocation (required)
21
+ output_dir: ??? # Absolute path accessible on compute nodes (required)
20
22
  partition: batch
21
23
  num_nodes: 1
22
24
  ntasks_per_node: 1
23
25
  gres: gpu:8
24
26
  walltime: 01:00:00
25
27
  subproject: nemo-evaluator-launcher
26
- output_dir: ???
27
28
  env_vars:
28
29
  deployment: {}
29
30
  evaluation: {}
@@ -406,7 +406,12 @@ class LeptonExecutor(BaseExecutor):
406
406
  cfg.target.api_endpoint.url = full_endpoint_url
407
407
 
408
408
  # Generate command with the correct endpoint URL
409
- eval_command = get_eval_factory_command(cfg, task, task_definition)
409
+ eval_command_struct = get_eval_factory_command(
410
+ cfg, task, task_definition
411
+ )
412
+ eval_command = eval_command_struct.cmd
413
+ # Debug string for explainability of some base64-parts of the command
414
+ eval_command_debug_comment = eval_command_struct.debug
410
415
 
411
416
  finally:
412
417
  # Restore original URL and struct mode
@@ -431,6 +436,7 @@ class LeptonExecutor(BaseExecutor):
431
436
  task_name=task.name,
432
437
  invocation_id=invocation_id,
433
438
  eval_command=eval_command, # Pass the fixed command
439
+ eval_command_debug_comment=eval_command_debug_comment,
434
440
  )
435
441
 
436
442
  # Prepare job command to run the launch script
@@ -734,6 +740,7 @@ def _create_evaluation_launch_script(
734
740
  task_name: str,
735
741
  invocation_id: str,
736
742
  eval_command: str,
743
+ eval_command_debug_comment: str,
737
744
  ) -> str:
738
745
  """Create bash script for running evaluation in Lepton job container.
739
746
 
@@ -747,6 +754,7 @@ def _create_evaluation_launch_script(
747
754
  task_name: Name of the evaluation task.
748
755
  invocation_id: Unique invocation identifier.
749
756
  eval_command: The evaluation command with correct endpoint URL.
757
+ eval_command_debug_comment: The debug comment for placing into the script and easy debug
750
758
 
751
759
  Returns:
752
760
  String containing the bash launch script.
@@ -779,6 +787,8 @@ echo "Invocation ID: {invocation_id}"
779
787
  echo "Endpoint URL: {endpoint_url}"
780
788
  echo "Command: {eval_command_modified}"
781
789
 
790
+ {eval_command_debug_comment}
791
+
782
792
  # Execute the evaluation with proper error handling
783
793
  set +e
784
794
  {eval_command_modified}
@@ -47,6 +47,7 @@ from nemo_evaluator_launcher.common.mapping import (
47
47
  get_task_from_mapping,
48
48
  load_tasks_mapping,
49
49
  )
50
+ from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
50
51
  from nemo_evaluator_launcher.executors.base import (
51
52
  BaseExecutor,
52
53
  ExecutionState,
@@ -155,6 +156,16 @@ class LocalExecutor(BaseExecutor):
155
156
 
156
157
  task_output_dir = output_dir / task.name
157
158
  task_output_dir.mkdir(parents=True, exist_ok=True)
159
+ eval_factory_command_struct = get_eval_factory_command(
160
+ cfg, task, task_definition
161
+ )
162
+ eval_factory_command = eval_factory_command_struct.cmd
163
+ # The debug comment for placing into the script and easy debug. Reason
164
+ # (see `CmdAndReadableComment`) is the current way of passing the command
165
+ # is base64-encoded config `echo`-ed into file.
166
+ # TODO(agronskiy): cleaner way is to encode everything with base64, not
167
+ # some parts (like ef_config.yaml) and just output as logs somewhere.
168
+ eval_factory_command_debug_comment = eval_factory_command_struct.debug
158
169
  evaluation_task = {
159
170
  "name": task.name,
160
171
  "job_id": job_id,
@@ -162,9 +173,8 @@ class LocalExecutor(BaseExecutor):
162
173
  "container_name": container_name,
163
174
  "env_vars": env_vars,
164
175
  "output_dir": task_output_dir,
165
- "eval_factory_command": get_eval_factory_command(
166
- cfg, task, task_definition
167
- ),
176
+ "eval_factory_command": eval_factory_command,
177
+ "eval_factory_command_debug_comment": eval_factory_command_debug_comment,
168
178
  }
169
179
  evaluation_tasks.append(evaluation_task)
170
180
 
@@ -198,23 +208,28 @@ class LocalExecutor(BaseExecutor):
198
208
  )
199
209
 
200
210
  if dry_run:
201
- print("\n\n=============================================\n\n")
202
- print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
211
+ print(bold("\n\n=============================================\n\n"))
212
+ print(bold(cyan(f"DRY RUN: Scripts prepared and saved to {output_dir}")))
203
213
  if is_execution_mode_sequential:
204
214
  print(
205
- "\n\n =========== Main script | run_all.sequential.sh ===================== \n\n"
215
+ cyan(
216
+ "\n\n=========== Main script | run_all.sequential.sh =====================\n\n"
217
+ )
206
218
  )
219
+
207
220
  with open(output_dir / "run_all.sequential.sh", "r") as f:
208
- print(f.read())
221
+ print(grey(f.read()))
209
222
  else:
210
223
  for idx, task in enumerate(cfg.evaluation.tasks):
211
224
  task_output_dir = output_dir / task.name
212
225
  print(
213
- f"\n\n =========== Task script | {task.name}/run.sh ===================== \n\n"
226
+ cyan(
227
+ f"\n\n=========== Task script | {task.name}/run.sh =====================\n\n"
228
+ )
214
229
  )
215
230
  with open(task_output_dir / "run.sh", "r") as f:
216
- print(f.read())
217
- print("\nTo execute, run without --dry-run")
231
+ print(grey(f.read()))
232
+ print(bold("\nTo execute, run without --dry-run"))
218
233
  return invocation_id
219
234
 
220
235
  # Save launched jobs metadata
@@ -284,13 +299,13 @@ class LocalExecutor(BaseExecutor):
284
299
  error_msg = f"Script for {name} exited with code {exit_code}"
285
300
  raise RuntimeError(f"Job startup failed | {error_msg}")
286
301
 
287
- print("\nCommands for real-time monitoring:")
302
+ print(bold(cyan("\nCommands for real-time monitoring:")))
288
303
  for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
289
304
  log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
290
305
  print(f" tail -f {log_file}")
291
306
 
292
- print("\nFollow all logs for this invocation:")
293
- print(f" tail -f {output_dir}/*/logs/stdout.log")
307
+ print(bold(cyan("\nFollow all logs for this invocation:")))
308
+ print(f" tail -f {output_dir}/*/logs/stdout.log\n")
294
309
 
295
310
  return invocation_id
296
311
 
@@ -40,6 +40,9 @@ else
40
40
  # Create pre-start stage file
41
41
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
42
42
 
43
+ # Debug contents of the eval factory command's config
44
+ {{ task.eval_factory_command_debug_comment | indent(4) }}
45
+
43
46
  # Docker run with eval factory command
44
47
  (
45
48
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
@@ -51,7 +54,7 @@ else
51
54
  {% endfor -%}
52
55
  {{ task.eval_image }} \
53
56
  bash -c '
54
- {{ task.eval_factory_command }} ;
57
+ {{ task.eval_factory_command | indent(8) }} ;
55
58
  exit_code=$?
56
59
  chmod 777 -R /results;
57
60
  if [ "$exit_code" -ne 0 ]; then
@@ -50,6 +50,7 @@ from nemo_evaluator_launcher.common.mapping import (
50
50
  get_task_from_mapping,
51
51
  load_tasks_mapping,
52
52
  )
53
+ from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
53
54
  from nemo_evaluator_launcher.executors.base import (
54
55
  BaseExecutor,
55
56
  ExecutionState,
@@ -130,13 +131,13 @@ class SlurmExecutor(BaseExecutor):
130
131
  remote_runsub_paths.append(remote_runsub_path)
131
132
 
132
133
  if dry_run:
133
- print("\n\n=============================================\n\n")
134
- print("DRY RUN: SLURM scripts prepared")
134
+ print(bold("\n\n=============================================\n\n"))
135
+ print(bold(cyan("DRY RUN: SLURM scripts prepared")))
135
136
  for idx, local_runsub_path in enumerate(local_runsub_paths):
136
- print(f"\n\n =========== Task {idx} ===================== \n\n")
137
+ print(cyan(f"\n\n=========== Task {idx} =====================\n\n"))
137
138
  with open(local_runsub_path, "r") as f:
138
- print(f.read())
139
- print("\nTo submit jobs, run the executor without --dry-run")
139
+ print(grey(f.read()))
140
+ print(bold("To submit jobs") + ", run the executor without --dry-run")
140
141
  return invocation_id
141
142
 
142
143
  socket = str(Path(tmpdirname) / "socket")
@@ -589,7 +590,20 @@ def _create_slurm_sbatch_script(
589
590
  ):
590
591
  evaluation_mounts_list.append(f"{source_mnt}:{target_mnt}")
591
592
 
593
+ eval_factory_command_struct = get_eval_factory_command(cfg, task, task_definition)
594
+ eval_factory_command = eval_factory_command_struct.cmd
595
+ # The debug comment for placing into the script and easy debug. Reason
596
+ # (see `CmdAndReadableComment`) is the current way of passing the command
597
+ # is base64-encoded config `echo`-ed into file.
598
+ # TODO(agronskiy): cleaner way is to encode everything with base64, not
599
+ # some parts (like ef_config.yaml) and just output as logs somewhere.
600
+ eval_factory_command_debug_comment = eval_factory_command_struct.debug
601
+
592
602
  # add evaluation srun command
603
+ s += "# Debug contents of the eval factory command's config\n"
604
+ s += eval_factory_command_debug_comment
605
+ s += "\n\n"
606
+
593
607
  s += "# evaluation client\n"
594
608
  s += "srun --mpi pmix --overlap "
595
609
  s += "--container-image {} ".format(eval_image)
@@ -600,10 +614,11 @@ def _create_slurm_sbatch_script(
600
614
  s += "--container-env {} ".format(",".join(evaluation_env_var_names))
601
615
  if not cfg.execution.get("mounts", {}).get("mount_home", True):
602
616
  s += "--no-container-mount-home "
617
+
603
618
  s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
604
619
  s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
605
- s += "bash -c '"
606
- s += get_eval_factory_command(cfg, task, task_definition)
620
+ s += "bash -c '\n"
621
+ s += eval_factory_command
607
622
  s += "'\n\n"
608
623
 
609
624
  # terminate the server after all evaluation clients finish
@@ -16,7 +16,7 @@
16
16
  # Below is the _next_ version that will be published, not the currently published one.
17
17
  MAJOR = 0
18
18
  MINOR = 1
19
- PATCH = 17
19
+ PATCH = 18
20
20
  PRE_RELEASE = ""
21
21
 
22
22
  # Use the following formatting: (major, minor, patch, pre-release)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.17
3
+ Version: 0.1.18
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -14,8 +14,8 @@ src/nemo_evaluator_launcher/api/functional.py
14
14
  src/nemo_evaluator_launcher/api/types.py
15
15
  src/nemo_evaluator_launcher/api/utils.py
16
16
  src/nemo_evaluator_launcher/cli/__init__.py
17
- src/nemo_evaluator_launcher/cli/debug.py
18
17
  src/nemo_evaluator_launcher/cli/export.py
18
+ src/nemo_evaluator_launcher/cli/info.py
19
19
  src/nemo_evaluator_launcher/cli/kill.py
20
20
  src/nemo_evaluator_launcher/cli/ls_runs.py
21
21
  src/nemo_evaluator_launcher/cli/ls_tasks.py
@@ -28,6 +28,7 @@ src/nemo_evaluator_launcher/common/execdb.py
28
28
  src/nemo_evaluator_launcher/common/helpers.py
29
29
  src/nemo_evaluator_launcher/common/logging_utils.py
30
30
  src/nemo_evaluator_launcher/common/mapping.py
31
+ src/nemo_evaluator_launcher/common/printing_utils.py
31
32
  src/nemo_evaluator_launcher/configs/__init__.py
32
33
  src/nemo_evaluator_launcher/configs/default.yaml
33
34
  src/nemo_evaluator_launcher/configs/deployment/generic.yaml