nemo-evaluator-launcher 0.1.17__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher-0.1.17/src/nemo_evaluator_launcher/cli/debug.py → nemo_evaluator_launcher-0.1.18/src/nemo_evaluator_launcher/cli/info.py +170 -63
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/main.py +10 -10
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/run.py +39 -13
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/status.py +9 -8
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/helpers.py +36 -4
- nemo_evaluator_launcher-0.1.18/src/nemo_evaluator_launcher/common/printing_utils.py +93 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +5 -4
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/executor.py +11 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/executor.py +28 -13
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/run.template.sh +4 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/slurm/executor.py +22 -7
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +2 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/pyproject.toml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/functional.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/cli/version.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/base.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.18}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -19,9 +19,18 @@ This module defines data structures and helpers for configuration and type safet
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
import os
|
|
22
|
+
import warnings
|
|
22
23
|
from dataclasses import dataclass
|
|
23
24
|
from typing import cast
|
|
24
25
|
|
|
26
|
+
# ruff: noqa: E402
|
|
27
|
+
# Later when adding optional module to hydra, since the internal package is optional,
|
|
28
|
+
# will generate a hydra warning. We suppress it as distraction and bad UX, before hydra gets invoked.
|
|
29
|
+
warnings.filterwarnings(
|
|
30
|
+
"ignore",
|
|
31
|
+
message="provider=hydra.searchpath.*path=nemo_evaluator_launcher_internal.*is not available\\.",
|
|
32
|
+
)
|
|
33
|
+
|
|
25
34
|
import hydra
|
|
26
35
|
from hydra.core.global_hydra import GlobalHydra
|
|
27
36
|
from omegaconf import DictConfig, OmegaConf
|
|
@@ -14,16 +14,16 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
#
|
|
16
16
|
|
|
17
|
-
"""
|
|
17
|
+
"""Job information helper functionalities for nemo-evaluator-launcher."""
|
|
18
18
|
|
|
19
|
+
import sys
|
|
19
20
|
from dataclasses import dataclass
|
|
20
21
|
from datetime import datetime
|
|
21
22
|
from pathlib import Path
|
|
22
|
-
from typing import Any, Dict, List,
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
23
24
|
|
|
24
25
|
from simple_parsing import field
|
|
25
26
|
|
|
26
|
-
from nemo_evaluator_launcher.cli.export import ExportCmd
|
|
27
27
|
from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
|
|
28
28
|
from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
|
|
29
29
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
@@ -35,52 +35,60 @@ _EXPORT_HELPER = LocalExporter({})
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
|
-
class
|
|
39
|
-
"""
|
|
38
|
+
class InfoCmd:
|
|
39
|
+
"""Job information functionalities for nemo-evaluator-launcher.
|
|
40
40
|
|
|
41
41
|
Examples:
|
|
42
|
-
nemo-evaluator-launcher
|
|
43
|
-
nemo-evaluator-launcher
|
|
44
|
-
nemo-evaluator-launcher
|
|
45
|
-
nemo-evaluator-launcher
|
|
46
|
-
nemo-evaluator-launcher
|
|
47
|
-
nemo-evaluator-launcher
|
|
42
|
+
nemo-evaluator-launcher info <inv> # Full job info
|
|
43
|
+
nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
|
|
44
|
+
nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
|
|
45
|
+
nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
|
|
46
|
+
nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
|
|
47
|
+
nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
|
|
48
48
|
|
|
49
49
|
Notes:
|
|
50
|
-
- Supports invocation IDs and job IDs
|
|
50
|
+
- Supports invocation IDs and job IDs (space-separated)
|
|
51
51
|
- Shows local or remote paths depending on executor (local/slurm/lepton)
|
|
52
|
+
- Copy operations work for both local and remote jobs (expect longer time for remote jobs)
|
|
53
|
+
- Copy operations are not supported for Lepton executor (yet).
|
|
52
54
|
"""
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
invocation_ids: List[str] = field(
|
|
57
|
+
positional=True,
|
|
58
|
+
help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
|
|
59
|
+
)
|
|
56
60
|
|
|
57
|
-
#
|
|
58
|
-
config: bool = field(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
+
# info modes
|
|
62
|
+
config: bool = field(
|
|
63
|
+
default=False, action="store_true", help="Show job configuration"
|
|
64
|
+
)
|
|
65
|
+
artifacts: bool = field(
|
|
66
|
+
default=False, action="store_true", help="Show artifact locations and key files"
|
|
67
|
+
)
|
|
68
|
+
logs: bool = field(
|
|
69
|
+
default=False, action="store_true", help="Show log locations and key files"
|
|
70
|
+
)
|
|
61
71
|
|
|
62
|
-
# copy operations
|
|
63
|
-
copy_logs:
|
|
72
|
+
# copy operations - work for both local and remote jobs
|
|
73
|
+
copy_logs: str | None = field(
|
|
64
74
|
default=None,
|
|
65
75
|
alias=["--copy-logs"],
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
help="Copy logs to a local directory",
|
|
77
|
+
metavar="DIR",
|
|
68
78
|
)
|
|
69
|
-
copy_artifacts:
|
|
79
|
+
copy_artifacts: str | None = field(
|
|
70
80
|
default=None,
|
|
71
81
|
alias=["--copy-artifacts"],
|
|
72
|
-
|
|
73
|
-
|
|
82
|
+
help="Copy artifacts to a local directory",
|
|
83
|
+
metavar="DIR",
|
|
74
84
|
)
|
|
75
85
|
|
|
76
86
|
def execute(self) -> None:
|
|
77
|
-
# show version
|
|
78
87
|
VersionCmd().execute()
|
|
79
|
-
|
|
80
|
-
logger.info("Debug command started", invocation_ids=self.invocation_ids)
|
|
88
|
+
logger.info("Info command started", invocation_ids=self.invocation_ids)
|
|
81
89
|
|
|
82
90
|
if not self.invocation_ids:
|
|
83
|
-
logger.error("No invocation IDs provided")
|
|
91
|
+
logger.error("No job or invocation IDs provided.")
|
|
84
92
|
raise ValueError("No job or invocation IDs provided.")
|
|
85
93
|
|
|
86
94
|
jobs = self._resolve_jobs()
|
|
@@ -96,48 +104,63 @@ class DebugCmd(ExportCmd):
|
|
|
96
104
|
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
97
105
|
)
|
|
98
106
|
print(
|
|
99
|
-
"No valid jobs found (jobs may have been
|
|
107
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
100
108
|
)
|
|
101
109
|
return
|
|
102
110
|
|
|
111
|
+
# show ops
|
|
103
112
|
if self.config:
|
|
104
|
-
logger.info("Showing job configuration", job_count=len(jobs))
|
|
105
113
|
self._show_config_info(jobs)
|
|
106
|
-
|
|
107
|
-
logger.info("Showing job logs locations", job_count=len(jobs))
|
|
114
|
+
if self.logs:
|
|
108
115
|
self._show_logs_info(jobs)
|
|
109
|
-
|
|
110
|
-
logger.info("Showing artifacts locations", job_count=len(jobs))
|
|
116
|
+
if self.artifacts:
|
|
111
117
|
self._show_artifacts_info(jobs)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
|
|
119
|
+
# copy ops
|
|
120
|
+
args = sys.argv[1:]
|
|
121
|
+
copy_logs_flag = "--copy-logs" in args
|
|
122
|
+
copy_artifacts_flag = "--copy-artifacts" in args
|
|
123
|
+
|
|
124
|
+
if copy_logs_flag:
|
|
125
|
+
if self.copy_logs is None:
|
|
126
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
127
|
+
if not self.copy_logs.strip():
|
|
128
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
118
129
|
logger.info(
|
|
119
|
-
"Copying logs to local directory",
|
|
130
|
+
"Copying logs to local directory",
|
|
131
|
+
dest_dir=self.copy_logs,
|
|
132
|
+
job_count=len(jobs),
|
|
120
133
|
)
|
|
121
|
-
self._copy_logs(jobs,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
if
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
)
|
|
134
|
+
self._copy_logs(jobs, self.copy_logs)
|
|
135
|
+
|
|
136
|
+
if copy_artifacts_flag:
|
|
137
|
+
if self.copy_artifacts is None:
|
|
138
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
139
|
+
if not self.copy_artifacts.strip():
|
|
140
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
128
141
|
logger.info(
|
|
129
142
|
"Copying artifacts to local directory",
|
|
130
|
-
dest_dir=
|
|
143
|
+
dest_dir=self.copy_artifacts,
|
|
131
144
|
job_count=len(jobs),
|
|
132
145
|
)
|
|
133
|
-
self._copy_artifacts(jobs,
|
|
134
|
-
|
|
146
|
+
self._copy_artifacts(jobs, self.copy_artifacts)
|
|
147
|
+
|
|
148
|
+
# default view when no flags
|
|
149
|
+
if not any(
|
|
150
|
+
[
|
|
151
|
+
self.config,
|
|
152
|
+
self.logs,
|
|
153
|
+
self.artifacts,
|
|
154
|
+
self.copy_logs,
|
|
155
|
+
self.copy_artifacts,
|
|
156
|
+
]
|
|
157
|
+
):
|
|
135
158
|
logger.info(
|
|
136
159
|
"Job metadata details",
|
|
137
160
|
invocation_id=jobs[0][1].invocation_id if jobs else None,
|
|
138
161
|
jobs=len(jobs),
|
|
139
162
|
)
|
|
140
|
-
self.
|
|
163
|
+
self._show_invocation_info(jobs)
|
|
141
164
|
|
|
142
165
|
def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
|
|
143
166
|
"""Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
|
|
@@ -160,15 +183,15 @@ class DebugCmd(ExportCmd):
|
|
|
160
183
|
uniq.append((jid, jd))
|
|
161
184
|
return sorted(uniq, key=lambda p: p[0])
|
|
162
185
|
|
|
163
|
-
def
|
|
186
|
+
def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
164
187
|
inv = jobs[0][1].invocation_id if jobs else None
|
|
165
|
-
logger.info("
|
|
188
|
+
logger.info("Job information", jobs=len(jobs), invocation=inv)
|
|
166
189
|
print(
|
|
167
|
-
f"
|
|
190
|
+
f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
|
|
168
191
|
)
|
|
169
192
|
|
|
170
193
|
for job_id, job_data in jobs:
|
|
171
|
-
self.
|
|
194
|
+
self._show_job_info(job_id, job_data)
|
|
172
195
|
print()
|
|
173
196
|
|
|
174
197
|
# footer hint: where to find more metadata
|
|
@@ -184,10 +207,14 @@ class DebugCmd(ExportCmd):
|
|
|
184
207
|
print(" - Use --logs to show log locations.")
|
|
185
208
|
print(" - Use --artifacts to show artifact locations.")
|
|
186
209
|
print(" - Use --config to show stored job configuration (YAML).")
|
|
187
|
-
print(
|
|
188
|
-
|
|
210
|
+
print(
|
|
211
|
+
" - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
|
|
212
|
+
)
|
|
213
|
+
print(
|
|
214
|
+
" - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
|
|
215
|
+
)
|
|
189
216
|
|
|
190
|
-
def
|
|
217
|
+
def _show_job_info(self, job_id: str, job_data: JobData) -> None:
|
|
191
218
|
logger.info("Job", job_id=job_id)
|
|
192
219
|
print(f"Job {job_id}")
|
|
193
220
|
|
|
@@ -208,14 +235,22 @@ class DebugCmd(ExportCmd):
|
|
|
208
235
|
logger.info("Task", job_id=job_id, name=task_name)
|
|
209
236
|
print(f"├── Task: {task_name}")
|
|
210
237
|
|
|
238
|
+
# Determine executor type for file descriptions
|
|
239
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
240
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
241
|
+
|
|
211
242
|
# locations via exporter helper
|
|
212
243
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
213
244
|
|
|
214
|
-
# Artifacts
|
|
245
|
+
# Artifacts with file descriptions
|
|
246
|
+
artifacts_list = _get_artifacts_file_list()
|
|
215
247
|
if paths.get("storage_type") == "remote_ssh":
|
|
216
248
|
artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
|
|
217
249
|
logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
|
|
218
250
|
print(f"├── Artifacts: {artifacts_path} (remote)")
|
|
251
|
+
print("│ └── Key files:")
|
|
252
|
+
for filename, desc in artifacts_list:
|
|
253
|
+
print(f"│ ├── {filename} - {desc}")
|
|
219
254
|
else:
|
|
220
255
|
ap = paths.get("artifacts_dir")
|
|
221
256
|
if ap:
|
|
@@ -224,14 +259,21 @@ class DebugCmd(ExportCmd):
|
|
|
224
259
|
"Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
|
|
225
260
|
)
|
|
226
261
|
print(f"├── Artifacts: {ap} {exists} (local)")
|
|
262
|
+
print("│ └── Key files:")
|
|
263
|
+
for filename, desc in artifacts_list:
|
|
264
|
+
print(f"│ ├── {filename} - {desc}")
|
|
227
265
|
|
|
228
|
-
# Logs
|
|
266
|
+
# Logs with file descriptions
|
|
267
|
+
logs_list = _get_log_file_list(exec_type)
|
|
229
268
|
if paths.get("storage_type") == "remote_ssh":
|
|
230
269
|
logs_path = (
|
|
231
270
|
f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
|
|
232
271
|
)
|
|
233
272
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
234
273
|
print(f"├── Logs: {logs_path} (remote)")
|
|
274
|
+
print("│ └── Key files:")
|
|
275
|
+
for filename, desc in logs_list:
|
|
276
|
+
print(f"│ ├── {filename} - {desc}")
|
|
235
277
|
else:
|
|
236
278
|
lp = paths.get("logs_dir")
|
|
237
279
|
if lp:
|
|
@@ -240,6 +282,9 @@ class DebugCmd(ExportCmd):
|
|
|
240
282
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
241
283
|
)
|
|
242
284
|
print(f"├── Logs: {lp} {exists} (local)")
|
|
285
|
+
print("│ └── Key files:")
|
|
286
|
+
for filename, desc in logs_list:
|
|
287
|
+
print(f"│ ├── {filename} - {desc}")
|
|
243
288
|
|
|
244
289
|
# executor-specific
|
|
245
290
|
d = job_data.data or {}
|
|
@@ -264,17 +309,23 @@ class DebugCmd(ExportCmd):
|
|
|
264
309
|
eu = d.get("endpoint_url")
|
|
265
310
|
if eu:
|
|
266
311
|
print(f"├── Endpoint URL: {eu}")
|
|
267
|
-
# local and others: paths already displayed above; no extra fields needed
|
|
268
312
|
|
|
269
313
|
def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
270
314
|
logger.info("Log locations")
|
|
271
315
|
print("Log locations:\n")
|
|
272
316
|
for job_id, job_data in jobs:
|
|
273
317
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
318
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
319
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
320
|
+
logs_list = _get_log_file_list(exec_type)
|
|
321
|
+
|
|
274
322
|
if paths.get("storage_type") == "remote_ssh":
|
|
275
323
|
logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
|
|
276
324
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
277
325
|
print(f"{job_id}: {logs_path} (remote)")
|
|
326
|
+
print(" └── Key files:")
|
|
327
|
+
for filename, desc in logs_list:
|
|
328
|
+
print(f" ├── {filename} - {desc}")
|
|
278
329
|
else:
|
|
279
330
|
lp = paths.get("logs_dir")
|
|
280
331
|
if lp:
|
|
@@ -283,18 +334,26 @@ class DebugCmd(ExportCmd):
|
|
|
283
334
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
284
335
|
)
|
|
285
336
|
print(f"{job_id}: {lp} {exists} (local)")
|
|
337
|
+
print(" └── Key files:")
|
|
338
|
+
for filename, desc in logs_list:
|
|
339
|
+
print(f" ├── {filename} - {desc}")
|
|
286
340
|
|
|
287
341
|
def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
288
342
|
logger.info("Artifact locations")
|
|
289
343
|
print("Artifact locations:\n")
|
|
290
344
|
for job_id, job_data in jobs:
|
|
291
345
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
346
|
+
artifacts_list = _get_artifacts_file_list()
|
|
347
|
+
|
|
292
348
|
if paths.get("storage_type") == "remote_ssh":
|
|
293
349
|
artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
|
|
294
350
|
logger.info(
|
|
295
351
|
"Artifacts", job_id=job_id, path=artifacts_path, remote=True
|
|
296
352
|
)
|
|
297
353
|
print(f"{job_id}: {artifacts_path} (remote)")
|
|
354
|
+
print(" └── Key files:")
|
|
355
|
+
for filename, desc in artifacts_list:
|
|
356
|
+
print(f" ├── {filename} - {desc}")
|
|
298
357
|
else:
|
|
299
358
|
ap = paths.get("artifacts_dir")
|
|
300
359
|
if ap:
|
|
@@ -306,6 +365,9 @@ class DebugCmd(ExportCmd):
|
|
|
306
365
|
exists_indicator=exists,
|
|
307
366
|
)
|
|
308
367
|
print(f"{job_id}: {ap} {exists} (local)")
|
|
368
|
+
print(" └── Key files:")
|
|
369
|
+
for filename, desc in artifacts_list:
|
|
370
|
+
print(f" ├── {filename} - {desc}")
|
|
309
371
|
|
|
310
372
|
def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
311
373
|
for job_id, job_data in jobs:
|
|
@@ -383,6 +445,9 @@ class DebugCmd(ExportCmd):
|
|
|
383
445
|
print(
|
|
384
446
|
f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
|
|
385
447
|
)
|
|
448
|
+
# Show full destination path
|
|
449
|
+
full_dest_path = Path(dest_dir).resolve()
|
|
450
|
+
print(f"Copied to: {full_dest_path}")
|
|
386
451
|
else:
|
|
387
452
|
err = result.get("error", "Unknown error")
|
|
388
453
|
logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
|
|
@@ -403,3 +468,45 @@ class DebugCmd(ExportCmd):
|
|
|
403
468
|
except Exception:
|
|
404
469
|
pass
|
|
405
470
|
return ""
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# Helper functions for file descriptions (based on actual code and content analysis)
|
|
474
|
+
def _get_artifacts_file_list() -> list[tuple[str, str]]:
|
|
475
|
+
"""Files generated in artifacts/."""
|
|
476
|
+
return [
|
|
477
|
+
(
|
|
478
|
+
"results.yml",
|
|
479
|
+
"Benchmark scores, task results and resolved run configuration.",
|
|
480
|
+
),
|
|
481
|
+
(
|
|
482
|
+
"eval_factory_metrics.json",
|
|
483
|
+
"Response + runtime stats (latency, tokens count, memory)",
|
|
484
|
+
),
|
|
485
|
+
("metrics.json", "Harness/benchmark metric and configuration"),
|
|
486
|
+
("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
|
|
487
|
+
("report.json", "Report data in json format, if enabled"),
|
|
488
|
+
]
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
|
|
492
|
+
"""Files actually generated in logs/ - executor-specific."""
|
|
493
|
+
et = (executor_type or "local").lower()
|
|
494
|
+
if et == "slurm":
|
|
495
|
+
return [
|
|
496
|
+
("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
|
|
497
|
+
(
|
|
498
|
+
"slurm-{SLURM_JOB_ID}.out",
|
|
499
|
+
"SLURM scheduler stdout/stderr (batch submission, export steps).",
|
|
500
|
+
),
|
|
501
|
+
(
|
|
502
|
+
"server-{SLURM_JOB_ID}.out",
|
|
503
|
+
"Model server logs when a deployment is used.",
|
|
504
|
+
),
|
|
505
|
+
]
|
|
506
|
+
# local executor
|
|
507
|
+
return [
|
|
508
|
+
(
|
|
509
|
+
"stdout.log",
|
|
510
|
+
"Complete evaluation output (timestamps, resolved config, run/export messages).",
|
|
511
|
+
),
|
|
512
|
+
]
|
|
@@ -19,8 +19,8 @@ import os
|
|
|
19
19
|
|
|
20
20
|
from simple_parsing import ArgumentParser
|
|
21
21
|
|
|
22
|
-
import nemo_evaluator_launcher.cli.debug as debug
|
|
23
22
|
import nemo_evaluator_launcher.cli.export as export
|
|
23
|
+
import nemo_evaluator_launcher.cli.info as info
|
|
24
24
|
import nemo_evaluator_launcher.cli.kill as kill
|
|
25
25
|
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
26
26
|
import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
|
|
@@ -42,12 +42,12 @@ def is_verbose_enabled(args) -> bool:
|
|
|
42
42
|
subcommands = [
|
|
43
43
|
"run",
|
|
44
44
|
"status",
|
|
45
|
+
"info",
|
|
45
46
|
"kill",
|
|
46
47
|
"tasks_alias",
|
|
47
48
|
"tasks",
|
|
48
49
|
"runs",
|
|
49
50
|
"export",
|
|
50
|
-
"debug",
|
|
51
51
|
]
|
|
52
52
|
for subcmd in subcommands:
|
|
53
53
|
if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
|
|
@@ -163,16 +163,16 @@ def create_parser() -> ArgumentParser:
|
|
|
163
163
|
)
|
|
164
164
|
export_parser.add_arguments(export.ExportCmd, dest="export")
|
|
165
165
|
|
|
166
|
-
#
|
|
167
|
-
|
|
168
|
-
"
|
|
166
|
+
# Info subcommand
|
|
167
|
+
info_parser = subparsers.add_parser(
|
|
168
|
+
"info",
|
|
169
169
|
help="Display evaluation job information",
|
|
170
|
-
description="
|
|
170
|
+
description="Info functionalities for nemo-evaluator-launcher",
|
|
171
171
|
)
|
|
172
|
-
|
|
172
|
+
info_parser.add_argument(
|
|
173
173
|
"-v", "--verbose", action="store_true", help="Enable verbose logging"
|
|
174
174
|
)
|
|
175
|
-
|
|
175
|
+
info_parser.add_arguments(info.InfoCmd, dest="info")
|
|
176
176
|
|
|
177
177
|
return parser
|
|
178
178
|
|
|
@@ -218,8 +218,8 @@ def main() -> None:
|
|
|
218
218
|
args.runs.execute()
|
|
219
219
|
elif args.command == "export":
|
|
220
220
|
args.export.execute()
|
|
221
|
-
elif args.command == "
|
|
222
|
-
args.
|
|
221
|
+
elif args.command == "info":
|
|
222
|
+
args.info.execute()
|
|
223
223
|
|
|
224
224
|
|
|
225
225
|
if __name__ == "__main__":
|
|
@@ -19,6 +19,15 @@ from dataclasses import dataclass
|
|
|
19
19
|
|
|
20
20
|
from simple_parsing import field
|
|
21
21
|
|
|
22
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
23
|
+
from nemo_evaluator_launcher.common.printing_utils import (
|
|
24
|
+
bold,
|
|
25
|
+
cyan,
|
|
26
|
+
green,
|
|
27
|
+
magenta,
|
|
28
|
+
red,
|
|
29
|
+
)
|
|
30
|
+
|
|
22
31
|
|
|
23
32
|
@dataclass
|
|
24
33
|
class Cmd:
|
|
@@ -101,15 +110,10 @@ class Cmd:
|
|
|
101
110
|
try:
|
|
102
111
|
invocation_id = run_eval(config, self.dry_run)
|
|
103
112
|
except Exception as e:
|
|
104
|
-
print(f"
|
|
113
|
+
print(red(f"✗ Job submission failed, see logs | Error: {e}"))
|
|
114
|
+
logger.error("Job submission failed", error=e)
|
|
105
115
|
raise
|
|
106
116
|
|
|
107
|
-
# Print general success message with invocation ID
|
|
108
|
-
if invocation_id is not None and not self.dry_run:
|
|
109
|
-
print(
|
|
110
|
-
f"\033[32m✓ Job submission successful | Invocation ID: {invocation_id}\033[0m"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
117
|
# Save the complete configuration
|
|
114
118
|
if not self.dry_run and invocation_id is not None:
|
|
115
119
|
# Determine config output directory
|
|
@@ -151,14 +155,22 @@ class Cmd:
|
|
|
151
155
|
f.write("#\n")
|
|
152
156
|
f.write(config_yaml)
|
|
153
157
|
|
|
154
|
-
print(
|
|
158
|
+
print(bold(cyan("Complete run config saved to: ")) + f"\n {config_path}\n")
|
|
159
|
+
logger.info("Saved complete config", path=config_path)
|
|
155
160
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
print(
|
|
161
|
+
# Print general success message with invocation ID and helpful commands
|
|
162
|
+
if invocation_id is not None and not self.dry_run:
|
|
163
|
+
print(
|
|
164
|
+
bold(cyan("To check status: "))
|
|
165
|
+
+ f"nemo-evaluator-launcher status {invocation_id}"
|
|
166
|
+
)
|
|
167
|
+
print(
|
|
168
|
+
bold(cyan("To kill all jobs: "))
|
|
169
|
+
+ f"nemo-evaluator-launcher kill {invocation_id}"
|
|
170
|
+
)
|
|
159
171
|
|
|
160
172
|
# Show actual job IDs and task names
|
|
161
|
-
print("
|
|
173
|
+
print(bold(cyan("To kill individual jobs:")))
|
|
162
174
|
# Access tasks - will work after normalization in run_eval
|
|
163
175
|
tasks = (
|
|
164
176
|
config.evaluation.tasks
|
|
@@ -168,7 +180,21 @@ class Cmd:
|
|
|
168
180
|
for idx, task in enumerate(tasks):
|
|
169
181
|
job_id = f"{invocation_id}.{idx}"
|
|
170
182
|
print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
|
|
183
|
+
|
|
184
|
+
print(
|
|
185
|
+
magenta(
|
|
186
|
+
"(all commands accept shortened IDs as long as there are no conflicts)"
|
|
187
|
+
)
|
|
188
|
+
)
|
|
171
189
|
print(
|
|
172
|
-
"
|
|
190
|
+
bold(cyan("To print all jobs: ")) + "nemo-evaluator-launcher ls runs"
|
|
173
191
|
"\n (--since 1d or --since 6h for time span, see --help)"
|
|
174
192
|
)
|
|
193
|
+
|
|
194
|
+
print(
|
|
195
|
+
green(
|
|
196
|
+
bold(
|
|
197
|
+
f"✓ Job submission successful | Invocation ID: {invocation_id}"
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
)
|
|
@@ -17,6 +17,7 @@ from dataclasses import dataclass
|
|
|
17
17
|
|
|
18
18
|
from simple_parsing import field
|
|
19
19
|
|
|
20
|
+
import nemo_evaluator_launcher.common.printing_utils as pu
|
|
20
21
|
from nemo_evaluator_launcher.executors.base import ExecutionState
|
|
21
22
|
|
|
22
23
|
|
|
@@ -143,17 +144,17 @@ class Cmd:
|
|
|
143
144
|
"""Format status with Unicode visual indicators only."""
|
|
144
145
|
# Status mapping based on ExecutionState enum
|
|
145
146
|
status_formats = {
|
|
146
|
-
ExecutionState.SUCCESS.value: "
|
|
147
|
-
ExecutionState.FAILED.value: "
|
|
148
|
-
ExecutionState.RUNNING.value: "
|
|
149
|
-
ExecutionState.PENDING.value: "
|
|
150
|
-
ExecutionState.KILLED.value: "
|
|
147
|
+
ExecutionState.SUCCESS.value: pu.green("✓ SUCCESS"),
|
|
148
|
+
ExecutionState.FAILED.value: pu.red("✗ FAILED"),
|
|
149
|
+
ExecutionState.RUNNING.value: pu.yellow("▶ RUNNING"),
|
|
150
|
+
ExecutionState.PENDING.value: pu.cyan("⧗ PENDING"),
|
|
151
|
+
ExecutionState.KILLED.value: pu.magenta("✗ KILLED"),
|
|
151
152
|
# Additional states for error handling
|
|
152
|
-
"not_found": "
|
|
153
|
-
"error": "
|
|
153
|
+
"not_found": pu.grey("? NOT FOUND"),
|
|
154
|
+
"error": pu.red("✗ ERROR"),
|
|
154
155
|
}
|
|
155
156
|
|
|
156
|
-
return status_formats.get(status.lower(),
|
|
157
|
+
return status_formats.get(status.lower(), pu.grey(status.upper()))
|
|
157
158
|
|
|
158
159
|
def _strip_ansi_codes(self, text: str) -> str:
|
|
159
160
|
"""Remove ANSI color codes from text for length calculation."""
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
import base64
|
|
17
17
|
import copy
|
|
18
18
|
import datetime
|
|
19
|
+
from dataclasses import dataclass
|
|
19
20
|
from typing import Optional
|
|
20
21
|
|
|
21
22
|
import yaml
|
|
@@ -24,9 +25,36 @@ from omegaconf import DictConfig, OmegaConf
|
|
|
24
25
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class CmdAndReadableComment:
|
|
30
|
+
"""See the comment to `_yaml_to_echo_command`."""
|
|
31
|
+
|
|
32
|
+
# Actual command. Might include hard-to-debug elements such as base64-encoded
|
|
33
|
+
# configs.
|
|
34
|
+
cmd: str
|
|
35
|
+
# A debuggale readable comment that can be passed along for accompanying
|
|
36
|
+
# the actual command
|
|
37
|
+
debug: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _yaml_to_echo_command(
|
|
41
|
+
yaml_str: str, filename: str = "config_ef.yaml"
|
|
42
|
+
) -> CmdAndReadableComment:
|
|
43
|
+
"""Create a safe (see below) echo command saving a yaml to file.
|
|
44
|
+
|
|
45
|
+
Safety in this context means the ability to pass such echo command through the
|
|
46
|
+
`bash -c '...'` boundaries for example.
|
|
47
|
+
|
|
48
|
+
Naturally, enconding with base64 creates debuggability issues. For that, the second
|
|
49
|
+
output of the function is the yaml string with bash comment signs prepended.
|
|
50
|
+
"""
|
|
28
51
|
yaml_str_b64 = base64.b64encode(yaml_str.encode("utf-8")).decode("utf-8")
|
|
29
|
-
|
|
52
|
+
debug_str = "\n".join(
|
|
53
|
+
[f"# Contents of {filename}"] + ["# " + s for s in yaml_str.splitlines()]
|
|
54
|
+
)
|
|
55
|
+
return CmdAndReadableComment(
|
|
56
|
+
cmd=f'echo "{yaml_str_b64}" | base64 -d > {filename}', debug=debug_str
|
|
57
|
+
)
|
|
30
58
|
|
|
31
59
|
|
|
32
60
|
def get_eval_factory_config(
|
|
@@ -55,7 +83,7 @@ def get_eval_factory_config(
|
|
|
55
83
|
|
|
56
84
|
def get_eval_factory_command(
|
|
57
85
|
cfg: DictConfig, user_task_config: DictConfig, task_definition: dict
|
|
58
|
-
) ->
|
|
86
|
+
) -> CmdAndReadableComment:
|
|
59
87
|
config_fields = get_eval_factory_config(cfg, user_task_config, task_definition)
|
|
60
88
|
|
|
61
89
|
overrides = copy.deepcopy(dict(cfg.evaluation.get("overrides", {})))
|
|
@@ -80,7 +108,11 @@ def get_eval_factory_command(
|
|
|
80
108
|
if overrides:
|
|
81
109
|
eval_command = f"{eval_command} --overrides {overrides_str}"
|
|
82
110
|
|
|
83
|
-
|
|
111
|
+
# We return both the command and the debugging base64-decoded strings, useful
|
|
112
|
+
# for exposing when building scripts.
|
|
113
|
+
return CmdAndReadableComment(
|
|
114
|
+
cmd=create_file_cmd.cmd + " && " + eval_command, debug=create_file_cmd.debug
|
|
115
|
+
)
|
|
84
116
|
|
|
85
117
|
|
|
86
118
|
def get_endpoint_url(
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Printing utils for more structured or visually appealing prints.
|
|
17
|
+
|
|
18
|
+
NOTE: use printing only for main application output that matters. For logging,
|
|
19
|
+
see `logging_utils.py`.
|
|
20
|
+
|
|
21
|
+
USAGE:
|
|
22
|
+
```
|
|
23
|
+
from nemo_evaluator_launcher.common.printing_utils import red, bold
|
|
24
|
+
print(bold(red("some red bold")))
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import os
|
|
31
|
+
|
|
32
|
+
# If this env var is set, it will override a more standard "LOG_LEVEL". If
|
|
33
|
+
# both are unset, default would be used.
|
|
34
|
+
_DISABLE_COLOR_ENV_VAR = "NEMO_EVALUATOR_DISABLE_COLOR"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _is_color_disabled():
|
|
38
|
+
env_var = os.environ.get(_DISABLE_COLOR_ENV_VAR, "0").lower()
|
|
39
|
+
|
|
40
|
+
if "1" in env_var or "yes" in env_var or "y" in env_var or "true" in env_var:
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_CODES: dict[str, str] = dict(
|
|
47
|
+
green="\033[32m",
|
|
48
|
+
red="\033[31m",
|
|
49
|
+
red_bg="\033[41m", # red background
|
|
50
|
+
cyan="\033[36m",
|
|
51
|
+
yellow="\033[33m",
|
|
52
|
+
magenta="\033[35m",
|
|
53
|
+
grey="\033[90m",
|
|
54
|
+
bold="\033[1m",
|
|
55
|
+
reset="\033[0m",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# If the colors are disabled, we null-out all the codes.
|
|
59
|
+
if _is_color_disabled():
|
|
60
|
+
for c in _CODES.keys():
|
|
61
|
+
_CODES[c] = ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def green(s: str) -> str:
|
|
65
|
+
return _CODES["green"] + s + _CODES["reset"]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def red(s: str) -> str:
|
|
69
|
+
return _CODES["red"] + s + _CODES["reset"]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def red_bg(s: str) -> str:
|
|
73
|
+
return _CODES["red_bg"] + s + _CODES["reset"]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cyan(s: str) -> str:
|
|
77
|
+
return _CODES["cyan"] + s + _CODES["reset"]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def yellow(s: str) -> str:
|
|
81
|
+
return _CODES["yellow"] + s + _CODES["reset"]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def magenta(s: str) -> str:
|
|
85
|
+
return _CODES["magenta"] + s + _CODES["reset"]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def grey(s: str) -> str:
|
|
89
|
+
return _CODES["grey"] + s + _CODES["reset"]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def bold(s: str) -> str:
|
|
93
|
+
return _CODES["bold"] + s + _CODES["reset"]
|
|
@@ -14,16 +14,17 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
#
|
|
16
16
|
# Each slurm cluster has its own flavour, below we provide some defaults that might meet one's needs.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
type: slurm # Executor is chosen based on this field
|
|
18
|
+
hostname: ??? # SLURM headnode (login) hostname (required)
|
|
19
|
+
username: ${oc.env:USER} # Defaults to $USER env var
|
|
20
|
+
account: ??? # SLURM account allocation (required)
|
|
21
|
+
output_dir: ??? # Absolute path accessible on compute nodes (required)
|
|
20
22
|
partition: batch
|
|
21
23
|
num_nodes: 1
|
|
22
24
|
ntasks_per_node: 1
|
|
23
25
|
gres: gpu:8
|
|
24
26
|
walltime: 01:00:00
|
|
25
27
|
subproject: nemo-evaluator-launcher
|
|
26
|
-
output_dir: ???
|
|
27
28
|
env_vars:
|
|
28
29
|
deployment: {}
|
|
29
30
|
evaluation: {}
|
|
@@ -406,7 +406,12 @@ class LeptonExecutor(BaseExecutor):
|
|
|
406
406
|
cfg.target.api_endpoint.url = full_endpoint_url
|
|
407
407
|
|
|
408
408
|
# Generate command with the correct endpoint URL
|
|
409
|
-
|
|
409
|
+
eval_command_struct = get_eval_factory_command(
|
|
410
|
+
cfg, task, task_definition
|
|
411
|
+
)
|
|
412
|
+
eval_command = eval_command_struct.cmd
|
|
413
|
+
# Debug string for explainability of some base64-parts of the command
|
|
414
|
+
eval_command_debug_comment = eval_command_struct.debug
|
|
410
415
|
|
|
411
416
|
finally:
|
|
412
417
|
# Restore original URL and struct mode
|
|
@@ -431,6 +436,7 @@ class LeptonExecutor(BaseExecutor):
|
|
|
431
436
|
task_name=task.name,
|
|
432
437
|
invocation_id=invocation_id,
|
|
433
438
|
eval_command=eval_command, # Pass the fixed command
|
|
439
|
+
eval_command_debug_comment=eval_command_debug_comment,
|
|
434
440
|
)
|
|
435
441
|
|
|
436
442
|
# Prepare job command to run the launch script
|
|
@@ -734,6 +740,7 @@ def _create_evaluation_launch_script(
|
|
|
734
740
|
task_name: str,
|
|
735
741
|
invocation_id: str,
|
|
736
742
|
eval_command: str,
|
|
743
|
+
eval_command_debug_comment: str,
|
|
737
744
|
) -> str:
|
|
738
745
|
"""Create bash script for running evaluation in Lepton job container.
|
|
739
746
|
|
|
@@ -747,6 +754,7 @@ def _create_evaluation_launch_script(
|
|
|
747
754
|
task_name: Name of the evaluation task.
|
|
748
755
|
invocation_id: Unique invocation identifier.
|
|
749
756
|
eval_command: The evaluation command with correct endpoint URL.
|
|
757
|
+
eval_command_debug_comment: The debug comment for placing into the script and easy debug
|
|
750
758
|
|
|
751
759
|
Returns:
|
|
752
760
|
String containing the bash launch script.
|
|
@@ -779,6 +787,8 @@ echo "Invocation ID: {invocation_id}"
|
|
|
779
787
|
echo "Endpoint URL: {endpoint_url}"
|
|
780
788
|
echo "Command: {eval_command_modified}"
|
|
781
789
|
|
|
790
|
+
{eval_command_debug_comment}
|
|
791
|
+
|
|
782
792
|
# Execute the evaluation with proper error handling
|
|
783
793
|
set +e
|
|
784
794
|
{eval_command_modified}
|
|
@@ -47,6 +47,7 @@ from nemo_evaluator_launcher.common.mapping import (
|
|
|
47
47
|
get_task_from_mapping,
|
|
48
48
|
load_tasks_mapping,
|
|
49
49
|
)
|
|
50
|
+
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
|
|
50
51
|
from nemo_evaluator_launcher.executors.base import (
|
|
51
52
|
BaseExecutor,
|
|
52
53
|
ExecutionState,
|
|
@@ -155,6 +156,16 @@ class LocalExecutor(BaseExecutor):
|
|
|
155
156
|
|
|
156
157
|
task_output_dir = output_dir / task.name
|
|
157
158
|
task_output_dir.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
eval_factory_command_struct = get_eval_factory_command(
|
|
160
|
+
cfg, task, task_definition
|
|
161
|
+
)
|
|
162
|
+
eval_factory_command = eval_factory_command_struct.cmd
|
|
163
|
+
# The debug comment for placing into the script and easy debug. Reason
|
|
164
|
+
# (see `CmdAndReadableComment`) is the current way of passing the command
|
|
165
|
+
# is base64-encoded config `echo`-ed into file.
|
|
166
|
+
# TODO(agronskiy): cleaner way is to encode everything with base64, not
|
|
167
|
+
# some parts (like ef_config.yaml) and just output as logs somewhere.
|
|
168
|
+
eval_factory_command_debug_comment = eval_factory_command_struct.debug
|
|
158
169
|
evaluation_task = {
|
|
159
170
|
"name": task.name,
|
|
160
171
|
"job_id": job_id,
|
|
@@ -162,9 +173,8 @@ class LocalExecutor(BaseExecutor):
|
|
|
162
173
|
"container_name": container_name,
|
|
163
174
|
"env_vars": env_vars,
|
|
164
175
|
"output_dir": task_output_dir,
|
|
165
|
-
"eval_factory_command":
|
|
166
|
-
|
|
167
|
-
),
|
|
176
|
+
"eval_factory_command": eval_factory_command,
|
|
177
|
+
"eval_factory_command_debug_comment": eval_factory_command_debug_comment,
|
|
168
178
|
}
|
|
169
179
|
evaluation_tasks.append(evaluation_task)
|
|
170
180
|
|
|
@@ -198,23 +208,28 @@ class LocalExecutor(BaseExecutor):
|
|
|
198
208
|
)
|
|
199
209
|
|
|
200
210
|
if dry_run:
|
|
201
|
-
print("\n\n=============================================\n\n")
|
|
202
|
-
print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
|
|
211
|
+
print(bold("\n\n=============================================\n\n"))
|
|
212
|
+
print(bold(cyan(f"DRY RUN: Scripts prepared and saved to {output_dir}")))
|
|
203
213
|
if is_execution_mode_sequential:
|
|
204
214
|
print(
|
|
205
|
-
|
|
215
|
+
cyan(
|
|
216
|
+
"\n\n=========== Main script | run_all.sequential.sh =====================\n\n"
|
|
217
|
+
)
|
|
206
218
|
)
|
|
219
|
+
|
|
207
220
|
with open(output_dir / "run_all.sequential.sh", "r") as f:
|
|
208
|
-
print(f.read())
|
|
221
|
+
print(grey(f.read()))
|
|
209
222
|
else:
|
|
210
223
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
211
224
|
task_output_dir = output_dir / task.name
|
|
212
225
|
print(
|
|
213
|
-
|
|
226
|
+
cyan(
|
|
227
|
+
f"\n\n=========== Task script | {task.name}/run.sh =====================\n\n"
|
|
228
|
+
)
|
|
214
229
|
)
|
|
215
230
|
with open(task_output_dir / "run.sh", "r") as f:
|
|
216
|
-
print(f.read())
|
|
217
|
-
print("\nTo execute, run without --dry-run")
|
|
231
|
+
print(grey(f.read()))
|
|
232
|
+
print(bold("\nTo execute, run without --dry-run"))
|
|
218
233
|
return invocation_id
|
|
219
234
|
|
|
220
235
|
# Save launched jobs metadata
|
|
@@ -284,13 +299,13 @@ class LocalExecutor(BaseExecutor):
|
|
|
284
299
|
error_msg = f"Script for {name} exited with code {exit_code}"
|
|
285
300
|
raise RuntimeError(f"Job startup failed | {error_msg}")
|
|
286
301
|
|
|
287
|
-
print("\nCommands for real-time monitoring:")
|
|
302
|
+
print(bold(cyan("\nCommands for real-time monitoring:")))
|
|
288
303
|
for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
|
|
289
304
|
log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
|
|
290
305
|
print(f" tail -f {log_file}")
|
|
291
306
|
|
|
292
|
-
print("\nFollow all logs for this invocation:")
|
|
293
|
-
print(f" tail -f {output_dir}/*/logs/stdout.log")
|
|
307
|
+
print(bold(cyan("\nFollow all logs for this invocation:")))
|
|
308
|
+
print(f" tail -f {output_dir}/*/logs/stdout.log\n")
|
|
294
309
|
|
|
295
310
|
return invocation_id
|
|
296
311
|
|
|
@@ -40,6 +40,9 @@ else
|
|
|
40
40
|
# Create pre-start stage file
|
|
41
41
|
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
42
42
|
|
|
43
|
+
# Debug contents of the eval factory command's config
|
|
44
|
+
{{ task.eval_factory_command_debug_comment | indent(4) }}
|
|
45
|
+
|
|
43
46
|
# Docker run with eval factory command
|
|
44
47
|
(
|
|
45
48
|
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
|
|
@@ -51,7 +54,7 @@ else
|
|
|
51
54
|
{% endfor -%}
|
|
52
55
|
{{ task.eval_image }} \
|
|
53
56
|
bash -c '
|
|
54
|
-
{{ task.eval_factory_command }} ;
|
|
57
|
+
{{ task.eval_factory_command | indent(8) }} ;
|
|
55
58
|
exit_code=$?
|
|
56
59
|
chmod 777 -R /results;
|
|
57
60
|
if [ "$exit_code" -ne 0 ]; then
|
|
@@ -50,6 +50,7 @@ from nemo_evaluator_launcher.common.mapping import (
|
|
|
50
50
|
get_task_from_mapping,
|
|
51
51
|
load_tasks_mapping,
|
|
52
52
|
)
|
|
53
|
+
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey
|
|
53
54
|
from nemo_evaluator_launcher.executors.base import (
|
|
54
55
|
BaseExecutor,
|
|
55
56
|
ExecutionState,
|
|
@@ -130,13 +131,13 @@ class SlurmExecutor(BaseExecutor):
|
|
|
130
131
|
remote_runsub_paths.append(remote_runsub_path)
|
|
131
132
|
|
|
132
133
|
if dry_run:
|
|
133
|
-
print("\n\n=============================================\n\n")
|
|
134
|
-
print("DRY RUN: SLURM scripts prepared")
|
|
134
|
+
print(bold("\n\n=============================================\n\n"))
|
|
135
|
+
print(bold(cyan("DRY RUN: SLURM scripts prepared")))
|
|
135
136
|
for idx, local_runsub_path in enumerate(local_runsub_paths):
|
|
136
|
-
print(f"\n\n
|
|
137
|
+
print(cyan(f"\n\n=========== Task {idx} =====================\n\n"))
|
|
137
138
|
with open(local_runsub_path, "r") as f:
|
|
138
|
-
print(f.read())
|
|
139
|
-
print("
|
|
139
|
+
print(grey(f.read()))
|
|
140
|
+
print(bold("To submit jobs") + ", run the executor without --dry-run")
|
|
140
141
|
return invocation_id
|
|
141
142
|
|
|
142
143
|
socket = str(Path(tmpdirname) / "socket")
|
|
@@ -589,7 +590,20 @@ def _create_slurm_sbatch_script(
|
|
|
589
590
|
):
|
|
590
591
|
evaluation_mounts_list.append(f"{source_mnt}:{target_mnt}")
|
|
591
592
|
|
|
593
|
+
eval_factory_command_struct = get_eval_factory_command(cfg, task, task_definition)
|
|
594
|
+
eval_factory_command = eval_factory_command_struct.cmd
|
|
595
|
+
# The debug comment for placing into the script and easy debug. Reason
|
|
596
|
+
# (see `CmdAndReadableComment`) is the current way of passing the command
|
|
597
|
+
# is base64-encoded config `echo`-ed into file.
|
|
598
|
+
# TODO(agronskiy): cleaner way is to encode everything with base64, not
|
|
599
|
+
# some parts (like ef_config.yaml) and just output as logs somewhere.
|
|
600
|
+
eval_factory_command_debug_comment = eval_factory_command_struct.debug
|
|
601
|
+
|
|
592
602
|
# add evaluation srun command
|
|
603
|
+
s += "# Debug contents of the eval factory command's config\n"
|
|
604
|
+
s += eval_factory_command_debug_comment
|
|
605
|
+
s += "\n\n"
|
|
606
|
+
|
|
593
607
|
s += "# evaluation client\n"
|
|
594
608
|
s += "srun --mpi pmix --overlap "
|
|
595
609
|
s += "--container-image {} ".format(eval_image)
|
|
@@ -600,10 +614,11 @@ def _create_slurm_sbatch_script(
|
|
|
600
614
|
s += "--container-env {} ".format(",".join(evaluation_env_var_names))
|
|
601
615
|
if not cfg.execution.get("mounts", {}).get("mount_home", True):
|
|
602
616
|
s += "--no-container-mount-home "
|
|
617
|
+
|
|
603
618
|
s += "--container-mounts {} ".format(",".join(evaluation_mounts_list))
|
|
604
619
|
s += "--output {} ".format(remote_task_subdir / "logs" / "client-%A.out")
|
|
605
|
-
s += "bash -c '"
|
|
606
|
-
s +=
|
|
620
|
+
s += "bash -c '\n"
|
|
621
|
+
s += eval_factory_command
|
|
607
622
|
s += "'\n\n"
|
|
608
623
|
|
|
609
624
|
# terminate the server after all evaluation clients finish
|
|
@@ -14,8 +14,8 @@ src/nemo_evaluator_launcher/api/functional.py
|
|
|
14
14
|
src/nemo_evaluator_launcher/api/types.py
|
|
15
15
|
src/nemo_evaluator_launcher/api/utils.py
|
|
16
16
|
src/nemo_evaluator_launcher/cli/__init__.py
|
|
17
|
-
src/nemo_evaluator_launcher/cli/debug.py
|
|
18
17
|
src/nemo_evaluator_launcher/cli/export.py
|
|
18
|
+
src/nemo_evaluator_launcher/cli/info.py
|
|
19
19
|
src/nemo_evaluator_launcher/cli/kill.py
|
|
20
20
|
src/nemo_evaluator_launcher/cli/ls_runs.py
|
|
21
21
|
src/nemo_evaluator_launcher/cli/ls_tasks.py
|
|
@@ -28,6 +28,7 @@ src/nemo_evaluator_launcher/common/execdb.py
|
|
|
28
28
|
src/nemo_evaluator_launcher/common/helpers.py
|
|
29
29
|
src/nemo_evaluator_launcher/common/logging_utils.py
|
|
30
30
|
src/nemo_evaluator_launcher/common/mapping.py
|
|
31
|
+
src/nemo_evaluator_launcher/common/printing_utils.py
|
|
31
32
|
src/nemo_evaluator_launcher/configs/__init__.py
|
|
32
33
|
src/nemo_evaluator_launcher/configs/default.yaml
|
|
33
34
|
src/nemo_evaluator_launcher/configs/deployment/generic.yaml
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|