nemo-evaluator-launcher 0.1.17__tar.gz → 0.1.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/__init__.py +15 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher-0.1.17/src/nemo_evaluator_launcher/cli/debug.py → nemo_evaluator_launcher-0.1.19/src/nemo_evaluator_launcher/cli/info.py +170 -63
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/main.py +10 -10
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/run.py +39 -13
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/status.py +9 -8
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/common/helpers.py +55 -8
- nemo_evaluator_launcher-0.1.19/src/nemo_evaluator_launcher/common/printing_utils.py +93 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +5 -4
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/lepton/executor.py +11 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/local/executor.py +28 -13
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/local/run.template.sh +4 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/slurm/executor.py +22 -7
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +2 -1
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/pyproject.toml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/api/functional.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/cli/version.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/base.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
- {nemo_evaluator_launcher-0.1.17 → nemo_evaluator_launcher-0.1.19}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ It automatically initializes logging and conditionally loads internal components
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
import importlib
|
|
23
|
+
import warnings
|
|
23
24
|
|
|
24
25
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
25
26
|
from nemo_evaluator_launcher.package_info import (
|
|
@@ -32,9 +33,22 @@ from nemo_evaluator_launcher.package_info import (
|
|
|
32
33
|
__version__,
|
|
33
34
|
)
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
# Suppress pydantic warnings from third-party libraries (e.g., wandb) that are not
|
|
37
|
+
# compatible with Pydantic 2.x field metadata on Python 3.13+
|
|
38
|
+
warnings.filterwarnings(
|
|
39
|
+
"ignore",
|
|
40
|
+
message=r"The 'repr' attribute.*Field\(\).*",
|
|
41
|
+
category=Warning,
|
|
42
|
+
)
|
|
43
|
+
warnings.filterwarnings(
|
|
44
|
+
"ignore",
|
|
45
|
+
message=r"The 'frozen' attribute.*Field\(\).*",
|
|
46
|
+
category=Warning,
|
|
47
|
+
)
|
|
36
48
|
|
|
37
49
|
|
|
50
|
+
logger.info("Version info", pkg=__package_name__, ver=__version__)
|
|
51
|
+
|
|
38
52
|
try:
|
|
39
53
|
importlib.import_module("nemo_evaluator_launcher_internal")
|
|
40
54
|
logger.debug(
|
|
@@ -19,9 +19,18 @@ This module defines data structures and helpers for configuration and type safet
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
import os
|
|
22
|
+
import warnings
|
|
22
23
|
from dataclasses import dataclass
|
|
23
24
|
from typing import cast
|
|
24
25
|
|
|
26
|
+
# ruff: noqa: E402
|
|
27
|
+
# Later when adding optional module to hydra, since the internal package is optional,
|
|
28
|
+
# will generate a hydra warning. We suppress it as distraction and bad UX, before hydra gets invoked.
|
|
29
|
+
warnings.filterwarnings(
|
|
30
|
+
"ignore",
|
|
31
|
+
message="provider=hydra.searchpath.*path=nemo_evaluator_launcher_internal.*is not available\\.",
|
|
32
|
+
)
|
|
33
|
+
|
|
25
34
|
import hydra
|
|
26
35
|
from hydra.core.global_hydra import GlobalHydra
|
|
27
36
|
from omegaconf import DictConfig, OmegaConf
|
|
@@ -14,16 +14,16 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
#
|
|
16
16
|
|
|
17
|
-
"""
|
|
17
|
+
"""Job information helper functionalities for nemo-evaluator-launcher."""
|
|
18
18
|
|
|
19
|
+
import sys
|
|
19
20
|
from dataclasses import dataclass
|
|
20
21
|
from datetime import datetime
|
|
21
22
|
from pathlib import Path
|
|
22
|
-
from typing import Any, Dict, List,
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
23
24
|
|
|
24
25
|
from simple_parsing import field
|
|
25
26
|
|
|
26
|
-
from nemo_evaluator_launcher.cli.export import ExportCmd
|
|
27
27
|
from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
|
|
28
28
|
from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
|
|
29
29
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
@@ -35,52 +35,60 @@ _EXPORT_HELPER = LocalExporter({})
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
|
-
class
|
|
39
|
-
"""
|
|
38
|
+
class InfoCmd:
|
|
39
|
+
"""Job information functionalities for nemo-evaluator-launcher.
|
|
40
40
|
|
|
41
41
|
Examples:
|
|
42
|
-
nemo-evaluator-launcher
|
|
43
|
-
nemo-evaluator-launcher
|
|
44
|
-
nemo-evaluator-launcher
|
|
45
|
-
nemo-evaluator-launcher
|
|
46
|
-
nemo-evaluator-launcher
|
|
47
|
-
nemo-evaluator-launcher
|
|
42
|
+
nemo-evaluator-launcher info <inv> # Full job info
|
|
43
|
+
nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
|
|
44
|
+
nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
|
|
45
|
+
nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
|
|
46
|
+
nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
|
|
47
|
+
nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
|
|
48
48
|
|
|
49
49
|
Notes:
|
|
50
|
-
- Supports invocation IDs and job IDs
|
|
50
|
+
- Supports invocation IDs and job IDs (space-separated)
|
|
51
51
|
- Shows local or remote paths depending on executor (local/slurm/lepton)
|
|
52
|
+
- Copy operations work for both local and remote jobs (expect longer time for remote jobs)
|
|
53
|
+
- Copy operations are not supported for Lepton executor (yet).
|
|
52
54
|
"""
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
invocation_ids: List[str] = field(
|
|
57
|
+
positional=True,
|
|
58
|
+
help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
|
|
59
|
+
)
|
|
56
60
|
|
|
57
|
-
#
|
|
58
|
-
config: bool = field(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
+
# info modes
|
|
62
|
+
config: bool = field(
|
|
63
|
+
default=False, action="store_true", help="Show job configuration"
|
|
64
|
+
)
|
|
65
|
+
artifacts: bool = field(
|
|
66
|
+
default=False, action="store_true", help="Show artifact locations and key files"
|
|
67
|
+
)
|
|
68
|
+
logs: bool = field(
|
|
69
|
+
default=False, action="store_true", help="Show log locations and key files"
|
|
70
|
+
)
|
|
61
71
|
|
|
62
|
-
# copy operations
|
|
63
|
-
copy_logs:
|
|
72
|
+
# copy operations - work for both local and remote jobs
|
|
73
|
+
copy_logs: str | None = field(
|
|
64
74
|
default=None,
|
|
65
75
|
alias=["--copy-logs"],
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
help="Copy logs to a local directory",
|
|
77
|
+
metavar="DIR",
|
|
68
78
|
)
|
|
69
|
-
copy_artifacts:
|
|
79
|
+
copy_artifacts: str | None = field(
|
|
70
80
|
default=None,
|
|
71
81
|
alias=["--copy-artifacts"],
|
|
72
|
-
|
|
73
|
-
|
|
82
|
+
help="Copy artifacts to a local directory",
|
|
83
|
+
metavar="DIR",
|
|
74
84
|
)
|
|
75
85
|
|
|
76
86
|
def execute(self) -> None:
|
|
77
|
-
# show version
|
|
78
87
|
VersionCmd().execute()
|
|
79
|
-
|
|
80
|
-
logger.info("Debug command started", invocation_ids=self.invocation_ids)
|
|
88
|
+
logger.info("Info command started", invocation_ids=self.invocation_ids)
|
|
81
89
|
|
|
82
90
|
if not self.invocation_ids:
|
|
83
|
-
logger.error("No invocation IDs provided")
|
|
91
|
+
logger.error("No job or invocation IDs provided.")
|
|
84
92
|
raise ValueError("No job or invocation IDs provided.")
|
|
85
93
|
|
|
86
94
|
jobs = self._resolve_jobs()
|
|
@@ -96,48 +104,63 @@ class DebugCmd(ExportCmd):
|
|
|
96
104
|
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
97
105
|
)
|
|
98
106
|
print(
|
|
99
|
-
"No valid jobs found (jobs may have been
|
|
107
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
100
108
|
)
|
|
101
109
|
return
|
|
102
110
|
|
|
111
|
+
# show ops
|
|
103
112
|
if self.config:
|
|
104
|
-
logger.info("Showing job configuration", job_count=len(jobs))
|
|
105
113
|
self._show_config_info(jobs)
|
|
106
|
-
|
|
107
|
-
logger.info("Showing job logs locations", job_count=len(jobs))
|
|
114
|
+
if self.logs:
|
|
108
115
|
self._show_logs_info(jobs)
|
|
109
|
-
|
|
110
|
-
logger.info("Showing artifacts locations", job_count=len(jobs))
|
|
116
|
+
if self.artifacts:
|
|
111
117
|
self._show_artifacts_info(jobs)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
|
|
119
|
+
# copy ops
|
|
120
|
+
args = sys.argv[1:]
|
|
121
|
+
copy_logs_flag = "--copy-logs" in args
|
|
122
|
+
copy_artifacts_flag = "--copy-artifacts" in args
|
|
123
|
+
|
|
124
|
+
if copy_logs_flag:
|
|
125
|
+
if self.copy_logs is None:
|
|
126
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
127
|
+
if not self.copy_logs.strip():
|
|
128
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
118
129
|
logger.info(
|
|
119
|
-
"Copying logs to local directory",
|
|
130
|
+
"Copying logs to local directory",
|
|
131
|
+
dest_dir=self.copy_logs,
|
|
132
|
+
job_count=len(jobs),
|
|
120
133
|
)
|
|
121
|
-
self._copy_logs(jobs,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
if
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
)
|
|
134
|
+
self._copy_logs(jobs, self.copy_logs)
|
|
135
|
+
|
|
136
|
+
if copy_artifacts_flag:
|
|
137
|
+
if self.copy_artifacts is None:
|
|
138
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
139
|
+
if not self.copy_artifacts.strip():
|
|
140
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
128
141
|
logger.info(
|
|
129
142
|
"Copying artifacts to local directory",
|
|
130
|
-
dest_dir=
|
|
143
|
+
dest_dir=self.copy_artifacts,
|
|
131
144
|
job_count=len(jobs),
|
|
132
145
|
)
|
|
133
|
-
self._copy_artifacts(jobs,
|
|
134
|
-
|
|
146
|
+
self._copy_artifacts(jobs, self.copy_artifacts)
|
|
147
|
+
|
|
148
|
+
# default view when no flags
|
|
149
|
+
if not any(
|
|
150
|
+
[
|
|
151
|
+
self.config,
|
|
152
|
+
self.logs,
|
|
153
|
+
self.artifacts,
|
|
154
|
+
self.copy_logs,
|
|
155
|
+
self.copy_artifacts,
|
|
156
|
+
]
|
|
157
|
+
):
|
|
135
158
|
logger.info(
|
|
136
159
|
"Job metadata details",
|
|
137
160
|
invocation_id=jobs[0][1].invocation_id if jobs else None,
|
|
138
161
|
jobs=len(jobs),
|
|
139
162
|
)
|
|
140
|
-
self.
|
|
163
|
+
self._show_invocation_info(jobs)
|
|
141
164
|
|
|
142
165
|
def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
|
|
143
166
|
"""Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
|
|
@@ -160,15 +183,15 @@ class DebugCmd(ExportCmd):
|
|
|
160
183
|
uniq.append((jid, jd))
|
|
161
184
|
return sorted(uniq, key=lambda p: p[0])
|
|
162
185
|
|
|
163
|
-
def
|
|
186
|
+
def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
164
187
|
inv = jobs[0][1].invocation_id if jobs else None
|
|
165
|
-
logger.info("
|
|
188
|
+
logger.info("Job information", jobs=len(jobs), invocation=inv)
|
|
166
189
|
print(
|
|
167
|
-
f"
|
|
190
|
+
f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
|
|
168
191
|
)
|
|
169
192
|
|
|
170
193
|
for job_id, job_data in jobs:
|
|
171
|
-
self.
|
|
194
|
+
self._show_job_info(job_id, job_data)
|
|
172
195
|
print()
|
|
173
196
|
|
|
174
197
|
# footer hint: where to find more metadata
|
|
@@ -184,10 +207,14 @@ class DebugCmd(ExportCmd):
|
|
|
184
207
|
print(" - Use --logs to show log locations.")
|
|
185
208
|
print(" - Use --artifacts to show artifact locations.")
|
|
186
209
|
print(" - Use --config to show stored job configuration (YAML).")
|
|
187
|
-
print(
|
|
188
|
-
|
|
210
|
+
print(
|
|
211
|
+
" - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
|
|
212
|
+
)
|
|
213
|
+
print(
|
|
214
|
+
" - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
|
|
215
|
+
)
|
|
189
216
|
|
|
190
|
-
def
|
|
217
|
+
def _show_job_info(self, job_id: str, job_data: JobData) -> None:
|
|
191
218
|
logger.info("Job", job_id=job_id)
|
|
192
219
|
print(f"Job {job_id}")
|
|
193
220
|
|
|
@@ -208,14 +235,22 @@ class DebugCmd(ExportCmd):
|
|
|
208
235
|
logger.info("Task", job_id=job_id, name=task_name)
|
|
209
236
|
print(f"├── Task: {task_name}")
|
|
210
237
|
|
|
238
|
+
# Determine executor type for file descriptions
|
|
239
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
240
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
241
|
+
|
|
211
242
|
# locations via exporter helper
|
|
212
243
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
213
244
|
|
|
214
|
-
# Artifacts
|
|
245
|
+
# Artifacts with file descriptions
|
|
246
|
+
artifacts_list = _get_artifacts_file_list()
|
|
215
247
|
if paths.get("storage_type") == "remote_ssh":
|
|
216
248
|
artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
|
|
217
249
|
logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
|
|
218
250
|
print(f"├── Artifacts: {artifacts_path} (remote)")
|
|
251
|
+
print("│ └── Key files:")
|
|
252
|
+
for filename, desc in artifacts_list:
|
|
253
|
+
print(f"│ ├── {filename} - {desc}")
|
|
219
254
|
else:
|
|
220
255
|
ap = paths.get("artifacts_dir")
|
|
221
256
|
if ap:
|
|
@@ -224,14 +259,21 @@ class DebugCmd(ExportCmd):
|
|
|
224
259
|
"Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
|
|
225
260
|
)
|
|
226
261
|
print(f"├── Artifacts: {ap} {exists} (local)")
|
|
262
|
+
print("│ └── Key files:")
|
|
263
|
+
for filename, desc in artifacts_list:
|
|
264
|
+
print(f"│ ├── {filename} - {desc}")
|
|
227
265
|
|
|
228
|
-
# Logs
|
|
266
|
+
# Logs with file descriptions
|
|
267
|
+
logs_list = _get_log_file_list(exec_type)
|
|
229
268
|
if paths.get("storage_type") == "remote_ssh":
|
|
230
269
|
logs_path = (
|
|
231
270
|
f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
|
|
232
271
|
)
|
|
233
272
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
234
273
|
print(f"├── Logs: {logs_path} (remote)")
|
|
274
|
+
print("│ └── Key files:")
|
|
275
|
+
for filename, desc in logs_list:
|
|
276
|
+
print(f"│ ├── {filename} - {desc}")
|
|
235
277
|
else:
|
|
236
278
|
lp = paths.get("logs_dir")
|
|
237
279
|
if lp:
|
|
@@ -240,6 +282,9 @@ class DebugCmd(ExportCmd):
|
|
|
240
282
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
241
283
|
)
|
|
242
284
|
print(f"├── Logs: {lp} {exists} (local)")
|
|
285
|
+
print("│ └── Key files:")
|
|
286
|
+
for filename, desc in logs_list:
|
|
287
|
+
print(f"│ ├── {filename} - {desc}")
|
|
243
288
|
|
|
244
289
|
# executor-specific
|
|
245
290
|
d = job_data.data or {}
|
|
@@ -264,17 +309,23 @@ class DebugCmd(ExportCmd):
|
|
|
264
309
|
eu = d.get("endpoint_url")
|
|
265
310
|
if eu:
|
|
266
311
|
print(f"├── Endpoint URL: {eu}")
|
|
267
|
-
# local and others: paths already displayed above; no extra fields needed
|
|
268
312
|
|
|
269
313
|
def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
270
314
|
logger.info("Log locations")
|
|
271
315
|
print("Log locations:\n")
|
|
272
316
|
for job_id, job_data in jobs:
|
|
273
317
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
318
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
319
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
320
|
+
logs_list = _get_log_file_list(exec_type)
|
|
321
|
+
|
|
274
322
|
if paths.get("storage_type") == "remote_ssh":
|
|
275
323
|
logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
|
|
276
324
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
277
325
|
print(f"{job_id}: {logs_path} (remote)")
|
|
326
|
+
print(" └── Key files:")
|
|
327
|
+
for filename, desc in logs_list:
|
|
328
|
+
print(f" ├── {filename} - {desc}")
|
|
278
329
|
else:
|
|
279
330
|
lp = paths.get("logs_dir")
|
|
280
331
|
if lp:
|
|
@@ -283,18 +334,26 @@ class DebugCmd(ExportCmd):
|
|
|
283
334
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
284
335
|
)
|
|
285
336
|
print(f"{job_id}: {lp} {exists} (local)")
|
|
337
|
+
print(" └── Key files:")
|
|
338
|
+
for filename, desc in logs_list:
|
|
339
|
+
print(f" ├── {filename} - {desc}")
|
|
286
340
|
|
|
287
341
|
def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
288
342
|
logger.info("Artifact locations")
|
|
289
343
|
print("Artifact locations:\n")
|
|
290
344
|
for job_id, job_data in jobs:
|
|
291
345
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
346
|
+
artifacts_list = _get_artifacts_file_list()
|
|
347
|
+
|
|
292
348
|
if paths.get("storage_type") == "remote_ssh":
|
|
293
349
|
artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
|
|
294
350
|
logger.info(
|
|
295
351
|
"Artifacts", job_id=job_id, path=artifacts_path, remote=True
|
|
296
352
|
)
|
|
297
353
|
print(f"{job_id}: {artifacts_path} (remote)")
|
|
354
|
+
print(" └── Key files:")
|
|
355
|
+
for filename, desc in artifacts_list:
|
|
356
|
+
print(f" ├── {filename} - {desc}")
|
|
298
357
|
else:
|
|
299
358
|
ap = paths.get("artifacts_dir")
|
|
300
359
|
if ap:
|
|
@@ -306,6 +365,9 @@ class DebugCmd(ExportCmd):
|
|
|
306
365
|
exists_indicator=exists,
|
|
307
366
|
)
|
|
308
367
|
print(f"{job_id}: {ap} {exists} (local)")
|
|
368
|
+
print(" └── Key files:")
|
|
369
|
+
for filename, desc in artifacts_list:
|
|
370
|
+
print(f" ├── {filename} - {desc}")
|
|
309
371
|
|
|
310
372
|
def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
311
373
|
for job_id, job_data in jobs:
|
|
@@ -383,6 +445,9 @@ class DebugCmd(ExportCmd):
|
|
|
383
445
|
print(
|
|
384
446
|
f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
|
|
385
447
|
)
|
|
448
|
+
# Show full destination path
|
|
449
|
+
full_dest_path = Path(dest_dir).resolve()
|
|
450
|
+
print(f"Copied to: {full_dest_path}")
|
|
386
451
|
else:
|
|
387
452
|
err = result.get("error", "Unknown error")
|
|
388
453
|
logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
|
|
@@ -403,3 +468,45 @@ class DebugCmd(ExportCmd):
|
|
|
403
468
|
except Exception:
|
|
404
469
|
pass
|
|
405
470
|
return ""
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# Helper functions for file descriptions (based on actual code and content analysis)
|
|
474
|
+
def _get_artifacts_file_list() -> list[tuple[str, str]]:
|
|
475
|
+
"""Files generated in artifacts/."""
|
|
476
|
+
return [
|
|
477
|
+
(
|
|
478
|
+
"results.yml",
|
|
479
|
+
"Benchmark scores, task results and resolved run configuration.",
|
|
480
|
+
),
|
|
481
|
+
(
|
|
482
|
+
"eval_factory_metrics.json",
|
|
483
|
+
"Response + runtime stats (latency, tokens count, memory)",
|
|
484
|
+
),
|
|
485
|
+
("metrics.json", "Harness/benchmark metric and configuration"),
|
|
486
|
+
("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
|
|
487
|
+
("report.json", "Report data in json format, if enabled"),
|
|
488
|
+
]
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
|
|
492
|
+
"""Files actually generated in logs/ - executor-specific."""
|
|
493
|
+
et = (executor_type or "local").lower()
|
|
494
|
+
if et == "slurm":
|
|
495
|
+
return [
|
|
496
|
+
("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
|
|
497
|
+
(
|
|
498
|
+
"slurm-{SLURM_JOB_ID}.out",
|
|
499
|
+
"SLURM scheduler stdout/stderr (batch submission, export steps).",
|
|
500
|
+
),
|
|
501
|
+
(
|
|
502
|
+
"server-{SLURM_JOB_ID}.out",
|
|
503
|
+
"Model server logs when a deployment is used.",
|
|
504
|
+
),
|
|
505
|
+
]
|
|
506
|
+
# local executor
|
|
507
|
+
return [
|
|
508
|
+
(
|
|
509
|
+
"stdout.log",
|
|
510
|
+
"Complete evaluation output (timestamps, resolved config, run/export messages).",
|
|
511
|
+
),
|
|
512
|
+
]
|
|
@@ -19,8 +19,8 @@ import os
|
|
|
19
19
|
|
|
20
20
|
from simple_parsing import ArgumentParser
|
|
21
21
|
|
|
22
|
-
import nemo_evaluator_launcher.cli.debug as debug
|
|
23
22
|
import nemo_evaluator_launcher.cli.export as export
|
|
23
|
+
import nemo_evaluator_launcher.cli.info as info
|
|
24
24
|
import nemo_evaluator_launcher.cli.kill as kill
|
|
25
25
|
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
26
26
|
import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
|
|
@@ -42,12 +42,12 @@ def is_verbose_enabled(args) -> bool:
|
|
|
42
42
|
subcommands = [
|
|
43
43
|
"run",
|
|
44
44
|
"status",
|
|
45
|
+
"info",
|
|
45
46
|
"kill",
|
|
46
47
|
"tasks_alias",
|
|
47
48
|
"tasks",
|
|
48
49
|
"runs",
|
|
49
50
|
"export",
|
|
50
|
-
"debug",
|
|
51
51
|
]
|
|
52
52
|
for subcmd in subcommands:
|
|
53
53
|
if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
|
|
@@ -163,16 +163,16 @@ def create_parser() -> ArgumentParser:
|
|
|
163
163
|
)
|
|
164
164
|
export_parser.add_arguments(export.ExportCmd, dest="export")
|
|
165
165
|
|
|
166
|
-
#
|
|
167
|
-
|
|
168
|
-
"
|
|
166
|
+
# Info subcommand
|
|
167
|
+
info_parser = subparsers.add_parser(
|
|
168
|
+
"info",
|
|
169
169
|
help="Display evaluation job information",
|
|
170
|
-
description="
|
|
170
|
+
description="Info functionalities for nemo-evaluator-launcher",
|
|
171
171
|
)
|
|
172
|
-
|
|
172
|
+
info_parser.add_argument(
|
|
173
173
|
"-v", "--verbose", action="store_true", help="Enable verbose logging"
|
|
174
174
|
)
|
|
175
|
-
|
|
175
|
+
info_parser.add_arguments(info.InfoCmd, dest="info")
|
|
176
176
|
|
|
177
177
|
return parser
|
|
178
178
|
|
|
@@ -218,8 +218,8 @@ def main() -> None:
|
|
|
218
218
|
args.runs.execute()
|
|
219
219
|
elif args.command == "export":
|
|
220
220
|
args.export.execute()
|
|
221
|
-
elif args.command == "
|
|
222
|
-
args.
|
|
221
|
+
elif args.command == "info":
|
|
222
|
+
args.info.execute()
|
|
223
223
|
|
|
224
224
|
|
|
225
225
|
if __name__ == "__main__":
|
|
@@ -19,6 +19,15 @@ from dataclasses import dataclass
|
|
|
19
19
|
|
|
20
20
|
from simple_parsing import field
|
|
21
21
|
|
|
22
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
23
|
+
from nemo_evaluator_launcher.common.printing_utils import (
|
|
24
|
+
bold,
|
|
25
|
+
cyan,
|
|
26
|
+
green,
|
|
27
|
+
magenta,
|
|
28
|
+
red,
|
|
29
|
+
)
|
|
30
|
+
|
|
22
31
|
|
|
23
32
|
@dataclass
|
|
24
33
|
class Cmd:
|
|
@@ -101,15 +110,10 @@ class Cmd:
|
|
|
101
110
|
try:
|
|
102
111
|
invocation_id = run_eval(config, self.dry_run)
|
|
103
112
|
except Exception as e:
|
|
104
|
-
print(f"
|
|
113
|
+
print(red(f"✗ Job submission failed, see logs | Error: {e}"))
|
|
114
|
+
logger.error("Job submission failed", error=e)
|
|
105
115
|
raise
|
|
106
116
|
|
|
107
|
-
# Print general success message with invocation ID
|
|
108
|
-
if invocation_id is not None and not self.dry_run:
|
|
109
|
-
print(
|
|
110
|
-
f"\033[32m✓ Job submission successful | Invocation ID: {invocation_id}\033[0m"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
117
|
# Save the complete configuration
|
|
114
118
|
if not self.dry_run and invocation_id is not None:
|
|
115
119
|
# Determine config output directory
|
|
@@ -151,14 +155,22 @@ class Cmd:
|
|
|
151
155
|
f.write("#\n")
|
|
152
156
|
f.write(config_yaml)
|
|
153
157
|
|
|
154
|
-
print(
|
|
158
|
+
print(bold(cyan("Complete run config saved to: ")) + f"\n {config_path}\n")
|
|
159
|
+
logger.info("Saved complete config", path=config_path)
|
|
155
160
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
print(
|
|
161
|
+
# Print general success message with invocation ID and helpful commands
|
|
162
|
+
if invocation_id is not None and not self.dry_run:
|
|
163
|
+
print(
|
|
164
|
+
bold(cyan("To check status: "))
|
|
165
|
+
+ f"nemo-evaluator-launcher status {invocation_id}"
|
|
166
|
+
)
|
|
167
|
+
print(
|
|
168
|
+
bold(cyan("To kill all jobs: "))
|
|
169
|
+
+ f"nemo-evaluator-launcher kill {invocation_id}"
|
|
170
|
+
)
|
|
159
171
|
|
|
160
172
|
# Show actual job IDs and task names
|
|
161
|
-
print("
|
|
173
|
+
print(bold(cyan("To kill individual jobs:")))
|
|
162
174
|
# Access tasks - will work after normalization in run_eval
|
|
163
175
|
tasks = (
|
|
164
176
|
config.evaluation.tasks
|
|
@@ -168,7 +180,21 @@ class Cmd:
|
|
|
168
180
|
for idx, task in enumerate(tasks):
|
|
169
181
|
job_id = f"{invocation_id}.{idx}"
|
|
170
182
|
print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
|
|
183
|
+
|
|
184
|
+
print(
|
|
185
|
+
magenta(
|
|
186
|
+
"(all commands accept shortened IDs as long as there are no conflicts)"
|
|
187
|
+
)
|
|
188
|
+
)
|
|
171
189
|
print(
|
|
172
|
-
"
|
|
190
|
+
bold(cyan("To print all jobs: ")) + "nemo-evaluator-launcher ls runs"
|
|
173
191
|
"\n (--since 1d or --since 6h for time span, see --help)"
|
|
174
192
|
)
|
|
193
|
+
|
|
194
|
+
print(
|
|
195
|
+
green(
|
|
196
|
+
bold(
|
|
197
|
+
f"✓ Job submission successful | Invocation ID: {invocation_id}"
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
)
|
|
@@ -17,6 +17,7 @@ from dataclasses import dataclass
|
|
|
17
17
|
|
|
18
18
|
from simple_parsing import field
|
|
19
19
|
|
|
20
|
+
import nemo_evaluator_launcher.common.printing_utils as pu
|
|
20
21
|
from nemo_evaluator_launcher.executors.base import ExecutionState
|
|
21
22
|
|
|
22
23
|
|
|
@@ -143,17 +144,17 @@ class Cmd:
|
|
|
143
144
|
"""Format status with Unicode visual indicators only."""
|
|
144
145
|
# Status mapping based on ExecutionState enum
|
|
145
146
|
status_formats = {
|
|
146
|
-
ExecutionState.SUCCESS.value: "
|
|
147
|
-
ExecutionState.FAILED.value: "
|
|
148
|
-
ExecutionState.RUNNING.value: "
|
|
149
|
-
ExecutionState.PENDING.value: "
|
|
150
|
-
ExecutionState.KILLED.value: "
|
|
147
|
+
ExecutionState.SUCCESS.value: pu.green("✓ SUCCESS"),
|
|
148
|
+
ExecutionState.FAILED.value: pu.red("✗ FAILED"),
|
|
149
|
+
ExecutionState.RUNNING.value: pu.yellow("▶ RUNNING"),
|
|
150
|
+
ExecutionState.PENDING.value: pu.cyan("⧗ PENDING"),
|
|
151
|
+
ExecutionState.KILLED.value: pu.magenta("✗ KILLED"),
|
|
151
152
|
# Additional states for error handling
|
|
152
|
-
"not_found": "
|
|
153
|
-
"error": "
|
|
153
|
+
"not_found": pu.grey("? NOT FOUND"),
|
|
154
|
+
"error": pu.red("✗ ERROR"),
|
|
154
155
|
}
|
|
155
156
|
|
|
156
|
-
return status_formats.get(status.lower(),
|
|
157
|
+
return status_formats.get(status.lower(), pu.grey(status.upper()))
|
|
157
158
|
|
|
158
159
|
def _strip_ansi_codes(self, text: str) -> str:
|
|
159
160
|
"""Remove ANSI color codes from text for length calculation."""
|