nemo-evaluator-launcher 0.1.16__tar.gz → 0.1.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/PKG-INFO +3 -3
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/pyproject.toml +4 -2
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/__init__.py +15 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/functional.py +106 -2
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/cli/debug.py → nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/cli/info.py +170 -63
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/cli/logs.py +102 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/main.py +22 -10
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/run.py +112 -28
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/status.py +9 -8
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/common/helpers.py +374 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/common/printing_utils.py +100 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/base.py +31 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/executor.py +219 -24
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/executor.py +403 -33
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/run.template.sh +58 -3
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/executor.py +442 -64
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/utils.py +32 -46
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/resources/mapping.toml +57 -16
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +3 -3
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +4 -1
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +1 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/requires.txt +1 -1
- nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/common/helpers.py +0 -194
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nemo-evaluator-launcher
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.44
|
|
4
4
|
Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
|
|
5
5
|
Author: NVIDIA
|
|
6
6
|
Author-email: nemo-toolkit@nvidia.com
|
|
@@ -458,7 +458,7 @@ License:
|
|
|
458
458
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
459
459
|
SOFTWARE.
|
|
460
460
|
|
|
461
|
-
Project-URL: homepage, https://github.com/NVIDIA-NeMo/
|
|
461
|
+
Project-URL: homepage, https://github.com/NVIDIA-NeMo/Evaluator
|
|
462
462
|
Project-URL: repository, https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher
|
|
463
463
|
Keywords: deep learning,evaluations,machine learning,gpu,NLP,pytorch,torch
|
|
464
464
|
Requires-Python: <3.14,>=3.10
|
|
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
|
|
|
478
478
|
Provides-Extra: wandb
|
|
479
479
|
Requires-Dist: wandb>=0.15.0; extra == "wandb"
|
|
480
480
|
Provides-Extra: gsheets
|
|
481
|
-
Requires-Dist:
|
|
481
|
+
Requires-Dist: gspread>=5.0.0; extra == "gsheets"
|
|
482
482
|
Provides-Extra: exporters
|
|
483
483
|
Requires-Dist: mlflow; extra == "exporters"
|
|
484
484
|
Requires-Dist: wandb; extra == "exporters"
|
|
@@ -33,20 +33,21 @@ keywords = [
|
|
|
33
33
|
|
|
34
34
|
[project.urls]
|
|
35
35
|
# BEGIN(if-changed): check package_info.py
|
|
36
|
-
homepage = "https://github.com/NVIDIA-NeMo/
|
|
36
|
+
homepage = "https://github.com/NVIDIA-NeMo/Evaluator"
|
|
37
37
|
repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher"
|
|
38
38
|
# END(if-changed)
|
|
39
39
|
|
|
40
40
|
[project.optional-dependencies]
|
|
41
41
|
mlflow = ["mlflow>=2.8.0"]
|
|
42
42
|
wandb = ["wandb>=0.15.0"]
|
|
43
|
-
gsheets = ["
|
|
43
|
+
gsheets = ["gspread>=5.0.0"]
|
|
44
44
|
exporters = ["mlflow", "wandb", "gsheets"]
|
|
45
45
|
all = ["mlflow", "wandb", "gsheets"]
|
|
46
46
|
|
|
47
47
|
[project.scripts]
|
|
48
48
|
nemo-evaluator-launcher = "nemo_evaluator_launcher.cli.main:main"
|
|
49
49
|
nv-eval = "nemo_evaluator_launcher.cli.main:main"
|
|
50
|
+
nel = "nemo_evaluator_launcher.cli.main:main"
|
|
50
51
|
|
|
51
52
|
[dependency-groups]
|
|
52
53
|
dev = [
|
|
@@ -75,6 +76,7 @@ where = ["src"]
|
|
|
75
76
|
"resources/**/*",
|
|
76
77
|
"configs/**/*",
|
|
77
78
|
"executors/**/*.sh",
|
|
79
|
+
"executors/**/*.template",
|
|
78
80
|
]
|
|
79
81
|
|
|
80
82
|
[tool.setuptools.dynamic]
|
|
@@ -20,6 +20,7 @@ It automatically initializes logging and conditionally loads internal components
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
import importlib
|
|
23
|
+
import warnings
|
|
23
24
|
|
|
24
25
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
25
26
|
from nemo_evaluator_launcher.package_info import (
|
|
@@ -32,9 +33,22 @@ from nemo_evaluator_launcher.package_info import (
|
|
|
32
33
|
__version__,
|
|
33
34
|
)
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
# Suppress pydantic warnings from third-party libraries (e.g., wandb) that are not
|
|
37
|
+
# compatible with Pydantic 2.x field metadata on Python 3.13+
|
|
38
|
+
warnings.filterwarnings(
|
|
39
|
+
"ignore",
|
|
40
|
+
message=r"The 'repr' attribute.*Field\(\).*",
|
|
41
|
+
category=Warning,
|
|
42
|
+
)
|
|
43
|
+
warnings.filterwarnings(
|
|
44
|
+
"ignore",
|
|
45
|
+
message=r"The 'frozen' attribute.*Field\(\).*",
|
|
46
|
+
category=Warning,
|
|
47
|
+
)
|
|
36
48
|
|
|
37
49
|
|
|
50
|
+
logger.info("Version info", pkg=__package_name__, ver=__version__)
|
|
51
|
+
|
|
38
52
|
try:
|
|
39
53
|
importlib.import_module("nemo_evaluator_launcher_internal")
|
|
40
54
|
logger.debug(
|
|
@@ -19,7 +19,7 @@ This module provides the main functional entry points for running evaluations, q
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
from pathlib import Path
|
|
22
|
-
from typing import Any, List, Optional, Union
|
|
22
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
23
23
|
|
|
24
24
|
import yaml
|
|
25
25
|
from omegaconf import DictConfig, OmegaConf
|
|
@@ -116,6 +116,7 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
|
|
|
116
116
|
db = ExecutionDB()
|
|
117
117
|
results: List[dict[str, Any]] = []
|
|
118
118
|
|
|
119
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
119
120
|
for id_or_prefix in ids_or_prefixes:
|
|
120
121
|
# If id looks like an invocation_id (no dot), get all jobs for it
|
|
121
122
|
if "." not in id_or_prefix:
|
|
@@ -259,6 +260,108 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
|
|
|
259
260
|
return results
|
|
260
261
|
|
|
261
262
|
|
|
263
|
+
def stream_logs(
|
|
264
|
+
ids_or_prefixes: Union[str, list[str]],
|
|
265
|
+
) -> Iterator[Tuple[str, str, str]]:
|
|
266
|
+
"""Stream logs from jobs or invocations by their IDs or invocation IDs.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
ids_or_prefixes: Single ID/prefix or list of job IDs or invocation IDs to stream logs from.
|
|
270
|
+
Short prefixes are allowed, we would try to match the full ones from
|
|
271
|
+
prefixes if no collisions are present.
|
|
272
|
+
|
|
273
|
+
Yields:
|
|
274
|
+
Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
|
|
275
|
+
Empty lines are yielded as empty strings.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
ValueError: If the executor doesn't support log streaming.
|
|
279
|
+
"""
|
|
280
|
+
db = ExecutionDB()
|
|
281
|
+
|
|
282
|
+
# Normalize to list for consistent processing
|
|
283
|
+
if isinstance(ids_or_prefixes, str):
|
|
284
|
+
ids_or_prefixes = [ids_or_prefixes]
|
|
285
|
+
|
|
286
|
+
# Collect all jobs from all IDs, grouped by executor
|
|
287
|
+
executor_to_jobs: Dict[str, Dict[str, JobData]] = {}
|
|
288
|
+
executor_to_invocations: Dict[str, List[str]] = {}
|
|
289
|
+
|
|
290
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
291
|
+
for id_or_prefix in ids_or_prefixes:
|
|
292
|
+
# Determine if this is a job ID or invocation ID
|
|
293
|
+
if "." in id_or_prefix:
|
|
294
|
+
# This is a job ID
|
|
295
|
+
job_data = db.get_job(id_or_prefix)
|
|
296
|
+
if job_data is None:
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
executor = job_data.executor
|
|
300
|
+
if executor not in executor_to_jobs:
|
|
301
|
+
executor_to_jobs[executor] = {}
|
|
302
|
+
executor_to_jobs[executor][id_or_prefix] = job_data
|
|
303
|
+
else:
|
|
304
|
+
# This is an invocation ID
|
|
305
|
+
jobs = db.get_jobs(id_or_prefix)
|
|
306
|
+
if not jobs:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# Get the executor class from the first job
|
|
310
|
+
first_job_data = next(iter(jobs.values()))
|
|
311
|
+
executor = first_job_data.executor
|
|
312
|
+
if executor not in executor_to_invocations:
|
|
313
|
+
executor_to_invocations[executor] = []
|
|
314
|
+
executor_to_invocations[executor].append(id_or_prefix)
|
|
315
|
+
|
|
316
|
+
# Stream logs from each executor simultaneously
|
|
317
|
+
# For each executor, collect all job IDs and stream them together
|
|
318
|
+
for executor, jobs_dict in executor_to_jobs.items():
|
|
319
|
+
try:
|
|
320
|
+
executor_cls = get_executor(executor)
|
|
321
|
+
except ValueError:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
# For local executor with multiple jobs, pass list to stream simultaneously
|
|
325
|
+
# For other executors or single jobs, pass individual job IDs
|
|
326
|
+
if executor == "local" and len(jobs_dict) > 1:
|
|
327
|
+
# Pass all job IDs as a list to stream simultaneously
|
|
328
|
+
try:
|
|
329
|
+
yield from executor_cls.stream_logs(
|
|
330
|
+
list(jobs_dict.keys()), executor_name=executor
|
|
331
|
+
)
|
|
332
|
+
except NotImplementedError:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
# Single job or non-local executor
|
|
338
|
+
for job_id in jobs_dict.keys():
|
|
339
|
+
try:
|
|
340
|
+
yield from executor_cls.stream_logs(job_id, executor_name=executor)
|
|
341
|
+
except NotImplementedError:
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Stream logs from invocation IDs
|
|
347
|
+
for executor, invocation_ids in executor_to_invocations.items():
|
|
348
|
+
try:
|
|
349
|
+
executor_cls = get_executor(executor)
|
|
350
|
+
except ValueError:
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
# Stream each invocation (each invocation already handles multiple jobs internally)
|
|
354
|
+
for invocation_id in invocation_ids:
|
|
355
|
+
try:
|
|
356
|
+
yield from executor_cls.stream_logs(
|
|
357
|
+
invocation_id, executor_name=executor
|
|
358
|
+
)
|
|
359
|
+
except NotImplementedError:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
262
365
|
def list_all_invocations_summary() -> list[dict[str, Any]]:
|
|
263
366
|
"""Return a concise per-invocation summary from the exec DB.
|
|
264
367
|
|
|
@@ -378,6 +481,7 @@ def kill_job_or_invocation(id: str) -> list[dict[str, Any]]:
|
|
|
378
481
|
"data": {"error": f"Unexpected error: {str(e)}"},
|
|
379
482
|
}
|
|
380
483
|
|
|
484
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
381
485
|
# Determine if this is a job ID or invocation ID
|
|
382
486
|
if "." in id:
|
|
383
487
|
# This is a job ID - kill single job
|
|
@@ -442,7 +546,7 @@ def export_results(
|
|
|
442
546
|
if "." in single_id: # job_id
|
|
443
547
|
# Try reading config from artifacts working dir (auto-export on remote node)
|
|
444
548
|
cfg_file = None
|
|
445
|
-
for name in ("
|
|
549
|
+
for name in ("config.yml", "run_config.yml"):
|
|
446
550
|
p = Path(name)
|
|
447
551
|
if p.exists():
|
|
448
552
|
cfg_file = p
|
|
@@ -19,9 +19,18 @@ This module defines data structures and helpers for configuration and type safet
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
import os
|
|
22
|
+
import warnings
|
|
22
23
|
from dataclasses import dataclass
|
|
23
24
|
from typing import cast
|
|
24
25
|
|
|
26
|
+
# ruff: noqa: E402
|
|
27
|
+
# Later when adding optional module to hydra, since the internal package is optional,
|
|
28
|
+
# will generate a hydra warning. We suppress it as distraction and bad UX, before hydra gets invoked.
|
|
29
|
+
warnings.filterwarnings(
|
|
30
|
+
"ignore",
|
|
31
|
+
message="provider=hydra.searchpath.*path=nemo_evaluator_launcher_internal.*is not available\\.",
|
|
32
|
+
)
|
|
33
|
+
|
|
25
34
|
import hydra
|
|
26
35
|
from hydra.core.global_hydra import GlobalHydra
|
|
27
36
|
from omegaconf import DictConfig, OmegaConf
|
|
@@ -14,16 +14,16 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
#
|
|
16
16
|
|
|
17
|
-
"""
|
|
17
|
+
"""Job information helper functionalities for nemo-evaluator-launcher."""
|
|
18
18
|
|
|
19
|
+
import sys
|
|
19
20
|
from dataclasses import dataclass
|
|
20
21
|
from datetime import datetime
|
|
21
22
|
from pathlib import Path
|
|
22
|
-
from typing import Any, Dict, List,
|
|
23
|
+
from typing import Any, Dict, List, Tuple
|
|
23
24
|
|
|
24
25
|
from simple_parsing import field
|
|
25
26
|
|
|
26
|
-
from nemo_evaluator_launcher.cli.export import ExportCmd
|
|
27
27
|
from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
|
|
28
28
|
from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
|
|
29
29
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
@@ -35,52 +35,60 @@ _EXPORT_HELPER = LocalExporter({})
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
|
-
class
|
|
39
|
-
"""
|
|
38
|
+
class InfoCmd:
|
|
39
|
+
"""Job information functionalities for nemo-evaluator-launcher.
|
|
40
40
|
|
|
41
41
|
Examples:
|
|
42
|
-
nemo-evaluator-launcher
|
|
43
|
-
nemo-evaluator-launcher
|
|
44
|
-
nemo-evaluator-launcher
|
|
45
|
-
nemo-evaluator-launcher
|
|
46
|
-
nemo-evaluator-launcher
|
|
47
|
-
nemo-evaluator-launcher
|
|
42
|
+
nemo-evaluator-launcher info <inv> # Full job info
|
|
43
|
+
nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
|
|
44
|
+
nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
|
|
45
|
+
nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
|
|
46
|
+
nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
|
|
47
|
+
nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
|
|
48
48
|
|
|
49
49
|
Notes:
|
|
50
|
-
- Supports invocation IDs and job IDs
|
|
50
|
+
- Supports invocation IDs and job IDs (space-separated)
|
|
51
51
|
- Shows local or remote paths depending on executor (local/slurm/lepton)
|
|
52
|
+
- Copy operations work for both local and remote jobs (expect longer time for remote jobs)
|
|
53
|
+
- Copy operations are not supported for Lepton executor (yet).
|
|
52
54
|
"""
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
invocation_ids: List[str] = field(
|
|
57
|
+
positional=True,
|
|
58
|
+
help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
|
|
59
|
+
)
|
|
56
60
|
|
|
57
|
-
#
|
|
58
|
-
config: bool = field(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
+
# info modes
|
|
62
|
+
config: bool = field(
|
|
63
|
+
default=False, action="store_true", help="Show job configuration"
|
|
64
|
+
)
|
|
65
|
+
artifacts: bool = field(
|
|
66
|
+
default=False, action="store_true", help="Show artifact locations and key files"
|
|
67
|
+
)
|
|
68
|
+
logs: bool = field(
|
|
69
|
+
default=False, action="store_true", help="Show log locations and key files"
|
|
70
|
+
)
|
|
61
71
|
|
|
62
|
-
# copy operations
|
|
63
|
-
copy_logs:
|
|
72
|
+
# copy operations - work for both local and remote jobs
|
|
73
|
+
copy_logs: str | None = field(
|
|
64
74
|
default=None,
|
|
65
75
|
alias=["--copy-logs"],
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
help="Copy logs to a local directory",
|
|
77
|
+
metavar="DIR",
|
|
68
78
|
)
|
|
69
|
-
copy_artifacts:
|
|
79
|
+
copy_artifacts: str | None = field(
|
|
70
80
|
default=None,
|
|
71
81
|
alias=["--copy-artifacts"],
|
|
72
|
-
|
|
73
|
-
|
|
82
|
+
help="Copy artifacts to a local directory",
|
|
83
|
+
metavar="DIR",
|
|
74
84
|
)
|
|
75
85
|
|
|
76
86
|
def execute(self) -> None:
|
|
77
|
-
# show version
|
|
78
87
|
VersionCmd().execute()
|
|
79
|
-
|
|
80
|
-
logger.info("Debug command started", invocation_ids=self.invocation_ids)
|
|
88
|
+
logger.info("Info command started", invocation_ids=self.invocation_ids)
|
|
81
89
|
|
|
82
90
|
if not self.invocation_ids:
|
|
83
|
-
logger.error("No invocation IDs provided")
|
|
91
|
+
logger.error("No job or invocation IDs provided.")
|
|
84
92
|
raise ValueError("No job or invocation IDs provided.")
|
|
85
93
|
|
|
86
94
|
jobs = self._resolve_jobs()
|
|
@@ -96,48 +104,63 @@ class DebugCmd(ExportCmd):
|
|
|
96
104
|
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
97
105
|
)
|
|
98
106
|
print(
|
|
99
|
-
"No valid jobs found (jobs may have been
|
|
107
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
100
108
|
)
|
|
101
109
|
return
|
|
102
110
|
|
|
111
|
+
# show ops
|
|
103
112
|
if self.config:
|
|
104
|
-
logger.info("Showing job configuration", job_count=len(jobs))
|
|
105
113
|
self._show_config_info(jobs)
|
|
106
|
-
|
|
107
|
-
logger.info("Showing job logs locations", job_count=len(jobs))
|
|
114
|
+
if self.logs:
|
|
108
115
|
self._show_logs_info(jobs)
|
|
109
|
-
|
|
110
|
-
logger.info("Showing artifacts locations", job_count=len(jobs))
|
|
116
|
+
if self.artifacts:
|
|
111
117
|
self._show_artifacts_info(jobs)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
|
|
119
|
+
# copy ops
|
|
120
|
+
args = sys.argv[1:]
|
|
121
|
+
copy_logs_flag = "--copy-logs" in args
|
|
122
|
+
copy_artifacts_flag = "--copy-artifacts" in args
|
|
123
|
+
|
|
124
|
+
if copy_logs_flag:
|
|
125
|
+
if self.copy_logs is None:
|
|
126
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
127
|
+
if not self.copy_logs.strip():
|
|
128
|
+
raise ValueError("--copy-logs requires a directory path")
|
|
118
129
|
logger.info(
|
|
119
|
-
"Copying logs to local directory",
|
|
130
|
+
"Copying logs to local directory",
|
|
131
|
+
dest_dir=self.copy_logs,
|
|
132
|
+
job_count=len(jobs),
|
|
120
133
|
)
|
|
121
|
-
self._copy_logs(jobs,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
if
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
)
|
|
134
|
+
self._copy_logs(jobs, self.copy_logs)
|
|
135
|
+
|
|
136
|
+
if copy_artifacts_flag:
|
|
137
|
+
if self.copy_artifacts is None:
|
|
138
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
139
|
+
if not self.copy_artifacts.strip():
|
|
140
|
+
raise ValueError("--copy-artifacts requires a directory path")
|
|
128
141
|
logger.info(
|
|
129
142
|
"Copying artifacts to local directory",
|
|
130
|
-
dest_dir=
|
|
143
|
+
dest_dir=self.copy_artifacts,
|
|
131
144
|
job_count=len(jobs),
|
|
132
145
|
)
|
|
133
|
-
self._copy_artifacts(jobs,
|
|
134
|
-
|
|
146
|
+
self._copy_artifacts(jobs, self.copy_artifacts)
|
|
147
|
+
|
|
148
|
+
# default view when no flags
|
|
149
|
+
if not any(
|
|
150
|
+
[
|
|
151
|
+
self.config,
|
|
152
|
+
self.logs,
|
|
153
|
+
self.artifacts,
|
|
154
|
+
self.copy_logs,
|
|
155
|
+
self.copy_artifacts,
|
|
156
|
+
]
|
|
157
|
+
):
|
|
135
158
|
logger.info(
|
|
136
159
|
"Job metadata details",
|
|
137
160
|
invocation_id=jobs[0][1].invocation_id if jobs else None,
|
|
138
161
|
jobs=len(jobs),
|
|
139
162
|
)
|
|
140
|
-
self.
|
|
163
|
+
self._show_invocation_info(jobs)
|
|
141
164
|
|
|
142
165
|
def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
|
|
143
166
|
"""Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
|
|
@@ -160,15 +183,15 @@ class DebugCmd(ExportCmd):
|
|
|
160
183
|
uniq.append((jid, jd))
|
|
161
184
|
return sorted(uniq, key=lambda p: p[0])
|
|
162
185
|
|
|
163
|
-
def
|
|
186
|
+
def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
164
187
|
inv = jobs[0][1].invocation_id if jobs else None
|
|
165
|
-
logger.info("
|
|
188
|
+
logger.info("Job information", jobs=len(jobs), invocation=inv)
|
|
166
189
|
print(
|
|
167
|
-
f"
|
|
190
|
+
f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
|
|
168
191
|
)
|
|
169
192
|
|
|
170
193
|
for job_id, job_data in jobs:
|
|
171
|
-
self.
|
|
194
|
+
self._show_job_info(job_id, job_data)
|
|
172
195
|
print()
|
|
173
196
|
|
|
174
197
|
# footer hint: where to find more metadata
|
|
@@ -184,10 +207,14 @@ class DebugCmd(ExportCmd):
|
|
|
184
207
|
print(" - Use --logs to show log locations.")
|
|
185
208
|
print(" - Use --artifacts to show artifact locations.")
|
|
186
209
|
print(" - Use --config to show stored job configuration (YAML).")
|
|
187
|
-
print(
|
|
188
|
-
|
|
210
|
+
print(
|
|
211
|
+
" - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
|
|
212
|
+
)
|
|
213
|
+
print(
|
|
214
|
+
" - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
|
|
215
|
+
)
|
|
189
216
|
|
|
190
|
-
def
|
|
217
|
+
def _show_job_info(self, job_id: str, job_data: JobData) -> None:
|
|
191
218
|
logger.info("Job", job_id=job_id)
|
|
192
219
|
print(f"Job {job_id}")
|
|
193
220
|
|
|
@@ -208,14 +235,22 @@ class DebugCmd(ExportCmd):
|
|
|
208
235
|
logger.info("Task", job_id=job_id, name=task_name)
|
|
209
236
|
print(f"├── Task: {task_name}")
|
|
210
237
|
|
|
238
|
+
# Determine executor type for file descriptions
|
|
239
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
240
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
241
|
+
|
|
211
242
|
# locations via exporter helper
|
|
212
243
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
213
244
|
|
|
214
|
-
# Artifacts
|
|
245
|
+
# Artifacts with file descriptions
|
|
246
|
+
artifacts_list = _get_artifacts_file_list()
|
|
215
247
|
if paths.get("storage_type") == "remote_ssh":
|
|
216
248
|
artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
|
|
217
249
|
logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
|
|
218
250
|
print(f"├── Artifacts: {artifacts_path} (remote)")
|
|
251
|
+
print("│ └── Key files:")
|
|
252
|
+
for filename, desc in artifacts_list:
|
|
253
|
+
print(f"│ ├── {filename} - {desc}")
|
|
219
254
|
else:
|
|
220
255
|
ap = paths.get("artifacts_dir")
|
|
221
256
|
if ap:
|
|
@@ -224,14 +259,21 @@ class DebugCmd(ExportCmd):
|
|
|
224
259
|
"Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
|
|
225
260
|
)
|
|
226
261
|
print(f"├── Artifacts: {ap} {exists} (local)")
|
|
262
|
+
print("│ └── Key files:")
|
|
263
|
+
for filename, desc in artifacts_list:
|
|
264
|
+
print(f"│ ├── {filename} - {desc}")
|
|
227
265
|
|
|
228
|
-
# Logs
|
|
266
|
+
# Logs with file descriptions
|
|
267
|
+
logs_list = _get_log_file_list(exec_type)
|
|
229
268
|
if paths.get("storage_type") == "remote_ssh":
|
|
230
269
|
logs_path = (
|
|
231
270
|
f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
|
|
232
271
|
)
|
|
233
272
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
234
273
|
print(f"├── Logs: {logs_path} (remote)")
|
|
274
|
+
print("│ └── Key files:")
|
|
275
|
+
for filename, desc in logs_list:
|
|
276
|
+
print(f"│ ├── {filename} - {desc}")
|
|
235
277
|
else:
|
|
236
278
|
lp = paths.get("logs_dir")
|
|
237
279
|
if lp:
|
|
@@ -240,6 +282,9 @@ class DebugCmd(ExportCmd):
|
|
|
240
282
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
241
283
|
)
|
|
242
284
|
print(f"├── Logs: {lp} {exists} (local)")
|
|
285
|
+
print("│ └── Key files:")
|
|
286
|
+
for filename, desc in logs_list:
|
|
287
|
+
print(f"│ ├── {filename} - {desc}")
|
|
243
288
|
|
|
244
289
|
# executor-specific
|
|
245
290
|
d = job_data.data or {}
|
|
@@ -264,17 +309,23 @@ class DebugCmd(ExportCmd):
|
|
|
264
309
|
eu = d.get("endpoint_url")
|
|
265
310
|
if eu:
|
|
266
311
|
print(f"├── Endpoint URL: {eu}")
|
|
267
|
-
# local and others: paths already displayed above; no extra fields needed
|
|
268
312
|
|
|
269
313
|
def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
270
314
|
logger.info("Log locations")
|
|
271
315
|
print("Log locations:\n")
|
|
272
316
|
for job_id, job_data in jobs:
|
|
273
317
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
318
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
319
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
320
|
+
logs_list = _get_log_file_list(exec_type)
|
|
321
|
+
|
|
274
322
|
if paths.get("storage_type") == "remote_ssh":
|
|
275
323
|
logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
|
|
276
324
|
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
277
325
|
print(f"{job_id}: {logs_path} (remote)")
|
|
326
|
+
print(" └── Key files:")
|
|
327
|
+
for filename, desc in logs_list:
|
|
328
|
+
print(f" ├── {filename} - {desc}")
|
|
278
329
|
else:
|
|
279
330
|
lp = paths.get("logs_dir")
|
|
280
331
|
if lp:
|
|
@@ -283,18 +334,26 @@ class DebugCmd(ExportCmd):
|
|
|
283
334
|
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
284
335
|
)
|
|
285
336
|
print(f"{job_id}: {lp} {exists} (local)")
|
|
337
|
+
print(" └── Key files:")
|
|
338
|
+
for filename, desc in logs_list:
|
|
339
|
+
print(f" ├── {filename} - {desc}")
|
|
286
340
|
|
|
287
341
|
def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
288
342
|
logger.info("Artifact locations")
|
|
289
343
|
print("Artifact locations:\n")
|
|
290
344
|
for job_id, job_data in jobs:
|
|
291
345
|
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
346
|
+
artifacts_list = _get_artifacts_file_list()
|
|
347
|
+
|
|
292
348
|
if paths.get("storage_type") == "remote_ssh":
|
|
293
349
|
artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
|
|
294
350
|
logger.info(
|
|
295
351
|
"Artifacts", job_id=job_id, path=artifacts_path, remote=True
|
|
296
352
|
)
|
|
297
353
|
print(f"{job_id}: {artifacts_path} (remote)")
|
|
354
|
+
print(" └── Key files:")
|
|
355
|
+
for filename, desc in artifacts_list:
|
|
356
|
+
print(f" ├── {filename} - {desc}")
|
|
298
357
|
else:
|
|
299
358
|
ap = paths.get("artifacts_dir")
|
|
300
359
|
if ap:
|
|
@@ -306,6 +365,9 @@ class DebugCmd(ExportCmd):
|
|
|
306
365
|
exists_indicator=exists,
|
|
307
366
|
)
|
|
308
367
|
print(f"{job_id}: {ap} {exists} (local)")
|
|
368
|
+
print(" └── Key files:")
|
|
369
|
+
for filename, desc in artifacts_list:
|
|
370
|
+
print(f" ├── {filename} - {desc}")
|
|
309
371
|
|
|
310
372
|
def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
311
373
|
for job_id, job_data in jobs:
|
|
@@ -383,6 +445,9 @@ class DebugCmd(ExportCmd):
|
|
|
383
445
|
print(
|
|
384
446
|
f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
|
|
385
447
|
)
|
|
448
|
+
# Show full destination path
|
|
449
|
+
full_dest_path = Path(dest_dir).resolve()
|
|
450
|
+
print(f"Copied to: {full_dest_path}")
|
|
386
451
|
else:
|
|
387
452
|
err = result.get("error", "Unknown error")
|
|
388
453
|
logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
|
|
@@ -403,3 +468,45 @@ class DebugCmd(ExportCmd):
|
|
|
403
468
|
except Exception:
|
|
404
469
|
pass
|
|
405
470
|
return ""
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# Helper functions for file descriptions (based on actual code and content analysis)
|
|
474
|
+
def _get_artifacts_file_list() -> list[tuple[str, str]]:
|
|
475
|
+
"""Files generated in artifacts/."""
|
|
476
|
+
return [
|
|
477
|
+
(
|
|
478
|
+
"results.yml",
|
|
479
|
+
"Benchmark scores, task results and resolved run configuration.",
|
|
480
|
+
),
|
|
481
|
+
(
|
|
482
|
+
"eval_factory_metrics.json",
|
|
483
|
+
"Response + runtime stats (latency, tokens count, memory)",
|
|
484
|
+
),
|
|
485
|
+
("metrics.json", "Harness/benchmark metric and configuration"),
|
|
486
|
+
("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
|
|
487
|
+
("report.json", "Report data in json format, if enabled"),
|
|
488
|
+
]
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
|
|
492
|
+
"""Files actually generated in logs/ - executor-specific."""
|
|
493
|
+
et = (executor_type or "local").lower()
|
|
494
|
+
if et == "slurm":
|
|
495
|
+
return [
|
|
496
|
+
("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
|
|
497
|
+
(
|
|
498
|
+
"slurm-{SLURM_JOB_ID}.out",
|
|
499
|
+
"SLURM scheduler stdout/stderr (batch submission, export steps).",
|
|
500
|
+
),
|
|
501
|
+
(
|
|
502
|
+
"server-{SLURM_JOB_ID}.out",
|
|
503
|
+
"Model server logs when a deployment is used.",
|
|
504
|
+
),
|
|
505
|
+
]
|
|
506
|
+
# local executor
|
|
507
|
+
return [
|
|
508
|
+
(
|
|
509
|
+
"stdout.log",
|
|
510
|
+
"Complete evaluation output (timestamps, resolved config, run/export messages).",
|
|
511
|
+
),
|
|
512
|
+
]
|