nemo-evaluator-launcher 0.1.14__tar.gz → 0.1.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/functional.py +19 -29
- nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/cli/debug.py +405 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/ls_runs.py +26 -6
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/main.py +24 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/run.py +4 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/local.yaml +1 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/base.py +23 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/executor.py +17 -71
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/executor.py +48 -7
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/run.template.sh +18 -6
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/slurm/executor.py +40 -22
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/local.py +25 -16
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/mlflow.py +168 -70
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/utils.py +85 -33
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/wandb.py +40 -5
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +1 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/pyproject.toml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/types.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/status.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/version.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
- {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -440,23 +440,28 @@ def export_results(
|
|
|
440
440
|
single_id = invocation_ids[0]
|
|
441
441
|
|
|
442
442
|
if "." in single_id: # job_id
|
|
443
|
+
# Try reading config from artifacts working dir (auto-export on remote node)
|
|
444
|
+
cfg_file = None
|
|
445
|
+
for name in ("run_config.yml", "config.yml"):
|
|
446
|
+
p = Path(name)
|
|
447
|
+
if p.exists():
|
|
448
|
+
cfg_file = p
|
|
449
|
+
break
|
|
450
|
+
|
|
443
451
|
md_job_data = None
|
|
444
|
-
|
|
445
|
-
ypath_artifacts = Path("run_config.yml")
|
|
446
|
-
if ypath_artifacts.exists():
|
|
452
|
+
if cfg_file:
|
|
447
453
|
try:
|
|
448
454
|
cfg_yaml = (
|
|
449
|
-
yaml.safe_load(
|
|
450
|
-
or {}
|
|
455
|
+
yaml.safe_load(cfg_file.read_text(encoding="utf-8")) or {}
|
|
451
456
|
)
|
|
452
|
-
|
|
457
|
+
|
|
458
|
+
# Merge exporter override file if present
|
|
453
459
|
ypath_export = Path("export_config.yml")
|
|
454
460
|
if ypath_export.exists():
|
|
455
461
|
exp_yaml = (
|
|
456
462
|
yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
|
|
457
463
|
or {}
|
|
458
464
|
)
|
|
459
|
-
# execution.auto_export contains auto-export destinations
|
|
460
465
|
exec_cfg = cfg_yaml.get("execution") or {}
|
|
461
466
|
auto_exp = (exp_yaml.get("execution") or {}).get(
|
|
462
467
|
"auto_export"
|
|
@@ -464,42 +469,30 @@ def export_results(
|
|
|
464
469
|
if auto_exp is not None:
|
|
465
470
|
exec_cfg["auto_export"] = auto_exp
|
|
466
471
|
cfg_yaml["execution"] = exec_cfg
|
|
467
|
-
|
|
468
|
-
# top-level export block contains exporter config
|
|
469
472
|
if "export" in exp_yaml:
|
|
470
473
|
cfg_yaml["export"] = exp_yaml["export"]
|
|
471
|
-
|
|
472
|
-
# Merge evaluation.tasks from export_config (Slurm writes it there)
|
|
473
474
|
if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
|
|
474
475
|
eval_cfg = cfg_yaml.get("evaluation") or {}
|
|
475
476
|
eval_cfg.update(exp_yaml["evaluation"])
|
|
476
477
|
cfg_yaml["evaluation"] = eval_cfg
|
|
477
478
|
|
|
478
|
-
# metadata
|
|
479
479
|
executor_name = (cfg_yaml.get("execution") or {}).get(
|
|
480
480
|
"type", "local"
|
|
481
481
|
)
|
|
482
|
-
|
|
483
482
|
md_job_data = JobData(
|
|
484
483
|
invocation_id=single_id.split(".")[0],
|
|
485
484
|
job_id=single_id,
|
|
486
485
|
timestamp=0.0,
|
|
487
|
-
executor=executor_name,
|
|
486
|
+
executor=executor_name, # ensures slurm tag is preserved
|
|
488
487
|
data={
|
|
489
488
|
"output_dir": str(Path.cwd().parent),
|
|
490
|
-
"storage_type": "remote_local",
|
|
489
|
+
"storage_type": "remote_local", # no SSH in auto-export path
|
|
491
490
|
},
|
|
492
491
|
config=cfg_yaml,
|
|
493
492
|
)
|
|
494
|
-
# DEBUG: print what we loaded
|
|
495
|
-
print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
|
|
496
|
-
if "evaluation" in cfg_yaml:
|
|
497
|
-
print(
|
|
498
|
-
f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
|
|
499
|
-
)
|
|
500
493
|
except Exception:
|
|
501
494
|
md_job_data = None
|
|
502
|
-
|
|
495
|
+
|
|
503
496
|
job_data = md_job_data or ExecutionDB().get_job(single_id)
|
|
504
497
|
if job_data is None:
|
|
505
498
|
return {
|
|
@@ -507,7 +500,6 @@ def export_results(
|
|
|
507
500
|
"error": f"Job {single_id} not found in ExecutionDB",
|
|
508
501
|
}
|
|
509
502
|
|
|
510
|
-
# Convert single job result to invocation-like structure
|
|
511
503
|
job_result = exporter.export_job(job_data)
|
|
512
504
|
return {
|
|
513
505
|
"success": job_result.success,
|
|
@@ -522,10 +514,9 @@ def export_results(
|
|
|
522
514
|
},
|
|
523
515
|
"metadata": job_result.metadata or {},
|
|
524
516
|
}
|
|
517
|
+
|
|
525
518
|
elif single_id.isdigit(): # pipeline_id
|
|
526
|
-
# Find job by pipeline_id
|
|
527
519
|
db = ExecutionDB()
|
|
528
|
-
# Search all jobs for matching pipeline_id
|
|
529
520
|
for job_id, job_data in db._jobs.items():
|
|
530
521
|
if job_data.data.get("pipeline_id") == int(single_id):
|
|
531
522
|
job_result = exporter.export_job(job_data)
|
|
@@ -542,14 +533,13 @@ def export_results(
|
|
|
542
533
|
"metadata": job_result.metadata or {},
|
|
543
534
|
}
|
|
544
535
|
return {"success": False, "error": f"Pipeline {single_id} not found"}
|
|
536
|
+
|
|
545
537
|
else: # invocation_id
|
|
546
538
|
result = exporter.export_invocation(single_id)
|
|
547
|
-
# Ensure metadata is present in job results to prevent KeyError
|
|
548
539
|
if "jobs" in result:
|
|
549
540
|
for job_id, job_result in result["jobs"].items():
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
return result # type: ignore[no-any-return]
|
|
541
|
+
job_result.setdefault("metadata", {})
|
|
542
|
+
return result
|
|
553
543
|
else:
|
|
554
544
|
# Multiple IDs - parse and group
|
|
555
545
|
db = ExecutionDB()
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""Debugging helper functionalities for nemo-evaluator-launcher."""
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
23
|
+
|
|
24
|
+
from simple_parsing import field
|
|
25
|
+
|
|
26
|
+
from nemo_evaluator_launcher.cli.export import ExportCmd
|
|
27
|
+
from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
|
|
28
|
+
from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
|
|
29
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
30
|
+
from nemo_evaluator_launcher.exporters.local import LocalExporter
|
|
31
|
+
from nemo_evaluator_launcher.exporters.utils import get_task_name
|
|
32
|
+
|
|
33
|
+
# Local exporter helper to copy logs and artifacts
|
|
34
|
+
_EXPORT_HELPER = LocalExporter({})
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class DebugCmd(ExportCmd):
|
|
39
|
+
"""Debugging functionalities for nemo-evaluator-launcher.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
nemo-evaluator-launcher debug <inv> # Full debug info
|
|
43
|
+
nemo-evaluator-launcher debug <inv> --config # Show stored job config (YAML)
|
|
44
|
+
nemo-evaluator-launcher debug <inv> --artifacts # Show artifact locations
|
|
45
|
+
nemo-evaluator-launcher debug <inv> --logs # Show log locations
|
|
46
|
+
nemo-evaluator-launcher debug <inv> --copy-logs <path> # Copy logs (default: current dir)
|
|
47
|
+
nemo-evaluator-launcher debug <inv> --copy-artifacts <path> # Copy artifacts (default: current dir)
|
|
48
|
+
|
|
49
|
+
Notes:
|
|
50
|
+
- Supports invocation IDs and job IDs
|
|
51
|
+
- Shows local or remote paths depending on executor (local/slurm/lepton)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# local exporter destination defaults to local
|
|
55
|
+
dest: str = field(default="local", init=False)
|
|
56
|
+
|
|
57
|
+
# debug modes
|
|
58
|
+
config: bool = field(default=False, help="Show job configuration")
|
|
59
|
+
artifacts: bool = field(default=False, help="Show artifact locations")
|
|
60
|
+
logs: bool = field(default=False, help="Show log locations")
|
|
61
|
+
|
|
62
|
+
# copy operations
|
|
63
|
+
copy_logs: Optional[str] = field(
|
|
64
|
+
default=None,
|
|
65
|
+
alias=["--copy-logs"],
|
|
66
|
+
nargs="?",
|
|
67
|
+
help="Copy logs to local directory (default: current dir)",
|
|
68
|
+
)
|
|
69
|
+
copy_artifacts: Optional[str] = field(
|
|
70
|
+
default=None,
|
|
71
|
+
alias=["--copy-artifacts"],
|
|
72
|
+
nargs="?",
|
|
73
|
+
help="Copy artifacts to local directory (default: current dir)",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def execute(self) -> None:
|
|
77
|
+
# show version
|
|
78
|
+
VersionCmd().execute()
|
|
79
|
+
|
|
80
|
+
logger.info("Debug command started", invocation_ids=self.invocation_ids)
|
|
81
|
+
|
|
82
|
+
if not self.invocation_ids:
|
|
83
|
+
logger.error("No invocation IDs provided")
|
|
84
|
+
raise ValueError("No job or invocation IDs provided.")
|
|
85
|
+
|
|
86
|
+
jobs = self._resolve_jobs()
|
|
87
|
+
logger.info(
|
|
88
|
+
"Resolved jobs",
|
|
89
|
+
total_ids=len(self.invocation_ids),
|
|
90
|
+
valid_jobs=len(jobs),
|
|
91
|
+
job_ids=[jid for jid, _ in jobs],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
if not jobs:
|
|
95
|
+
logger.info(
|
|
96
|
+
"No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
|
|
97
|
+
)
|
|
98
|
+
print(
|
|
99
|
+
"No valid jobs found (jobs may have been deletedd or IDs may be incorrect)."
|
|
100
|
+
)
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
if self.config:
|
|
104
|
+
logger.info("Showing job configuration", job_count=len(jobs))
|
|
105
|
+
self._show_config_info(jobs)
|
|
106
|
+
elif self.logs:
|
|
107
|
+
logger.info("Showing job logs locations", job_count=len(jobs))
|
|
108
|
+
self._show_logs_info(jobs)
|
|
109
|
+
elif self.artifacts:
|
|
110
|
+
logger.info("Showing artifacts locations", job_count=len(jobs))
|
|
111
|
+
self._show_artifacts_info(jobs)
|
|
112
|
+
elif self.copy_logs is not None:
|
|
113
|
+
dest = self.copy_logs or "."
|
|
114
|
+
if not self.copy_logs:
|
|
115
|
+
print(
|
|
116
|
+
"No destination provided for --copy-logs; defaulting to current dir"
|
|
117
|
+
)
|
|
118
|
+
logger.info(
|
|
119
|
+
"Copying logs to local directory", dest_dir=dest, job_count=len(jobs)
|
|
120
|
+
)
|
|
121
|
+
self._copy_logs(jobs, dest)
|
|
122
|
+
elif self.copy_artifacts is not None:
|
|
123
|
+
dest = self.copy_artifacts or "."
|
|
124
|
+
if not self.copy_artifacts:
|
|
125
|
+
print(
|
|
126
|
+
"No destination provided for --copy-artifacts; defaulting to current dir)"
|
|
127
|
+
)
|
|
128
|
+
logger.info(
|
|
129
|
+
"Copying artifacts to local directory",
|
|
130
|
+
dest_dir=dest,
|
|
131
|
+
job_count=len(jobs),
|
|
132
|
+
)
|
|
133
|
+
self._copy_artifacts(jobs, dest)
|
|
134
|
+
else:
|
|
135
|
+
logger.info(
|
|
136
|
+
"Job metadata details",
|
|
137
|
+
invocation_id=jobs[0][1].invocation_id if jobs else None,
|
|
138
|
+
jobs=len(jobs),
|
|
139
|
+
)
|
|
140
|
+
self._show_invocation_debug_info(jobs)
|
|
141
|
+
|
|
142
|
+
def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
|
|
143
|
+
"""Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
|
|
144
|
+
db = ExecutionDB()
|
|
145
|
+
found: list[tuple[str, JobData]] = []
|
|
146
|
+
for id_or_prefix in self.invocation_ids:
|
|
147
|
+
if "." in id_or_prefix:
|
|
148
|
+
jd = db.get_job(id_or_prefix)
|
|
149
|
+
if jd:
|
|
150
|
+
found.append((jd.job_id, jd))
|
|
151
|
+
else:
|
|
152
|
+
for jid, jd in db.get_jobs(id_or_prefix).items():
|
|
153
|
+
found.append((jid, jd))
|
|
154
|
+
# deduplicate and stable sort
|
|
155
|
+
seen: set[str] = set()
|
|
156
|
+
uniq: list[tuple[str, JobData]] = []
|
|
157
|
+
for jid, jd in found:
|
|
158
|
+
if jid not in seen:
|
|
159
|
+
seen.add(jid)
|
|
160
|
+
uniq.append((jid, jd))
|
|
161
|
+
return sorted(uniq, key=lambda p: p[0])
|
|
162
|
+
|
|
163
|
+
def _show_invocation_debug_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
164
|
+
inv = jobs[0][1].invocation_id if jobs else None
|
|
165
|
+
logger.info("Debug information", jobs=len(jobs), invocation=inv)
|
|
166
|
+
print(
|
|
167
|
+
f"Debug information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
for job_id, job_data in jobs:
|
|
171
|
+
self._show_job_debug_info(job_id, job_data)
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
# footer hint: where to find more metadata
|
|
175
|
+
print(
|
|
176
|
+
"For more details about this run, inspect the Execution DB under your home dir:"
|
|
177
|
+
)
|
|
178
|
+
print(f"Path: {EXEC_DB_FILE}")
|
|
179
|
+
if inv:
|
|
180
|
+
print(f"├── Lookup key: invocation_id={inv}")
|
|
181
|
+
|
|
182
|
+
# Next steps hint
|
|
183
|
+
print("\nNext steps:")
|
|
184
|
+
print(" - Use --logs to show log locations.")
|
|
185
|
+
print(" - Use --artifacts to show artifact locations.")
|
|
186
|
+
print(" - Use --config to show stored job configuration (YAML).")
|
|
187
|
+
print(" - Use --copy-logs [DIR] to copy logs to a local directory.")
|
|
188
|
+
print(" - Use --copy-artifacts [DIR] to copy artifacts to a local directory.")
|
|
189
|
+
|
|
190
|
+
def _show_job_debug_info(self, job_id: str, job_data: JobData) -> None:
|
|
191
|
+
logger.info("Job", job_id=job_id)
|
|
192
|
+
print(f"Job {job_id}")
|
|
193
|
+
|
|
194
|
+
# metadata
|
|
195
|
+
try:
|
|
196
|
+
when = datetime.fromtimestamp(job_data.timestamp).isoformat(
|
|
197
|
+
timespec="seconds"
|
|
198
|
+
)
|
|
199
|
+
except Exception:
|
|
200
|
+
when = str(job_data.timestamp)
|
|
201
|
+
logger.info("Executor", job_id=job_id, executor=job_data.executor)
|
|
202
|
+
logger.info("Created", job_id=job_id, created=when)
|
|
203
|
+
print(f"├── Executor: {job_data.executor}")
|
|
204
|
+
print(f"├── Created: {when}")
|
|
205
|
+
|
|
206
|
+
task_name = get_task_name(job_data)
|
|
207
|
+
if task_name:
|
|
208
|
+
logger.info("Task", job_id=job_id, name=task_name)
|
|
209
|
+
print(f"├── Task: {task_name}")
|
|
210
|
+
|
|
211
|
+
# locations via exporter helper
|
|
212
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
213
|
+
|
|
214
|
+
# Artifacts
|
|
215
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
216
|
+
artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
|
|
217
|
+
logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
|
|
218
|
+
print(f"├── Artifacts: {artifacts_path} (remote)")
|
|
219
|
+
else:
|
|
220
|
+
ap = paths.get("artifacts_dir")
|
|
221
|
+
if ap:
|
|
222
|
+
exists = self._check_path_exists(paths, "artifacts")
|
|
223
|
+
logger.info(
|
|
224
|
+
"Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
|
|
225
|
+
)
|
|
226
|
+
print(f"├── Artifacts: {ap} {exists} (local)")
|
|
227
|
+
|
|
228
|
+
# Logs
|
|
229
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
230
|
+
logs_path = (
|
|
231
|
+
f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
|
|
232
|
+
)
|
|
233
|
+
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
234
|
+
print(f"├── Logs: {logs_path} (remote)")
|
|
235
|
+
else:
|
|
236
|
+
lp = paths.get("logs_dir")
|
|
237
|
+
if lp:
|
|
238
|
+
exists = self._check_path_exists(paths, "logs")
|
|
239
|
+
logger.info(
|
|
240
|
+
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
241
|
+
)
|
|
242
|
+
print(f"├── Logs: {lp} {exists} (local)")
|
|
243
|
+
|
|
244
|
+
# executor-specific
|
|
245
|
+
d = job_data.data or {}
|
|
246
|
+
cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
|
|
247
|
+
exec_type = (job_data.executor or cfg_exec_type or "").lower()
|
|
248
|
+
|
|
249
|
+
if exec_type == "slurm":
|
|
250
|
+
sj = d.get("slurm_job_id")
|
|
251
|
+
if sj:
|
|
252
|
+
print(f"├── Slurm Job ID: {sj}")
|
|
253
|
+
elif exec_type == "gitlab":
|
|
254
|
+
pid = d.get("pipeline_id")
|
|
255
|
+
if pid:
|
|
256
|
+
print(f"├── Pipeline ID: {pid}")
|
|
257
|
+
elif exec_type == "lepton":
|
|
258
|
+
jn = d.get("lepton_job_name")
|
|
259
|
+
if jn:
|
|
260
|
+
print(f"├── Lepton Job: {jn}")
|
|
261
|
+
en = d.get("endpoint_name")
|
|
262
|
+
if en:
|
|
263
|
+
print(f"├── Endpoint: {en}")
|
|
264
|
+
eu = d.get("endpoint_url")
|
|
265
|
+
if eu:
|
|
266
|
+
print(f"├── Endpoint URL: {eu}")
|
|
267
|
+
# local and others: paths already displayed above; no extra fields needed
|
|
268
|
+
|
|
269
|
+
def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
270
|
+
logger.info("Log locations")
|
|
271
|
+
print("Log locations:\n")
|
|
272
|
+
for job_id, job_data in jobs:
|
|
273
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
274
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
275
|
+
logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
|
|
276
|
+
logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
|
|
277
|
+
print(f"{job_id}: {logs_path} (remote)")
|
|
278
|
+
else:
|
|
279
|
+
lp = paths.get("logs_dir")
|
|
280
|
+
if lp:
|
|
281
|
+
exists = self._check_path_exists(paths, "logs")
|
|
282
|
+
logger.info(
|
|
283
|
+
"Logs", job_id=job_id, path=str(lp), exists_indicator=exists
|
|
284
|
+
)
|
|
285
|
+
print(f"{job_id}: {lp} {exists} (local)")
|
|
286
|
+
|
|
287
|
+
def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
288
|
+
logger.info("Artifact locations")
|
|
289
|
+
print("Artifact locations:\n")
|
|
290
|
+
for job_id, job_data in jobs:
|
|
291
|
+
paths = _EXPORT_HELPER.get_job_paths(job_data)
|
|
292
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
293
|
+
artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
|
|
294
|
+
logger.info(
|
|
295
|
+
"Artifacts", job_id=job_id, path=artifacts_path, remote=True
|
|
296
|
+
)
|
|
297
|
+
print(f"{job_id}: {artifacts_path} (remote)")
|
|
298
|
+
else:
|
|
299
|
+
ap = paths.get("artifacts_dir")
|
|
300
|
+
if ap:
|
|
301
|
+
exists = self._check_path_exists(paths, "artifacts")
|
|
302
|
+
logger.info(
|
|
303
|
+
"Artifacts",
|
|
304
|
+
job_id=job_id,
|
|
305
|
+
path=str(ap),
|
|
306
|
+
exists_indicator=exists,
|
|
307
|
+
)
|
|
308
|
+
print(f"{job_id}: {ap} {exists} (local)")
|
|
309
|
+
|
|
310
|
+
def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
|
|
311
|
+
for job_id, job_data in jobs:
|
|
312
|
+
logger.info("Configuration for job", job_id=job_id)
|
|
313
|
+
print(f"Configuration for {job_id}:")
|
|
314
|
+
if job_data.config:
|
|
315
|
+
import yaml
|
|
316
|
+
|
|
317
|
+
config_yaml = yaml.dump(
|
|
318
|
+
job_data.config, default_flow_style=False, indent=2
|
|
319
|
+
)
|
|
320
|
+
logger.info("Configuration YAML", job_id=job_id, config=config_yaml)
|
|
321
|
+
print(config_yaml)
|
|
322
|
+
else:
|
|
323
|
+
logger.info("No configuration stored for this job", job_id=job_id)
|
|
324
|
+
print(" No configuration stored for this job.")
|
|
325
|
+
print()
|
|
326
|
+
|
|
327
|
+
def _copy_logs(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
|
|
328
|
+
"""Copy logs using export functionality."""
|
|
329
|
+
self._copy_content(jobs, dest_dir, copy_logs=True, copy_artifacts=False)
|
|
330
|
+
|
|
331
|
+
def _copy_artifacts(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
|
|
332
|
+
"""Copy artifacts using export functionality."""
|
|
333
|
+
self._copy_content(jobs, dest_dir, copy_logs=False, copy_artifacts=True)
|
|
334
|
+
|
|
335
|
+
def _copy_content(
|
|
336
|
+
self,
|
|
337
|
+
jobs: List[Tuple[str, JobData]],
|
|
338
|
+
dest_dir: str,
|
|
339
|
+
copy_logs: bool,
|
|
340
|
+
copy_artifacts: bool,
|
|
341
|
+
) -> None:
|
|
342
|
+
logger.debug(
|
|
343
|
+
"Preparing export call",
|
|
344
|
+
dest_dir=dest_dir,
|
|
345
|
+
copy_logs=copy_logs,
|
|
346
|
+
copy_artifacts=copy_artifacts,
|
|
347
|
+
job_ids=[jid for jid, _ in jobs],
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
from nemo_evaluator_launcher.api.functional import export_results
|
|
351
|
+
|
|
352
|
+
config = {
|
|
353
|
+
"output_dir": dest_dir,
|
|
354
|
+
"only_required": True,
|
|
355
|
+
"copy_logs": bool(copy_logs) and not bool(copy_artifacts),
|
|
356
|
+
"copy_artifacts": bool(copy_artifacts) and not bool(copy_logs),
|
|
357
|
+
}
|
|
358
|
+
# skip artifact validation
|
|
359
|
+
if copy_logs and not copy_artifacts:
|
|
360
|
+
config["skip_validation"] = True
|
|
361
|
+
|
|
362
|
+
job_ids = [job_id for job_id, _ in jobs]
|
|
363
|
+
kind = "logs" if copy_logs else "artifacts"
|
|
364
|
+
logger.info(
|
|
365
|
+
"Copying content", kind=kind, job_count=len(job_ids), dest_dir=dest_dir
|
|
366
|
+
)
|
|
367
|
+
print(f"Copying {kind} for {len(job_ids)} job(s) to {dest_dir}...")
|
|
368
|
+
|
|
369
|
+
result = export_results(job_ids, "local", config)
|
|
370
|
+
logger.debug("Export API call completed", success=result.get("success"))
|
|
371
|
+
|
|
372
|
+
if result.get("success"):
|
|
373
|
+
logger.info(
|
|
374
|
+
"Content copy completed successfully",
|
|
375
|
+
dest_dir=dest_dir,
|
|
376
|
+
job_count=len(jobs),
|
|
377
|
+
)
|
|
378
|
+
if "jobs" in result:
|
|
379
|
+
for jid, job_result in result["jobs"].items():
|
|
380
|
+
if job_result.get("success"):
|
|
381
|
+
print(f"{jid}: Success")
|
|
382
|
+
else:
|
|
383
|
+
print(
|
|
384
|
+
f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
|
|
385
|
+
)
|
|
386
|
+
else:
|
|
387
|
+
err = result.get("error", "Unknown error")
|
|
388
|
+
logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
|
|
389
|
+
print(f"Failed to copy {kind}: {err}")
|
|
390
|
+
|
|
391
|
+
def _check_path_exists(self, paths: Dict[str, Any], path_type: str) -> str:
|
|
392
|
+
"""Check if a path exists and return indicator."""
|
|
393
|
+
try:
|
|
394
|
+
if paths.get("storage_type") == "remote_ssh":
|
|
395
|
+
# For remote paths, we can't easily check existence
|
|
396
|
+
return "(remote)"
|
|
397
|
+
elif path_type == "logs" and "logs_dir" in paths:
|
|
398
|
+
logs_dir = Path(paths["logs_dir"])
|
|
399
|
+
return "(exists)" if logs_dir.exists() else "(not found)"
|
|
400
|
+
elif path_type == "artifacts" and "artifacts_dir" in paths:
|
|
401
|
+
artifacts_dir = Path(paths["artifacts_dir"])
|
|
402
|
+
return "(exists)" if artifacts_dir.exists() else "(not found)"
|
|
403
|
+
except Exception:
|
|
404
|
+
pass
|
|
405
|
+
return ""
|
|
@@ -20,6 +20,8 @@ from typing import Optional
|
|
|
20
20
|
|
|
21
21
|
from simple_parsing import field
|
|
22
22
|
|
|
23
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
@dataclass
|
|
25
27
|
class Cmd:
|
|
@@ -27,12 +29,16 @@ class Cmd:
|
|
|
27
29
|
|
|
28
30
|
limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
|
|
29
31
|
executor: Optional[str] = field(
|
|
30
|
-
default=None,
|
|
32
|
+
default=None,
|
|
33
|
+
alias=["--executor"],
|
|
34
|
+
help="Filter by executor",
|
|
31
35
|
)
|
|
36
|
+
# TODO(agronskiy): think about if we can propagate a `--status` filter into here.
|
|
32
37
|
since: Optional[str] = field(
|
|
33
38
|
default=None,
|
|
34
39
|
alias=["--since"],
|
|
35
|
-
help="Filter by ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00)"
|
|
40
|
+
help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
|
|
41
|
+
"an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
|
|
36
42
|
)
|
|
37
43
|
|
|
38
44
|
def execute(self) -> None:
|
|
@@ -53,7 +59,22 @@ class Cmd:
|
|
|
53
59
|
|
|
54
60
|
if self.since:
|
|
55
61
|
try:
|
|
56
|
-
if "
|
|
62
|
+
# Check if it's a relative time format like "1d" or "3h"
|
|
63
|
+
if self.since.lower().endswith("d") and len(self.since) > 1:
|
|
64
|
+
days = int(self.since[:-1])
|
|
65
|
+
if days < 0:
|
|
66
|
+
raise ValueError("Days should be non-negative")
|
|
67
|
+
since_ts = (
|
|
68
|
+
_dt.datetime.now() - _dt.timedelta(days=days)
|
|
69
|
+
).timestamp()
|
|
70
|
+
elif self.since.lower().endswith("h") and len(self.since) > 1:
|
|
71
|
+
hours = int(self.since[:-1])
|
|
72
|
+
if hours < 0:
|
|
73
|
+
raise ValueError("Hours should be non-negative")
|
|
74
|
+
since_ts = (
|
|
75
|
+
_dt.datetime.now() - _dt.timedelta(hours=hours)
|
|
76
|
+
).timestamp()
|
|
77
|
+
elif "T" in self.since:
|
|
57
78
|
since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
|
|
58
79
|
else:
|
|
59
80
|
since_ts = _dt.datetime.fromisoformat(
|
|
@@ -61,9 +82,8 @@ class Cmd:
|
|
|
61
82
|
).timestamp()
|
|
62
83
|
rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
|
|
63
84
|
except Exception:
|
|
64
|
-
|
|
65
|
-
f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS"
|
|
66
|
-
file=sys.stderr,
|
|
85
|
+
logger.fatal(
|
|
86
|
+
f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
|
|
67
87
|
)
|
|
68
88
|
sys.exit(2)
|
|
69
89
|
|
|
@@ -19,6 +19,7 @@ import os
|
|
|
19
19
|
|
|
20
20
|
from simple_parsing import ArgumentParser
|
|
21
21
|
|
|
22
|
+
import nemo_evaluator_launcher.cli.debug as debug
|
|
22
23
|
import nemo_evaluator_launcher.cli.export as export
|
|
23
24
|
import nemo_evaluator_launcher.cli.kill as kill
|
|
24
25
|
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
@@ -38,7 +39,16 @@ def is_verbose_enabled(args) -> bool:
|
|
|
38
39
|
return True
|
|
39
40
|
|
|
40
41
|
# Check subcommand verbose flags
|
|
41
|
-
subcommands = [
|
|
42
|
+
subcommands = [
|
|
43
|
+
"run",
|
|
44
|
+
"status",
|
|
45
|
+
"kill",
|
|
46
|
+
"tasks_alias",
|
|
47
|
+
"tasks",
|
|
48
|
+
"runs",
|
|
49
|
+
"export",
|
|
50
|
+
"debug",
|
|
51
|
+
]
|
|
42
52
|
for subcmd in subcommands:
|
|
43
53
|
if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
|
|
44
54
|
if getattr(getattr(args, subcmd), "verbose"):
|
|
@@ -153,6 +163,17 @@ def create_parser() -> ArgumentParser:
|
|
|
153
163
|
)
|
|
154
164
|
export_parser.add_arguments(export.ExportCmd, dest="export")
|
|
155
165
|
|
|
166
|
+
# Debug helper subcommand
|
|
167
|
+
debug_parser = subparsers.add_parser(
|
|
168
|
+
"debug",
|
|
169
|
+
help="Display evaluation job information",
|
|
170
|
+
description="Debug helper functionalities for nemo-evaluator-launcher",
|
|
171
|
+
)
|
|
172
|
+
debug_parser.add_argument(
|
|
173
|
+
"-v", "--verbose", action="store_true", help="Enable verbose logging"
|
|
174
|
+
)
|
|
175
|
+
debug_parser.add_arguments(debug.DebugCmd, dest="debug")
|
|
176
|
+
|
|
156
177
|
return parser
|
|
157
178
|
|
|
158
179
|
|
|
@@ -197,6 +218,8 @@ def main() -> None:
|
|
|
197
218
|
args.runs.execute()
|
|
198
219
|
elif args.command == "export":
|
|
199
220
|
args.export.execute()
|
|
221
|
+
elif args.command == "debug":
|
|
222
|
+
args.debug.execute()
|
|
200
223
|
|
|
201
224
|
|
|
202
225
|
if __name__ == "__main__":
|
|
@@ -168,3 +168,7 @@ class Cmd:
|
|
|
168
168
|
for idx, task in enumerate(tasks):
|
|
169
169
|
job_id = f"{invocation_id}.{idx}"
|
|
170
170
|
print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
|
|
171
|
+
print(
|
|
172
|
+
"to print all jobs: nemo-evaluator-launcher ls runs"
|
|
173
|
+
"\n (--since 1d or --since 6h for time span, see --help)"
|
|
174
|
+
)
|
|
@@ -21,6 +21,7 @@ port: 8000
|
|
|
21
21
|
tensor_parallel_size: 8
|
|
22
22
|
pipeline_parallel_size: 1
|
|
23
23
|
data_parallel_size: 1
|
|
24
|
+
gpu_memory_utilization: 0.95
|
|
24
25
|
extra_args: ""
|
|
25
26
|
env_vars: {} # {name: value} dict
|
|
26
27
|
|
|
@@ -37,5 +38,5 @@ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
|
|
|
37
38
|
--trust-remote-code
|
|
38
39
|
--served-model-name ${deployment.served_model_name}
|
|
39
40
|
--enforce-eager
|
|
40
|
-
--gpu-memory-utilization
|
|
41
|
+
--gpu-memory-utilization ${deployment.gpu_memory_utilization}
|
|
41
42
|
${deployment.extra_args}
|