nemo-evaluator-launcher 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (64) hide show
  1. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/PKG-INFO +1 -1
  2. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/functional.py +19 -29
  3. nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/cli/debug.py +405 -0
  4. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/ls_runs.py +26 -6
  5. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/main.py +24 -1
  6. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/run.py +4 -0
  7. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -1
  8. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/local.yaml +1 -0
  9. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/base.py +23 -0
  10. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/executor.py +17 -71
  11. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/executor.py +48 -7
  12. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/run.template.sh +18 -6
  13. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/slurm/executor.py +40 -22
  14. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/local.py +25 -16
  15. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/mlflow.py +168 -70
  16. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/utils.py +85 -33
  17. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/wandb.py +40 -5
  18. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/package_info.py +1 -1
  19. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +1 -1
  20. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +1 -0
  21. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/LICENSE +0 -0
  22. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/README.md +0 -0
  23. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/pyproject.toml +0 -0
  24. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/setup.cfg +0 -0
  25. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/__init__.py +0 -0
  26. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  27. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/types.py +0 -0
  28. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  29. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  30. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  31. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  32. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  33. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/status.py +0 -0
  34. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/cli/version.py +0 -0
  35. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  36. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  37. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/helpers.py +0 -0
  38. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/logging_utils.py +0 -0
  39. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  40. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  41. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  42. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  43. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  44. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  45. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +0 -0
  46. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +0 -0
  47. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  48. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +0 -0
  49. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  50. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  51. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +0 -0
  52. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  53. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  54. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  55. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  56. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  57. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  58. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  59. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  60. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher/resources/mapping.toml +0 -0
  61. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  62. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +0 -0
  63. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/requires.txt +0 -0
  64. {nemo_evaluator_launcher-0.1.14 → nemo_evaluator_launcher-0.1.16}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -440,23 +440,28 @@ def export_results(
440
440
  single_id = invocation_ids[0]
441
441
 
442
442
  if "." in single_id: # job_id
443
+ # Try reading config from artifacts working dir (auto-export on remote node)
444
+ cfg_file = None
445
+ for name in ("run_config.yml", "config.yml"):
446
+ p = Path(name)
447
+ if p.exists():
448
+ cfg_file = p
449
+ break
450
+
443
451
  md_job_data = None
444
- # Use artifacts/run_config.yml if present
445
- ypath_artifacts = Path("run_config.yml")
446
- if ypath_artifacts.exists():
452
+ if cfg_file:
447
453
  try:
448
454
  cfg_yaml = (
449
- yaml.safe_load(ypath_artifacts.read_text(encoding="utf-8"))
450
- or {}
455
+ yaml.safe_load(cfg_file.read_text(encoding="utf-8")) or {}
451
456
  )
452
- # merge exporter config if present
457
+
458
+ # Merge exporter override file if present
453
459
  ypath_export = Path("export_config.yml")
454
460
  if ypath_export.exists():
455
461
  exp_yaml = (
456
462
  yaml.safe_load(ypath_export.read_text(encoding="utf-8"))
457
463
  or {}
458
464
  )
459
- # execution.auto_export contains auto-export destinations
460
465
  exec_cfg = cfg_yaml.get("execution") or {}
461
466
  auto_exp = (exp_yaml.get("execution") or {}).get(
462
467
  "auto_export"
@@ -464,42 +469,30 @@ def export_results(
464
469
  if auto_exp is not None:
465
470
  exec_cfg["auto_export"] = auto_exp
466
471
  cfg_yaml["execution"] = exec_cfg
467
-
468
- # top-level export block contains exporter config
469
472
  if "export" in exp_yaml:
470
473
  cfg_yaml["export"] = exp_yaml["export"]
471
-
472
- # Merge evaluation.tasks from export_config (Slurm writes it there)
473
474
  if "evaluation" in exp_yaml and exp_yaml["evaluation"]:
474
475
  eval_cfg = cfg_yaml.get("evaluation") or {}
475
476
  eval_cfg.update(exp_yaml["evaluation"])
476
477
  cfg_yaml["evaluation"] = eval_cfg
477
478
 
478
- # metadata
479
479
  executor_name = (cfg_yaml.get("execution") or {}).get(
480
480
  "type", "local"
481
481
  )
482
-
483
482
  md_job_data = JobData(
484
483
  invocation_id=single_id.split(".")[0],
485
484
  job_id=single_id,
486
485
  timestamp=0.0,
487
- executor=executor_name,
486
+ executor=executor_name, # ensures slurm tag is preserved
488
487
  data={
489
488
  "output_dir": str(Path.cwd().parent),
490
- "storage_type": "remote_local",
489
+ "storage_type": "remote_local", # no SSH in auto-export path
491
490
  },
492
491
  config=cfg_yaml,
493
492
  )
494
- # DEBUG: print what we loaded
495
- print(f"DEBUG: cfg_yaml keys: {list(cfg_yaml.keys())}")
496
- if "evaluation" in cfg_yaml:
497
- print(
498
- f"DEBUG: evaluation.tasks: {cfg_yaml.get('evaluation', {}).get('tasks')}"
499
- )
500
493
  except Exception:
501
494
  md_job_data = None
502
- # fallback to execDB only
495
+
503
496
  job_data = md_job_data or ExecutionDB().get_job(single_id)
504
497
  if job_data is None:
505
498
  return {
@@ -507,7 +500,6 @@ def export_results(
507
500
  "error": f"Job {single_id} not found in ExecutionDB",
508
501
  }
509
502
 
510
- # Convert single job result to invocation-like structure
511
503
  job_result = exporter.export_job(job_data)
512
504
  return {
513
505
  "success": job_result.success,
@@ -522,10 +514,9 @@ def export_results(
522
514
  },
523
515
  "metadata": job_result.metadata or {},
524
516
  }
517
+
525
518
  elif single_id.isdigit(): # pipeline_id
526
- # Find job by pipeline_id
527
519
  db = ExecutionDB()
528
- # Search all jobs for matching pipeline_id
529
520
  for job_id, job_data in db._jobs.items():
530
521
  if job_data.data.get("pipeline_id") == int(single_id):
531
522
  job_result = exporter.export_job(job_data)
@@ -542,14 +533,13 @@ def export_results(
542
533
  "metadata": job_result.metadata or {},
543
534
  }
544
535
  return {"success": False, "error": f"Pipeline {single_id} not found"}
536
+
545
537
  else: # invocation_id
546
538
  result = exporter.export_invocation(single_id)
547
- # Ensure metadata is present in job results to prevent KeyError
548
539
  if "jobs" in result:
549
540
  for job_id, job_result in result["jobs"].items():
550
- if "metadata" not in job_result:
551
- job_result["metadata"] = {}
552
- return result # type: ignore[no-any-return]
541
+ job_result.setdefault("metadata", {})
542
+ return result
553
543
  else:
554
544
  # Multiple IDs - parse and group
555
545
  db = ExecutionDB()
@@ -0,0 +1,405 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Debugging helper functionalities for nemo-evaluator-launcher."""
18
+
19
+ from dataclasses import dataclass
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional, Tuple
23
+
24
+ from simple_parsing import field
25
+
26
+ from nemo_evaluator_launcher.cli.export import ExportCmd
27
+ from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
28
+ from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
29
+ from nemo_evaluator_launcher.common.logging_utils import logger
30
+ from nemo_evaluator_launcher.exporters.local import LocalExporter
31
+ from nemo_evaluator_launcher.exporters.utils import get_task_name
32
+
33
+ # Local exporter helper to copy logs and artifacts
34
+ _EXPORT_HELPER = LocalExporter({})
35
+
36
+
37
+ @dataclass
38
+ class DebugCmd(ExportCmd):
39
+ """Debugging functionalities for nemo-evaluator-launcher.
40
+
41
+ Examples:
42
+ nemo-evaluator-launcher debug <inv> # Full debug info
43
+ nemo-evaluator-launcher debug <inv> --config # Show stored job config (YAML)
44
+ nemo-evaluator-launcher debug <inv> --artifacts # Show artifact locations
45
+ nemo-evaluator-launcher debug <inv> --logs # Show log locations
46
+ nemo-evaluator-launcher debug <inv> --copy-logs <path> # Copy logs (default: current dir)
47
+ nemo-evaluator-launcher debug <inv> --copy-artifacts <path> # Copy artifacts (default: current dir)
48
+
49
+ Notes:
50
+ - Supports invocation IDs and job IDs
51
+ - Shows local or remote paths depending on executor (local/slurm/lepton)
52
+ """
53
+
54
+ # local exporter destination defaults to local
55
+ dest: str = field(default="local", init=False)
56
+
57
+ # debug modes
58
+ config: bool = field(default=False, help="Show job configuration")
59
+ artifacts: bool = field(default=False, help="Show artifact locations")
60
+ logs: bool = field(default=False, help="Show log locations")
61
+
62
+ # copy operations
63
+ copy_logs: Optional[str] = field(
64
+ default=None,
65
+ alias=["--copy-logs"],
66
+ nargs="?",
67
+ help="Copy logs to local directory (default: current dir)",
68
+ )
69
+ copy_artifacts: Optional[str] = field(
70
+ default=None,
71
+ alias=["--copy-artifacts"],
72
+ nargs="?",
73
+ help="Copy artifacts to local directory (default: current dir)",
74
+ )
75
+
76
+ def execute(self) -> None:
77
+ # show version
78
+ VersionCmd().execute()
79
+
80
+ logger.info("Debug command started", invocation_ids=self.invocation_ids)
81
+
82
+ if not self.invocation_ids:
83
+ logger.error("No invocation IDs provided")
84
+ raise ValueError("No job or invocation IDs provided.")
85
+
86
+ jobs = self._resolve_jobs()
87
+ logger.info(
88
+ "Resolved jobs",
89
+ total_ids=len(self.invocation_ids),
90
+ valid_jobs=len(jobs),
91
+ job_ids=[jid for jid, _ in jobs],
92
+ )
93
+
94
+ if not jobs:
95
+ logger.info(
96
+ "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
97
+ )
98
+ print(
99
+ "No valid jobs found (jobs may have been deletedd or IDs may be incorrect)."
100
+ )
101
+ return
102
+
103
+ if self.config:
104
+ logger.info("Showing job configuration", job_count=len(jobs))
105
+ self._show_config_info(jobs)
106
+ elif self.logs:
107
+ logger.info("Showing job logs locations", job_count=len(jobs))
108
+ self._show_logs_info(jobs)
109
+ elif self.artifacts:
110
+ logger.info("Showing artifacts locations", job_count=len(jobs))
111
+ self._show_artifacts_info(jobs)
112
+ elif self.copy_logs is not None:
113
+ dest = self.copy_logs or "."
114
+ if not self.copy_logs:
115
+ print(
116
+ "No destination provided for --copy-logs; defaulting to current dir"
117
+ )
118
+ logger.info(
119
+ "Copying logs to local directory", dest_dir=dest, job_count=len(jobs)
120
+ )
121
+ self._copy_logs(jobs, dest)
122
+ elif self.copy_artifacts is not None:
123
+ dest = self.copy_artifacts or "."
124
+ if not self.copy_artifacts:
125
+ print(
126
+ "No destination provided for --copy-artifacts; defaulting to current dir)"
127
+ )
128
+ logger.info(
129
+ "Copying artifacts to local directory",
130
+ dest_dir=dest,
131
+ job_count=len(jobs),
132
+ )
133
+ self._copy_artifacts(jobs, dest)
134
+ else:
135
+ logger.info(
136
+ "Job metadata details",
137
+ invocation_id=jobs[0][1].invocation_id if jobs else None,
138
+ jobs=len(jobs),
139
+ )
140
+ self._show_invocation_debug_info(jobs)
141
+
142
+ def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
143
+ """Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
144
+ db = ExecutionDB()
145
+ found: list[tuple[str, JobData]] = []
146
+ for id_or_prefix in self.invocation_ids:
147
+ if "." in id_or_prefix:
148
+ jd = db.get_job(id_or_prefix)
149
+ if jd:
150
+ found.append((jd.job_id, jd))
151
+ else:
152
+ for jid, jd in db.get_jobs(id_or_prefix).items():
153
+ found.append((jid, jd))
154
+ # deduplicate and stable sort
155
+ seen: set[str] = set()
156
+ uniq: list[tuple[str, JobData]] = []
157
+ for jid, jd in found:
158
+ if jid not in seen:
159
+ seen.add(jid)
160
+ uniq.append((jid, jd))
161
+ return sorted(uniq, key=lambda p: p[0])
162
+
163
+ def _show_invocation_debug_info(self, jobs: List[Tuple[str, JobData]]) -> None:
164
+ inv = jobs[0][1].invocation_id if jobs else None
165
+ logger.info("Debug information", jobs=len(jobs), invocation=inv)
166
+ print(
167
+ f"Debug information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
168
+ )
169
+
170
+ for job_id, job_data in jobs:
171
+ self._show_job_debug_info(job_id, job_data)
172
+ print()
173
+
174
+ # footer hint: where to find more metadata
175
+ print(
176
+ "For more details about this run, inspect the Execution DB under your home dir:"
177
+ )
178
+ print(f"Path: {EXEC_DB_FILE}")
179
+ if inv:
180
+ print(f"├── Lookup key: invocation_id={inv}")
181
+
182
+ # Next steps hint
183
+ print("\nNext steps:")
184
+ print(" - Use --logs to show log locations.")
185
+ print(" - Use --artifacts to show artifact locations.")
186
+ print(" - Use --config to show stored job configuration (YAML).")
187
+ print(" - Use --copy-logs [DIR] to copy logs to a local directory.")
188
+ print(" - Use --copy-artifacts [DIR] to copy artifacts to a local directory.")
189
+
190
+ def _show_job_debug_info(self, job_id: str, job_data: JobData) -> None:
191
+ logger.info("Job", job_id=job_id)
192
+ print(f"Job {job_id}")
193
+
194
+ # metadata
195
+ try:
196
+ when = datetime.fromtimestamp(job_data.timestamp).isoformat(
197
+ timespec="seconds"
198
+ )
199
+ except Exception:
200
+ when = str(job_data.timestamp)
201
+ logger.info("Executor", job_id=job_id, executor=job_data.executor)
202
+ logger.info("Created", job_id=job_id, created=when)
203
+ print(f"├── Executor: {job_data.executor}")
204
+ print(f"├── Created: {when}")
205
+
206
+ task_name = get_task_name(job_data)
207
+ if task_name:
208
+ logger.info("Task", job_id=job_id, name=task_name)
209
+ print(f"├── Task: {task_name}")
210
+
211
+ # locations via exporter helper
212
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
213
+
214
+ # Artifacts
215
+ if paths.get("storage_type") == "remote_ssh":
216
+ artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
217
+ logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
218
+ print(f"├── Artifacts: {artifacts_path} (remote)")
219
+ else:
220
+ ap = paths.get("artifacts_dir")
221
+ if ap:
222
+ exists = self._check_path_exists(paths, "artifacts")
223
+ logger.info(
224
+ "Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
225
+ )
226
+ print(f"├── Artifacts: {ap} {exists} (local)")
227
+
228
+ # Logs
229
+ if paths.get("storage_type") == "remote_ssh":
230
+ logs_path = (
231
+ f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
232
+ )
233
+ logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
234
+ print(f"├── Logs: {logs_path} (remote)")
235
+ else:
236
+ lp = paths.get("logs_dir")
237
+ if lp:
238
+ exists = self._check_path_exists(paths, "logs")
239
+ logger.info(
240
+ "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
241
+ )
242
+ print(f"├── Logs: {lp} {exists} (local)")
243
+
244
+ # executor-specific
245
+ d = job_data.data or {}
246
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
247
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
248
+
249
+ if exec_type == "slurm":
250
+ sj = d.get("slurm_job_id")
251
+ if sj:
252
+ print(f"├── Slurm Job ID: {sj}")
253
+ elif exec_type == "gitlab":
254
+ pid = d.get("pipeline_id")
255
+ if pid:
256
+ print(f"├── Pipeline ID: {pid}")
257
+ elif exec_type == "lepton":
258
+ jn = d.get("lepton_job_name")
259
+ if jn:
260
+ print(f"├── Lepton Job: {jn}")
261
+ en = d.get("endpoint_name")
262
+ if en:
263
+ print(f"├── Endpoint: {en}")
264
+ eu = d.get("endpoint_url")
265
+ if eu:
266
+ print(f"├── Endpoint URL: {eu}")
267
+ # local and others: paths already displayed above; no extra fields needed
268
+
269
+ def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
270
+ logger.info("Log locations")
271
+ print("Log locations:\n")
272
+ for job_id, job_data in jobs:
273
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
274
+ if paths.get("storage_type") == "remote_ssh":
275
+ logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
276
+ logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
277
+ print(f"{job_id}: {logs_path} (remote)")
278
+ else:
279
+ lp = paths.get("logs_dir")
280
+ if lp:
281
+ exists = self._check_path_exists(paths, "logs")
282
+ logger.info(
283
+ "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
284
+ )
285
+ print(f"{job_id}: {lp} {exists} (local)")
286
+
287
+ def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
288
+ logger.info("Artifact locations")
289
+ print("Artifact locations:\n")
290
+ for job_id, job_data in jobs:
291
+ paths = _EXPORT_HELPER.get_job_paths(job_data)
292
+ if paths.get("storage_type") == "remote_ssh":
293
+ artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
294
+ logger.info(
295
+ "Artifacts", job_id=job_id, path=artifacts_path, remote=True
296
+ )
297
+ print(f"{job_id}: {artifacts_path} (remote)")
298
+ else:
299
+ ap = paths.get("artifacts_dir")
300
+ if ap:
301
+ exists = self._check_path_exists(paths, "artifacts")
302
+ logger.info(
303
+ "Artifacts",
304
+ job_id=job_id,
305
+ path=str(ap),
306
+ exists_indicator=exists,
307
+ )
308
+ print(f"{job_id}: {ap} {exists} (local)")
309
+
310
+ def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
311
+ for job_id, job_data in jobs:
312
+ logger.info("Configuration for job", job_id=job_id)
313
+ print(f"Configuration for {job_id}:")
314
+ if job_data.config:
315
+ import yaml
316
+
317
+ config_yaml = yaml.dump(
318
+ job_data.config, default_flow_style=False, indent=2
319
+ )
320
+ logger.info("Configuration YAML", job_id=job_id, config=config_yaml)
321
+ print(config_yaml)
322
+ else:
323
+ logger.info("No configuration stored for this job", job_id=job_id)
324
+ print(" No configuration stored for this job.")
325
+ print()
326
+
327
+ def _copy_logs(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
328
+ """Copy logs using export functionality."""
329
+ self._copy_content(jobs, dest_dir, copy_logs=True, copy_artifacts=False)
330
+
331
+ def _copy_artifacts(self, jobs: List[Tuple[str, JobData]], dest_dir: str) -> None:
332
+ """Copy artifacts using export functionality."""
333
+ self._copy_content(jobs, dest_dir, copy_logs=False, copy_artifacts=True)
334
+
335
+ def _copy_content(
336
+ self,
337
+ jobs: List[Tuple[str, JobData]],
338
+ dest_dir: str,
339
+ copy_logs: bool,
340
+ copy_artifacts: bool,
341
+ ) -> None:
342
+ logger.debug(
343
+ "Preparing export call",
344
+ dest_dir=dest_dir,
345
+ copy_logs=copy_logs,
346
+ copy_artifacts=copy_artifacts,
347
+ job_ids=[jid for jid, _ in jobs],
348
+ )
349
+
350
+ from nemo_evaluator_launcher.api.functional import export_results
351
+
352
+ config = {
353
+ "output_dir": dest_dir,
354
+ "only_required": True,
355
+ "copy_logs": bool(copy_logs) and not bool(copy_artifacts),
356
+ "copy_artifacts": bool(copy_artifacts) and not bool(copy_logs),
357
+ }
358
+ # skip artifact validation
359
+ if copy_logs and not copy_artifacts:
360
+ config["skip_validation"] = True
361
+
362
+ job_ids = [job_id for job_id, _ in jobs]
363
+ kind = "logs" if copy_logs else "artifacts"
364
+ logger.info(
365
+ "Copying content", kind=kind, job_count=len(job_ids), dest_dir=dest_dir
366
+ )
367
+ print(f"Copying {kind} for {len(job_ids)} job(s) to {dest_dir}...")
368
+
369
+ result = export_results(job_ids, "local", config)
370
+ logger.debug("Export API call completed", success=result.get("success"))
371
+
372
+ if result.get("success"):
373
+ logger.info(
374
+ "Content copy completed successfully",
375
+ dest_dir=dest_dir,
376
+ job_count=len(jobs),
377
+ )
378
+ if "jobs" in result:
379
+ for jid, job_result in result["jobs"].items():
380
+ if job_result.get("success"):
381
+ print(f"{jid}: Success")
382
+ else:
383
+ print(
384
+ f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
385
+ )
386
+ else:
387
+ err = result.get("error", "Unknown error")
388
+ logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
389
+ print(f"Failed to copy {kind}: {err}")
390
+
391
+ def _check_path_exists(self, paths: Dict[str, Any], path_type: str) -> str:
392
+ """Check if a path exists and return indicator."""
393
+ try:
394
+ if paths.get("storage_type") == "remote_ssh":
395
+ # For remote paths, we can't easily check existence
396
+ return "(remote)"
397
+ elif path_type == "logs" and "logs_dir" in paths:
398
+ logs_dir = Path(paths["logs_dir"])
399
+ return "(exists)" if logs_dir.exists() else "(not found)"
400
+ elif path_type == "artifacts" and "artifacts_dir" in paths:
401
+ artifacts_dir = Path(paths["artifacts_dir"])
402
+ return "(exists)" if artifacts_dir.exists() else "(not found)"
403
+ except Exception:
404
+ pass
405
+ return ""
@@ -20,6 +20,8 @@ from typing import Optional
20
20
 
21
21
  from simple_parsing import field
22
22
 
23
+ from nemo_evaluator_launcher.common.logging_utils import logger
24
+
23
25
 
24
26
  @dataclass
25
27
  class Cmd:
@@ -27,12 +29,16 @@ class Cmd:
27
29
 
28
30
  limit: Optional[int] = field(default=None, alias=["--limit"], help="Max rows")
29
31
  executor: Optional[str] = field(
30
- default=None, alias=["--executor"], help="Filter by executor"
32
+ default=None,
33
+ alias=["--executor"],
34
+ help="Filter by executor",
31
35
  )
36
+ # TODO(agronskiy): think about if we can propagate a `--status` filter into here.
32
37
  since: Optional[str] = field(
33
38
  default=None,
34
39
  alias=["--since"],
35
- help="Filter by ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00)",
40
+ help="Filter by either ISO date/time (e.g., 2025-08-20 or 2025-08-20T12:00:00) or "
41
+ "an interval into the past, e.g. `1d` or `3h`; formally `{N}[d|h]`.",
36
42
  )
37
43
 
38
44
  def execute(self) -> None:
@@ -53,7 +59,22 @@ class Cmd:
53
59
 
54
60
  if self.since:
55
61
  try:
56
- if "T" in self.since:
62
+ # Check if it's a relative time format like "1d" or "3h"
63
+ if self.since.lower().endswith("d") and len(self.since) > 1:
64
+ days = int(self.since[:-1])
65
+ if days < 0:
66
+ raise ValueError("Days should be non-negative")
67
+ since_ts = (
68
+ _dt.datetime.now() - _dt.timedelta(days=days)
69
+ ).timestamp()
70
+ elif self.since.lower().endswith("h") and len(self.since) > 1:
71
+ hours = int(self.since[:-1])
72
+ if hours < 0:
73
+ raise ValueError("Hours should be non-negative")
74
+ since_ts = (
75
+ _dt.datetime.now() - _dt.timedelta(hours=hours)
76
+ ).timestamp()
77
+ elif "T" in self.since:
57
78
  since_ts = _dt.datetime.fromisoformat(self.since).timestamp()
58
79
  else:
59
80
  since_ts = _dt.datetime.fromisoformat(
@@ -61,9 +82,8 @@ class Cmd:
61
82
  ).timestamp()
62
83
  rows = [r for r in rows if (r.get("earliest_job_ts") or 0) >= since_ts]
63
84
  except Exception:
64
- print(
65
- f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS",
66
- file=sys.stderr,
85
+ logger.fatal(
86
+ f"Invalid --since value: {self.since}. Use YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or N[d|h] for N days|hours."
67
87
  )
68
88
  sys.exit(2)
69
89
 
@@ -19,6 +19,7 @@ import os
19
19
 
20
20
  from simple_parsing import ArgumentParser
21
21
 
22
+ import nemo_evaluator_launcher.cli.debug as debug
22
23
  import nemo_evaluator_launcher.cli.export as export
23
24
  import nemo_evaluator_launcher.cli.kill as kill
24
25
  import nemo_evaluator_launcher.cli.ls_runs as ls_runs
@@ -38,7 +39,16 @@ def is_verbose_enabled(args) -> bool:
38
39
  return True
39
40
 
40
41
  # Check subcommand verbose flags
41
- subcommands = ["run", "status", "kill", "tasks_alias", "tasks", "runs", "export"]
42
+ subcommands = [
43
+ "run",
44
+ "status",
45
+ "kill",
46
+ "tasks_alias",
47
+ "tasks",
48
+ "runs",
49
+ "export",
50
+ "debug",
51
+ ]
42
52
  for subcmd in subcommands:
43
53
  if hasattr(args, subcmd) and hasattr(getattr(args, subcmd), "verbose"):
44
54
  if getattr(getattr(args, subcmd), "verbose"):
@@ -153,6 +163,17 @@ def create_parser() -> ArgumentParser:
153
163
  )
154
164
  export_parser.add_arguments(export.ExportCmd, dest="export")
155
165
 
166
+ # Debug helper subcommand
167
+ debug_parser = subparsers.add_parser(
168
+ "debug",
169
+ help="Display evaluation job information",
170
+ description="Debug helper functionalities for nemo-evaluator-launcher",
171
+ )
172
+ debug_parser.add_argument(
173
+ "-v", "--verbose", action="store_true", help="Enable verbose logging"
174
+ )
175
+ debug_parser.add_arguments(debug.DebugCmd, dest="debug")
176
+
156
177
  return parser
157
178
 
158
179
 
@@ -197,6 +218,8 @@ def main() -> None:
197
218
  args.runs.execute()
198
219
  elif args.command == "export":
199
220
  args.export.execute()
221
+ elif args.command == "debug":
222
+ args.debug.execute()
200
223
 
201
224
 
202
225
  if __name__ == "__main__":
@@ -168,3 +168,7 @@ class Cmd:
168
168
  for idx, task in enumerate(tasks):
169
169
  job_id = f"{invocation_id}.{idx}"
170
170
  print(f" nemo-evaluator-launcher kill {job_id} # {task.name}")
171
+ print(
172
+ "to print all jobs: nemo-evaluator-launcher ls runs"
173
+ "\n (--since 1d or --since 6h for time span, see --help)"
174
+ )
@@ -21,6 +21,7 @@ port: 8000
21
21
  tensor_parallel_size: 8
22
22
  pipeline_parallel_size: 1
23
23
  data_parallel_size: 1
24
+ gpu_memory_utilization: 0.95
24
25
  extra_args: ""
25
26
  env_vars: {} # {name: value} dict
26
27
 
@@ -37,5 +38,5 @@ command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
37
38
  --trust-remote-code
38
39
  --served-model-name ${deployment.served_model_name}
39
40
  --enforce-eager
40
- --gpu-memory-utilization 0.95
41
+ --gpu-memory-utilization ${deployment.gpu_memory_utilization}
41
42
  ${deployment.extra_args}
@@ -16,3 +16,4 @@
16
16
  type: local
17
17
  output_dir: ???
18
18
  extra_docker_args: ""
19
+ mode: sequential