nemo-evaluator-launcher 0.1.16__tar.gz → 0.1.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/PKG-INFO +3 -3
  2. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/pyproject.toml +4 -2
  3. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/__init__.py +15 -1
  4. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/functional.py +106 -2
  5. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/types.py +9 -0
  6. nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/cli/debug.py → nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/cli/info.py +170 -63
  7. nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/cli/logs.py +102 -0
  8. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/main.py +22 -10
  9. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/run.py +112 -28
  10. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/status.py +9 -8
  11. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/version.py +26 -23
  12. nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/common/helpers.py +374 -0
  13. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/logging_utils.py +16 -5
  14. nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/common/printing_utils.py +100 -0
  15. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  16. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
  17. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
  18. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  19. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/base.py +31 -1
  20. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
  21. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/executor.py +219 -24
  22. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/executor.py +403 -33
  23. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/run.template.sh +58 -3
  24. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/executor.py +442 -64
  25. nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  26. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/utils.py +32 -46
  27. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/package_info.py +1 -1
  28. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/resources/mapping.toml +57 -16
  29. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +3 -3
  30. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +4 -1
  31. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +1 -0
  32. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/requires.txt +1 -1
  33. nemo_evaluator_launcher-0.1.16/src/nemo_evaluator_launcher/common/helpers.py +0 -194
  34. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/LICENSE +0 -0
  35. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/README.md +0 -0
  36. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/setup.cfg +0 -0
  37. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
  38. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/utils.py +0 -0
  39. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
  40. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/export.py +0 -0
  41. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
  42. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
  43. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
  44. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
  45. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
  46. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
  47. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
  48. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
  49. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
  50. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
  51. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
  52. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
  53. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
  54. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
  55. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
  56. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
  57. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
  58. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
  59. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
  60. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
  61. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
  62. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
  63. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
  64. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
  65. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
  66. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
  67. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
  68. {nemo_evaluator_launcher-0.1.16 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nemo-evaluator-launcher
3
- Version: 0.1.16
3
+ Version: 0.1.44
4
4
  Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
5
5
  Author: NVIDIA
6
6
  Author-email: nemo-toolkit@nvidia.com
@@ -458,7 +458,7 @@ License:
458
458
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
459
459
  SOFTWARE.
460
460
 
461
- Project-URL: homepage, https://github.com/NVIDIA-NeMo/Eval
461
+ Project-URL: homepage, https://github.com/NVIDIA-NeMo/Evaluator
462
462
  Project-URL: repository, https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher
463
463
  Keywords: deep learning,evaluations,machine learning,gpu,NLP,pytorch,torch
464
464
  Requires-Python: <3.14,>=3.10
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
478
478
  Provides-Extra: wandb
479
479
  Requires-Dist: wandb>=0.15.0; extra == "wandb"
480
480
  Provides-Extra: gsheets
481
- Requires-Dist: gsheets>=0.1.0; extra == "gsheets"
481
+ Requires-Dist: gspread>=5.0.0; extra == "gsheets"
482
482
  Provides-Extra: exporters
483
483
  Requires-Dist: mlflow; extra == "exporters"
484
484
  Requires-Dist: wandb; extra == "exporters"
@@ -33,20 +33,21 @@ keywords = [
33
33
 
34
34
  [project.urls]
35
35
  # BEGIN(if-changed): check package_info.py
36
- homepage = "https://github.com/NVIDIA-NeMo/Eval"
36
+ homepage = "https://github.com/NVIDIA-NeMo/Evaluator"
37
37
  repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher"
38
38
  # END(if-changed)
39
39
 
40
40
  [project.optional-dependencies]
41
41
  mlflow = ["mlflow>=2.8.0"]
42
42
  wandb = ["wandb>=0.15.0"]
43
- gsheets = ["gsheets>=0.1.0"]
43
+ gsheets = ["gspread>=5.0.0"]
44
44
  exporters = ["mlflow", "wandb", "gsheets"]
45
45
  all = ["mlflow", "wandb", "gsheets"]
46
46
 
47
47
  [project.scripts]
48
48
  nemo-evaluator-launcher = "nemo_evaluator_launcher.cli.main:main"
49
49
  nv-eval = "nemo_evaluator_launcher.cli.main:main"
50
+ nel = "nemo_evaluator_launcher.cli.main:main"
50
51
 
51
52
  [dependency-groups]
52
53
  dev = [
@@ -75,6 +76,7 @@ where = ["src"]
75
76
  "resources/**/*",
76
77
  "configs/**/*",
77
78
  "executors/**/*.sh",
79
+ "executors/**/*.template",
78
80
  ]
79
81
 
80
82
  [tool.setuptools.dynamic]
@@ -20,6 +20,7 @@ It automatically initializes logging and conditionally loads internal components
20
20
  """
21
21
 
22
22
  import importlib
23
+ import warnings
23
24
 
24
25
  from nemo_evaluator_launcher.common.logging_utils import logger
25
26
  from nemo_evaluator_launcher.package_info import (
@@ -32,9 +33,22 @@ from nemo_evaluator_launcher.package_info import (
32
33
  __version__,
33
34
  )
34
35
 
35
- logger.info("Version info", pkg=__package_name__, ver=__version__)
36
+ # Suppress pydantic warnings from third-party libraries (e.g., wandb) that are not
37
+ # compatible with Pydantic 2.x field metadata on Python 3.13+
38
+ warnings.filterwarnings(
39
+ "ignore",
40
+ message=r"The 'repr' attribute.*Field\(\).*",
41
+ category=Warning,
42
+ )
43
+ warnings.filterwarnings(
44
+ "ignore",
45
+ message=r"The 'frozen' attribute.*Field\(\).*",
46
+ category=Warning,
47
+ )
36
48
 
37
49
 
50
+ logger.info("Version info", pkg=__package_name__, ver=__version__)
51
+
38
52
  try:
39
53
  importlib.import_module("nemo_evaluator_launcher_internal")
40
54
  logger.debug(
@@ -19,7 +19,7 @@ This module provides the main functional entry points for running evaluations, q
19
19
  """
20
20
 
21
21
  from pathlib import Path
22
- from typing import Any, List, Optional, Union
22
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
23
23
 
24
24
  import yaml
25
25
  from omegaconf import DictConfig, OmegaConf
@@ -116,6 +116,7 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
116
116
  db = ExecutionDB()
117
117
  results: List[dict[str, Any]] = []
118
118
 
119
+ # TODO(agronskiy): refactor the `.`-checking job in all the functions.
119
120
  for id_or_prefix in ids_or_prefixes:
120
121
  # If id looks like an invocation_id (no dot), get all jobs for it
121
122
  if "." not in id_or_prefix:
@@ -259,6 +260,108 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
259
260
  return results
260
261
 
261
262
 
263
+ def stream_logs(
264
+ ids_or_prefixes: Union[str, list[str]],
265
+ ) -> Iterator[Tuple[str, str, str]]:
266
+ """Stream logs from jobs or invocations by their IDs or invocation IDs.
267
+
268
+ Args:
269
+ ids_or_prefixes: Single ID/prefix or list of job IDs or invocation IDs to stream logs from.
270
+ Short prefixes are allowed, we would try to match the full ones from
271
+ prefixes if no collisions are present.
272
+
273
+ Yields:
274
+ Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
275
+ Empty lines are yielded as empty strings.
276
+
277
+ Raises:
278
+ ValueError: If the executor doesn't support log streaming.
279
+ """
280
+ db = ExecutionDB()
281
+
282
+ # Normalize to list for consistent processing
283
+ if isinstance(ids_or_prefixes, str):
284
+ ids_or_prefixes = [ids_or_prefixes]
285
+
286
+ # Collect all jobs from all IDs, grouped by executor
287
+ executor_to_jobs: Dict[str, Dict[str, JobData]] = {}
288
+ executor_to_invocations: Dict[str, List[str]] = {}
289
+
290
+ # TODO(agronskiy): refactor the `.`-checking job in all the functions.
291
+ for id_or_prefix in ids_or_prefixes:
292
+ # Determine if this is a job ID or invocation ID
293
+ if "." in id_or_prefix:
294
+ # This is a job ID
295
+ job_data = db.get_job(id_or_prefix)
296
+ if job_data is None:
297
+ continue
298
+
299
+ executor = job_data.executor
300
+ if executor not in executor_to_jobs:
301
+ executor_to_jobs[executor] = {}
302
+ executor_to_jobs[executor][id_or_prefix] = job_data
303
+ else:
304
+ # This is an invocation ID
305
+ jobs = db.get_jobs(id_or_prefix)
306
+ if not jobs:
307
+ continue
308
+
309
+ # Get the executor class from the first job
310
+ first_job_data = next(iter(jobs.values()))
311
+ executor = first_job_data.executor
312
+ if executor not in executor_to_invocations:
313
+ executor_to_invocations[executor] = []
314
+ executor_to_invocations[executor].append(id_or_prefix)
315
+
316
+ # Stream logs from each executor simultaneously
317
+ # For each executor, collect all job IDs and stream them together
318
+ for executor, jobs_dict in executor_to_jobs.items():
319
+ try:
320
+ executor_cls = get_executor(executor)
321
+ except ValueError:
322
+ continue
323
+
324
+ # For local executor with multiple jobs, pass list to stream simultaneously
325
+ # For other executors or single jobs, pass individual job IDs
326
+ if executor == "local" and len(jobs_dict) > 1:
327
+ # Pass all job IDs as a list to stream simultaneously
328
+ try:
329
+ yield from executor_cls.stream_logs(
330
+ list(jobs_dict.keys()), executor_name=executor
331
+ )
332
+ except NotImplementedError:
333
+ raise ValueError(
334
+ f"Log streaming is not yet implemented for executor '{executor}'"
335
+ )
336
+ else:
337
+ # Single job or non-local executor
338
+ for job_id in jobs_dict.keys():
339
+ try:
340
+ yield from executor_cls.stream_logs(job_id, executor_name=executor)
341
+ except NotImplementedError:
342
+ raise ValueError(
343
+ f"Log streaming is not yet implemented for executor '{executor}'"
344
+ )
345
+
346
+ # Stream logs from invocation IDs
347
+ for executor, invocation_ids in executor_to_invocations.items():
348
+ try:
349
+ executor_cls = get_executor(executor)
350
+ except ValueError:
351
+ continue
352
+
353
+ # Stream each invocation (each invocation already handles multiple jobs internally)
354
+ for invocation_id in invocation_ids:
355
+ try:
356
+ yield from executor_cls.stream_logs(
357
+ invocation_id, executor_name=executor
358
+ )
359
+ except NotImplementedError:
360
+ raise ValueError(
361
+ f"Log streaming is not yet implemented for executor '{executor}'"
362
+ )
363
+
364
+
262
365
  def list_all_invocations_summary() -> list[dict[str, Any]]:
263
366
  """Return a concise per-invocation summary from the exec DB.
264
367
 
@@ -378,6 +481,7 @@ def kill_job_or_invocation(id: str) -> list[dict[str, Any]]:
378
481
  "data": {"error": f"Unexpected error: {str(e)}"},
379
482
  }
380
483
 
484
+ # TODO(agronskiy): refactor the `.`-checking job in all the functions.
381
485
  # Determine if this is a job ID or invocation ID
382
486
  if "." in id:
383
487
  # This is a job ID - kill single job
@@ -442,7 +546,7 @@ def export_results(
442
546
  if "." in single_id: # job_id
443
547
  # Try reading config from artifacts working dir (auto-export on remote node)
444
548
  cfg_file = None
445
- for name in ("run_config.yml", "config.yml"):
549
+ for name in ("config.yml", "run_config.yml"):
446
550
  p = Path(name)
447
551
  if p.exists():
448
552
  cfg_file = p
@@ -19,9 +19,18 @@ This module defines data structures and helpers for configuration and type safet
19
19
  """
20
20
 
21
21
  import os
22
+ import warnings
22
23
  from dataclasses import dataclass
23
24
  from typing import cast
24
25
 
26
+ # ruff: noqa: E402
27
+ # Later when adding optional module to hydra, since the internal package is optional,
28
+ # will generate a hydra warning. We suppress it as distraction and bad UX, before hydra gets invoked.
29
+ warnings.filterwarnings(
30
+ "ignore",
31
+ message="provider=hydra.searchpath.*path=nemo_evaluator_launcher_internal.*is not available\\.",
32
+ )
33
+
25
34
  import hydra
26
35
  from hydra.core.global_hydra import GlobalHydra
27
36
  from omegaconf import DictConfig, OmegaConf
@@ -14,16 +14,16 @@
14
14
  # limitations under the License.
15
15
  #
16
16
 
17
- """Debugging helper functionalities for nemo-evaluator-launcher."""
17
+ """Job information helper functionalities for nemo-evaluator-launcher."""
18
18
 
19
+ import sys
19
20
  from dataclasses import dataclass
20
21
  from datetime import datetime
21
22
  from pathlib import Path
22
- from typing import Any, Dict, List, Optional, Tuple
23
+ from typing import Any, Dict, List, Tuple
23
24
 
24
25
  from simple_parsing import field
25
26
 
26
- from nemo_evaluator_launcher.cli.export import ExportCmd
27
27
  from nemo_evaluator_launcher.cli.version import Cmd as VersionCmd
28
28
  from nemo_evaluator_launcher.common.execdb import EXEC_DB_FILE, ExecutionDB, JobData
29
29
  from nemo_evaluator_launcher.common.logging_utils import logger
@@ -35,52 +35,60 @@ _EXPORT_HELPER = LocalExporter({})
35
35
 
36
36
 
37
37
  @dataclass
38
- class DebugCmd(ExportCmd):
39
- """Debugging functionalities for nemo-evaluator-launcher.
38
+ class InfoCmd:
39
+ """Job information functionalities for nemo-evaluator-launcher.
40
40
 
41
41
  Examples:
42
- nemo-evaluator-launcher debug <inv> # Full debug info
43
- nemo-evaluator-launcher debug <inv> --config # Show stored job config (YAML)
44
- nemo-evaluator-launcher debug <inv> --artifacts # Show artifact locations
45
- nemo-evaluator-launcher debug <inv> --logs # Show log locations
46
- nemo-evaluator-launcher debug <inv> --copy-logs <path> # Copy logs (default: current dir)
47
- nemo-evaluator-launcher debug <inv> --copy-artifacts <path> # Copy artifacts (default: current dir)
42
+ nemo-evaluator-launcher info <inv> # Full job info
43
+ nemo-evaluator-launcher info <inv> --config # Show stored job config (YAML)
44
+ nemo-evaluator-launcher info <inv> --artifacts # Show artifact locations and key files
45
+ nemo-evaluator-launcher info <inv> --logs # Show log locations and key files
46
+ nemo-evaluator-launcher info <inv> --copy-logs <DIR> # Copy logs to <DIR>
47
+ nemo-evaluator-launcher info <inv> --copy-artifacts <DIR> # Copy artifacts to <DIR>
48
48
 
49
49
  Notes:
50
- - Supports invocation IDs and job IDs
50
+ - Supports invocation IDs and job IDs (space-separated)
51
51
  - Shows local or remote paths depending on executor (local/slurm/lepton)
52
+ - Copy operations work for both local and remote jobs (expect longer time for remote jobs)
53
+ - Copy operations are not supported for Lepton executor (yet).
52
54
  """
53
55
 
54
- # local exporter destination defaults to local
55
- dest: str = field(default="local", init=False)
56
+ invocation_ids: List[str] = field(
57
+ positional=True,
58
+ help="IDs to show info for (space-separated). Accepts invocation IDs or/and job IDs.",
59
+ )
56
60
 
57
- # debug modes
58
- config: bool = field(default=False, help="Show job configuration")
59
- artifacts: bool = field(default=False, help="Show artifact locations")
60
- logs: bool = field(default=False, help="Show log locations")
61
+ # info modes
62
+ config: bool = field(
63
+ default=False, action="store_true", help="Show job configuration"
64
+ )
65
+ artifacts: bool = field(
66
+ default=False, action="store_true", help="Show artifact locations and key files"
67
+ )
68
+ logs: bool = field(
69
+ default=False, action="store_true", help="Show log locations and key files"
70
+ )
61
71
 
62
- # copy operations
63
- copy_logs: Optional[str] = field(
72
+ # copy operations - work for both local and remote jobs
73
+ copy_logs: str | None = field(
64
74
  default=None,
65
75
  alias=["--copy-logs"],
66
- nargs="?",
67
- help="Copy logs to local directory (default: current dir)",
76
+ help="Copy logs to a local directory",
77
+ metavar="DIR",
68
78
  )
69
- copy_artifacts: Optional[str] = field(
79
+ copy_artifacts: str | None = field(
70
80
  default=None,
71
81
  alias=["--copy-artifacts"],
72
- nargs="?",
73
- help="Copy artifacts to local directory (default: current dir)",
82
+ help="Copy artifacts to a local directory",
83
+ metavar="DIR",
74
84
  )
75
85
 
76
86
  def execute(self) -> None:
77
- # show version
78
87
  VersionCmd().execute()
79
-
80
- logger.info("Debug command started", invocation_ids=self.invocation_ids)
88
+ logger.info("Info command started", invocation_ids=self.invocation_ids)
81
89
 
82
90
  if not self.invocation_ids:
83
- logger.error("No invocation IDs provided")
91
+ logger.error("No job or invocation IDs provided.")
84
92
  raise ValueError("No job or invocation IDs provided.")
85
93
 
86
94
  jobs = self._resolve_jobs()
@@ -96,48 +104,63 @@ class DebugCmd(ExportCmd):
96
104
  "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
97
105
  )
98
106
  print(
99
- "No valid jobs found (jobs may have been deletedd or IDs may be incorrect)."
107
+ "No valid jobs found (jobs may have been deleted or IDs may be incorrect)."
100
108
  )
101
109
  return
102
110
 
111
+ # show ops
103
112
  if self.config:
104
- logger.info("Showing job configuration", job_count=len(jobs))
105
113
  self._show_config_info(jobs)
106
- elif self.logs:
107
- logger.info("Showing job logs locations", job_count=len(jobs))
114
+ if self.logs:
108
115
  self._show_logs_info(jobs)
109
- elif self.artifacts:
110
- logger.info("Showing artifacts locations", job_count=len(jobs))
116
+ if self.artifacts:
111
117
  self._show_artifacts_info(jobs)
112
- elif self.copy_logs is not None:
113
- dest = self.copy_logs or "."
114
- if not self.copy_logs:
115
- print(
116
- "No destination provided for --copy-logs; defaulting to current dir"
117
- )
118
+
119
+ # copy ops
120
+ args = sys.argv[1:]
121
+ copy_logs_flag = "--copy-logs" in args
122
+ copy_artifacts_flag = "--copy-artifacts" in args
123
+
124
+ if copy_logs_flag:
125
+ if self.copy_logs is None:
126
+ raise ValueError("--copy-logs requires a directory path")
127
+ if not self.copy_logs.strip():
128
+ raise ValueError("--copy-logs requires a directory path")
118
129
  logger.info(
119
- "Copying logs to local directory", dest_dir=dest, job_count=len(jobs)
130
+ "Copying logs to local directory",
131
+ dest_dir=self.copy_logs,
132
+ job_count=len(jobs),
120
133
  )
121
- self._copy_logs(jobs, dest)
122
- elif self.copy_artifacts is not None:
123
- dest = self.copy_artifacts or "."
124
- if not self.copy_artifacts:
125
- print(
126
- "No destination provided for --copy-artifacts; defaulting to current dir)"
127
- )
134
+ self._copy_logs(jobs, self.copy_logs)
135
+
136
+ if copy_artifacts_flag:
137
+ if self.copy_artifacts is None:
138
+ raise ValueError("--copy-artifacts requires a directory path")
139
+ if not self.copy_artifacts.strip():
140
+ raise ValueError("--copy-artifacts requires a directory path")
128
141
  logger.info(
129
142
  "Copying artifacts to local directory",
130
- dest_dir=dest,
143
+ dest_dir=self.copy_artifacts,
131
144
  job_count=len(jobs),
132
145
  )
133
- self._copy_artifacts(jobs, dest)
134
- else:
146
+ self._copy_artifacts(jobs, self.copy_artifacts)
147
+
148
+ # default view when no flags
149
+ if not any(
150
+ [
151
+ self.config,
152
+ self.logs,
153
+ self.artifacts,
154
+ self.copy_logs,
155
+ self.copy_artifacts,
156
+ ]
157
+ ):
135
158
  logger.info(
136
159
  "Job metadata details",
137
160
  invocation_id=jobs[0][1].invocation_id if jobs else None,
138
161
  jobs=len(jobs),
139
162
  )
140
- self._show_invocation_debug_info(jobs)
163
+ self._show_invocation_info(jobs)
141
164
 
142
165
  def _resolve_jobs(self) -> List[Tuple[str, JobData]]:
143
166
  """Resolve jobs from ExecDB using IDs (job IDs and/or invocation IDs)."""
@@ -160,15 +183,15 @@ class DebugCmd(ExportCmd):
160
183
  uniq.append((jid, jd))
161
184
  return sorted(uniq, key=lambda p: p[0])
162
185
 
163
- def _show_invocation_debug_info(self, jobs: List[Tuple[str, JobData]]) -> None:
186
+ def _show_invocation_info(self, jobs: List[Tuple[str, JobData]]) -> None:
164
187
  inv = jobs[0][1].invocation_id if jobs else None
165
- logger.info("Debug information", jobs=len(jobs), invocation=inv)
188
+ logger.info("Job information", jobs=len(jobs), invocation=inv)
166
189
  print(
167
- f"Debug information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
190
+ f"Job information for {len(jobs)} job(s){f' under invocation {inv}' if inv else ''}:\n"
168
191
  )
169
192
 
170
193
  for job_id, job_data in jobs:
171
- self._show_job_debug_info(job_id, job_data)
194
+ self._show_job_info(job_id, job_data)
172
195
  print()
173
196
 
174
197
  # footer hint: where to find more metadata
@@ -184,10 +207,14 @@ class DebugCmd(ExportCmd):
184
207
  print(" - Use --logs to show log locations.")
185
208
  print(" - Use --artifacts to show artifact locations.")
186
209
  print(" - Use --config to show stored job configuration (YAML).")
187
- print(" - Use --copy-logs [DIR] to copy logs to a local directory.")
188
- print(" - Use --copy-artifacts [DIR] to copy artifacts to a local directory.")
210
+ print(
211
+ " - Use --copy-logs [DIR] to copy logs to a local directory (works for local and remote jobs)."
212
+ )
213
+ print(
214
+ " - Use --copy-artifacts [DIR] to copy artifacts to a local directory (works for local and remote jobs)."
215
+ )
189
216
 
190
- def _show_job_debug_info(self, job_id: str, job_data: JobData) -> None:
217
+ def _show_job_info(self, job_id: str, job_data: JobData) -> None:
191
218
  logger.info("Job", job_id=job_id)
192
219
  print(f"Job {job_id}")
193
220
 
@@ -208,14 +235,22 @@ class DebugCmd(ExportCmd):
208
235
  logger.info("Task", job_id=job_id, name=task_name)
209
236
  print(f"├── Task: {task_name}")
210
237
 
238
+ # Determine executor type for file descriptions
239
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
240
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
241
+
211
242
  # locations via exporter helper
212
243
  paths = _EXPORT_HELPER.get_job_paths(job_data)
213
244
 
214
- # Artifacts
245
+ # Artifacts with file descriptions
246
+ artifacts_list = _get_artifacts_file_list()
215
247
  if paths.get("storage_type") == "remote_ssh":
216
248
  artifacts_path = f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts"
217
249
  logger.info("Artifacts", job_id=job_id, path=artifacts_path, remote=True)
218
250
  print(f"├── Artifacts: {artifacts_path} (remote)")
251
+ print("│ └── Key files:")
252
+ for filename, desc in artifacts_list:
253
+ print(f"│ ├── {filename} - {desc}")
219
254
  else:
220
255
  ap = paths.get("artifacts_dir")
221
256
  if ap:
@@ -224,14 +259,21 @@ class DebugCmd(ExportCmd):
224
259
  "Artifacts", job_id=job_id, path=str(ap), exists_indicator=exists
225
260
  )
226
261
  print(f"├── Artifacts: {ap} {exists} (local)")
262
+ print("│ └── Key files:")
263
+ for filename, desc in artifacts_list:
264
+ print(f"│ ├── {filename} - {desc}")
227
265
 
228
- # Logs
266
+ # Logs with file descriptions
267
+ logs_list = _get_log_file_list(exec_type)
229
268
  if paths.get("storage_type") == "remote_ssh":
230
269
  logs_path = (
231
270
  f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/logs"
232
271
  )
233
272
  logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
234
273
  print(f"├── Logs: {logs_path} (remote)")
274
+ print("│ └── Key files:")
275
+ for filename, desc in logs_list:
276
+ print(f"│ ├── {filename} - {desc}")
235
277
  else:
236
278
  lp = paths.get("logs_dir")
237
279
  if lp:
@@ -240,6 +282,9 @@ class DebugCmd(ExportCmd):
240
282
  "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
241
283
  )
242
284
  print(f"├── Logs: {lp} {exists} (local)")
285
+ print("│ └── Key files:")
286
+ for filename, desc in logs_list:
287
+ print(f"│ ├── {filename} - {desc}")
243
288
 
244
289
  # executor-specific
245
290
  d = job_data.data or {}
@@ -264,17 +309,23 @@ class DebugCmd(ExportCmd):
264
309
  eu = d.get("endpoint_url")
265
310
  if eu:
266
311
  print(f"├── Endpoint URL: {eu}")
267
- # local and others: paths already displayed above; no extra fields needed
268
312
 
269
313
  def _show_logs_info(self, jobs: List[Tuple[str, JobData]]) -> None:
270
314
  logger.info("Log locations")
271
315
  print("Log locations:\n")
272
316
  for job_id, job_data in jobs:
273
317
  paths = _EXPORT_HELPER.get_job_paths(job_data)
318
+ cfg_exec_type = ((job_data.config or {}).get("execution") or {}).get("type")
319
+ exec_type = (job_data.executor or cfg_exec_type or "").lower()
320
+ logs_list = _get_log_file_list(exec_type)
321
+
274
322
  if paths.get("storage_type") == "remote_ssh":
275
323
  logs_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/logs"
276
324
  logger.info("Logs", job_id=job_id, path=logs_path, remote=True)
277
325
  print(f"{job_id}: {logs_path} (remote)")
326
+ print(" └── Key files:")
327
+ for filename, desc in logs_list:
328
+ print(f" ├── {filename} - {desc}")
278
329
  else:
279
330
  lp = paths.get("logs_dir")
280
331
  if lp:
@@ -283,18 +334,26 @@ class DebugCmd(ExportCmd):
283
334
  "Logs", job_id=job_id, path=str(lp), exists_indicator=exists
284
335
  )
285
336
  print(f"{job_id}: {lp} {exists} (local)")
337
+ print(" └── Key files:")
338
+ for filename, desc in logs_list:
339
+ print(f" ├── {filename} - {desc}")
286
340
 
287
341
  def _show_artifacts_info(self, jobs: List[Tuple[str, JobData]]) -> None:
288
342
  logger.info("Artifact locations")
289
343
  print("Artifact locations:\n")
290
344
  for job_id, job_data in jobs:
291
345
  paths = _EXPORT_HELPER.get_job_paths(job_data)
346
+ artifacts_list = _get_artifacts_file_list()
347
+
292
348
  if paths.get("storage_type") == "remote_ssh":
293
349
  artifacts_path = f"ssh://{paths['username']}@{paths['hostname']}{paths['remote_path']}/artifacts"
294
350
  logger.info(
295
351
  "Artifacts", job_id=job_id, path=artifacts_path, remote=True
296
352
  )
297
353
  print(f"{job_id}: {artifacts_path} (remote)")
354
+ print(" └── Key files:")
355
+ for filename, desc in artifacts_list:
356
+ print(f" ├── {filename} - {desc}")
298
357
  else:
299
358
  ap = paths.get("artifacts_dir")
300
359
  if ap:
@@ -306,6 +365,9 @@ class DebugCmd(ExportCmd):
306
365
  exists_indicator=exists,
307
366
  )
308
367
  print(f"{job_id}: {ap} {exists} (local)")
368
+ print(" └── Key files:")
369
+ for filename, desc in artifacts_list:
370
+ print(f" ├── {filename} - {desc}")
309
371
 
310
372
  def _show_config_info(self, jobs: List[Tuple[str, JobData]]) -> None:
311
373
  for job_id, job_data in jobs:
@@ -383,6 +445,9 @@ class DebugCmd(ExportCmd):
383
445
  print(
384
446
  f"{jid}: Failed - {job_result.get('message', 'Unknown error')}"
385
447
  )
448
+ # Show full destination path
449
+ full_dest_path = Path(dest_dir).resolve()
450
+ print(f"Copied to: {full_dest_path}")
386
451
  else:
387
452
  err = result.get("error", "Unknown error")
388
453
  logger.warning("Content copy failed", error=err, dest_dir=dest_dir)
@@ -403,3 +468,45 @@ class DebugCmd(ExportCmd):
403
468
  except Exception:
404
469
  pass
405
470
  return ""
471
+
472
+
473
+ # Helper functions for file descriptions (based on actual code and content analysis)
474
+ def _get_artifacts_file_list() -> list[tuple[str, str]]:
475
+ """Files generated in artifacts/."""
476
+ return [
477
+ (
478
+ "results.yml",
479
+ "Benchmark scores, task results and resolved run configuration.",
480
+ ),
481
+ (
482
+ "eval_factory_metrics.json",
483
+ "Response + runtime stats (latency, tokens count, memory)",
484
+ ),
485
+ ("metrics.json", "Harness/benchmark metric and configuration"),
486
+ ("report.html", "Request-Response Pairs samples in HTML format (if enabled)"),
487
+ ("report.json", "Report data in json format, if enabled"),
488
+ ]
489
+
490
+
491
+ def _get_log_file_list(executor_type: str) -> list[tuple[str, str]]:
492
+ """Files actually generated in logs/ - executor-specific."""
493
+ et = (executor_type or "local").lower()
494
+ if et == "slurm":
495
+ return [
496
+ ("client-{SLURM_JOB_ID}.out", "Evaluation container/process output"),
497
+ (
498
+ "slurm-{SLURM_JOB_ID}.out",
499
+ "SLURM scheduler stdout/stderr (batch submission, export steps).",
500
+ ),
501
+ (
502
+ "server-{SLURM_JOB_ID}.out",
503
+ "Model server logs when a deployment is used.",
504
+ ),
505
+ ]
506
+ # local executor
507
+ return [
508
+ (
509
+ "stdout.log",
510
+ "Complete evaluation output (timestamps, resolved config, run/export messages).",
511
+ ),
512
+ ]