nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +65 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +641 -0
- nemo_evaluator_launcher/api/types.py +89 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +148 -0
- nemo_evaluator_launcher/cli/info.py +117 -0
- nemo_evaluator_launcher/cli/kill.py +39 -0
- nemo_evaluator_launcher/cli/ls_runs.py +113 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
- nemo_evaluator_launcher/cli/main.py +136 -0
- nemo_evaluator_launcher/cli/run.py +135 -0
- nemo_evaluator_launcher/cli/status.py +118 -0
- nemo_evaluator_launcher/cli/version.py +52 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +189 -0
- nemo_evaluator_launcher/common/helpers.py +157 -0
- nemo_evaluator_launcher/common/logging_utils.py +349 -0
- nemo_evaluator_launcher/common/mapping.py +310 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +97 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +491 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +112 -0
- nemo_evaluator_launcher/exporters/gsheets.py +391 -0
- nemo_evaluator_launcher/exporters/local.py +488 -0
- nemo_evaluator_launcher/exporters/mlflow.py +448 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +669 -0
- nemo_evaluator_launcher/exporters/wandb.py +376 -0
- nemo_evaluator_launcher/package_info.py +35 -0
- nemo_evaluator_launcher/resources/mapping.toml +344 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Main CLI module using simple-parsing with subcommands."""
|
|
17
|
+
|
|
18
|
+
from simple_parsing import ArgumentParser
|
|
19
|
+
|
|
20
|
+
import nemo_evaluator_launcher.cli.export as export
|
|
21
|
+
import nemo_evaluator_launcher.cli.kill as kill
|
|
22
|
+
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
23
|
+
import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
|
|
24
|
+
import nemo_evaluator_launcher.cli.run as run
|
|
25
|
+
import nemo_evaluator_launcher.cli.status as status
|
|
26
|
+
import nemo_evaluator_launcher.cli.version as version
|
|
27
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
28
|
+
|
|
29
|
+
VERSION_HELP = "Show version information"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_parser() -> ArgumentParser:
|
|
33
|
+
"""Create and configure the CLI argument parser with subcommands."""
|
|
34
|
+
parser = ArgumentParser()
|
|
35
|
+
|
|
36
|
+
# Add --version flag at the top level
|
|
37
|
+
parser.add_argument("--version", action="store_true", help=VERSION_HELP)
|
|
38
|
+
|
|
39
|
+
subparsers = parser.add_subparsers(dest="command", required=False)
|
|
40
|
+
|
|
41
|
+
# Version subcommand
|
|
42
|
+
version_parser = subparsers.add_parser(
|
|
43
|
+
"version",
|
|
44
|
+
help=VERSION_HELP,
|
|
45
|
+
description=VERSION_HELP,
|
|
46
|
+
)
|
|
47
|
+
version_parser.add_arguments(version.Cmd, dest="version")
|
|
48
|
+
|
|
49
|
+
# Run subcommand
|
|
50
|
+
run_parser = subparsers.add_parser(
|
|
51
|
+
"run", help="Run evaluation", description="Run evaluation"
|
|
52
|
+
)
|
|
53
|
+
run_parser.add_arguments(run.Cmd, dest="run")
|
|
54
|
+
|
|
55
|
+
# Status subcommand
|
|
56
|
+
status_parser = subparsers.add_parser(
|
|
57
|
+
"status", help="Check job status", description="Check job status"
|
|
58
|
+
)
|
|
59
|
+
status_parser.add_arguments(status.Cmd, dest="status")
|
|
60
|
+
|
|
61
|
+
# Kill subcommand
|
|
62
|
+
kill_parser = subparsers.add_parser(
|
|
63
|
+
"kill",
|
|
64
|
+
help="Kill a job or invocation",
|
|
65
|
+
description="Kill a job (e.g., aefc4819.0) or entire invocation (e.g., aefc4819) by its ID",
|
|
66
|
+
)
|
|
67
|
+
kill_parser.add_arguments(kill.Cmd, dest="kill")
|
|
68
|
+
|
|
69
|
+
# Ls subcommand (with nested subcommands)
|
|
70
|
+
ls_parser = subparsers.add_parser(
|
|
71
|
+
"ls", help="List resources", description="List tasks or runs"
|
|
72
|
+
)
|
|
73
|
+
ls_sub = ls_parser.add_subparsers(dest="ls_command", required=True)
|
|
74
|
+
|
|
75
|
+
# ls tasks
|
|
76
|
+
ls_tasks_parser = ls_sub.add_parser(
|
|
77
|
+
"tasks", help="List available tasks", description="List available tasks"
|
|
78
|
+
)
|
|
79
|
+
ls_tasks_parser.add_arguments(ls_tasks.Cmd, dest="tasks")
|
|
80
|
+
|
|
81
|
+
# ls runs (invocations summary)
|
|
82
|
+
ls_runs_parser = ls_sub.add_parser(
|
|
83
|
+
"runs",
|
|
84
|
+
help="List invocations (runs)",
|
|
85
|
+
description="Show a concise table of invocations from the exec DB",
|
|
86
|
+
)
|
|
87
|
+
ls_runs_parser.add_arguments(ls_runs.Cmd, dest="runs")
|
|
88
|
+
|
|
89
|
+
# Export subcommand
|
|
90
|
+
export_parser = subparsers.add_parser(
|
|
91
|
+
"export",
|
|
92
|
+
help="Export evaluation results",
|
|
93
|
+
description="Export evaluation results takes a List of invocation ids and a list of destinations(local, gitlab, wandb)",
|
|
94
|
+
)
|
|
95
|
+
export_parser.add_arguments(export.ExportCmd, dest="export")
|
|
96
|
+
|
|
97
|
+
return parser
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def main() -> None:
|
|
101
|
+
"""Main CLI entry point with subcommands."""
|
|
102
|
+
parser = create_parser()
|
|
103
|
+
args = parser.parse_args()
|
|
104
|
+
|
|
105
|
+
# Handle --version flag
|
|
106
|
+
if hasattr(args, "version") and args.version:
|
|
107
|
+
version_cmd = version.Cmd()
|
|
108
|
+
version_cmd.execute()
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
# Handle case where no command is provided but --version wasn't used
|
|
112
|
+
if not hasattr(args, "command") or args.command is None:
|
|
113
|
+
parser.print_help()
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
logger.debug("Parsed arguments", args=args)
|
|
117
|
+
if args.command == "version":
|
|
118
|
+
args.version.execute()
|
|
119
|
+
elif args.command == "run":
|
|
120
|
+
args.run.execute()
|
|
121
|
+
elif args.command == "status":
|
|
122
|
+
args.status.execute()
|
|
123
|
+
elif args.command == "kill":
|
|
124
|
+
args.kill.execute()
|
|
125
|
+
elif args.command == "ls":
|
|
126
|
+
# Dispatch nested ls subcommands
|
|
127
|
+
if hasattr(args, "tasks"):
|
|
128
|
+
args.tasks.execute()
|
|
129
|
+
elif hasattr(args, "runs"):
|
|
130
|
+
args.runs.execute()
|
|
131
|
+
elif args.command == "export":
|
|
132
|
+
args.export.execute()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import pathlib
|
|
17
|
+
import time
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
from omegaconf import OmegaConf
|
|
22
|
+
from simple_parsing import field
|
|
23
|
+
|
|
24
|
+
from nemo_evaluator_launcher.api.functional import RunConfig, run_eval
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Cmd:
|
|
29
|
+
"""Run command parameters"""
|
|
30
|
+
|
|
31
|
+
config_name: str = field(
|
|
32
|
+
default="default",
|
|
33
|
+
alias=["-c", "--config-name"],
|
|
34
|
+
metadata={
|
|
35
|
+
"help": "Config name to use. Consult `nemo_evaluator_launcher.configs`"
|
|
36
|
+
},
|
|
37
|
+
)
|
|
38
|
+
config_dir: str | None = field(
|
|
39
|
+
default=None,
|
|
40
|
+
alias=["-d", "--config-dir"],
|
|
41
|
+
metadata={
|
|
42
|
+
"help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
run_config_file: str | None = field(
|
|
46
|
+
default=None,
|
|
47
|
+
alias=["-f", "--run-config-file"],
|
|
48
|
+
metadata={
|
|
49
|
+
"help": "Path to a run config file to load directly (bypasses Hydra config loading)."
|
|
50
|
+
},
|
|
51
|
+
)
|
|
52
|
+
override: list[str] = field(
|
|
53
|
+
default_factory=list,
|
|
54
|
+
action="append",
|
|
55
|
+
nargs="?",
|
|
56
|
+
alias=["-o"],
|
|
57
|
+
metadata={
|
|
58
|
+
"help": "Hydra override in the form some.param.path=value (pass multiple `-o` for multiple overrides).",
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
dry_run: bool = field(
|
|
62
|
+
default=False,
|
|
63
|
+
alias=["-n", "--dry-run"],
|
|
64
|
+
metadata={"help": "Do not run the evaluation, just print the config."},
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def execute(self) -> None:
|
|
68
|
+
# Load configuration either from Hydra or from a run config file
|
|
69
|
+
if self.run_config_file:
|
|
70
|
+
# Validate that run config file is not used with other config options
|
|
71
|
+
if self.config_name != "default":
|
|
72
|
+
raise ValueError("Cannot use --run-config-file with --config-name")
|
|
73
|
+
if self.config_dir is not None:
|
|
74
|
+
raise ValueError("Cannot use --run-config-file with --config-dir")
|
|
75
|
+
if self.override:
|
|
76
|
+
raise ValueError("Cannot use --run-config-file with --override")
|
|
77
|
+
|
|
78
|
+
# Load from run config file
|
|
79
|
+
with open(self.run_config_file, "r") as f:
|
|
80
|
+
config_dict = yaml.safe_load(f)
|
|
81
|
+
|
|
82
|
+
# Create RunConfig from the loaded data
|
|
83
|
+
config = OmegaConf.create(config_dict)
|
|
84
|
+
else:
|
|
85
|
+
# Load the complete Hydra configuration
|
|
86
|
+
config = RunConfig.from_hydra(
|
|
87
|
+
config_name=self.config_name,
|
|
88
|
+
hydra_overrides=self.override,
|
|
89
|
+
config_dir=self.config_dir,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
invocation_id = run_eval(config, self.dry_run)
|
|
93
|
+
|
|
94
|
+
# Save the complete configuration to the raw_configs directory
|
|
95
|
+
if not self.dry_run and invocation_id is not None:
|
|
96
|
+
# Create ~/.nemo-evaluator/run_configs directory
|
|
97
|
+
home_dir = pathlib.Path.home()
|
|
98
|
+
run_configs_dir = home_dir / ".nemo-evaluator" / "run_configs"
|
|
99
|
+
run_configs_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Convert DictConfig to dict and save as YAML
|
|
102
|
+
config_dict = OmegaConf.to_container(config, resolve=True)
|
|
103
|
+
config_yaml = yaml.dump(
|
|
104
|
+
config_dict, default_flow_style=False, sort_keys=False, indent=2
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Create config filename with invocation ID
|
|
108
|
+
config_filename = f"{invocation_id}_config.yml"
|
|
109
|
+
config_path = run_configs_dir / config_filename
|
|
110
|
+
|
|
111
|
+
# Save the complete Hydra configuration
|
|
112
|
+
with open(config_path, "w") as f:
|
|
113
|
+
f.write("# Complete configuration from nemo-evaluator-launcher\n")
|
|
114
|
+
f.write(
|
|
115
|
+
f"# Generated at: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n"
|
|
116
|
+
)
|
|
117
|
+
f.write(f"# Invocation ID: {invocation_id}\n")
|
|
118
|
+
f.write("#\n")
|
|
119
|
+
f.write("# This is the complete raw configuration\n")
|
|
120
|
+
f.write("#\n")
|
|
121
|
+
f.write("# To rerun this exact configuration:\n")
|
|
122
|
+
f.write(
|
|
123
|
+
f"# nemo-evaluator-launcher run --run-config-file {config_path}\n"
|
|
124
|
+
)
|
|
125
|
+
f.write("#\n")
|
|
126
|
+
f.write(config_yaml)
|
|
127
|
+
|
|
128
|
+
print(f"Complete run config saved to: {config_path}")
|
|
129
|
+
|
|
130
|
+
if invocation_id is not None:
|
|
131
|
+
print(f"to check status: nemo-evaluator-launcher status {invocation_id}")
|
|
132
|
+
print(f"to kill all jobs: nemo-evaluator-launcher kill {invocation_id}")
|
|
133
|
+
print(
|
|
134
|
+
f"to kill individual jobs: nemo-evaluator-launcher kill <job_id> (e.g., {invocation_id}.0)"
|
|
135
|
+
)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import json
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from simple_parsing import field
|
|
20
|
+
|
|
21
|
+
from nemo_evaluator_launcher.api.functional import get_status
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Cmd:
|
|
26
|
+
"""Status command configuration."""
|
|
27
|
+
|
|
28
|
+
job_ids: list[str] = field(
|
|
29
|
+
default_factory=list,
|
|
30
|
+
positional=True,
|
|
31
|
+
)
|
|
32
|
+
json: bool = field(
|
|
33
|
+
default=False,
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Print output as JSON instead of table format",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def execute(self) -> None:
|
|
39
|
+
res = get_status(self.job_ids)
|
|
40
|
+
if self.json:
|
|
41
|
+
# Remove progress field from JSON output as it's a WIP feature
|
|
42
|
+
filtered_res = []
|
|
43
|
+
for job in res:
|
|
44
|
+
job_copy = job.copy()
|
|
45
|
+
job_copy.pop("progress", None)
|
|
46
|
+
filtered_res.append(job_copy)
|
|
47
|
+
print(json.dumps(filtered_res, indent=2))
|
|
48
|
+
else:
|
|
49
|
+
self._print_table(res)
|
|
50
|
+
|
|
51
|
+
def _print_table(self, jobs: list[dict]) -> None:
|
|
52
|
+
"""Print job status as a table."""
|
|
53
|
+
if not jobs:
|
|
54
|
+
print("No jobs found.")
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Define executor-specific mappings
|
|
58
|
+
executor_headers = {
|
|
59
|
+
"slurm_job_id": "Slurm Job ID",
|
|
60
|
+
"lepton_job_name": "Lepton Job Name",
|
|
61
|
+
"pipeline_id": "Pipeline ID",
|
|
62
|
+
"container": "Container",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Determine executor type and headers
|
|
66
|
+
first_data = jobs[0].get("data", {}) if jobs else {}
|
|
67
|
+
executor_key = next((k for k in executor_headers if k in first_data), None)
|
|
68
|
+
info_header = executor_headers.get(executor_key, "Executor Info")
|
|
69
|
+
headers = ["Job ID", "Status", info_header, "Location"]
|
|
70
|
+
|
|
71
|
+
# Build rows
|
|
72
|
+
rows = []
|
|
73
|
+
for job in jobs:
|
|
74
|
+
data = job.get("data", {})
|
|
75
|
+
|
|
76
|
+
# Extract executor info
|
|
77
|
+
executor_info = str(data.get(executor_key, "")) if executor_key else ""
|
|
78
|
+
|
|
79
|
+
# Extract location
|
|
80
|
+
if executor_key == "slurm_job_id":
|
|
81
|
+
path = data.get("remote_rundir_path", "")
|
|
82
|
+
location = (
|
|
83
|
+
"<output_dir>/" + "/".join(path.split("/")[-2:]) if path else ""
|
|
84
|
+
)
|
|
85
|
+
elif executor_key == "lepton_job_name":
|
|
86
|
+
location = data.get("endpoint_name") or "shared-endpoint"
|
|
87
|
+
elif executor_key == "pipeline_id":
|
|
88
|
+
location = data.get("pipeline_web_url")
|
|
89
|
+
elif executor_key == "container":
|
|
90
|
+
path = data.get("output_dir", "")
|
|
91
|
+
location = path.split("/")[-1] if path else ""
|
|
92
|
+
else:
|
|
93
|
+
location = ""
|
|
94
|
+
|
|
95
|
+
rows.append(
|
|
96
|
+
[
|
|
97
|
+
job.get("job_id", ""),
|
|
98
|
+
job.get("status", ""),
|
|
99
|
+
# job.get("progress", ""), temporarily disabled as this is a WIP feature
|
|
100
|
+
executor_info,
|
|
101
|
+
location,
|
|
102
|
+
]
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Calculate column widths and print
|
|
106
|
+
widths = [
|
|
107
|
+
max(len(str(headers[i])), max(len(str(row[i])) for row in rows))
|
|
108
|
+
for i in range(len(headers))
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
header_row = " | ".join(
|
|
112
|
+
headers[i].ljust(widths[i]) for i in range(len(headers))
|
|
113
|
+
)
|
|
114
|
+
print(header_row)
|
|
115
|
+
print("-" * len(header_row))
|
|
116
|
+
|
|
117
|
+
for row in rows:
|
|
118
|
+
print(" | ".join(str(row[i]).ljust(widths[i]) for i in range(len(row))))
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Version command for nemo-evaluator-launcher."""
|
|
17
|
+
|
|
18
|
+
import importlib
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from nemo_evaluator_launcher import __package_name__, __version__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Cmd:
|
|
26
|
+
"""Show version information for nemo-evaluator-launcher and internal packages."""
|
|
27
|
+
|
|
28
|
+
def execute(self) -> None:
|
|
29
|
+
"""Execute the version command."""
|
|
30
|
+
print(f"{__package_name__}: {__version__}")
|
|
31
|
+
|
|
32
|
+
# Check for internal package
|
|
33
|
+
try:
|
|
34
|
+
internal_module = importlib.import_module(
|
|
35
|
+
"nemo_evaluator_launcher_internal"
|
|
36
|
+
)
|
|
37
|
+
# Try to get version from internal package
|
|
38
|
+
try:
|
|
39
|
+
internal_version = getattr(internal_module, "__version__", None)
|
|
40
|
+
if internal_version:
|
|
41
|
+
print(f"nemo-evaluator-launcher-internal: {internal_version}")
|
|
42
|
+
else:
|
|
43
|
+
print(
|
|
44
|
+
"nemo-evaluator-launcher-internal: available (version unknown)"
|
|
45
|
+
)
|
|
46
|
+
except Exception:
|
|
47
|
+
print("nemo-evaluator-launcher-internal: available (version unknown)")
|
|
48
|
+
except ImportError:
|
|
49
|
+
# Internal package not available - this is expected in many cases
|
|
50
|
+
pass
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"nemo-evaluator-launcher-internal: error loading ({e})")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Common utilities and configurations for nemo-evaluator-launcher."""
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Execution database module for tracking job executions."""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import pathlib
|
|
20
|
+
import secrets
|
|
21
|
+
from dataclasses import asdict, dataclass
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
25
|
+
|
|
26
|
+
# Configuration constants
|
|
27
|
+
EXEC_DB_DIR = pathlib.Path.home() / ".nemo-evaluator" / "exec-db"
|
|
28
|
+
EXEC_DB_FILE = EXEC_DB_DIR / "exec.v1.jsonl"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generate_invocation_id() -> str:
|
|
32
|
+
"""Generate a unique invocation ID as an 8-digit hex string."""
|
|
33
|
+
return secrets.token_hex(4)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def generate_job_id(invocation_id: str, index: int) -> str:
|
|
37
|
+
"""Generate a job ID as <invocation_id>.<n>.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
invocation_id: The invocation group ID (8-digit hex).
|
|
41
|
+
index: The job index (0-based integer).
|
|
42
|
+
Returns:
|
|
43
|
+
The job ID string.
|
|
44
|
+
"""
|
|
45
|
+
return f"{invocation_id}.{index}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class JobData:
|
|
50
|
+
"""Data structure for job execution information.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
invocation_id: 8-digit hex string.
|
|
54
|
+
job_id: <invocation_id>.<n> string.
|
|
55
|
+
timestamp: Unix timestamp when the job was created.
|
|
56
|
+
executor: Name of the executor that handled this job.
|
|
57
|
+
data: Additional job-specific data as a dictionary.
|
|
58
|
+
config: Configuration used to setup a job.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
invocation_id: str
|
|
62
|
+
job_id: str
|
|
63
|
+
timestamp: float
|
|
64
|
+
executor: str
|
|
65
|
+
data: Dict[str, Any]
|
|
66
|
+
config: Optional[Dict[str, Any]] = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ExecutionDB:
|
|
70
|
+
"""Singleton class for managing execution database with invocation and job hierarchy."""
|
|
71
|
+
|
|
72
|
+
_instance: Optional["ExecutionDB"] = None
|
|
73
|
+
_jobs: Dict[str, JobData] = {} # job_id -> JobData
|
|
74
|
+
_invocations: Dict[str, List[str]] = {} # invocation_id -> list of job_ids
|
|
75
|
+
|
|
76
|
+
def __new__(cls) -> "ExecutionDB":
|
|
77
|
+
if cls._instance is None:
|
|
78
|
+
cls._instance = super().__new__(cls)
|
|
79
|
+
return cls._instance
|
|
80
|
+
|
|
81
|
+
def __init__(self) -> None:
|
|
82
|
+
if not hasattr(self, "_initialized"):
|
|
83
|
+
self._ensure_db_dir()
|
|
84
|
+
self._load_existing_jobs()
|
|
85
|
+
self._initialized = True
|
|
86
|
+
|
|
87
|
+
def _ensure_db_dir(self) -> None:
|
|
88
|
+
EXEC_DB_DIR.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
def _load_existing_jobs(self) -> None:
|
|
91
|
+
if not EXEC_DB_FILE.exists():
|
|
92
|
+
return
|
|
93
|
+
try:
|
|
94
|
+
with open(EXEC_DB_FILE, "r") as f:
|
|
95
|
+
for line in f:
|
|
96
|
+
line = line.strip()
|
|
97
|
+
if not line:
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
record = json.loads(line)
|
|
101
|
+
invocation_id = record.get("invocation_id")
|
|
102
|
+
job_id = record.get("job_id")
|
|
103
|
+
executor = record.get("executor")
|
|
104
|
+
data = record.get("data", {})
|
|
105
|
+
config = record.get("config", {})
|
|
106
|
+
timestamp = record.get("timestamp", 0.0)
|
|
107
|
+
if invocation_id and job_id and executor:
|
|
108
|
+
job_data = JobData(
|
|
109
|
+
invocation_id=invocation_id,
|
|
110
|
+
job_id=job_id,
|
|
111
|
+
timestamp=timestamp,
|
|
112
|
+
executor=executor,
|
|
113
|
+
data=data,
|
|
114
|
+
config=config,
|
|
115
|
+
)
|
|
116
|
+
self._jobs[job_id] = job_data
|
|
117
|
+
if invocation_id not in self._invocations:
|
|
118
|
+
self._invocations[invocation_id] = []
|
|
119
|
+
self._invocations[invocation_id].append(job_id)
|
|
120
|
+
except json.JSONDecodeError as e:
|
|
121
|
+
logger.warning("Failed to parse JSONL line", error=str(e))
|
|
122
|
+
except OSError as e:
|
|
123
|
+
logger.warning("Failed to load existing jobs", error=str(e))
|
|
124
|
+
|
|
125
|
+
def write_job(self, job: JobData) -> None:
|
|
126
|
+
if job.job_id:
|
|
127
|
+
self._jobs[job.job_id] = job
|
|
128
|
+
if job.invocation_id not in self._invocations:
|
|
129
|
+
self._invocations[job.invocation_id] = []
|
|
130
|
+
if job.job_id and job.job_id not in self._invocations[job.invocation_id]:
|
|
131
|
+
self._invocations[job.invocation_id].append(job.job_id)
|
|
132
|
+
record = asdict(job)
|
|
133
|
+
try:
|
|
134
|
+
with open(EXEC_DB_FILE, "a") as f:
|
|
135
|
+
f.write(json.dumps(record) + "\n")
|
|
136
|
+
logger.info(
|
|
137
|
+
"Job written to execution database",
|
|
138
|
+
invocation_id=job.invocation_id,
|
|
139
|
+
job_id=job.job_id,
|
|
140
|
+
executor=job.executor,
|
|
141
|
+
)
|
|
142
|
+
except OSError as e:
|
|
143
|
+
logger.error(
|
|
144
|
+
"Failed to write job to database",
|
|
145
|
+
invocation_id=job.invocation_id,
|
|
146
|
+
job_id=job.job_id,
|
|
147
|
+
error=str(e),
|
|
148
|
+
)
|
|
149
|
+
raise
|
|
150
|
+
|
|
151
|
+
def get_job(self, job_id: str) -> Optional[JobData]:
|
|
152
|
+
return self._jobs.get(job_id)
|
|
153
|
+
|
|
154
|
+
def get_jobs(self, invocation_id: str) -> Dict[str, JobData]:
|
|
155
|
+
job_ids = self._invocations.get(invocation_id, [])
|
|
156
|
+
return {
|
|
157
|
+
job_id: self._jobs[job_id] for job_id in job_ids if job_id in self._jobs
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
def get_invocation_jobs(self, invocation_id: str) -> List[str]:
|
|
161
|
+
return self._invocations.get(invocation_id, [])
|
|
162
|
+
|
|
163
|
+
def get_all_jobs(self) -> Dict[str, JobData]:
|
|
164
|
+
"""Return a copy of all jobs in the execution DB."""
|
|
165
|
+
return dict(self._jobs)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def write_job(job: JobData) -> None:
|
|
169
|
+
db = ExecutionDB()
|
|
170
|
+
db.write_job(job)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_job(job_id: str) -> Optional[JobData]:
|
|
174
|
+
db = ExecutionDB()
|
|
175
|
+
return db.get_job(job_id)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_jobs(invocation_id: str) -> Dict[str, JobData]:
|
|
179
|
+
db = ExecutionDB()
|
|
180
|
+
return db.get_jobs(invocation_id)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_all_jobs() -> Dict[str, JobData]:
|
|
184
|
+
db = ExecutionDB()
|
|
185
|
+
return db.get_all_jobs()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# Ensure all the paths
|
|
189
|
+
_DB = ExecutionDB()
|