nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,136 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Main CLI module using simple-parsing with subcommands."""
17
+
18
+ from simple_parsing import ArgumentParser
19
+
20
+ import nemo_evaluator_launcher.cli.export as export
21
+ import nemo_evaluator_launcher.cli.kill as kill
22
+ import nemo_evaluator_launcher.cli.ls_runs as ls_runs
23
+ import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
24
+ import nemo_evaluator_launcher.cli.run as run
25
+ import nemo_evaluator_launcher.cli.status as status
26
+ import nemo_evaluator_launcher.cli.version as version
27
+ from nemo_evaluator_launcher.common.logging_utils import logger
28
+
29
+ VERSION_HELP = "Show version information"
30
+
31
+
32
+ def create_parser() -> ArgumentParser:
33
+ """Create and configure the CLI argument parser with subcommands."""
34
+ parser = ArgumentParser()
35
+
36
+ # Add --version flag at the top level
37
+ parser.add_argument("--version", action="store_true", help=VERSION_HELP)
38
+
39
+ subparsers = parser.add_subparsers(dest="command", required=False)
40
+
41
+ # Version subcommand
42
+ version_parser = subparsers.add_parser(
43
+ "version",
44
+ help=VERSION_HELP,
45
+ description=VERSION_HELP,
46
+ )
47
+ version_parser.add_arguments(version.Cmd, dest="version")
48
+
49
+ # Run subcommand
50
+ run_parser = subparsers.add_parser(
51
+ "run", help="Run evaluation", description="Run evaluation"
52
+ )
53
+ run_parser.add_arguments(run.Cmd, dest="run")
54
+
55
+ # Status subcommand
56
+ status_parser = subparsers.add_parser(
57
+ "status", help="Check job status", description="Check job status"
58
+ )
59
+ status_parser.add_arguments(status.Cmd, dest="status")
60
+
61
+ # Kill subcommand
62
+ kill_parser = subparsers.add_parser(
63
+ "kill",
64
+ help="Kill a job or invocation",
65
+ description="Kill a job (e.g., aefc4819.0) or entire invocation (e.g., aefc4819) by its ID",
66
+ )
67
+ kill_parser.add_arguments(kill.Cmd, dest="kill")
68
+
69
+ # Ls subcommand (with nested subcommands)
70
+ ls_parser = subparsers.add_parser(
71
+ "ls", help="List resources", description="List tasks or runs"
72
+ )
73
+ ls_sub = ls_parser.add_subparsers(dest="ls_command", required=True)
74
+
75
+ # ls tasks
76
+ ls_tasks_parser = ls_sub.add_parser(
77
+ "tasks", help="List available tasks", description="List available tasks"
78
+ )
79
+ ls_tasks_parser.add_arguments(ls_tasks.Cmd, dest="tasks")
80
+
81
+ # ls runs (invocations summary)
82
+ ls_runs_parser = ls_sub.add_parser(
83
+ "runs",
84
+ help="List invocations (runs)",
85
+ description="Show a concise table of invocations from the exec DB",
86
+ )
87
+ ls_runs_parser.add_arguments(ls_runs.Cmd, dest="runs")
88
+
89
+ # Export subcommand
90
+ export_parser = subparsers.add_parser(
91
+ "export",
92
+ help="Export evaluation results",
93
+ description="Export evaluation results takes a List of invocation ids and a list of destinations(local, gitlab, wandb)",
94
+ )
95
+ export_parser.add_arguments(export.ExportCmd, dest="export")
96
+
97
+ return parser
98
+
99
+
100
+ def main() -> None:
101
+ """Main CLI entry point with subcommands."""
102
+ parser = create_parser()
103
+ args = parser.parse_args()
104
+
105
+ # Handle --version flag
106
+ if hasattr(args, "version") and args.version:
107
+ version_cmd = version.Cmd()
108
+ version_cmd.execute()
109
+ return
110
+
111
+ # Handle case where no command is provided but --version wasn't used
112
+ if not hasattr(args, "command") or args.command is None:
113
+ parser.print_help()
114
+ return
115
+
116
+ logger.debug("Parsed arguments", args=args)
117
+ if args.command == "version":
118
+ args.version.execute()
119
+ elif args.command == "run":
120
+ args.run.execute()
121
+ elif args.command == "status":
122
+ args.status.execute()
123
+ elif args.command == "kill":
124
+ args.kill.execute()
125
+ elif args.command == "ls":
126
+ # Dispatch nested ls subcommands
127
+ if hasattr(args, "tasks"):
128
+ args.tasks.execute()
129
+ elif hasattr(args, "runs"):
130
+ args.runs.execute()
131
+ elif args.command == "export":
132
+ args.export.execute()
133
+
134
+
135
+ if __name__ == "__main__":
136
+ main()
@@ -0,0 +1,135 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import pathlib
17
+ import time
18
+ from dataclasses import dataclass
19
+
20
+ import yaml
21
+ from omegaconf import OmegaConf
22
+ from simple_parsing import field
23
+
24
+ from nemo_evaluator_launcher.api.functional import RunConfig, run_eval
25
+
26
+
27
+ @dataclass
28
+ class Cmd:
29
+ """Run command parameters"""
30
+
31
+ config_name: str = field(
32
+ default="default",
33
+ alias=["-c", "--config-name"],
34
+ metadata={
35
+ "help": "Config name to use. Consult `nemo_evaluator_launcher.configs`"
36
+ },
37
+ )
38
+ config_dir: str | None = field(
39
+ default=None,
40
+ alias=["-d", "--config-dir"],
41
+ metadata={
42
+ "help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
43
+ },
44
+ )
45
+ run_config_file: str | None = field(
46
+ default=None,
47
+ alias=["-f", "--run-config-file"],
48
+ metadata={
49
+ "help": "Path to a run config file to load directly (bypasses Hydra config loading)."
50
+ },
51
+ )
52
+ override: list[str] = field(
53
+ default_factory=list,
54
+ action="append",
55
+ nargs="?",
56
+ alias=["-o"],
57
+ metadata={
58
+ "help": "Hydra override in the form some.param.path=value (pass multiple `-o` for multiple overrides).",
59
+ },
60
+ )
61
+ dry_run: bool = field(
62
+ default=False,
63
+ alias=["-n", "--dry-run"],
64
+ metadata={"help": "Do not run the evaluation, just print the config."},
65
+ )
66
+
67
+ def execute(self) -> None:
68
+ # Load configuration either from Hydra or from a run config file
69
+ if self.run_config_file:
70
+ # Validate that run config file is not used with other config options
71
+ if self.config_name != "default":
72
+ raise ValueError("Cannot use --run-config-file with --config-name")
73
+ if self.config_dir is not None:
74
+ raise ValueError("Cannot use --run-config-file with --config-dir")
75
+ if self.override:
76
+ raise ValueError("Cannot use --run-config-file with --override")
77
+
78
+ # Load from run config file
79
+ with open(self.run_config_file, "r") as f:
80
+ config_dict = yaml.safe_load(f)
81
+
82
+ # Create RunConfig from the loaded data
83
+ config = OmegaConf.create(config_dict)
84
+ else:
85
+ # Load the complete Hydra configuration
86
+ config = RunConfig.from_hydra(
87
+ config_name=self.config_name,
88
+ hydra_overrides=self.override,
89
+ config_dir=self.config_dir,
90
+ )
91
+
92
+ invocation_id = run_eval(config, self.dry_run)
93
+
94
+ # Save the complete configuration to the raw_configs directory
95
+ if not self.dry_run and invocation_id is not None:
96
+ # Create ~/.nemo-evaluator/run_configs directory
97
+ home_dir = pathlib.Path.home()
98
+ run_configs_dir = home_dir / ".nemo-evaluator" / "run_configs"
99
+ run_configs_dir.mkdir(parents=True, exist_ok=True)
100
+
101
+ # Convert DictConfig to dict and save as YAML
102
+ config_dict = OmegaConf.to_container(config, resolve=True)
103
+ config_yaml = yaml.dump(
104
+ config_dict, default_flow_style=False, sort_keys=False, indent=2
105
+ )
106
+
107
+ # Create config filename with invocation ID
108
+ config_filename = f"{invocation_id}_config.yml"
109
+ config_path = run_configs_dir / config_filename
110
+
111
+ # Save the complete Hydra configuration
112
+ with open(config_path, "w") as f:
113
+ f.write("# Complete configuration from nemo-evaluator-launcher\n")
114
+ f.write(
115
+ f"# Generated at: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}\n"
116
+ )
117
+ f.write(f"# Invocation ID: {invocation_id}\n")
118
+ f.write("#\n")
119
+ f.write("# This is the complete raw configuration\n")
120
+ f.write("#\n")
121
+ f.write("# To rerun this exact configuration:\n")
122
+ f.write(
123
+ f"# nemo-evaluator-launcher run --run-config-file {config_path}\n"
124
+ )
125
+ f.write("#\n")
126
+ f.write(config_yaml)
127
+
128
+ print(f"Complete run config saved to: {config_path}")
129
+
130
+ if invocation_id is not None:
131
+ print(f"to check status: nemo-evaluator-launcher status {invocation_id}")
132
+ print(f"to kill all jobs: nemo-evaluator-launcher kill {invocation_id}")
133
+ print(
134
+ f"to kill individual jobs: nemo-evaluator-launcher kill <job_id> (e.g., {invocation_id}.0)"
135
+ )
@@ -0,0 +1,118 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import json
17
+ from dataclasses import dataclass
18
+
19
+ from simple_parsing import field
20
+
21
+ from nemo_evaluator_launcher.api.functional import get_status
22
+
23
+
24
+ @dataclass
25
+ class Cmd:
26
+ """Status command configuration."""
27
+
28
+ job_ids: list[str] = field(
29
+ default_factory=list,
30
+ positional=True,
31
+ )
32
+ json: bool = field(
33
+ default=False,
34
+ action="store_true",
35
+ help="Print output as JSON instead of table format",
36
+ )
37
+
38
+ def execute(self) -> None:
39
+ res = get_status(self.job_ids)
40
+ if self.json:
41
+ # Remove progress field from JSON output as it's a WIP feature
42
+ filtered_res = []
43
+ for job in res:
44
+ job_copy = job.copy()
45
+ job_copy.pop("progress", None)
46
+ filtered_res.append(job_copy)
47
+ print(json.dumps(filtered_res, indent=2))
48
+ else:
49
+ self._print_table(res)
50
+
51
+ def _print_table(self, jobs: list[dict]) -> None:
52
+ """Print job status as a table."""
53
+ if not jobs:
54
+ print("No jobs found.")
55
+ return
56
+
57
+ # Define executor-specific mappings
58
+ executor_headers = {
59
+ "slurm_job_id": "Slurm Job ID",
60
+ "lepton_job_name": "Lepton Job Name",
61
+ "pipeline_id": "Pipeline ID",
62
+ "container": "Container",
63
+ }
64
+
65
+ # Determine executor type and headers
66
+ first_data = jobs[0].get("data", {}) if jobs else {}
67
+ executor_key = next((k for k in executor_headers if k in first_data), None)
68
+ info_header = executor_headers.get(executor_key, "Executor Info")
69
+ headers = ["Job ID", "Status", info_header, "Location"]
70
+
71
+ # Build rows
72
+ rows = []
73
+ for job in jobs:
74
+ data = job.get("data", {})
75
+
76
+ # Extract executor info
77
+ executor_info = str(data.get(executor_key, "")) if executor_key else ""
78
+
79
+ # Extract location
80
+ if executor_key == "slurm_job_id":
81
+ path = data.get("remote_rundir_path", "")
82
+ location = (
83
+ "<output_dir>/" + "/".join(path.split("/")[-2:]) if path else ""
84
+ )
85
+ elif executor_key == "lepton_job_name":
86
+ location = data.get("endpoint_name") or "shared-endpoint"
87
+ elif executor_key == "pipeline_id":
88
+ location = data.get("pipeline_web_url")
89
+ elif executor_key == "container":
90
+ path = data.get("output_dir", "")
91
+ location = path.split("/")[-1] if path else ""
92
+ else:
93
+ location = ""
94
+
95
+ rows.append(
96
+ [
97
+ job.get("job_id", ""),
98
+ job.get("status", ""),
99
+ # job.get("progress", ""), temporarily disabled as this is a WIP feature
100
+ executor_info,
101
+ location,
102
+ ]
103
+ )
104
+
105
+ # Calculate column widths and print
106
+ widths = [
107
+ max(len(str(headers[i])), max(len(str(row[i])) for row in rows))
108
+ for i in range(len(headers))
109
+ ]
110
+
111
+ header_row = " | ".join(
112
+ headers[i].ljust(widths[i]) for i in range(len(headers))
113
+ )
114
+ print(header_row)
115
+ print("-" * len(header_row))
116
+
117
+ for row in rows:
118
+ print(" | ".join(str(row[i]).ljust(widths[i]) for i in range(len(row))))
@@ -0,0 +1,52 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Version command for nemo-evaluator-launcher."""
17
+
18
+ import importlib
19
+ from dataclasses import dataclass
20
+
21
+ from nemo_evaluator_launcher import __package_name__, __version__
22
+
23
+
24
+ @dataclass
25
+ class Cmd:
26
+ """Show version information for nemo-evaluator-launcher and internal packages."""
27
+
28
+ def execute(self) -> None:
29
+ """Execute the version command."""
30
+ print(f"{__package_name__}: {__version__}")
31
+
32
+ # Check for internal package
33
+ try:
34
+ internal_module = importlib.import_module(
35
+ "nemo_evaluator_launcher_internal"
36
+ )
37
+ # Try to get version from internal package
38
+ try:
39
+ internal_version = getattr(internal_module, "__version__", None)
40
+ if internal_version:
41
+ print(f"nemo-evaluator-launcher-internal: {internal_version}")
42
+ else:
43
+ print(
44
+ "nemo-evaluator-launcher-internal: available (version unknown)"
45
+ )
46
+ except Exception:
47
+ print("nemo-evaluator-launcher-internal: available (version unknown)")
48
+ except ImportError:
49
+ # Internal package not available - this is expected in many cases
50
+ pass
51
+ except Exception as e:
52
+ print(f"nemo-evaluator-launcher-internal: error loading ({e})")
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Common utilities and configurations for nemo-evaluator-launcher."""
@@ -0,0 +1,189 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Execution database module for tracking job executions."""
17
+
18
+ import json
19
+ import pathlib
20
+ import secrets
21
+ from dataclasses import asdict, dataclass
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ from nemo_evaluator_launcher.common.logging_utils import logger
25
+
26
+ # Configuration constants
27
+ EXEC_DB_DIR = pathlib.Path.home() / ".nemo-evaluator" / "exec-db"
28
+ EXEC_DB_FILE = EXEC_DB_DIR / "exec.v1.jsonl"
29
+
30
+
31
+ def generate_invocation_id() -> str:
32
+ """Generate a unique invocation ID as an 8-digit hex string."""
33
+ return secrets.token_hex(4)
34
+
35
+
36
+ def generate_job_id(invocation_id: str, index: int) -> str:
37
+ """Generate a job ID as <invocation_id>.<n>.
38
+
39
+ Args:
40
+ invocation_id: The invocation group ID (8-digit hex).
41
+ index: The job index (0-based integer).
42
+ Returns:
43
+ The job ID string.
44
+ """
45
+ return f"{invocation_id}.{index}"
46
+
47
+
48
+ @dataclass
49
+ class JobData:
50
+ """Data structure for job execution information.
51
+
52
+ Attributes:
53
+ invocation_id: 8-digit hex string.
54
+ job_id: <invocation_id>.<n> string.
55
+ timestamp: Unix timestamp when the job was created.
56
+ executor: Name of the executor that handled this job.
57
+ data: Additional job-specific data as a dictionary.
58
+ config: Configuration used to setup a job.
59
+ """
60
+
61
+ invocation_id: str
62
+ job_id: str
63
+ timestamp: float
64
+ executor: str
65
+ data: Dict[str, Any]
66
+ config: Optional[Dict[str, Any]] = None
67
+
68
+
69
+ class ExecutionDB:
70
+ """Singleton class for managing execution database with invocation and job hierarchy."""
71
+
72
+ _instance: Optional["ExecutionDB"] = None
73
+ _jobs: Dict[str, JobData] = {} # job_id -> JobData
74
+ _invocations: Dict[str, List[str]] = {} # invocation_id -> list of job_ids
75
+
76
+ def __new__(cls) -> "ExecutionDB":
77
+ if cls._instance is None:
78
+ cls._instance = super().__new__(cls)
79
+ return cls._instance
80
+
81
+ def __init__(self) -> None:
82
+ if not hasattr(self, "_initialized"):
83
+ self._ensure_db_dir()
84
+ self._load_existing_jobs()
85
+ self._initialized = True
86
+
87
+ def _ensure_db_dir(self) -> None:
88
+ EXEC_DB_DIR.mkdir(parents=True, exist_ok=True)
89
+
90
+ def _load_existing_jobs(self) -> None:
91
+ if not EXEC_DB_FILE.exists():
92
+ return
93
+ try:
94
+ with open(EXEC_DB_FILE, "r") as f:
95
+ for line in f:
96
+ line = line.strip()
97
+ if not line:
98
+ continue
99
+ try:
100
+ record = json.loads(line)
101
+ invocation_id = record.get("invocation_id")
102
+ job_id = record.get("job_id")
103
+ executor = record.get("executor")
104
+ data = record.get("data", {})
105
+ config = record.get("config", {})
106
+ timestamp = record.get("timestamp", 0.0)
107
+ if invocation_id and job_id and executor:
108
+ job_data = JobData(
109
+ invocation_id=invocation_id,
110
+ job_id=job_id,
111
+ timestamp=timestamp,
112
+ executor=executor,
113
+ data=data,
114
+ config=config,
115
+ )
116
+ self._jobs[job_id] = job_data
117
+ if invocation_id not in self._invocations:
118
+ self._invocations[invocation_id] = []
119
+ self._invocations[invocation_id].append(job_id)
120
+ except json.JSONDecodeError as e:
121
+ logger.warning("Failed to parse JSONL line", error=str(e))
122
+ except OSError as e:
123
+ logger.warning("Failed to load existing jobs", error=str(e))
124
+
125
+ def write_job(self, job: JobData) -> None:
126
+ if job.job_id:
127
+ self._jobs[job.job_id] = job
128
+ if job.invocation_id not in self._invocations:
129
+ self._invocations[job.invocation_id] = []
130
+ if job.job_id and job.job_id not in self._invocations[job.invocation_id]:
131
+ self._invocations[job.invocation_id].append(job.job_id)
132
+ record = asdict(job)
133
+ try:
134
+ with open(EXEC_DB_FILE, "a") as f:
135
+ f.write(json.dumps(record) + "\n")
136
+ logger.info(
137
+ "Job written to execution database",
138
+ invocation_id=job.invocation_id,
139
+ job_id=job.job_id,
140
+ executor=job.executor,
141
+ )
142
+ except OSError as e:
143
+ logger.error(
144
+ "Failed to write job to database",
145
+ invocation_id=job.invocation_id,
146
+ job_id=job.job_id,
147
+ error=str(e),
148
+ )
149
+ raise
150
+
151
+ def get_job(self, job_id: str) -> Optional[JobData]:
152
+ return self._jobs.get(job_id)
153
+
154
+ def get_jobs(self, invocation_id: str) -> Dict[str, JobData]:
155
+ job_ids = self._invocations.get(invocation_id, [])
156
+ return {
157
+ job_id: self._jobs[job_id] for job_id in job_ids if job_id in self._jobs
158
+ }
159
+
160
+ def get_invocation_jobs(self, invocation_id: str) -> List[str]:
161
+ return self._invocations.get(invocation_id, [])
162
+
163
+ def get_all_jobs(self) -> Dict[str, JobData]:
164
+ """Return a copy of all jobs in the execution DB."""
165
+ return dict(self._jobs)
166
+
167
+
168
+ def write_job(job: JobData) -> None:
169
+ db = ExecutionDB()
170
+ db.write_job(job)
171
+
172
+
173
+ def get_job(job_id: str) -> Optional[JobData]:
174
+ db = ExecutionDB()
175
+ return db.get_job(job_id)
176
+
177
+
178
+ def get_jobs(invocation_id: str) -> Dict[str, JobData]:
179
+ db = ExecutionDB()
180
+ return db.get_jobs(invocation_id)
181
+
182
+
183
+ def get_all_jobs() -> Dict[str, JobData]:
184
+ db = ExecutionDB()
185
+ return db.get_all_jobs()
186
+
187
+
188
+ # Ensure all the paths
189
+ _DB = ExecutionDB()