nemo-evaluator-launcher 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +79 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +698 -0
- nemo_evaluator_launcher/api/types.py +98 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +267 -0
- nemo_evaluator_launcher/cli/info.py +512 -0
- nemo_evaluator_launcher/cli/kill.py +41 -0
- nemo_evaluator_launcher/cli/ls_runs.py +134 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
- nemo_evaluator_launcher/cli/main.py +226 -0
- nemo_evaluator_launcher/cli/run.py +200 -0
- nemo_evaluator_launcher/cli/status.py +164 -0
- nemo_evaluator_launcher/cli/version.py +55 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +283 -0
- nemo_evaluator_launcher/common/helpers.py +366 -0
- nemo_evaluator_launcher/common/logging_utils.py +357 -0
- nemo_evaluator_launcher/common/mapping.py +295 -0
- nemo_evaluator_launcher/common/printing_utils.py +93 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +120 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +605 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +121 -0
- nemo_evaluator_launcher/exporters/gsheets.py +409 -0
- nemo_evaluator_launcher/exporters/local.py +502 -0
- nemo_evaluator_launcher/exporters/mlflow.py +619 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +624 -0
- nemo_evaluator_launcher/exporters/wandb.py +490 -0
- nemo_evaluator_launcher/package_info.py +38 -0
- nemo_evaluator_launcher/resources/mapping.toml +380 -0
- nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
- nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
from simple_parsing import field
|
|
19
|
+
|
|
20
|
+
import nemo_evaluator_launcher.common.printing_utils as pu
|
|
21
|
+
from nemo_evaluator_launcher.executors.base import ExecutionState
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Cmd:
|
|
26
|
+
"""Status command configuration."""
|
|
27
|
+
|
|
28
|
+
job_ids: list[str] = field(
|
|
29
|
+
default_factory=list,
|
|
30
|
+
positional=True,
|
|
31
|
+
)
|
|
32
|
+
json: bool = field(
|
|
33
|
+
default=False,
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Print output as JSON instead of table format",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def execute(self) -> None:
|
|
39
|
+
# Import heavy dependencies only when needed
|
|
40
|
+
import json
|
|
41
|
+
|
|
42
|
+
from nemo_evaluator_launcher.api.functional import get_status
|
|
43
|
+
|
|
44
|
+
res = get_status(self.job_ids)
|
|
45
|
+
if self.json:
|
|
46
|
+
# Remove progress field from JSON output as it's a WIP feature
|
|
47
|
+
filtered_res = []
|
|
48
|
+
for job in res:
|
|
49
|
+
job_copy = job.copy()
|
|
50
|
+
job_copy.pop("progress", None)
|
|
51
|
+
filtered_res.append(job_copy)
|
|
52
|
+
print(json.dumps(filtered_res, indent=2))
|
|
53
|
+
else:
|
|
54
|
+
self._print_table(res)
|
|
55
|
+
|
|
56
|
+
def _print_table(self, jobs: list[dict]) -> None:
|
|
57
|
+
"""Print job status as a table."""
|
|
58
|
+
if not jobs:
|
|
59
|
+
print("No jobs found.")
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
# Define executor-specific mappings
|
|
63
|
+
executor_headers = {
|
|
64
|
+
"slurm_job_id": "Slurm Job ID",
|
|
65
|
+
"lepton_job_name": "Lepton Job Name",
|
|
66
|
+
"pipeline_id": "Pipeline ID",
|
|
67
|
+
"container": "Container",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Determine executor type and headers
|
|
71
|
+
first_data = jobs[0].get("data", {}) if jobs else {}
|
|
72
|
+
executor_key = next((k for k in executor_headers if k in first_data), None)
|
|
73
|
+
info_header = executor_headers.get(executor_key, "Executor Info")
|
|
74
|
+
headers = ["Job ID", "Status", info_header, "Location"]
|
|
75
|
+
|
|
76
|
+
# Build rows
|
|
77
|
+
rows = []
|
|
78
|
+
for job in jobs:
|
|
79
|
+
data = job.get("data", {})
|
|
80
|
+
|
|
81
|
+
# Extract executor info
|
|
82
|
+
executor_info = str(data.get(executor_key, "")) if executor_key else ""
|
|
83
|
+
|
|
84
|
+
# Extract location
|
|
85
|
+
if executor_key == "slurm_job_id":
|
|
86
|
+
path = data.get("remote_rundir_path", "")
|
|
87
|
+
location = (
|
|
88
|
+
"<output_dir>/" + "/".join(path.split("/")[-2:]) if path else ""
|
|
89
|
+
)
|
|
90
|
+
elif executor_key == "lepton_job_name":
|
|
91
|
+
location = data.get("endpoint_name") or "shared-endpoint"
|
|
92
|
+
elif executor_key == "pipeline_id":
|
|
93
|
+
location = data.get("pipeline_web_url")
|
|
94
|
+
elif executor_key == "container":
|
|
95
|
+
path = data.get("output_dir", "")
|
|
96
|
+
location = (
|
|
97
|
+
"<output_dir>/" + "/".join(path.split("/")[-2:]) if path else ""
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
location = ""
|
|
101
|
+
|
|
102
|
+
# Format status with visual indicators and colors
|
|
103
|
+
status = job.get("status", "")
|
|
104
|
+
formatted_status = self._format_status_with_indicators(status)
|
|
105
|
+
|
|
106
|
+
# Extract task name
|
|
107
|
+
|
|
108
|
+
rows.append(
|
|
109
|
+
[
|
|
110
|
+
job.get("job_id", ""),
|
|
111
|
+
formatted_status,
|
|
112
|
+
# job.get("progress", ""), temporarily disabled as this is a WIP feature
|
|
113
|
+
executor_info,
|
|
114
|
+
location,
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Calculate column widths and print
|
|
119
|
+
widths = [
|
|
120
|
+
max(
|
|
121
|
+
len(str(headers[i])),
|
|
122
|
+
max(len(self._strip_ansi_codes(str(row[i]))) for row in rows),
|
|
123
|
+
)
|
|
124
|
+
for i in range(len(headers))
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
header_row = " | ".join(
|
|
128
|
+
headers[i].ljust(widths[i]) for i in range(len(headers))
|
|
129
|
+
)
|
|
130
|
+
print(header_row)
|
|
131
|
+
print("-" * len(header_row))
|
|
132
|
+
|
|
133
|
+
for row in rows:
|
|
134
|
+
# Adjust padding for ANSI color codes
|
|
135
|
+
formatted_row = []
|
|
136
|
+
for i in range(len(row)):
|
|
137
|
+
content = str(row[i])
|
|
138
|
+
visible_length = len(self._strip_ansi_codes(content))
|
|
139
|
+
padding = widths[i] - visible_length
|
|
140
|
+
formatted_row.append(content + " " * padding)
|
|
141
|
+
print(" | ".join(formatted_row))
|
|
142
|
+
|
|
143
|
+
def _format_status_with_indicators(self, status: str) -> str:
|
|
144
|
+
"""Format status with Unicode visual indicators only."""
|
|
145
|
+
# Status mapping based on ExecutionState enum
|
|
146
|
+
status_formats = {
|
|
147
|
+
ExecutionState.SUCCESS.value: pu.green("✓ SUCCESS"),
|
|
148
|
+
ExecutionState.FAILED.value: pu.red("✗ FAILED"),
|
|
149
|
+
ExecutionState.RUNNING.value: pu.yellow("▶ RUNNING"),
|
|
150
|
+
ExecutionState.PENDING.value: pu.cyan("⧗ PENDING"),
|
|
151
|
+
ExecutionState.KILLED.value: pu.magenta("✗ KILLED"),
|
|
152
|
+
# Additional states for error handling
|
|
153
|
+
"not_found": pu.grey("? NOT FOUND"),
|
|
154
|
+
"error": pu.red("✗ ERROR"),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return status_formats.get(status.lower(), pu.grey(status.upper()))
|
|
158
|
+
|
|
159
|
+
def _strip_ansi_codes(self, text: str) -> str:
|
|
160
|
+
"""Remove ANSI color codes from text for length calculation."""
|
|
161
|
+
import re
|
|
162
|
+
|
|
163
|
+
ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
|
|
164
|
+
return ansi_escape.sub("", text)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Version command for nemo-evaluator-launcher."""
|
|
17
|
+
|
|
18
|
+
import importlib
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from nemo_evaluator_launcher import __package_name__, __version__
|
|
22
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_versions() -> dict:
|
|
26
|
+
internal_module_name = "nemo_evaluator_launcher_internal"
|
|
27
|
+
res = {__package_name__: __version__}
|
|
28
|
+
# Check for internal package
|
|
29
|
+
try:
|
|
30
|
+
internal_module = importlib.import_module(internal_module_name)
|
|
31
|
+
# Try to get version from internal package
|
|
32
|
+
internal_version = getattr(internal_module, "__version__", None)
|
|
33
|
+
if internal_version:
|
|
34
|
+
res[internal_module_name] = internal_version
|
|
35
|
+
else:
|
|
36
|
+
res[internal_module_name] = "available (version unknown)"
|
|
37
|
+
except ImportError:
|
|
38
|
+
# Internal package not available - this is expected in many cases
|
|
39
|
+
pass
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"nemo_evaluator_launcher_internal: error loading ({e})")
|
|
42
|
+
raise
|
|
43
|
+
|
|
44
|
+
return res
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class Cmd:
|
|
49
|
+
"""Show version information for nemo-evaluator-launcher and internal packages."""
|
|
50
|
+
|
|
51
|
+
def execute(self) -> None:
|
|
52
|
+
"""Execute the version command."""
|
|
53
|
+
res = get_versions()
|
|
54
|
+
for package, version in res.items():
|
|
55
|
+
print(f"{package}: {version}")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Common utilities and configurations for nemo-evaluator-launcher."""
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Execution database module for tracking job executions."""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import pathlib
|
|
20
|
+
import secrets
|
|
21
|
+
from dataclasses import asdict, dataclass
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
25
|
+
|
|
26
|
+
# Configuration constants
|
|
27
|
+
EXEC_DB_DIR = pathlib.Path.home() / ".nemo-evaluator" / "exec-db"
|
|
28
|
+
EXEC_DB_FILE = EXEC_DB_DIR / "exec.v1.jsonl"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generate_invocation_id() -> str:
|
|
32
|
+
"""Generate a unique invocation ID as an 16-digit hex string."""
|
|
33
|
+
return secrets.token_hex(8)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def generate_job_id(invocation_id: str, index: int) -> str:
|
|
37
|
+
"""Generate a job ID as <invocation_id>.<n>.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
invocation_id: The invocation group ID (16-digit hex).
|
|
41
|
+
index: The job index (0-based integer).
|
|
42
|
+
Returns:
|
|
43
|
+
The job ID string.
|
|
44
|
+
"""
|
|
45
|
+
return f"{invocation_id}.{index}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class JobData:
|
|
50
|
+
"""Data structure for job execution information.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
invocation_id: 16-digit hex string.
|
|
54
|
+
job_id: <invocation_id>.<n> string.
|
|
55
|
+
timestamp: Unix timestamp when the job was created.
|
|
56
|
+
executor: Name of the executor that handled this job.
|
|
57
|
+
data: Additional job-specific data as a dictionary.
|
|
58
|
+
config: Configuration used to setup a job.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
invocation_id: str
|
|
62
|
+
job_id: str
|
|
63
|
+
timestamp: float
|
|
64
|
+
executor: str
|
|
65
|
+
data: Dict[str, Any]
|
|
66
|
+
config: Optional[Dict[str, Any]] = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ExecutionDB:
|
|
70
|
+
"""Singleton class for managing execution database with invocation and job hierarchy."""
|
|
71
|
+
|
|
72
|
+
_instance: Optional["ExecutionDB"] = None
|
|
73
|
+
_jobs: Dict[str, JobData] = {} # job_id -> JobData
|
|
74
|
+
_invocations: Dict[str, List[str]] = {} # invocation_id -> list of job_ids
|
|
75
|
+
|
|
76
|
+
def __new__(cls) -> "ExecutionDB":
|
|
77
|
+
if cls._instance is None:
|
|
78
|
+
cls._instance = super().__new__(cls)
|
|
79
|
+
return cls._instance
|
|
80
|
+
|
|
81
|
+
def __init__(self) -> None:
|
|
82
|
+
if not hasattr(self, "_initialized"):
|
|
83
|
+
self._ensure_db_dir()
|
|
84
|
+
self._load_existing_jobs()
|
|
85
|
+
self._initialized = True
|
|
86
|
+
|
|
87
|
+
def _ensure_db_dir(self) -> None:
|
|
88
|
+
EXEC_DB_DIR.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
def _load_existing_jobs(self) -> None:
|
|
91
|
+
if not EXEC_DB_FILE.exists():
|
|
92
|
+
return
|
|
93
|
+
try:
|
|
94
|
+
with open(EXEC_DB_FILE, "r") as f:
|
|
95
|
+
for line in f:
|
|
96
|
+
line = line.strip()
|
|
97
|
+
if not line:
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
record = json.loads(line)
|
|
101
|
+
invocation_id = record.get("invocation_id")
|
|
102
|
+
job_id = record.get("job_id")
|
|
103
|
+
executor = record.get("executor")
|
|
104
|
+
data = record.get("data", {})
|
|
105
|
+
config = record.get("config", {})
|
|
106
|
+
timestamp = record.get("timestamp", 0.0)
|
|
107
|
+
if invocation_id and job_id and executor:
|
|
108
|
+
job_data = JobData(
|
|
109
|
+
invocation_id=invocation_id,
|
|
110
|
+
job_id=job_id,
|
|
111
|
+
timestamp=timestamp,
|
|
112
|
+
executor=executor,
|
|
113
|
+
data=data,
|
|
114
|
+
config=config,
|
|
115
|
+
)
|
|
116
|
+
self._jobs[job_id] = job_data
|
|
117
|
+
if invocation_id not in self._invocations:
|
|
118
|
+
self._invocations[invocation_id] = []
|
|
119
|
+
self._invocations[invocation_id].append(job_id)
|
|
120
|
+
except json.JSONDecodeError as e:
|
|
121
|
+
logger.warning("Failed to parse JSONL line", error=str(e))
|
|
122
|
+
except OSError as e:
|
|
123
|
+
logger.warning("Failed to load existing jobs", error=str(e))
|
|
124
|
+
|
|
125
|
+
def write_job(self, job: JobData) -> None:
|
|
126
|
+
if job.job_id:
|
|
127
|
+
self._jobs[job.job_id] = job
|
|
128
|
+
if job.invocation_id not in self._invocations:
|
|
129
|
+
self._invocations[job.invocation_id] = []
|
|
130
|
+
if job.job_id and job.job_id not in self._invocations[job.invocation_id]:
|
|
131
|
+
self._invocations[job.invocation_id].append(job.job_id)
|
|
132
|
+
record = asdict(job)
|
|
133
|
+
try:
|
|
134
|
+
with open(EXEC_DB_FILE, "a") as f:
|
|
135
|
+
f.write(json.dumps(record) + "\n")
|
|
136
|
+
logger.info(
|
|
137
|
+
"Job written to execution database",
|
|
138
|
+
invocation_id=job.invocation_id,
|
|
139
|
+
job_id=job.job_id,
|
|
140
|
+
executor=job.executor,
|
|
141
|
+
)
|
|
142
|
+
except OSError as e:
|
|
143
|
+
logger.error(
|
|
144
|
+
"Failed to write job to database",
|
|
145
|
+
invocation_id=job.invocation_id,
|
|
146
|
+
job_id=job.job_id,
|
|
147
|
+
error=str(e),
|
|
148
|
+
)
|
|
149
|
+
raise
|
|
150
|
+
|
|
151
|
+
def _resolve_invocation_id(self, short_id: str) -> Optional[str]:
|
|
152
|
+
"""Resolve a short invocation ID to the full one.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
short_id: Partial or full invocation ID.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Full invocation ID if found uniquely, None if not found.
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
ValueError: If the short_id matches multiple invocation IDs.
|
|
162
|
+
"""
|
|
163
|
+
if not short_id:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
short_id = short_id.lower()
|
|
167
|
+
|
|
168
|
+
# NOTE(agronskiy): this is a non-optimized implementation that assumes small amount
|
|
169
|
+
# of jobs in ExecDB(), a typical scenario. Speeding up would involve building a
|
|
170
|
+
# prefix tree when loading invocations/jobs.
|
|
171
|
+
matches = [
|
|
172
|
+
inv_id
|
|
173
|
+
for inv_id in self._invocations.keys()
|
|
174
|
+
if inv_id.lower().startswith(short_id)
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
if len(matches) == 1:
|
|
178
|
+
return matches[0]
|
|
179
|
+
elif len(matches) > 1:
|
|
180
|
+
raise ValueError(f"Ambiguous invocation ID '{short_id}': matches {matches}")
|
|
181
|
+
else:
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
def _resolve_job_id(self, short_job_id: str) -> Optional[str]:
|
|
185
|
+
"""Resolve a short job ID to the full one.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
short_job_id: Partial or full job ID.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Full job ID if found uniquely, None if not found.
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: If the short_job_id matches multiple job IDs.
|
|
195
|
+
"""
|
|
196
|
+
if not short_job_id:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
# Normalize to lowercase for case-insensitive matching
|
|
200
|
+
short_job_id = short_job_id.lower()
|
|
201
|
+
|
|
202
|
+
if "." in short_job_id:
|
|
203
|
+
parts = short_job_id.split(".", 1)
|
|
204
|
+
short_inv_id, job_index = parts[0], parts[1]
|
|
205
|
+
|
|
206
|
+
# Resolve the invocation part
|
|
207
|
+
full_inv_id = self._resolve_invocation_id(short_inv_id)
|
|
208
|
+
if full_inv_id:
|
|
209
|
+
candidate_job_id = f"{full_inv_id}.{job_index}"
|
|
210
|
+
if candidate_job_id in self._jobs:
|
|
211
|
+
return candidate_job_id
|
|
212
|
+
|
|
213
|
+
# NOTE(agronskiy): unfortunately, due to legacy, there exist usecases where
|
|
214
|
+
# job_id is the same format as invocation_id
|
|
215
|
+
candidate_job_id = self._resolve_invocation_id(short_job_id)
|
|
216
|
+
if candidate_job_id and candidate_job_id in self._jobs:
|
|
217
|
+
return candidate_job_id
|
|
218
|
+
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
def get_job(self, job_id: str) -> Optional[JobData]:
|
|
222
|
+
"""Get job by full or partial job ID.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
job_id: Full or partial job ID.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
JobData if found, None otherwise.
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
ValueError: If the job_id matches multiple jobs.
|
|
232
|
+
"""
|
|
233
|
+
resolved_id = self._resolve_job_id(job_id)
|
|
234
|
+
if resolved_id:
|
|
235
|
+
return self._jobs.get(resolved_id)
|
|
236
|
+
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def get_jobs(self, invocation_id: str) -> Dict[str, JobData]:
|
|
240
|
+
"""Get all jobs for a full or partial invocation ID.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
invocation_id: Full or partial invocation ID.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Dictionary mapping job_id to JobData for all jobs in the invocation.
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
ValueError: If the invocation_id matches multiple invocations.
|
|
250
|
+
"""
|
|
251
|
+
resolved_inv_id = self._resolve_invocation_id(invocation_id)
|
|
252
|
+
if not resolved_inv_id:
|
|
253
|
+
return {}
|
|
254
|
+
|
|
255
|
+
job_ids = self._invocations.get(resolved_inv_id, [])
|
|
256
|
+
return {
|
|
257
|
+
job_id: self._jobs[job_id] for job_id in job_ids if job_id in self._jobs
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
def get_invocation_jobs(self, invocation_id: str) -> List[str]:
|
|
261
|
+
"""Get job IDs for a full or partial invocation ID.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
invocation_id: Full or partial invocation ID.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List of job IDs for the invocation.
|
|
268
|
+
|
|
269
|
+
Raises:
|
|
270
|
+
ValueError: If the invocation_id matches multiple invocations.
|
|
271
|
+
"""
|
|
272
|
+
resolved_inv_id = self._resolve_invocation_id(invocation_id)
|
|
273
|
+
if not resolved_inv_id:
|
|
274
|
+
return []
|
|
275
|
+
return self._invocations.get(resolved_inv_id, [])
|
|
276
|
+
|
|
277
|
+
def get_all_jobs(self) -> Dict[str, JobData]:
|
|
278
|
+
"""Return a copy of all jobs in the execution DB."""
|
|
279
|
+
return dict(self._jobs)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# Ensure all the paths
|
|
283
|
+
_DB = ExecutionDB()
|