nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nemo-evaluator-launcher might be problematic. Click here for more details.
- nemo_evaluator_launcher/__init__.py +65 -0
- nemo_evaluator_launcher/api/__init__.py +24 -0
- nemo_evaluator_launcher/api/functional.py +641 -0
- nemo_evaluator_launcher/api/types.py +89 -0
- nemo_evaluator_launcher/api/utils.py +19 -0
- nemo_evaluator_launcher/cli/__init__.py +15 -0
- nemo_evaluator_launcher/cli/export.py +148 -0
- nemo_evaluator_launcher/cli/info.py +117 -0
- nemo_evaluator_launcher/cli/kill.py +39 -0
- nemo_evaluator_launcher/cli/ls_runs.py +113 -0
- nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
- nemo_evaluator_launcher/cli/main.py +136 -0
- nemo_evaluator_launcher/cli/run.py +135 -0
- nemo_evaluator_launcher/cli/status.py +118 -0
- nemo_evaluator_launcher/cli/version.py +52 -0
- nemo_evaluator_launcher/common/__init__.py +16 -0
- nemo_evaluator_launcher/common/execdb.py +189 -0
- nemo_evaluator_launcher/common/helpers.py +157 -0
- nemo_evaluator_launcher/common/logging_utils.py +349 -0
- nemo_evaluator_launcher/common/mapping.py +310 -0
- nemo_evaluator_launcher/configs/__init__.py +15 -0
- nemo_evaluator_launcher/configs/default.yaml +28 -0
- nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
- nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
- nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
- nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
- nemo_evaluator_launcher/executors/__init__.py +22 -0
- nemo_evaluator_launcher/executors/base.py +97 -0
- nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
- nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
- nemo_evaluator_launcher/executors/local/__init__.py +15 -0
- nemo_evaluator_launcher/executors/local/executor.py +491 -0
- nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
- nemo_evaluator_launcher/executors/registry.py +38 -0
- nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
- nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
- nemo_evaluator_launcher/exporters/__init__.py +36 -0
- nemo_evaluator_launcher/exporters/base.py +112 -0
- nemo_evaluator_launcher/exporters/gsheets.py +391 -0
- nemo_evaluator_launcher/exporters/local.py +488 -0
- nemo_evaluator_launcher/exporters/mlflow.py +448 -0
- nemo_evaluator_launcher/exporters/registry.py +40 -0
- nemo_evaluator_launcher/exporters/utils.py +669 -0
- nemo_evaluator_launcher/exporters/wandb.py +376 -0
- nemo_evaluator_launcher/package_info.py +35 -0
- nemo_evaluator_launcher/resources/mapping.toml +344 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
- nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Local executor implementation for nemo-evaluator-launcher.
|
|
17
|
+
|
|
18
|
+
Handles running evaluation jobs locally using shell scripts and Docker containers.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import copy
|
|
22
|
+
import os
|
|
23
|
+
import pathlib
|
|
24
|
+
import platform
|
|
25
|
+
import shlex
|
|
26
|
+
import subprocess
|
|
27
|
+
import time
|
|
28
|
+
from typing import List, Optional
|
|
29
|
+
|
|
30
|
+
import jinja2
|
|
31
|
+
import yaml
|
|
32
|
+
from omegaconf import DictConfig, OmegaConf
|
|
33
|
+
|
|
34
|
+
from nemo_evaluator_launcher.common.execdb import (
|
|
35
|
+
ExecutionDB,
|
|
36
|
+
JobData,
|
|
37
|
+
generate_invocation_id,
|
|
38
|
+
generate_job_id,
|
|
39
|
+
)
|
|
40
|
+
from nemo_evaluator_launcher.common.helpers import (
|
|
41
|
+
get_eval_factory_command,
|
|
42
|
+
get_eval_factory_dataset_size_from_run_config,
|
|
43
|
+
get_timestamp_string,
|
|
44
|
+
)
|
|
45
|
+
from nemo_evaluator_launcher.common.mapping import (
|
|
46
|
+
get_task_from_mapping,
|
|
47
|
+
load_tasks_mapping,
|
|
48
|
+
)
|
|
49
|
+
from nemo_evaluator_launcher.executors.base import (
|
|
50
|
+
BaseExecutor,
|
|
51
|
+
ExecutionState,
|
|
52
|
+
ExecutionStatus,
|
|
53
|
+
)
|
|
54
|
+
from nemo_evaluator_launcher.executors.registry import register_executor
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@register_executor("local")
|
|
58
|
+
class LocalExecutor(BaseExecutor):
|
|
59
|
+
@classmethod
|
|
60
|
+
def execute_eval(cls, cfg: DictConfig, dry_run: bool = False) -> str:
|
|
61
|
+
"""Run evaluation jobs locally using the provided configuration.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
cfg: The configuration object for the evaluation run.
|
|
65
|
+
dry_run: If True, prepare scripts and save them without execution.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
str: The invocation ID for the evaluation run.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
NotImplementedError: If deployment is not 'none'.
|
|
72
|
+
RuntimeError: If the run script fails.
|
|
73
|
+
"""
|
|
74
|
+
if cfg.deployment.type != "none":
|
|
75
|
+
raise NotImplementedError(
|
|
76
|
+
f"type {cfg.deployment.type} is not implemented -- add deployment support"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Generate invocation ID for this evaluation run
|
|
80
|
+
invocation_id = generate_invocation_id()
|
|
81
|
+
|
|
82
|
+
output_dir = pathlib.Path(cfg.execution.output_dir).absolute() / (
|
|
83
|
+
get_timestamp_string(include_microseconds=False) + "-" + invocation_id
|
|
84
|
+
)
|
|
85
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
|
|
87
|
+
tasks_mapping = load_tasks_mapping()
|
|
88
|
+
evaluation_tasks = []
|
|
89
|
+
job_ids = []
|
|
90
|
+
|
|
91
|
+
eval_template = jinja2.Template(
|
|
92
|
+
open(pathlib.Path(__file__).parent / "run.template.sh", "r").read()
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
execution_mode = cfg.execution.get("mode", "parallel")
|
|
96
|
+
if execution_mode == "parallel":
|
|
97
|
+
is_execution_mode_sequential = False
|
|
98
|
+
elif execution_mode == "sequential":
|
|
99
|
+
is_execution_mode_sequential = True
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"unknown execution mode: {}. Choose one of {}".format(
|
|
103
|
+
repr(execution_mode), ["parallel", "sequential"]
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
108
|
+
task_definition = get_task_from_mapping(task.name, tasks_mapping)
|
|
109
|
+
|
|
110
|
+
# Create job ID as <invocation_id>.<n>
|
|
111
|
+
job_id = generate_job_id(invocation_id, idx)
|
|
112
|
+
job_ids.append(job_id)
|
|
113
|
+
container_name = f"{task.name}-{get_timestamp_string()}"
|
|
114
|
+
|
|
115
|
+
# collect all env vars
|
|
116
|
+
env_vars = copy.deepcopy(dict(cfg.evaluation.get("env_vars", {})))
|
|
117
|
+
env_vars.update(task.get("env_vars", {}))
|
|
118
|
+
if cfg.target.api_endpoint.api_key_name:
|
|
119
|
+
assert "API_KEY" not in env_vars
|
|
120
|
+
env_vars["API_KEY"] = cfg.target.api_endpoint.api_key_name
|
|
121
|
+
|
|
122
|
+
# check if the environment variables are set
|
|
123
|
+
for env_var in env_vars.values():
|
|
124
|
+
if os.getenv(env_var) is None:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"Trying to pass an unset environment variable {env_var}."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# check if required env vars are defined:
|
|
130
|
+
for required_env_var in task_definition.get("required_env_vars", []):
|
|
131
|
+
if required_env_var not in env_vars.keys():
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"{task.name} task requires environment variable {required_env_var}."
|
|
134
|
+
" Specify it in the task subconfig in the 'env_vars' dict as the following"
|
|
135
|
+
f" pair {required_env_var}: YOUR_ENV_VAR_NAME"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# format env_vars for a template
|
|
139
|
+
env_vars = [
|
|
140
|
+
f"{env_var_dst}=${env_var_src}"
|
|
141
|
+
for env_var_dst, env_var_src in env_vars.items()
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
eval_image = task_definition["container"]
|
|
145
|
+
if "container" in task:
|
|
146
|
+
eval_image = task["container"]
|
|
147
|
+
|
|
148
|
+
task_output_dir = output_dir / task.name
|
|
149
|
+
task_output_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
evaluation_task = {
|
|
151
|
+
"name": task.name,
|
|
152
|
+
"job_id": job_id,
|
|
153
|
+
"eval_image": eval_image,
|
|
154
|
+
"container_name": container_name,
|
|
155
|
+
"env_vars": env_vars,
|
|
156
|
+
"output_dir": task_output_dir,
|
|
157
|
+
"eval_factory_command": get_eval_factory_command(
|
|
158
|
+
cfg, task, task_definition
|
|
159
|
+
),
|
|
160
|
+
}
|
|
161
|
+
evaluation_tasks.append(evaluation_task)
|
|
162
|
+
|
|
163
|
+
# Check if auto-export is enabled by presence of destination(s)
|
|
164
|
+
auto_export_config = cfg.execution.get("auto_export", {})
|
|
165
|
+
auto_export_destinations = auto_export_config.get("destinations", [])
|
|
166
|
+
|
|
167
|
+
run_sh_content = (
|
|
168
|
+
eval_template.render(
|
|
169
|
+
evaluation_tasks=[evaluation_task],
|
|
170
|
+
auto_export_destinations=auto_export_destinations,
|
|
171
|
+
).rstrip("\n")
|
|
172
|
+
+ "\n"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
(task_output_dir / "run.sh").write_text(run_sh_content)
|
|
176
|
+
|
|
177
|
+
run_all_sequentially_sh_content = (
|
|
178
|
+
eval_template.render(
|
|
179
|
+
evaluation_tasks=evaluation_tasks,
|
|
180
|
+
auto_export_destinations=auto_export_destinations,
|
|
181
|
+
).rstrip("\n")
|
|
182
|
+
+ "\n"
|
|
183
|
+
)
|
|
184
|
+
(output_dir / "run_all.sequential.sh").write_text(
|
|
185
|
+
run_all_sequentially_sh_content
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Save launched jobs metadata
|
|
189
|
+
db = ExecutionDB()
|
|
190
|
+
for job_id, task, evaluation_task in zip(
|
|
191
|
+
job_ids, cfg.evaluation.tasks, evaluation_tasks
|
|
192
|
+
):
|
|
193
|
+
db.write_job(
|
|
194
|
+
job=JobData(
|
|
195
|
+
invocation_id=invocation_id,
|
|
196
|
+
job_id=job_id,
|
|
197
|
+
timestamp=time.time(),
|
|
198
|
+
executor="local",
|
|
199
|
+
data={
|
|
200
|
+
"output_dir": str(evaluation_task["output_dir"]),
|
|
201
|
+
"container": evaluation_task["container_name"],
|
|
202
|
+
"eval_image": evaluation_task["eval_image"],
|
|
203
|
+
},
|
|
204
|
+
config=OmegaConf.to_object(cfg),
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if dry_run:
|
|
209
|
+
print("\n\n=============================================\n\n")
|
|
210
|
+
print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
|
|
211
|
+
if is_execution_mode_sequential:
|
|
212
|
+
print(
|
|
213
|
+
"\n\n =========== Main script | run_all.sequential.sh ===================== \n\n"
|
|
214
|
+
)
|
|
215
|
+
with open(output_dir / "run_all.sequential.sh", "r") as f:
|
|
216
|
+
print(f.read())
|
|
217
|
+
else:
|
|
218
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
219
|
+
task_output_dir = output_dir / task.name
|
|
220
|
+
print(
|
|
221
|
+
f"\n\n =========== Task script | {task.name}/run.sh ===================== \n\n"
|
|
222
|
+
)
|
|
223
|
+
with open(task_output_dir / "run.sh", "r") as f:
|
|
224
|
+
print(f.read())
|
|
225
|
+
print("\nTo execute, run without --dry-run")
|
|
226
|
+
return invocation_id
|
|
227
|
+
|
|
228
|
+
# Launch bash scripts with Popen for non-blocking execution.
|
|
229
|
+
# To ensure subprocess continues after python exits:
|
|
230
|
+
# - on Unix-like systems, to fully detach the subprocess
|
|
231
|
+
# so it does not die when Python exits, pass start_new_session=True;
|
|
232
|
+
# - on Widnows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
|
|
233
|
+
os_name = platform.system()
|
|
234
|
+
if is_execution_mode_sequential:
|
|
235
|
+
if os_name == "Windows":
|
|
236
|
+
subprocess.Popen(
|
|
237
|
+
shlex.split("bash run_all.sequential.sh"),
|
|
238
|
+
cwd=output_dir,
|
|
239
|
+
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
subprocess.Popen(
|
|
243
|
+
shlex.split("bash run_all.sequential.sh"),
|
|
244
|
+
cwd=output_dir,
|
|
245
|
+
start_new_session=True,
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
for task in cfg.evaluation.tasks:
|
|
249
|
+
if os_name == "Windows":
|
|
250
|
+
subprocess.Popen(
|
|
251
|
+
shlex.split("bash run.sh"),
|
|
252
|
+
cwd=output_dir / task.name,
|
|
253
|
+
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
subprocess.Popen(
|
|
257
|
+
shlex.split("bash run.sh"),
|
|
258
|
+
cwd=output_dir / task.name,
|
|
259
|
+
start_new_session=True,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
print("\nCommands for real-time monitoring:")
|
|
263
|
+
for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
|
|
264
|
+
log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
|
|
265
|
+
print(f" tail -f {log_file}")
|
|
266
|
+
|
|
267
|
+
print("\nFollow all logs for this invocation:")
|
|
268
|
+
print(f" tail -f {output_dir}/*/logs/stdout.log")
|
|
269
|
+
|
|
270
|
+
return invocation_id
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def get_status(id: str) -> List[ExecutionStatus]:
|
|
274
|
+
"""Get the status of a specific job or all jobs in an invocation group.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
id: Unique job identifier or invocation identifier.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
List containing the execution status for the job(s).
|
|
281
|
+
"""
|
|
282
|
+
db = ExecutionDB()
|
|
283
|
+
|
|
284
|
+
# If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
|
|
285
|
+
if len(id) == 8 and "." not in id:
|
|
286
|
+
jobs = db.get_jobs(id)
|
|
287
|
+
statuses: List[ExecutionStatus] = []
|
|
288
|
+
for job_id, _ in jobs.items():
|
|
289
|
+
statuses.extend(LocalExecutor.get_status(job_id))
|
|
290
|
+
return statuses
|
|
291
|
+
|
|
292
|
+
# Otherwise, treat as job_id
|
|
293
|
+
job_data = db.get_job(id)
|
|
294
|
+
if job_data is None:
|
|
295
|
+
return []
|
|
296
|
+
if job_data.executor != "local":
|
|
297
|
+
return []
|
|
298
|
+
|
|
299
|
+
output_dir = pathlib.Path(job_data.data.get("output_dir", ""))
|
|
300
|
+
if not output_dir.exists():
|
|
301
|
+
return [ExecutionStatus(id=id, state=ExecutionState.PENDING)]
|
|
302
|
+
|
|
303
|
+
artifacts_dir = output_dir / "artifacts"
|
|
304
|
+
progress = _get_progress(artifacts_dir)
|
|
305
|
+
|
|
306
|
+
logs_dir = output_dir / "logs"
|
|
307
|
+
if not logs_dir.exists():
|
|
308
|
+
return [
|
|
309
|
+
ExecutionStatus(
|
|
310
|
+
id=id,
|
|
311
|
+
state=ExecutionState.PENDING,
|
|
312
|
+
progress=dict(progress=progress),
|
|
313
|
+
)
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
# Check if job was killed
|
|
317
|
+
if job_data.data.get("killed", False):
|
|
318
|
+
return [
|
|
319
|
+
ExecutionStatus(
|
|
320
|
+
id=id, state=ExecutionState.KILLED, progress=dict(progress=progress)
|
|
321
|
+
)
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
stage_files = {
|
|
325
|
+
"pre_start": logs_dir / "stage.pre-start",
|
|
326
|
+
"running": logs_dir / "stage.running",
|
|
327
|
+
"exit": logs_dir / "stage.exit",
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if stage_files["exit"].exists():
|
|
331
|
+
try:
|
|
332
|
+
content = stage_files["exit"].read_text().strip()
|
|
333
|
+
if " " in content:
|
|
334
|
+
timestamp, exit_code_str = content.rsplit(" ", 1)
|
|
335
|
+
exit_code = int(exit_code_str)
|
|
336
|
+
if exit_code == 0:
|
|
337
|
+
return [
|
|
338
|
+
ExecutionStatus(
|
|
339
|
+
id=id,
|
|
340
|
+
state=ExecutionState.SUCCESS,
|
|
341
|
+
progress=dict(progress=progress),
|
|
342
|
+
)
|
|
343
|
+
]
|
|
344
|
+
else:
|
|
345
|
+
return [
|
|
346
|
+
ExecutionStatus(
|
|
347
|
+
id=id,
|
|
348
|
+
state=ExecutionState.FAILED,
|
|
349
|
+
progress=dict(progress=progress),
|
|
350
|
+
)
|
|
351
|
+
]
|
|
352
|
+
else:
|
|
353
|
+
return [
|
|
354
|
+
ExecutionStatus(
|
|
355
|
+
id=id,
|
|
356
|
+
state=ExecutionState.FAILED,
|
|
357
|
+
progress=dict(progress=progress),
|
|
358
|
+
)
|
|
359
|
+
]
|
|
360
|
+
except (ValueError, OSError):
|
|
361
|
+
return [
|
|
362
|
+
ExecutionStatus(
|
|
363
|
+
id=id,
|
|
364
|
+
state=ExecutionState.FAILED,
|
|
365
|
+
progress=dict(progress=progress),
|
|
366
|
+
)
|
|
367
|
+
]
|
|
368
|
+
elif stage_files["running"].exists():
|
|
369
|
+
return [
|
|
370
|
+
ExecutionStatus(
|
|
371
|
+
id=id,
|
|
372
|
+
state=ExecutionState.RUNNING,
|
|
373
|
+
progress=dict(progress=progress),
|
|
374
|
+
)
|
|
375
|
+
]
|
|
376
|
+
elif stage_files["pre_start"].exists():
|
|
377
|
+
return [
|
|
378
|
+
ExecutionStatus(
|
|
379
|
+
id=id,
|
|
380
|
+
state=ExecutionState.PENDING,
|
|
381
|
+
progress=dict(progress=progress),
|
|
382
|
+
)
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
return [
|
|
386
|
+
ExecutionStatus(
|
|
387
|
+
id=id, state=ExecutionState.PENDING, progress=dict(progress=progress)
|
|
388
|
+
)
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
@staticmethod
|
|
392
|
+
def kill_job(job_id: str) -> None:
|
|
393
|
+
"""Kill a local job by stopping its Docker container and related processes.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
job_id: The job ID to kill.
|
|
397
|
+
|
|
398
|
+
Raises:
|
|
399
|
+
ValueError: If job is not found or invalid.
|
|
400
|
+
RuntimeError: If Docker container cannot be stopped.
|
|
401
|
+
"""
|
|
402
|
+
db = ExecutionDB()
|
|
403
|
+
job_data = db.get_job(job_id)
|
|
404
|
+
|
|
405
|
+
if job_data is None:
|
|
406
|
+
raise ValueError(f"Job {job_id} not found")
|
|
407
|
+
|
|
408
|
+
if job_data.executor != "local":
|
|
409
|
+
raise ValueError(
|
|
410
|
+
f"Job {job_id} is not a local job (executor: {job_data.executor})"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Get container name from database
|
|
414
|
+
container_name = job_data.data.get("container")
|
|
415
|
+
if not container_name:
|
|
416
|
+
raise ValueError(f"No container name found for job {job_id}")
|
|
417
|
+
|
|
418
|
+
killed_something = False
|
|
419
|
+
|
|
420
|
+
# First, try to stop the Docker container if it's running
|
|
421
|
+
result = subprocess.run(
|
|
422
|
+
shlex.split(f"docker stop {container_name}"),
|
|
423
|
+
capture_output=True,
|
|
424
|
+
text=True,
|
|
425
|
+
timeout=30,
|
|
426
|
+
)
|
|
427
|
+
if result.returncode == 0:
|
|
428
|
+
killed_something = True
|
|
429
|
+
# Don't raise error if container doesn't exist (might be still pulling)
|
|
430
|
+
|
|
431
|
+
# Find and kill Docker processes for this container
|
|
432
|
+
result = subprocess.run(
|
|
433
|
+
shlex.split(f"pkill -f 'docker run.*{container_name}'"),
|
|
434
|
+
capture_output=True,
|
|
435
|
+
text=True,
|
|
436
|
+
timeout=10,
|
|
437
|
+
)
|
|
438
|
+
if result.returncode == 0:
|
|
439
|
+
killed_something = True
|
|
440
|
+
|
|
441
|
+
# Mark job as killed in database if we killed something
|
|
442
|
+
if killed_something:
|
|
443
|
+
job_data.data["killed"] = True
|
|
444
|
+
db.write_job(job_data)
|
|
445
|
+
else:
|
|
446
|
+
raise RuntimeError(
|
|
447
|
+
f"Could not find or kill job {job_id} (container: {container_name})"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
|
|
452
|
+
"""Get the progress of a local job.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
artifacts_dir: The directory containing the evaluation artifacts.
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
The progress of the job as a float between 0 and 1.
|
|
459
|
+
"""
|
|
460
|
+
progress_filepath = artifacts_dir / "progress"
|
|
461
|
+
if not progress_filepath.exists():
|
|
462
|
+
return None
|
|
463
|
+
progress_str = progress_filepath.read_text().strip()
|
|
464
|
+
try:
|
|
465
|
+
processed_samples = int(progress_str)
|
|
466
|
+
except ValueError:
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
dataset_size = _get_dataset_size(artifacts_dir)
|
|
470
|
+
if dataset_size is not None:
|
|
471
|
+
progress = processed_samples / dataset_size
|
|
472
|
+
else:
|
|
473
|
+
# NOTE(dfridman): if we don't know the dataset size, report the number of processed samples
|
|
474
|
+
progress = processed_samples
|
|
475
|
+
return progress
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _get_dataset_size(artifacts_dir: pathlib.Path) -> Optional[int]:
|
|
479
|
+
"""Get the dataset size for a benchmark.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
artifacts_dir: The directory containing the evaluation artifacts.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
The dataset size for the benchmark.
|
|
486
|
+
"""
|
|
487
|
+
run_config = artifacts_dir / "run_config.yml"
|
|
488
|
+
if not run_config.exists():
|
|
489
|
+
return None
|
|
490
|
+
run_config = yaml.safe_load(run_config.read_text())
|
|
491
|
+
return get_eval_factory_dataset_size_from_run_config(run_config)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# check if docker exists
|
|
18
|
+
command -v docker >/dev/null 2>&1 || { echo 'docker not found'; exit 1; }
|
|
19
|
+
|
|
20
|
+
{% for task in evaluation_tasks %}
|
|
21
|
+
# {{ task.job_id }} {{ task.name }}
|
|
22
|
+
|
|
23
|
+
task_dir="{{ task.output_dir }}"
|
|
24
|
+
artifacts_dir="$task_dir/artifacts"
|
|
25
|
+
logs_dir="$task_dir/logs"
|
|
26
|
+
|
|
27
|
+
mkdir -m 777 -p "$task_dir"
|
|
28
|
+
mkdir -m 777 -p "$artifacts_dir"
|
|
29
|
+
mkdir -m 777 -p "$logs_dir"
|
|
30
|
+
|
|
31
|
+
# Create pre-start stage file
|
|
32
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.pre-start"
|
|
33
|
+
|
|
34
|
+
# Docker run with eval factory command
|
|
35
|
+
(
|
|
36
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$logs_dir/stage.running"
|
|
37
|
+
docker run --rm --shm-size=100g \
|
|
38
|
+
--name {{ task.container_name }} \
|
|
39
|
+
--volume "$artifacts_dir":/results \
|
|
40
|
+
{% for env_var in task.env_vars -%}
|
|
41
|
+
-e {{ env_var }} \
|
|
42
|
+
{% endfor -%}
|
|
43
|
+
{{ task.eval_image }} \
|
|
44
|
+
bash -c '
|
|
45
|
+
{{ task.eval_factory_command }} ;
|
|
46
|
+
exit_code=$?
|
|
47
|
+
chmod 777 -R /results;
|
|
48
|
+
if [ "$exit_code" -ne 0 ]; then
|
|
49
|
+
echo "The evaluation container failed with exit code $exit_code" >&2;
|
|
50
|
+
exit "$exit_code";
|
|
51
|
+
fi;
|
|
52
|
+
echo "Container completed successfully" >&2;
|
|
53
|
+
exit 0;
|
|
54
|
+
' > "$logs_dir/stdout.log" 2>&1
|
|
55
|
+
exit_code=$?
|
|
56
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $exit_code" > "$logs_dir/stage.exit"
|
|
57
|
+
) >> "$logs_dir/stdout.log" 2>&1
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
{% if auto_export_destinations %}
|
|
61
|
+
# Monitor job completion and auto-export
|
|
62
|
+
(
|
|
63
|
+
# Give it a moment to ensure file is fully written
|
|
64
|
+
sleep 1
|
|
65
|
+
|
|
66
|
+
exit_code=$(tail -1 "$logs_dir/stage.exit" | cut -d' ' -f2)
|
|
67
|
+
if [ "$exit_code" = "0" ]; then
|
|
68
|
+
# Log auto-export activity to task logs
|
|
69
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} completed successfully. Starting auto-export..." >> "$logs_dir/stdout.log"
|
|
70
|
+
|
|
71
|
+
{% for dest in auto_export_destinations %}
|
|
72
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Exporting job {{ task.job_id }} to {{ dest }}..." >> "$logs_dir/stdout.log"
|
|
73
|
+
nemo-evaluator-launcher export {{ task.job_id }} --dest {{ dest }} >> "$logs_dir/stdout.log" 2>&1
|
|
74
|
+
if [ $? -eq 0 ]; then
|
|
75
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Export to {{ dest }} completed successfully" >> "$logs_dir/stdout.log"
|
|
76
|
+
else
|
|
77
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Export to {{ dest }} failed" >> "$logs_dir/stdout.log"
|
|
78
|
+
fi
|
|
79
|
+
{% endfor %}
|
|
80
|
+
|
|
81
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Auto-export completed for job {{ task.job_id }}" >> "$logs_dir/stdout.log"
|
|
82
|
+
else
|
|
83
|
+
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) Job {{ task.job_id }} failed with exit code $exit_code. Skipping auto-export." >> "$logs_dir/stdout.log"
|
|
84
|
+
fi
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
{% endif %}
|
|
88
|
+
{% endfor %}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from typing import Callable, Type
|
|
17
|
+
|
|
18
|
+
from nemo_evaluator_launcher.executors.base import BaseExecutor
|
|
19
|
+
|
|
20
|
+
_EXECUTOR_REGISTRY: dict[str, Type[BaseExecutor]] = {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def register_executor(
|
|
24
|
+
executor_name: str,
|
|
25
|
+
) -> Callable[[Type[BaseExecutor]], Type[BaseExecutor]]:
|
|
26
|
+
def wrapper(executor_cls: Type[BaseExecutor]) -> Type[BaseExecutor]:
|
|
27
|
+
_EXECUTOR_REGISTRY[executor_name] = executor_cls
|
|
28
|
+
return executor_cls
|
|
29
|
+
|
|
30
|
+
return wrapper
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_executor(executor_name: str) -> Type[BaseExecutor]:
|
|
34
|
+
if executor_name not in _EXECUTOR_REGISTRY:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Executor {executor_name} not found. Available executors: {list(_EXECUTOR_REGISTRY.keys())}"
|
|
37
|
+
)
|
|
38
|
+
return _EXECUTOR_REGISTRY[executor_name]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|