nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -23,9 +23,11 @@ import os
|
|
|
23
23
|
import pathlib
|
|
24
24
|
import platform
|
|
25
25
|
import shlex
|
|
26
|
+
import shutil
|
|
26
27
|
import subprocess
|
|
27
28
|
import time
|
|
28
|
-
|
|
29
|
+
import warnings
|
|
30
|
+
from typing import Iterator, List, Optional, Tuple, Union
|
|
29
31
|
|
|
30
32
|
import jinja2
|
|
31
33
|
import yaml
|
|
@@ -38,14 +40,19 @@ from nemo_evaluator_launcher.common.execdb import (
|
|
|
38
40
|
generate_job_id,
|
|
39
41
|
)
|
|
40
42
|
from nemo_evaluator_launcher.common.helpers import (
|
|
43
|
+
get_api_key_name,
|
|
44
|
+
get_endpoint_url,
|
|
41
45
|
get_eval_factory_command,
|
|
42
46
|
get_eval_factory_dataset_size_from_run_config,
|
|
47
|
+
get_health_url,
|
|
43
48
|
get_timestamp_string,
|
|
44
49
|
)
|
|
50
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
45
51
|
from nemo_evaluator_launcher.common.mapping import (
|
|
46
52
|
get_task_from_mapping,
|
|
47
53
|
load_tasks_mapping,
|
|
48
54
|
)
|
|
55
|
+
from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
|
|
49
56
|
from nemo_evaluator_launcher.executors.base import (
|
|
50
57
|
BaseExecutor,
|
|
51
58
|
ExecutionState,
|
|
@@ -68,12 +75,13 @@ class LocalExecutor(BaseExecutor):
|
|
|
68
75
|
str: The invocation ID for the evaluation run.
|
|
69
76
|
|
|
70
77
|
Raises:
|
|
71
|
-
NotImplementedError: If deployment is not 'none'.
|
|
72
78
|
RuntimeError: If the run script fails.
|
|
73
79
|
"""
|
|
74
|
-
if
|
|
75
|
-
|
|
76
|
-
|
|
80
|
+
# Check if docker is available (skip in dry_run mode)
|
|
81
|
+
if not dry_run and shutil.which("docker") is None:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"Docker is not installed or not in PATH. "
|
|
84
|
+
"Please install Docker to run local evaluations."
|
|
77
85
|
)
|
|
78
86
|
|
|
79
87
|
# Generate invocation ID for this evaluation run
|
|
@@ -88,12 +96,16 @@ class LocalExecutor(BaseExecutor):
|
|
|
88
96
|
evaluation_tasks = []
|
|
89
97
|
job_ids = []
|
|
90
98
|
|
|
91
|
-
|
|
99
|
+
run_template = jinja2.Template(
|
|
92
100
|
open(pathlib.Path(__file__).parent / "run.template.sh", "r").read()
|
|
93
101
|
)
|
|
94
102
|
|
|
95
103
|
execution_mode = cfg.execution.get("mode", "parallel")
|
|
96
104
|
if execution_mode == "parallel":
|
|
105
|
+
if cfg.deployment.type != "none":
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"Execution mode 'parallel' is not supported with deployment type: {cfg.deployment.type}. Use 'sequential' instead."
|
|
108
|
+
)
|
|
97
109
|
is_execution_mode_sequential = False
|
|
98
110
|
elif execution_mode == "sequential":
|
|
99
111
|
is_execution_mode_sequential = True
|
|
@@ -104,20 +116,76 @@ class LocalExecutor(BaseExecutor):
|
|
|
104
116
|
)
|
|
105
117
|
)
|
|
106
118
|
|
|
119
|
+
# Will accumulate if any task contains unsafe commands.
|
|
120
|
+
is_potentially_unsafe = False
|
|
121
|
+
|
|
122
|
+
deployment = None
|
|
123
|
+
|
|
107
124
|
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
125
|
+
timestamp = get_timestamp_string()
|
|
108
126
|
task_definition = get_task_from_mapping(task.name, tasks_mapping)
|
|
109
127
|
|
|
128
|
+
if cfg.deployment.type != "none":
|
|
129
|
+
# container name
|
|
130
|
+
server_container_name = f"server-{task.name}-{timestamp}"
|
|
131
|
+
|
|
132
|
+
# health_url
|
|
133
|
+
health_url = get_health_url(
|
|
134
|
+
cfg, get_endpoint_url(cfg, task, task_definition["endpoint_type"])
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# mounts
|
|
138
|
+
deployment_mounts_list = []
|
|
139
|
+
if checkpoint_path := cfg.deployment.get("checkpoint_path"):
|
|
140
|
+
deployment_mounts_list.append(f"{checkpoint_path}:/checkpoint:ro")
|
|
141
|
+
if cache_path := cfg.deployment.get("cache_path"):
|
|
142
|
+
deployment_mounts_list.append(f"{cache_path}:/cache")
|
|
143
|
+
for source_mnt, target_mnt in (
|
|
144
|
+
cfg.execution.get("mounts", {}).get("deployment", {}).items()
|
|
145
|
+
):
|
|
146
|
+
deployment_mounts_list.append(f"{source_mnt}:{target_mnt}")
|
|
147
|
+
|
|
148
|
+
# env vars
|
|
149
|
+
deployment_env_vars = cfg.execution.get("env_vars", {}).get(
|
|
150
|
+
"deployment", {}
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if cfg.deployment.get("env_vars"):
|
|
154
|
+
warnings.warn(
|
|
155
|
+
"cfg.deployment.env_vars will be deprecated in future versions. "
|
|
156
|
+
"Use cfg.execution.env_vars.deployment instead.",
|
|
157
|
+
category=DeprecationWarning,
|
|
158
|
+
stacklevel=2,
|
|
159
|
+
)
|
|
160
|
+
deployment_env_vars.update(cfg.deployment["env_vars"])
|
|
161
|
+
|
|
162
|
+
command = cfg.deployment.command
|
|
163
|
+
deployment_extra_docker_args = cfg.execution.get(
|
|
164
|
+
"extra_docker_args", ""
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
deployment = {
|
|
168
|
+
"container_name": server_container_name,
|
|
169
|
+
"image": cfg.deployment.image,
|
|
170
|
+
"command": command,
|
|
171
|
+
"mounts": deployment_mounts_list,
|
|
172
|
+
"env_vars": [f"{k}={v}" for k, v in deployment_env_vars.items()],
|
|
173
|
+
"health_url": health_url,
|
|
174
|
+
"port": cfg.deployment.port,
|
|
175
|
+
"extra_docker_args": deployment_extra_docker_args,
|
|
176
|
+
}
|
|
177
|
+
|
|
110
178
|
# Create job ID as <invocation_id>.<n>
|
|
111
179
|
job_id = generate_job_id(invocation_id, idx)
|
|
112
180
|
job_ids.append(job_id)
|
|
113
|
-
|
|
181
|
+
client_container_name = f"client-{task.name}-{timestamp}"
|
|
114
182
|
|
|
115
183
|
# collect all env vars
|
|
116
184
|
env_vars = copy.deepcopy(dict(cfg.evaluation.get("env_vars", {})))
|
|
117
185
|
env_vars.update(task.get("env_vars", {}))
|
|
118
|
-
if cfg
|
|
186
|
+
if api_key_name := get_api_key_name(cfg):
|
|
119
187
|
assert "API_KEY" not in env_vars
|
|
120
|
-
env_vars["API_KEY"] =
|
|
188
|
+
env_vars["API_KEY"] = api_key_name
|
|
121
189
|
|
|
122
190
|
# check if the environment variables are set
|
|
123
191
|
for env_var in env_vars.values():
|
|
@@ -126,8 +194,11 @@ class LocalExecutor(BaseExecutor):
|
|
|
126
194
|
f"Trying to pass an unset environment variable {env_var}."
|
|
127
195
|
)
|
|
128
196
|
|
|
129
|
-
# check if required env vars are defined:
|
|
197
|
+
# check if required env vars are defined (excluding NEMO_EVALUATOR_DATASET_DIR which is handled separately):
|
|
130
198
|
for required_env_var in task_definition.get("required_env_vars", []):
|
|
199
|
+
# Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic below
|
|
200
|
+
if required_env_var == "NEMO_EVALUATOR_DATASET_DIR":
|
|
201
|
+
continue
|
|
131
202
|
if required_env_var not in env_vars.keys():
|
|
132
203
|
raise ValueError(
|
|
133
204
|
f"{task.name} task requires environment variable {required_env_var}."
|
|
@@ -135,28 +206,70 @@ class LocalExecutor(BaseExecutor):
|
|
|
135
206
|
f" pair {required_env_var}: YOUR_ENV_VAR_NAME"
|
|
136
207
|
)
|
|
137
208
|
|
|
209
|
+
# Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
|
|
210
|
+
dataset_mount_host = None
|
|
211
|
+
dataset_mount_container = None
|
|
212
|
+
dataset_env_var_value = None
|
|
213
|
+
if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get(
|
|
214
|
+
"required_env_vars", []
|
|
215
|
+
):
|
|
216
|
+
# Get dataset directory from task config
|
|
217
|
+
if "dataset_dir" in task:
|
|
218
|
+
dataset_mount_host = task["dataset_dir"]
|
|
219
|
+
else:
|
|
220
|
+
raise ValueError(
|
|
221
|
+
f"{task.name} task requires a dataset_dir to be specified. "
|
|
222
|
+
f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
|
|
223
|
+
)
|
|
224
|
+
# Get container mount path (default to /datasets if not specified)
|
|
225
|
+
dataset_mount_container = task.get("dataset_mount_path", "/datasets")
|
|
226
|
+
# Set NEMO_EVALUATOR_DATASET_DIR to the container mount path
|
|
227
|
+
dataset_env_var_value = dataset_mount_container
|
|
228
|
+
|
|
138
229
|
# format env_vars for a template
|
|
139
|
-
|
|
230
|
+
env_vars_list = [
|
|
140
231
|
f"{env_var_dst}=${env_var_src}"
|
|
141
232
|
for env_var_dst, env_var_src in env_vars.items()
|
|
142
233
|
]
|
|
143
234
|
|
|
235
|
+
# Add dataset env var if needed (directly with value, not from host env)
|
|
236
|
+
if dataset_env_var_value:
|
|
237
|
+
env_vars_list.append(
|
|
238
|
+
f"NEMO_EVALUATOR_DATASET_DIR={dataset_env_var_value}"
|
|
239
|
+
)
|
|
240
|
+
|
|
144
241
|
eval_image = task_definition["container"]
|
|
145
242
|
if "container" in task:
|
|
146
243
|
eval_image = task["container"]
|
|
147
244
|
|
|
148
245
|
task_output_dir = output_dir / task.name
|
|
149
246
|
task_output_dir.mkdir(parents=True, exist_ok=True)
|
|
247
|
+
eval_factory_command_struct = get_eval_factory_command(
|
|
248
|
+
cfg, task, task_definition
|
|
249
|
+
)
|
|
250
|
+
eval_factory_command = eval_factory_command_struct.cmd
|
|
251
|
+
# The debug comment for placing into the script and easy debug. Reason
|
|
252
|
+
# (see `CmdAndReadableComment`) is the current way of passing the command
|
|
253
|
+
# is base64-encoded config `echo`-ed into file.
|
|
254
|
+
# TODO(agronskiy): cleaner way is to encode everything with base64, not
|
|
255
|
+
# some parts (like ef_config.yaml) and just output as logs somewhere.
|
|
256
|
+
eval_factory_command_debug_comment = eval_factory_command_struct.debug
|
|
257
|
+
is_potentially_unsafe = (
|
|
258
|
+
is_potentially_unsafe
|
|
259
|
+
or eval_factory_command_struct.is_potentially_unsafe
|
|
260
|
+
)
|
|
150
261
|
evaluation_task = {
|
|
262
|
+
"deployment": deployment,
|
|
151
263
|
"name": task.name,
|
|
152
264
|
"job_id": job_id,
|
|
153
265
|
"eval_image": eval_image,
|
|
154
|
-
"
|
|
155
|
-
"env_vars":
|
|
266
|
+
"client_container_name": client_container_name,
|
|
267
|
+
"env_vars": env_vars_list,
|
|
156
268
|
"output_dir": task_output_dir,
|
|
157
|
-
"eval_factory_command":
|
|
158
|
-
|
|
159
|
-
|
|
269
|
+
"eval_factory_command": eval_factory_command,
|
|
270
|
+
"eval_factory_command_debug_comment": eval_factory_command_debug_comment,
|
|
271
|
+
"dataset_mount_host": dataset_mount_host,
|
|
272
|
+
"dataset_mount_container": dataset_mount_container,
|
|
160
273
|
}
|
|
161
274
|
evaluation_tasks.append(evaluation_task)
|
|
162
275
|
|
|
@@ -164,10 +277,13 @@ class LocalExecutor(BaseExecutor):
|
|
|
164
277
|
auto_export_config = cfg.execution.get("auto_export", {})
|
|
165
278
|
auto_export_destinations = auto_export_config.get("destinations", [])
|
|
166
279
|
|
|
280
|
+
extra_docker_args = cfg.execution.get("extra_docker_args", "")
|
|
281
|
+
|
|
167
282
|
run_sh_content = (
|
|
168
|
-
|
|
283
|
+
run_template.render(
|
|
169
284
|
evaluation_tasks=[evaluation_task],
|
|
170
285
|
auto_export_destinations=auto_export_destinations,
|
|
286
|
+
extra_docker_args=extra_docker_args,
|
|
171
287
|
).rstrip("\n")
|
|
172
288
|
+ "\n"
|
|
173
289
|
)
|
|
@@ -175,9 +291,10 @@ class LocalExecutor(BaseExecutor):
|
|
|
175
291
|
(task_output_dir / "run.sh").write_text(run_sh_content)
|
|
176
292
|
|
|
177
293
|
run_all_sequentially_sh_content = (
|
|
178
|
-
|
|
294
|
+
run_template.render(
|
|
179
295
|
evaluation_tasks=evaluation_tasks,
|
|
180
296
|
auto_export_destinations=auto_export_destinations,
|
|
297
|
+
extra_docker_args=extra_docker_args,
|
|
181
298
|
).rstrip("\n")
|
|
182
299
|
+ "\n"
|
|
183
300
|
)
|
|
@@ -185,6 +302,57 @@ class LocalExecutor(BaseExecutor):
|
|
|
185
302
|
run_all_sequentially_sh_content
|
|
186
303
|
)
|
|
187
304
|
|
|
305
|
+
if dry_run:
|
|
306
|
+
print(bold("\n\n=============================================\n\n"))
|
|
307
|
+
print(bold(cyan(f"DRY RUN: Scripts prepared and saved to {output_dir}")))
|
|
308
|
+
if is_execution_mode_sequential:
|
|
309
|
+
print(
|
|
310
|
+
cyan(
|
|
311
|
+
"\n\n=========== Main script | run_all.sequential.sh =====================\n\n"
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
with open(output_dir / "run_all.sequential.sh", "r") as f:
|
|
316
|
+
print(grey(f.read()))
|
|
317
|
+
else:
|
|
318
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
319
|
+
task_output_dir = output_dir / task.name
|
|
320
|
+
print(
|
|
321
|
+
cyan(
|
|
322
|
+
f"\n\n=========== Task script | {task.name}/run.sh =====================\n\n"
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
with open(task_output_dir / "run.sh", "r") as f:
|
|
326
|
+
print(grey(f.read()))
|
|
327
|
+
print(bold("\nTo execute, run without --dry-run"))
|
|
328
|
+
|
|
329
|
+
if is_potentially_unsafe:
|
|
330
|
+
print(
|
|
331
|
+
red(
|
|
332
|
+
"\nFound `pre_cmd` which carries security risk. When running without --dry-run "
|
|
333
|
+
"make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
|
|
334
|
+
)
|
|
335
|
+
)
|
|
336
|
+
return invocation_id
|
|
337
|
+
|
|
338
|
+
if is_potentially_unsafe:
|
|
339
|
+
if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
|
|
340
|
+
logger.warning(
|
|
341
|
+
"Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
|
|
342
|
+
"is set, proceeding with caution."
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
else:
|
|
346
|
+
logger.error(
|
|
347
|
+
"Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
|
|
348
|
+
"is not set. This might carry security risk and unstable environments. "
|
|
349
|
+
"To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
|
|
350
|
+
)
|
|
351
|
+
raise AttributeError(
|
|
352
|
+
"Untrusted command found in config, make sure you trust and "
|
|
353
|
+
"set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
|
|
354
|
+
)
|
|
355
|
+
|
|
188
356
|
# Save launched jobs metadata
|
|
189
357
|
db = ExecutionDB()
|
|
190
358
|
for job_id, task, evaluation_task in zip(
|
|
@@ -198,74 +366,67 @@ class LocalExecutor(BaseExecutor):
|
|
|
198
366
|
executor="local",
|
|
199
367
|
data={
|
|
200
368
|
"output_dir": str(evaluation_task["output_dir"]),
|
|
201
|
-
"container": evaluation_task["
|
|
369
|
+
"container": evaluation_task["client_container_name"],
|
|
202
370
|
"eval_image": evaluation_task["eval_image"],
|
|
203
371
|
},
|
|
204
372
|
config=OmegaConf.to_object(cfg),
|
|
205
373
|
)
|
|
206
374
|
)
|
|
207
375
|
|
|
208
|
-
if dry_run:
|
|
209
|
-
print("\n\n=============================================\n\n")
|
|
210
|
-
print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
|
|
211
|
-
if is_execution_mode_sequential:
|
|
212
|
-
print(
|
|
213
|
-
"\n\n =========== Main script | run_all.sequential.sh ===================== \n\n"
|
|
214
|
-
)
|
|
215
|
-
with open(output_dir / "run_all.sequential.sh", "r") as f:
|
|
216
|
-
print(f.read())
|
|
217
|
-
else:
|
|
218
|
-
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
219
|
-
task_output_dir = output_dir / task.name
|
|
220
|
-
print(
|
|
221
|
-
f"\n\n =========== Task script | {task.name}/run.sh ===================== \n\n"
|
|
222
|
-
)
|
|
223
|
-
with open(task_output_dir / "run.sh", "r") as f:
|
|
224
|
-
print(f.read())
|
|
225
|
-
print("\nTo execute, run without --dry-run")
|
|
226
|
-
return invocation_id
|
|
227
|
-
|
|
228
376
|
# Launch bash scripts with Popen for non-blocking execution.
|
|
229
377
|
# To ensure subprocess continues after python exits:
|
|
230
378
|
# - on Unix-like systems, to fully detach the subprocess
|
|
231
379
|
# so it does not die when Python exits, pass start_new_session=True;
|
|
232
|
-
# - on
|
|
380
|
+
# - on Windows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
|
|
233
381
|
os_name = platform.system()
|
|
382
|
+
processes = []
|
|
383
|
+
|
|
234
384
|
if is_execution_mode_sequential:
|
|
235
385
|
if os_name == "Windows":
|
|
236
|
-
subprocess.Popen(
|
|
386
|
+
proc = subprocess.Popen(
|
|
237
387
|
shlex.split("bash run_all.sequential.sh"),
|
|
238
388
|
cwd=output_dir,
|
|
239
389
|
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
240
390
|
)
|
|
241
391
|
else:
|
|
242
|
-
subprocess.Popen(
|
|
392
|
+
proc = subprocess.Popen(
|
|
243
393
|
shlex.split("bash run_all.sequential.sh"),
|
|
244
394
|
cwd=output_dir,
|
|
245
395
|
start_new_session=True,
|
|
246
396
|
)
|
|
397
|
+
processes.append(("run_all.sequential.sh", proc, output_dir))
|
|
247
398
|
else:
|
|
248
399
|
for task in cfg.evaluation.tasks:
|
|
249
400
|
if os_name == "Windows":
|
|
250
|
-
subprocess.Popen(
|
|
401
|
+
proc = subprocess.Popen(
|
|
251
402
|
shlex.split("bash run.sh"),
|
|
252
403
|
cwd=output_dir / task.name,
|
|
253
404
|
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
254
405
|
)
|
|
255
406
|
else:
|
|
256
|
-
subprocess.Popen(
|
|
407
|
+
proc = subprocess.Popen(
|
|
257
408
|
shlex.split("bash run.sh"),
|
|
258
409
|
cwd=output_dir / task.name,
|
|
259
410
|
start_new_session=True,
|
|
260
411
|
)
|
|
412
|
+
processes.append((task.name, proc, output_dir / task.name))
|
|
413
|
+
|
|
414
|
+
# Wait briefly and check if bash scripts exited immediately (which means error)
|
|
415
|
+
time.sleep(0.3)
|
|
261
416
|
|
|
262
|
-
|
|
417
|
+
for name, proc, work_dir in processes:
|
|
418
|
+
exit_code = proc.poll()
|
|
419
|
+
if exit_code is not None and exit_code != 0:
|
|
420
|
+
error_msg = f"Script for {name} exited with code {exit_code}"
|
|
421
|
+
raise RuntimeError(f"Job startup failed | {error_msg}")
|
|
422
|
+
|
|
423
|
+
print(bold(cyan("\nCommands for real-time monitoring:")))
|
|
263
424
|
for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
|
|
264
|
-
|
|
265
|
-
print(f"
|
|
425
|
+
print(f"\n Job {job_id} ({evaluation_task['name']}):")
|
|
426
|
+
print(f" nemo-evaluator-launcher logs {job_id}")
|
|
266
427
|
|
|
267
|
-
print("\nFollow all logs for this invocation:")
|
|
268
|
-
print(f"
|
|
428
|
+
print(bold(cyan("\nFollow all logs for this invocation:")))
|
|
429
|
+
print(f" nemo-evaluator-launcher logs {invocation_id}")
|
|
269
430
|
|
|
270
431
|
return invocation_id
|
|
271
432
|
|
|
@@ -281,8 +442,8 @@ class LocalExecutor(BaseExecutor):
|
|
|
281
442
|
"""
|
|
282
443
|
db = ExecutionDB()
|
|
283
444
|
|
|
284
|
-
# If id looks like an invocation_id (
|
|
285
|
-
if
|
|
445
|
+
# If id looks like an invocation_id (no dot), get all jobs for it
|
|
446
|
+
if "." not in id:
|
|
286
447
|
jobs = db.get_jobs(id)
|
|
287
448
|
statuses: List[ExecutionStatus] = []
|
|
288
449
|
for job_id, _ in jobs.items():
|
|
@@ -390,10 +551,10 @@ class LocalExecutor(BaseExecutor):
|
|
|
390
551
|
|
|
391
552
|
@staticmethod
|
|
392
553
|
def kill_job(job_id: str) -> None:
|
|
393
|
-
"""Kill a local job
|
|
554
|
+
"""Kill a local job.
|
|
394
555
|
|
|
395
556
|
Args:
|
|
396
|
-
job_id: The job ID to kill.
|
|
557
|
+
job_id: The job ID (e.g., abc123.0) to kill.
|
|
397
558
|
|
|
398
559
|
Raises:
|
|
399
560
|
ValueError: If job is not found or invalid.
|
|
@@ -438,15 +599,290 @@ class LocalExecutor(BaseExecutor):
|
|
|
438
599
|
if result.returncode == 0:
|
|
439
600
|
killed_something = True
|
|
440
601
|
|
|
441
|
-
#
|
|
602
|
+
# If we successfully killed something, mark as killed
|
|
442
603
|
if killed_something:
|
|
443
604
|
job_data.data["killed"] = True
|
|
444
605
|
db.write_job(job_data)
|
|
606
|
+
LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
|
|
607
|
+
return
|
|
608
|
+
|
|
609
|
+
# If nothing was killed, check if this is a pending job
|
|
610
|
+
status_list = LocalExecutor.get_status(job_id)
|
|
611
|
+
if status_list and status_list[0].state == ExecutionState.PENDING:
|
|
612
|
+
# For pending jobs, mark as killed even though there's nothing to kill yet
|
|
613
|
+
job_data.data["killed"] = True
|
|
614
|
+
db.write_job(job_data)
|
|
615
|
+
LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
|
|
616
|
+
return
|
|
617
|
+
|
|
618
|
+
# Use common helper to get informative error message based on job status
|
|
619
|
+
current_status = status_list[0].state if status_list else None
|
|
620
|
+
error_msg = LocalExecutor.get_kill_failure_message(
|
|
621
|
+
job_id, f"container: {container_name}", current_status
|
|
622
|
+
)
|
|
623
|
+
raise RuntimeError(error_msg)
|
|
624
|
+
|
|
625
|
+
@staticmethod
|
|
626
|
+
def stream_logs(
|
|
627
|
+
id: Union[str, List[str]], executor_name: Optional[str] = None
|
|
628
|
+
) -> Iterator[Tuple[str, str, str]]:
|
|
629
|
+
"""Stream logs from a job or invocation group.
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
id: Unique job identifier, invocation identifier, or list of job IDs to stream simultaneously.
|
|
633
|
+
|
|
634
|
+
Yields:
|
|
635
|
+
Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
|
|
636
|
+
Empty lines are yielded as empty strings.
|
|
637
|
+
"""
|
|
638
|
+
db = ExecutionDB()
|
|
639
|
+
|
|
640
|
+
# Handle list of job IDs for simultaneous streaming
|
|
641
|
+
if isinstance(id, list):
|
|
642
|
+
# Collect all jobs from the list of job IDs
|
|
643
|
+
jobs = {}
|
|
644
|
+
for job_id in id:
|
|
645
|
+
job_data = db.get_job(job_id)
|
|
646
|
+
if job_data is None or job_data.executor != "local":
|
|
647
|
+
continue
|
|
648
|
+
jobs[job_id] = job_data
|
|
649
|
+
if not jobs:
|
|
650
|
+
return
|
|
651
|
+
# If id looks like an invocation_id (no dot), get all jobs for it
|
|
652
|
+
elif "." not in id:
|
|
653
|
+
jobs = db.get_jobs(id)
|
|
654
|
+
if not jobs:
|
|
655
|
+
return
|
|
445
656
|
else:
|
|
446
|
-
|
|
447
|
-
|
|
657
|
+
# Otherwise, treat as job_id
|
|
658
|
+
job_data = db.get_job(id)
|
|
659
|
+
if job_data is None or job_data.executor != "local":
|
|
660
|
+
return
|
|
661
|
+
jobs = {id: job_data}
|
|
662
|
+
|
|
663
|
+
# Collect log file paths and metadata
|
|
664
|
+
log_files = []
|
|
665
|
+
|
|
666
|
+
for job_id, job_data in jobs.items():
|
|
667
|
+
output_dir = pathlib.Path(job_data.data.get("output_dir", ""))
|
|
668
|
+
if not output_dir:
|
|
669
|
+
continue
|
|
670
|
+
|
|
671
|
+
# Get task name from config
|
|
672
|
+
task_name = LocalExecutor._extract_task_name(job_data, job_id)
|
|
673
|
+
|
|
674
|
+
log_file_path = output_dir / "logs" / "client_stdout.log"
|
|
675
|
+
|
|
676
|
+
log_files.append(
|
|
677
|
+
{
|
|
678
|
+
"job_id": job_id,
|
|
679
|
+
"task_name": task_name,
|
|
680
|
+
"path": log_file_path,
|
|
681
|
+
"file_handle": None,
|
|
682
|
+
"position": 0,
|
|
683
|
+
}
|
|
448
684
|
)
|
|
449
685
|
|
|
686
|
+
if not log_files:
|
|
687
|
+
return
|
|
688
|
+
|
|
689
|
+
# Track which files we've seen before (for tail behavior)
|
|
690
|
+
file_seen_before = {}
|
|
691
|
+
|
|
692
|
+
# Open files that exist, keep track of which ones we're waiting for
|
|
693
|
+
# First, yield the last 15 lines from existing files
|
|
694
|
+
for log_info in log_files:
|
|
695
|
+
if log_info["path"].exists():
|
|
696
|
+
file_seen_before[log_info["path"]] = True
|
|
697
|
+
# Read and yield last 15 lines
|
|
698
|
+
last_lines = LocalExecutor._read_last_n_lines(log_info["path"], 15)
|
|
699
|
+
for line in last_lines:
|
|
700
|
+
yield (
|
|
701
|
+
log_info["job_id"],
|
|
702
|
+
log_info["task_name"],
|
|
703
|
+
line,
|
|
704
|
+
)
|
|
705
|
+
try:
|
|
706
|
+
log_info["file_handle"] = open(
|
|
707
|
+
log_info["path"], "r", encoding="utf-8", errors="replace"
|
|
708
|
+
)
|
|
709
|
+
# Seek to end if file already exists (tail behavior)
|
|
710
|
+
log_info["file_handle"].seek(0, 2)
|
|
711
|
+
log_info["position"] = log_info["file_handle"].tell()
|
|
712
|
+
except Exception as e:
|
|
713
|
+
logger.error(f"Could not open {log_info['path']}: {e}")
|
|
714
|
+
else:
|
|
715
|
+
file_seen_before[log_info["path"]] = False
|
|
716
|
+
|
|
717
|
+
try:
|
|
718
|
+
while True:
|
|
719
|
+
any_activity = False
|
|
720
|
+
|
|
721
|
+
for log_info in log_files:
|
|
722
|
+
# Try to open file if it doesn't exist yet
|
|
723
|
+
if log_info["file_handle"] is None:
|
|
724
|
+
if log_info["path"].exists():
|
|
725
|
+
try:
|
|
726
|
+
# If file was just created, read last 15 lines first
|
|
727
|
+
if not file_seen_before.get(log_info["path"], False):
|
|
728
|
+
last_lines = LocalExecutor._read_last_n_lines(
|
|
729
|
+
log_info["path"], 15
|
|
730
|
+
)
|
|
731
|
+
for line in last_lines:
|
|
732
|
+
yield (
|
|
733
|
+
log_info["job_id"],
|
|
734
|
+
log_info["task_name"],
|
|
735
|
+
line,
|
|
736
|
+
)
|
|
737
|
+
file_seen_before[log_info["path"]] = True
|
|
738
|
+
|
|
739
|
+
log_info["file_handle"] = open(
|
|
740
|
+
log_info["path"],
|
|
741
|
+
"r",
|
|
742
|
+
encoding="utf-8",
|
|
743
|
+
errors="replace",
|
|
744
|
+
)
|
|
745
|
+
# Seek to end for tail behavior
|
|
746
|
+
log_info["file_handle"].seek(0, 2)
|
|
747
|
+
log_info["position"] = log_info["file_handle"].tell()
|
|
748
|
+
except Exception as e:
|
|
749
|
+
logger.error(f"Could not open {log_info['path']}: {e}")
|
|
750
|
+
continue
|
|
751
|
+
|
|
752
|
+
# Read new lines from file
|
|
753
|
+
if log_info["file_handle"] is not None:
|
|
754
|
+
try:
|
|
755
|
+
# Check if file has grown
|
|
756
|
+
current_size = log_info["path"].stat().st_size
|
|
757
|
+
if current_size > log_info["position"]:
|
|
758
|
+
log_info["file_handle"].seek(log_info["position"])
|
|
759
|
+
new_lines = log_info["file_handle"].readlines()
|
|
760
|
+
log_info["position"] = log_info["file_handle"].tell()
|
|
761
|
+
|
|
762
|
+
# Yield new lines
|
|
763
|
+
for line in new_lines:
|
|
764
|
+
line_stripped = line.rstrip("\n\r")
|
|
765
|
+
yield (
|
|
766
|
+
log_info["job_id"],
|
|
767
|
+
log_info["task_name"],
|
|
768
|
+
line_stripped,
|
|
769
|
+
)
|
|
770
|
+
any_activity = True
|
|
771
|
+
except (OSError, IOError) as e:
|
|
772
|
+
# File might have been deleted or moved
|
|
773
|
+
# Don't log error for every check, only on first error
|
|
774
|
+
if log_info.get("error_printed", False) is False:
|
|
775
|
+
logger.error(f"Error reading {log_info['path']}: {e}")
|
|
776
|
+
log_info["error_printed"] = True
|
|
777
|
+
log_info["file_handle"] = None
|
|
778
|
+
except Exception:
|
|
779
|
+
# Reset error flag if we successfully read again
|
|
780
|
+
log_info["error_printed"] = False
|
|
781
|
+
|
|
782
|
+
# If no activity, sleep briefly to avoid busy waiting
|
|
783
|
+
if not any_activity:
|
|
784
|
+
time.sleep(0.1)
|
|
785
|
+
|
|
786
|
+
except KeyboardInterrupt:
|
|
787
|
+
# Clean exit on Ctrl+C
|
|
788
|
+
pass
|
|
789
|
+
finally:
|
|
790
|
+
# Close all file handles
|
|
791
|
+
for log_info in log_files:
|
|
792
|
+
if log_info["file_handle"] is not None:
|
|
793
|
+
try:
|
|
794
|
+
log_info["file_handle"].close()
|
|
795
|
+
except Exception:
|
|
796
|
+
pass
|
|
797
|
+
|
|
798
|
+
@staticmethod
|
|
799
|
+
def _read_last_n_lines(file_path: pathlib.Path, n: int) -> List[str]:
|
|
800
|
+
"""Read the last N lines from a file efficiently.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
file_path: Path to the file to read from.
|
|
804
|
+
n: Number of lines to read from the end.
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
List of the last N lines (or fewer if file has fewer lines).
|
|
808
|
+
"""
|
|
809
|
+
try:
|
|
810
|
+
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
|
811
|
+
# Read all lines
|
|
812
|
+
all_lines = f.readlines()
|
|
813
|
+
# Return last n lines, stripping newlines
|
|
814
|
+
return [line.rstrip("\n\r") for line in all_lines[-n:]]
|
|
815
|
+
except Exception as e:
|
|
816
|
+
logger.warning(f"Could not read last {n} lines from {file_path}: {e}")
|
|
817
|
+
return []
|
|
818
|
+
|
|
819
|
+
@staticmethod
|
|
820
|
+
def _extract_task_name(job_data: JobData, job_id: str) -> str:
|
|
821
|
+
"""Extract task name from job data config.
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
job_data: JobData object containing config.
|
|
825
|
+
job_id: Job ID for error reporting.
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
Task name string.
|
|
829
|
+
"""
|
|
830
|
+
config = job_data.config or {}
|
|
831
|
+
evaluation = config.get("evaluation", {})
|
|
832
|
+
tasks = evaluation.get("tasks", [])
|
|
833
|
+
|
|
834
|
+
# Find the task that matches this job
|
|
835
|
+
# For job_id like "15b9f667.0", index is 0
|
|
836
|
+
try:
|
|
837
|
+
if "." in job_id:
|
|
838
|
+
index = int(job_id.split(".")[1])
|
|
839
|
+
if len(tasks) > 0 and index >= len(tasks):
|
|
840
|
+
raise AttributeError(
|
|
841
|
+
f"Job task index {job_id} is larger than number of tasks {len(tasks)} in invocation"
|
|
842
|
+
)
|
|
843
|
+
# If index is valid and tasks exist, return the task name
|
|
844
|
+
if len(tasks) > 0 and index < len(tasks):
|
|
845
|
+
return tasks[index].get("name", "unknown")
|
|
846
|
+
except (ValueError, IndexError):
|
|
847
|
+
pass
|
|
848
|
+
|
|
849
|
+
# Fallback: try to get task name from output_dir
|
|
850
|
+
# output_dir typically ends with task name
|
|
851
|
+
output_dir = job_data.data.get("output_dir", "")
|
|
852
|
+
if output_dir:
|
|
853
|
+
parts = pathlib.Path(output_dir).parts
|
|
854
|
+
if parts:
|
|
855
|
+
return parts[-1]
|
|
856
|
+
|
|
857
|
+
return "unknown"
|
|
858
|
+
|
|
859
|
+
@staticmethod
|
|
860
|
+
def _add_to_killed_jobs(invocation_id: str, job_id: str) -> None:
|
|
861
|
+
"""Add a job ID to the killed jobs file for this invocation.
|
|
862
|
+
|
|
863
|
+
Args:
|
|
864
|
+
invocation_id: The invocation ID.
|
|
865
|
+
job_id: The job ID to mark as killed.
|
|
866
|
+
"""
|
|
867
|
+
db = ExecutionDB()
|
|
868
|
+
jobs = db.get_jobs(invocation_id)
|
|
869
|
+
if not jobs:
|
|
870
|
+
return
|
|
871
|
+
|
|
872
|
+
# Get invocation output directory from any job's output_dir
|
|
873
|
+
first_job_data = next(iter(jobs.values()))
|
|
874
|
+
job_output_dir = pathlib.Path(first_job_data.data.get("output_dir", ""))
|
|
875
|
+
if not job_output_dir.exists():
|
|
876
|
+
return
|
|
877
|
+
|
|
878
|
+
# Invocation dir is parent of job output dir
|
|
879
|
+
invocation_dir = job_output_dir.parent
|
|
880
|
+
killed_jobs_file = invocation_dir / "killed_jobs.txt"
|
|
881
|
+
|
|
882
|
+
# Append job_id to file
|
|
883
|
+
with open(killed_jobs_file, "a") as f:
|
|
884
|
+
f.write(f"{job_id}\n")
|
|
885
|
+
|
|
450
886
|
|
|
451
887
|
def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
|
|
452
888
|
"""Get the progress of a local job.
|