nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nemo_evaluator_launcher/__init__.py +15 -1
- nemo_evaluator_launcher/api/functional.py +188 -27
- nemo_evaluator_launcher/api/types.py +9 -0
- nemo_evaluator_launcher/cli/export.py +131 -12
- nemo_evaluator_launcher/cli/info.py +477 -82
- nemo_evaluator_launcher/cli/kill.py +5 -3
- nemo_evaluator_launcher/cli/logs.py +102 -0
- nemo_evaluator_launcher/cli/ls_runs.py +31 -10
- nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
- nemo_evaluator_launcher/cli/main.py +101 -5
- nemo_evaluator_launcher/cli/run.py +153 -30
- nemo_evaluator_launcher/cli/status.py +49 -5
- nemo_evaluator_launcher/cli/version.py +26 -23
- nemo_evaluator_launcher/common/execdb.py +121 -27
- nemo_evaluator_launcher/common/helpers.py +213 -33
- nemo_evaluator_launcher/common/logging_utils.py +16 -5
- nemo_evaluator_launcher/common/printing_utils.py +100 -0
- nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
- nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
- nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
- nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
- nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
- nemo_evaluator_launcher/executors/base.py +54 -1
- nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
- nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
- nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
- nemo_evaluator_launcher/executors/local/executor.py +492 -56
- nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
- nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
- nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- nemo_evaluator_launcher/exporters/base.py +9 -0
- nemo_evaluator_launcher/exporters/gsheets.py +27 -9
- nemo_evaluator_launcher/exporters/local.py +30 -16
- nemo_evaluator_launcher/exporters/mlflow.py +245 -74
- nemo_evaluator_launcher/exporters/utils.py +139 -184
- nemo_evaluator_launcher/exporters/wandb.py +157 -43
- nemo_evaluator_launcher/package_info.py +6 -3
- nemo_evaluator_launcher/resources/mapping.toml +56 -15
- nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
- nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
- nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
- nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
|
@@ -19,6 +19,7 @@ Handles Lepton endpoint creation, management, and health checks.
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
import json
|
|
22
|
+
import shlex
|
|
22
23
|
import subprocess
|
|
23
24
|
import time
|
|
24
25
|
from pathlib import Path
|
|
@@ -27,6 +28,7 @@ from typing import Any, Dict, Optional
|
|
|
27
28
|
# Import lepton dependencies
|
|
28
29
|
from omegaconf import DictConfig
|
|
29
30
|
|
|
31
|
+
from nemo_evaluator_launcher.common.helpers import _str_to_echo_command
|
|
30
32
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
31
33
|
|
|
32
34
|
|
|
@@ -235,6 +237,8 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
|
|
|
235
237
|
Returns:
|
|
236
238
|
Container specification for Lepton.
|
|
237
239
|
"""
|
|
240
|
+
# Extract pre_cmd from deployment_cfg
|
|
241
|
+
pre_cmd: str = deployment_cfg.get("pre_cmd") or ""
|
|
238
242
|
container_spec = {
|
|
239
243
|
"image": deployment_cfg.image,
|
|
240
244
|
"ports": [{"container_port": deployment_cfg.port}],
|
|
@@ -258,6 +262,18 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
|
|
|
258
262
|
if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
|
|
259
263
|
command_parts.extend(deployment_cfg.extra_args.split())
|
|
260
264
|
|
|
265
|
+
# Wrap with pre_cmd if provided
|
|
266
|
+
if pre_cmd:
|
|
267
|
+
create_pre_script_cmd = _str_to_echo_command(
|
|
268
|
+
pre_cmd, filename="deployment_pre_cmd.sh"
|
|
269
|
+
)
|
|
270
|
+
original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
|
|
271
|
+
command_parts = [
|
|
272
|
+
"/bin/bash",
|
|
273
|
+
"-c",
|
|
274
|
+
f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
|
|
275
|
+
]
|
|
276
|
+
|
|
261
277
|
container_spec["command"] = command_parts
|
|
262
278
|
|
|
263
279
|
elif deployment_cfg.type == "sglang":
|
|
@@ -278,12 +294,31 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
|
|
|
278
294
|
if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
|
|
279
295
|
command_parts.extend(deployment_cfg.extra_args.split())
|
|
280
296
|
|
|
297
|
+
# Wrap with pre_cmd if provided
|
|
298
|
+
if pre_cmd:
|
|
299
|
+
create_pre_script_cmd = _str_to_echo_command(
|
|
300
|
+
pre_cmd, filename="deployment_pre_cmd.sh"
|
|
301
|
+
)
|
|
302
|
+
original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
|
|
303
|
+
command_parts = [
|
|
304
|
+
"/bin/bash",
|
|
305
|
+
"-c",
|
|
306
|
+
f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
|
|
307
|
+
]
|
|
308
|
+
|
|
281
309
|
container_spec["command"] = command_parts
|
|
282
310
|
|
|
283
311
|
elif deployment_cfg.type == "nim":
|
|
284
312
|
# NIM containers use their default entrypoint - no custom command needed
|
|
285
313
|
# Configuration is handled via environment variables
|
|
286
|
-
|
|
314
|
+
# pre_cmd is not supported for NIM deployments
|
|
315
|
+
if pre_cmd:
|
|
316
|
+
logger.error(
|
|
317
|
+
"pre_cmd is not supported for NIM deployments",
|
|
318
|
+
deployment_type="nim",
|
|
319
|
+
pre_cmd=pre_cmd,
|
|
320
|
+
)
|
|
321
|
+
raise ValueError("pre_cmd is not supported for NIM deployments")
|
|
287
322
|
|
|
288
323
|
return container_spec
|
|
289
324
|
|
|
@@ -428,14 +463,34 @@ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
|
|
|
428
463
|
print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
|
|
429
464
|
return True
|
|
430
465
|
else:
|
|
431
|
-
|
|
466
|
+
error_msg = result.stderr.strip() if result.stderr else ""
|
|
467
|
+
output_msg = result.stdout.strip() if result.stdout else ""
|
|
468
|
+
print(
|
|
469
|
+
f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
|
|
470
|
+
)
|
|
471
|
+
if error_msg:
|
|
472
|
+
print(f" stderr: {error_msg}")
|
|
473
|
+
if output_msg:
|
|
474
|
+
print(f" stdout: {output_msg}")
|
|
432
475
|
return False
|
|
433
476
|
|
|
434
|
-
except subprocess.TimeoutExpired:
|
|
435
|
-
print(
|
|
477
|
+
except subprocess.TimeoutExpired as e:
|
|
478
|
+
print(
|
|
479
|
+
f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
|
|
480
|
+
)
|
|
481
|
+
if hasattr(e, "stderr") and e.stderr:
|
|
482
|
+
print(f" stderr: {e.stderr}")
|
|
483
|
+
if hasattr(e, "stdout") and e.stdout:
|
|
484
|
+
print(f" stdout: {e.stdout}")
|
|
436
485
|
return False
|
|
437
486
|
except subprocess.CalledProcessError as e:
|
|
438
|
-
print(
|
|
487
|
+
print(
|
|
488
|
+
f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
|
|
489
|
+
)
|
|
490
|
+
if hasattr(e, "stderr") and e.stderr:
|
|
491
|
+
print(f" stderr: {e.stderr}")
|
|
492
|
+
if hasattr(e, "stdout") and e.stdout:
|
|
493
|
+
print(f" stdout: {e.stdout}")
|
|
439
494
|
return False
|
|
440
495
|
finally:
|
|
441
496
|
# Clean up temporary file
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
Handles deployment and evaluation using Lepton endpoints with NIM containers.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
+
import os
|
|
21
22
|
import time
|
|
22
23
|
from pathlib import Path
|
|
23
24
|
from typing import List
|
|
@@ -36,6 +37,7 @@ from nemo_evaluator_launcher.common.mapping import (
|
|
|
36
37
|
get_task_from_mapping,
|
|
37
38
|
load_tasks_mapping,
|
|
38
39
|
)
|
|
40
|
+
from nemo_evaluator_launcher.common.printing_utils import red
|
|
39
41
|
from nemo_evaluator_launcher.executors.base import (
|
|
40
42
|
BaseExecutor,
|
|
41
43
|
ExecutionState,
|
|
@@ -78,9 +80,75 @@ class LeptonExecutor(BaseExecutor):
|
|
|
78
80
|
"LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
|
|
79
81
|
)
|
|
80
82
|
|
|
83
|
+
# Load tasks mapping
|
|
84
|
+
tasks_mapping = load_tasks_mapping()
|
|
85
|
+
job_ids = []
|
|
86
|
+
lepton_job_names = []
|
|
87
|
+
endpoint_names = [] # Track multiple endpoints
|
|
88
|
+
db = ExecutionDB()
|
|
89
|
+
|
|
81
90
|
# Generate invocation ID
|
|
82
91
|
invocation_id = generate_invocation_id()
|
|
83
92
|
|
|
93
|
+
# TODO(agronskiy): the structure of this executor differs from others,
|
|
94
|
+
# so the best place to check for unsafe commands yelids a bit of duplication.
|
|
95
|
+
# We can't use the get_eval_factory_command here because the port is not yet
|
|
96
|
+
# populated.
|
|
97
|
+
# Refactor the whole thing.
|
|
98
|
+
is_potentially_unsafe = False
|
|
99
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
100
|
+
pre_cmd: str = task.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
|
|
101
|
+
if pre_cmd:
|
|
102
|
+
is_potentially_unsafe = True
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
# Check for deployment pre_cmd
|
|
106
|
+
deployment_pre_cmd: str = cfg.deployment.get("pre_cmd") or ""
|
|
107
|
+
if deployment_pre_cmd:
|
|
108
|
+
is_potentially_unsafe = True
|
|
109
|
+
|
|
110
|
+
# DRY-RUN mode
|
|
111
|
+
if dry_run:
|
|
112
|
+
output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
113
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
|
|
115
|
+
# Validate configuration
|
|
116
|
+
_dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
|
|
117
|
+
|
|
118
|
+
if cfg.deployment.type == "none":
|
|
119
|
+
print("Using existing endpoint (deployment: none)")
|
|
120
|
+
print("using shared endpoint")
|
|
121
|
+
else:
|
|
122
|
+
print(f"with endpoint type '{cfg.deployment.type}'")
|
|
123
|
+
|
|
124
|
+
if is_potentially_unsafe:
|
|
125
|
+
print(
|
|
126
|
+
red(
|
|
127
|
+
"\nFound `pre_cmd` (evaluation or deployment) which carries security risk. When running without --dry-run "
|
|
128
|
+
"make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return invocation_id
|
|
133
|
+
|
|
134
|
+
if is_potentially_unsafe:
|
|
135
|
+
if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
|
|
136
|
+
logger.warning(
|
|
137
|
+
"Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
|
|
138
|
+
"is set, proceeding with caution."
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
else:
|
|
142
|
+
logger.error(
|
|
143
|
+
"Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
|
|
144
|
+
"is not set. This might carry security risk and unstable environments. "
|
|
145
|
+
"To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
|
|
146
|
+
)
|
|
147
|
+
raise AttributeError(
|
|
148
|
+
"Untrusted command found in config, make sure you trust and "
|
|
149
|
+
"set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
|
|
150
|
+
)
|
|
151
|
+
|
|
84
152
|
# For deployment: none, we use the existing endpoint for all tasks
|
|
85
153
|
if cfg.deployment.type == "none":
|
|
86
154
|
print("📌 Using existing endpoint (deployment: none)")
|
|
@@ -88,13 +156,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
88
156
|
print(f"✅ Using shared endpoint: {shared_endpoint_url}")
|
|
89
157
|
|
|
90
158
|
try:
|
|
91
|
-
# Load tasks mapping
|
|
92
|
-
tasks_mapping = load_tasks_mapping()
|
|
93
|
-
job_ids = []
|
|
94
|
-
lepton_job_names = []
|
|
95
|
-
endpoint_names = [] # Track multiple endpoints
|
|
96
|
-
db = ExecutionDB()
|
|
97
|
-
|
|
98
159
|
# Create local directory for outputs
|
|
99
160
|
output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
100
161
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -139,8 +200,13 @@ class LeptonExecutor(BaseExecutor):
|
|
|
139
200
|
task_index = str(idx)
|
|
140
201
|
endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
|
|
141
202
|
|
|
142
|
-
# Ensure we don't exceed 36 character limit
|
|
143
203
|
if len(endpoint_name) > 36:
|
|
204
|
+
logger.info(
|
|
205
|
+
"Lepton endpoint name will be deployed under name {task_name}",
|
|
206
|
+
task_name=task.name,
|
|
207
|
+
original=endpoint_name,
|
|
208
|
+
limit=36,
|
|
209
|
+
)
|
|
144
210
|
# Truncate task name further if needed
|
|
145
211
|
max_task_len = (
|
|
146
212
|
36
|
|
@@ -151,7 +217,19 @@ class LeptonExecutor(BaseExecutor):
|
|
|
151
217
|
) # 3 hyphens
|
|
152
218
|
short_task_name = sanitized_task_name[:max_task_len]
|
|
153
219
|
endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
|
|
220
|
+
logger.info(
|
|
221
|
+
"Lepton endpoint name is auto-generated",
|
|
222
|
+
task_name=task.name,
|
|
223
|
+
original=endpoint_name,
|
|
224
|
+
truncated=endpoint_name,
|
|
225
|
+
limit=36,
|
|
226
|
+
)
|
|
154
227
|
|
|
228
|
+
logger.info(
|
|
229
|
+
"Lepton endpoint name (auto-generated)",
|
|
230
|
+
task_name=task.name,
|
|
231
|
+
endpoint_name=endpoint_name,
|
|
232
|
+
)
|
|
155
233
|
endpoint_names.append(endpoint_name)
|
|
156
234
|
endpoint_creation_tasks.append((idx, task, endpoint_name))
|
|
157
235
|
|
|
@@ -298,20 +376,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
298
376
|
f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
|
|
299
377
|
)
|
|
300
378
|
|
|
301
|
-
if dry_run:
|
|
302
|
-
print("🔍 DRY RUN: Lepton job configurations prepared")
|
|
303
|
-
print(f" - Tasks: {len(cfg.evaluation.tasks)}")
|
|
304
|
-
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
305
|
-
if cfg.deployment.type == "none":
|
|
306
|
-
print(f" - Task {idx}: {task.name} using shared endpoint")
|
|
307
|
-
else:
|
|
308
|
-
print(
|
|
309
|
-
f" - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
|
|
310
|
-
)
|
|
311
|
-
print(f" - Output directory: {output_dir}")
|
|
312
|
-
print("\nTo submit jobs, run the executor without --dry-run")
|
|
313
|
-
return invocation_id
|
|
314
|
-
|
|
315
379
|
# ================================================================
|
|
316
380
|
# JOB SUBMISSION (Sequential, as before)
|
|
317
381
|
# ================================================================
|
|
@@ -334,8 +398,18 @@ class LeptonExecutor(BaseExecutor):
|
|
|
334
398
|
max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
|
|
335
399
|
if len(base_job_name) > max_base_length:
|
|
336
400
|
base_job_name = base_job_name[:max_base_length]
|
|
401
|
+
logger.info(
|
|
402
|
+
"Lepton job auto-generated name",
|
|
403
|
+
task_name=task.name,
|
|
404
|
+
job_name=f"{base_job_name}-{suffix}",
|
|
405
|
+
)
|
|
337
406
|
|
|
338
407
|
lepton_job_name = f"{base_job_name}-{suffix}"
|
|
408
|
+
logger.info(
|
|
409
|
+
"Lepton job name (auto-generated)",
|
|
410
|
+
task_name=task.name,
|
|
411
|
+
job_name=lepton_job_name,
|
|
412
|
+
)
|
|
339
413
|
job_ids.append(job_id)
|
|
340
414
|
lepton_job_names.append(lepton_job_name)
|
|
341
415
|
|
|
@@ -377,7 +451,12 @@ class LeptonExecutor(BaseExecutor):
|
|
|
377
451
|
cfg.target.api_endpoint.url = full_endpoint_url
|
|
378
452
|
|
|
379
453
|
# Generate command with the correct endpoint URL
|
|
380
|
-
|
|
454
|
+
eval_command_struct = get_eval_factory_command(
|
|
455
|
+
cfg, task, task_definition
|
|
456
|
+
)
|
|
457
|
+
eval_command = eval_command_struct.cmd
|
|
458
|
+
# Debug string for explainability of some base64-parts of the command
|
|
459
|
+
eval_command_debug_comment = eval_command_struct.debug
|
|
381
460
|
|
|
382
461
|
finally:
|
|
383
462
|
# Restore original URL and struct mode
|
|
@@ -402,6 +481,7 @@ class LeptonExecutor(BaseExecutor):
|
|
|
402
481
|
task_name=task.name,
|
|
403
482
|
invocation_id=invocation_id,
|
|
404
483
|
eval_command=eval_command, # Pass the fixed command
|
|
484
|
+
eval_command_debug_comment=eval_command_debug_comment,
|
|
405
485
|
)
|
|
406
486
|
|
|
407
487
|
# Prepare job command to run the launch script
|
|
@@ -456,6 +536,33 @@ class LeptonExecutor(BaseExecutor):
|
|
|
456
536
|
|
|
457
537
|
job_mounts.append(mount_dict)
|
|
458
538
|
|
|
539
|
+
# Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
|
|
540
|
+
if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get(
|
|
541
|
+
"required_env_vars", []
|
|
542
|
+
):
|
|
543
|
+
# Get dataset directory from task config
|
|
544
|
+
if "dataset_dir" in task:
|
|
545
|
+
dataset_mount_host = task["dataset_dir"]
|
|
546
|
+
else:
|
|
547
|
+
raise ValueError(
|
|
548
|
+
f"{task.name} task requires a dataset_dir to be specified. "
|
|
549
|
+
f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
|
|
550
|
+
)
|
|
551
|
+
# Get container mount path (default to /datasets if not specified)
|
|
552
|
+
dataset_mount_container = task.get(
|
|
553
|
+
"dataset_mount_path", "/datasets"
|
|
554
|
+
)
|
|
555
|
+
# Add dataset mount to job mounts
|
|
556
|
+
# Lepton mount format: {"path": "/path/in/container", "mount_from": {"path": "/host/path"}}
|
|
557
|
+
job_mounts.append(
|
|
558
|
+
{
|
|
559
|
+
"path": dataset_mount_container,
|
|
560
|
+
"mount_from": {"path": dataset_mount_host},
|
|
561
|
+
}
|
|
562
|
+
)
|
|
563
|
+
# Add NEMO_EVALUATOR_DATASET_DIR environment variable
|
|
564
|
+
job_env_vars["NEMO_EVALUATOR_DATASET_DIR"] = dataset_mount_container
|
|
565
|
+
|
|
459
566
|
print(
|
|
460
567
|
f" - Storage: {len(job_mounts)} mount(s) with evaluation ID isolation"
|
|
461
568
|
)
|
|
@@ -482,7 +589,8 @@ class LeptonExecutor(BaseExecutor):
|
|
|
482
589
|
|
|
483
590
|
if not job_success:
|
|
484
591
|
raise RuntimeError(
|
|
485
|
-
f"Failed to submit Lepton job
|
|
592
|
+
f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
|
|
593
|
+
f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
|
|
486
594
|
)
|
|
487
595
|
|
|
488
596
|
# Store job metadata in database (with task-specific endpoint info)
|
|
@@ -504,8 +612,6 @@ class LeptonExecutor(BaseExecutor):
|
|
|
504
612
|
)
|
|
505
613
|
)
|
|
506
614
|
|
|
507
|
-
print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
|
|
508
|
-
|
|
509
615
|
# Jobs submitted successfully - return immediately (non-blocking)
|
|
510
616
|
print(
|
|
511
617
|
f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
|
|
@@ -536,9 +642,8 @@ class LeptonExecutor(BaseExecutor):
|
|
|
536
642
|
|
|
537
643
|
return invocation_id
|
|
538
644
|
|
|
539
|
-
except Exception
|
|
645
|
+
except Exception:
|
|
540
646
|
# Clean up any created endpoints on failure
|
|
541
|
-
print(f"❌ Error during evaluation: {e}")
|
|
542
647
|
if cfg.deployment.type != "none" and "endpoint_names" in locals():
|
|
543
648
|
for endpoint_name in endpoint_names:
|
|
544
649
|
if endpoint_name:
|
|
@@ -559,7 +664,7 @@ class LeptonExecutor(BaseExecutor):
|
|
|
559
664
|
db = ExecutionDB()
|
|
560
665
|
|
|
561
666
|
# If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
|
|
562
|
-
if
|
|
667
|
+
if "." not in id:
|
|
563
668
|
return _get_statuses_for_invocation_id(id=id, db=db)
|
|
564
669
|
# Otherwise, treat as job_id
|
|
565
670
|
job_data = db.get_job(id)
|
|
@@ -577,7 +682,7 @@ class LeptonExecutor(BaseExecutor):
|
|
|
577
682
|
job_state = lepton_status.get("state", "Unknown")
|
|
578
683
|
|
|
579
684
|
# Map Lepton job states to our execution states
|
|
580
|
-
if job_state
|
|
685
|
+
if job_state in ["Succeeded", "Completed"]:
|
|
581
686
|
state = ExecutionState.SUCCESS
|
|
582
687
|
elif job_state in ["Running", "Pending", "Starting"]:
|
|
583
688
|
state = ExecutionState.RUNNING
|
|
@@ -624,76 +729,14 @@ class LeptonExecutor(BaseExecutor):
|
|
|
624
729
|
def kill_job(job_id: str) -> None:
|
|
625
730
|
"""Kill Lepton evaluation jobs and clean up endpoints.
|
|
626
731
|
|
|
627
|
-
For invocation IDs, this will kill all jobs and clean up all
|
|
628
|
-
dedicated endpoints created for the invocation.
|
|
629
|
-
|
|
630
732
|
Args:
|
|
631
|
-
job_id: The job ID
|
|
733
|
+
job_id: The job ID to kill.
|
|
632
734
|
|
|
633
735
|
Raises:
|
|
634
736
|
ValueError: If job is not found or invalid.
|
|
635
737
|
RuntimeError: If job cannot be killed.
|
|
636
738
|
"""
|
|
637
739
|
db = ExecutionDB()
|
|
638
|
-
|
|
639
|
-
# If it looks like an invocation_id, kill all jobs for that invocation
|
|
640
|
-
if len(job_id) == 8 and "." not in job_id:
|
|
641
|
-
jobs = db.get_jobs(job_id)
|
|
642
|
-
if not jobs:
|
|
643
|
-
raise ValueError(f"No jobs found for invocation {job_id}")
|
|
644
|
-
|
|
645
|
-
endpoint_names = (
|
|
646
|
-
set()
|
|
647
|
-
) # Use set to avoid duplicates (though each should be unique)
|
|
648
|
-
lepton_job_names = []
|
|
649
|
-
|
|
650
|
-
# Collect all Lepton jobs and endpoint info
|
|
651
|
-
for curr_job_data in jobs.values():
|
|
652
|
-
if curr_job_data.executor != "lepton":
|
|
653
|
-
continue
|
|
654
|
-
|
|
655
|
-
# Collect endpoint name for this job (each task may have its own)
|
|
656
|
-
endpoint_name = curr_job_data.data.get("endpoint_name")
|
|
657
|
-
if endpoint_name:
|
|
658
|
-
endpoint_names.add(endpoint_name)
|
|
659
|
-
|
|
660
|
-
lepton_job_name = curr_job_data.data.get("lepton_job_name")
|
|
661
|
-
if lepton_job_name:
|
|
662
|
-
lepton_job_names.append(lepton_job_name)
|
|
663
|
-
|
|
664
|
-
# Mark job as killed in database
|
|
665
|
-
curr_job_data.data["status"] = "killed"
|
|
666
|
-
curr_job_data.data["killed_time"] = time.time()
|
|
667
|
-
db.write_job(curr_job_data)
|
|
668
|
-
|
|
669
|
-
print(
|
|
670
|
-
f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
# Cancel all Lepton jobs
|
|
674
|
-
for lepton_job_name in lepton_job_names:
|
|
675
|
-
success = delete_lepton_job(lepton_job_name)
|
|
676
|
-
if success:
|
|
677
|
-
print(f"✅ Cancelled Lepton job: {lepton_job_name}")
|
|
678
|
-
else:
|
|
679
|
-
print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
|
|
680
|
-
|
|
681
|
-
# Clean up all dedicated endpoints
|
|
682
|
-
if endpoint_names:
|
|
683
|
-
print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
|
|
684
|
-
for endpoint_name in endpoint_names:
|
|
685
|
-
success = delete_lepton_endpoint(endpoint_name)
|
|
686
|
-
if success:
|
|
687
|
-
print(f"✅ Cleaned up endpoint: {endpoint_name}")
|
|
688
|
-
else:
|
|
689
|
-
print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
|
|
690
|
-
else:
|
|
691
|
-
print("📌 No dedicated endpoints to clean up (using shared endpoint)")
|
|
692
|
-
|
|
693
|
-
print(f"🛑 Killed all resources for invocation {job_id}")
|
|
694
|
-
return
|
|
695
|
-
|
|
696
|
-
# Otherwise, treat as individual job_id
|
|
697
740
|
job_data = db.get_job(job_id)
|
|
698
741
|
if job_data is None:
|
|
699
742
|
raise ValueError(f"Job {job_id} not found")
|
|
@@ -705,17 +748,25 @@ class LeptonExecutor(BaseExecutor):
|
|
|
705
748
|
|
|
706
749
|
# Cancel the specific Lepton job
|
|
707
750
|
lepton_job_name = job_data.data.get("lepton_job_name")
|
|
751
|
+
|
|
708
752
|
if lepton_job_name:
|
|
709
|
-
|
|
710
|
-
if
|
|
753
|
+
cancel_success = delete_lepton_job(lepton_job_name)
|
|
754
|
+
if cancel_success:
|
|
711
755
|
print(f"✅ Cancelled Lepton job: {lepton_job_name}")
|
|
756
|
+
# Mark job as killed in database
|
|
757
|
+
job_data.data["status"] = "killed"
|
|
758
|
+
job_data.data["killed_time"] = time.time()
|
|
759
|
+
db.write_job(job_data)
|
|
712
760
|
else:
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
761
|
+
# Use common helper to get informative error message based on job status
|
|
762
|
+
status_list = LeptonExecutor.get_status(job_id)
|
|
763
|
+
current_status = status_list[0].state if status_list else None
|
|
764
|
+
error_msg = LeptonExecutor.get_kill_failure_message(
|
|
765
|
+
job_id, f"lepton_job: {lepton_job_name}", current_status
|
|
766
|
+
)
|
|
767
|
+
raise RuntimeError(error_msg)
|
|
768
|
+
else:
|
|
769
|
+
raise ValueError(f"No Lepton job name found for job {job_id}")
|
|
719
770
|
|
|
720
771
|
print(f"🛑 Killed Lepton job {job_id}")
|
|
721
772
|
|
|
@@ -761,6 +812,7 @@ def _create_evaluation_launch_script(
|
|
|
761
812
|
task_name: str,
|
|
762
813
|
invocation_id: str,
|
|
763
814
|
eval_command: str,
|
|
815
|
+
eval_command_debug_comment: str,
|
|
764
816
|
) -> str:
|
|
765
817
|
"""Create bash script for running evaluation in Lepton job container.
|
|
766
818
|
|
|
@@ -774,6 +826,7 @@ def _create_evaluation_launch_script(
|
|
|
774
826
|
task_name: Name of the evaluation task.
|
|
775
827
|
invocation_id: Unique invocation identifier.
|
|
776
828
|
eval_command: The evaluation command with correct endpoint URL.
|
|
829
|
+
eval_command_debug_comment: The debug comment for placing into the script and easy debug
|
|
777
830
|
|
|
778
831
|
Returns:
|
|
779
832
|
String containing the bash launch script.
|
|
@@ -806,6 +859,8 @@ echo "Invocation ID: {invocation_id}"
|
|
|
806
859
|
echo "Endpoint URL: {endpoint_url}"
|
|
807
860
|
echo "Command: {eval_command_modified}"
|
|
808
861
|
|
|
862
|
+
{eval_command_debug_comment}
|
|
863
|
+
|
|
809
864
|
# Execute the evaluation with proper error handling
|
|
810
865
|
set +e
|
|
811
866
|
{eval_command_modified}
|
|
@@ -829,6 +884,90 @@ exit 0
|
|
|
829
884
|
return script
|
|
830
885
|
|
|
831
886
|
|
|
887
|
+
def _dry_run_lepton(
|
|
888
|
+
cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
|
|
889
|
+
) -> None:
|
|
890
|
+
print("DRY RUN: Lepton job configurations prepared")
|
|
891
|
+
try:
|
|
892
|
+
# validate tasks
|
|
893
|
+
for task in cfg.evaluation.tasks:
|
|
894
|
+
get_task_from_mapping(task.name, tasks_mapping)
|
|
895
|
+
|
|
896
|
+
# nice-to-have checks (existing endpoint URL or endpoints mapping)
|
|
897
|
+
if getattr(cfg.deployment, "type", None) == "none":
|
|
898
|
+
tgt = getattr(cfg, "target", {})
|
|
899
|
+
api = (
|
|
900
|
+
tgt.get("api_endpoint")
|
|
901
|
+
if isinstance(tgt, dict)
|
|
902
|
+
else getattr(tgt, "api_endpoint", None)
|
|
903
|
+
) or {}
|
|
904
|
+
url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
|
|
905
|
+
if not url or not str(url).strip():
|
|
906
|
+
raise ValueError(
|
|
907
|
+
"target.api_endpoint.url must be set when deployment.type == 'none'"
|
|
908
|
+
)
|
|
909
|
+
else:
|
|
910
|
+
endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
|
|
911
|
+
for task in cfg.evaluation.tasks:
|
|
912
|
+
td = get_task_from_mapping(task.name, tasks_mapping)
|
|
913
|
+
etype = td.get("endpoint_type")
|
|
914
|
+
if etype not in endpoints_cfg:
|
|
915
|
+
raise ValueError(
|
|
916
|
+
f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
|
|
917
|
+
)
|
|
918
|
+
path = endpoints_cfg.get(etype)
|
|
919
|
+
if not isinstance(path, str) or not path.startswith("/"):
|
|
920
|
+
raise ValueError(
|
|
921
|
+
f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# lepton env var presence (reference-level)
|
|
925
|
+
tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
|
|
926
|
+
lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
|
|
927
|
+
api_key_name = getattr(
|
|
928
|
+
getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
|
|
929
|
+
)
|
|
930
|
+
for task in cfg.evaluation.tasks:
|
|
931
|
+
td = get_task_from_mapping(task.name, tasks_mapping)
|
|
932
|
+
required = td.get("required_env_vars", []) or []
|
|
933
|
+
for var in required:
|
|
934
|
+
# Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
|
|
935
|
+
if var == "NEMO_EVALUATOR_DATASET_DIR":
|
|
936
|
+
if "dataset_dir" not in task:
|
|
937
|
+
raise ValueError(
|
|
938
|
+
f"Task '{task.name}' requires dataset_dir to be specified. "
|
|
939
|
+
f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
|
|
940
|
+
)
|
|
941
|
+
continue
|
|
942
|
+
if var == "API_KEY":
|
|
943
|
+
if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
|
|
944
|
+
raise ValueError(
|
|
945
|
+
f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
|
|
946
|
+
"or target.api_endpoint.api_key_name"
|
|
947
|
+
)
|
|
948
|
+
else:
|
|
949
|
+
if var not in lepton_env_vars:
|
|
950
|
+
raise ValueError(
|
|
951
|
+
f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
# success (use realized output directory if invocation_id is available)
|
|
955
|
+
preview_output_dir = (
|
|
956
|
+
Path(cfg.execution.output_dir).absolute() / invocation_id
|
|
957
|
+
if invocation_id
|
|
958
|
+
else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
|
|
959
|
+
)
|
|
960
|
+
print(f" - Tasks: {len(cfg.evaluation.tasks)}")
|
|
961
|
+
for idx, task in enumerate(cfg.evaluation.tasks):
|
|
962
|
+
print(f" - Task {idx}: {task.name}")
|
|
963
|
+
print(f" - Output directory: {preview_output_dir}")
|
|
964
|
+
print("\nTo run evaluation, execute run command without --dry-run")
|
|
965
|
+
except Exception as e:
|
|
966
|
+
print(f"❌ Configuration invalid: {e}")
|
|
967
|
+
logger.error("Lepton dry-run validation failed", error=str(e))
|
|
968
|
+
return
|
|
969
|
+
|
|
970
|
+
|
|
832
971
|
def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
|
|
833
972
|
"""Helper method that returns statuses if id is the invocation id"""
|
|
834
973
|
jobs = db.get_jobs(id)
|
|
@@ -23,13 +23,6 @@ import subprocess
|
|
|
23
23
|
import time
|
|
24
24
|
from typing import Any, List, Union
|
|
25
25
|
|
|
26
|
-
from leptonai.api.v1.types.affinity import LeptonResourceAffinity
|
|
27
|
-
from leptonai.api.v1.types.common import LeptonVisibility, Metadata
|
|
28
|
-
from leptonai.api.v1.types.deployment import EnvVar, LeptonContainer, Mount
|
|
29
|
-
from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
|
|
30
|
-
|
|
31
|
-
# Import lepton dependencies
|
|
32
|
-
from leptonai.api.v2.client import APIClient
|
|
33
26
|
from omegaconf import DictConfig
|
|
34
27
|
|
|
35
28
|
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
@@ -92,6 +85,18 @@ def _create_lepton_job_api(
|
|
|
92
85
|
) -> tuple[bool, str]:
|
|
93
86
|
"""Create Lepton job using API client (preferred method)."""
|
|
94
87
|
try:
|
|
88
|
+
# Import leptonai dependencies locally
|
|
89
|
+
from leptonai.api.v1.types.affinity import LeptonResourceAffinity
|
|
90
|
+
from leptonai.api.v1.types.common import LeptonVisibility, Metadata
|
|
91
|
+
from leptonai.api.v1.types.deployment import (
|
|
92
|
+
EnvValue,
|
|
93
|
+
EnvVar,
|
|
94
|
+
LeptonContainer,
|
|
95
|
+
Mount,
|
|
96
|
+
)
|
|
97
|
+
from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
|
|
98
|
+
from leptonai.api.v2.client import APIClient
|
|
99
|
+
|
|
95
100
|
client = APIClient()
|
|
96
101
|
|
|
97
102
|
# Prepare environment variables (support both direct values and secret references)
|
|
@@ -99,12 +104,8 @@ def _create_lepton_job_api(
|
|
|
99
104
|
if env_vars:
|
|
100
105
|
for key, value in env_vars.items():
|
|
101
106
|
# Handle both regular dicts and OmegaConf objects
|
|
102
|
-
from omegaconf import DictConfig
|
|
103
|
-
|
|
104
107
|
if isinstance(value, (dict, DictConfig)) and "value_from" in value:
|
|
105
108
|
# Secret reference: {value_from: {secret_name_ref: "secret_name"}}
|
|
106
|
-
from leptonai.api.v1.types.deployment import EnvValue
|
|
107
|
-
|
|
108
109
|
# Convert OmegaConf to dict if needed
|
|
109
110
|
value_dict = dict(value) if isinstance(value, DictConfig) else value
|
|
110
111
|
env_var = EnvVar(
|
|
@@ -203,6 +204,9 @@ def get_lepton_job_status(job_name_or_id: str) -> dict[Any, Any] | None:
|
|
|
203
204
|
def _get_lepton_job_status_api(job_name_or_id: str) -> dict[Any, Any] | None:
|
|
204
205
|
"""Get job status using API client (preferred method)."""
|
|
205
206
|
try:
|
|
207
|
+
# Import leptonai dependencies locally
|
|
208
|
+
from leptonai.api.v2.client import APIClient
|
|
209
|
+
|
|
206
210
|
client = APIClient()
|
|
207
211
|
|
|
208
212
|
# Try to get job by ID first, then by name
|