nemo-evaluator-launcher 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (60) hide show
  1. nemo_evaluator_launcher/__init__.py +79 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +698 -0
  4. nemo_evaluator_launcher/api/types.py +98 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +267 -0
  8. nemo_evaluator_launcher/cli/info.py +512 -0
  9. nemo_evaluator_launcher/cli/kill.py +41 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +134 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
  12. nemo_evaluator_launcher/cli/main.py +226 -0
  13. nemo_evaluator_launcher/cli/run.py +200 -0
  14. nemo_evaluator_launcher/cli/status.py +164 -0
  15. nemo_evaluator_launcher/cli/version.py +55 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +283 -0
  18. nemo_evaluator_launcher/common/helpers.py +366 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +357 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/common/printing_utils.py +93 -0
  22. nemo_evaluator_launcher/configs/__init__.py +15 -0
  23. nemo_evaluator_launcher/configs/default.yaml +28 -0
  24. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  25. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  26. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  27. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  28. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
  29. nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
  30. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  31. nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
  32. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
  33. nemo_evaluator_launcher/executors/__init__.py +22 -0
  34. nemo_evaluator_launcher/executors/base.py +120 -0
  35. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  36. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
  37. nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
  38. nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
  39. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  40. nemo_evaluator_launcher/executors/local/executor.py +605 -0
  41. nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
  42. nemo_evaluator_launcher/executors/registry.py +38 -0
  43. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  44. nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
  45. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  46. nemo_evaluator_launcher/exporters/base.py +121 -0
  47. nemo_evaluator_launcher/exporters/gsheets.py +409 -0
  48. nemo_evaluator_launcher/exporters/local.py +502 -0
  49. nemo_evaluator_launcher/exporters/mlflow.py +619 -0
  50. nemo_evaluator_launcher/exporters/registry.py +40 -0
  51. nemo_evaluator_launcher/exporters/utils.py +624 -0
  52. nemo_evaluator_launcher/exporters/wandb.py +490 -0
  53. nemo_evaluator_launcher/package_info.py +38 -0
  54. nemo_evaluator_launcher/resources/mapping.toml +380 -0
  55. nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
  56. nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
  57. nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
  58. nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
  59. nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
  60. nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1004 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Lepton executor implementation for nemo-evaluator-launcher.
17
+
18
+ Handles deployment and evaluation using Lepton endpoints with NIM containers.
19
+ """
20
+
21
+ import os
22
+ import time
23
+ from pathlib import Path
24
+ from typing import List
25
+
26
+ from omegaconf import DictConfig
27
+
28
+ from nemo_evaluator_launcher.common.execdb import (
29
+ ExecutionDB,
30
+ JobData,
31
+ generate_invocation_id,
32
+ generate_job_id,
33
+ )
34
+ from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
35
+ from nemo_evaluator_launcher.common.logging_utils import logger
36
+ from nemo_evaluator_launcher.common.mapping import (
37
+ get_task_from_mapping,
38
+ load_tasks_mapping,
39
+ )
40
+ from nemo_evaluator_launcher.common.printing_utils import red
41
+ from nemo_evaluator_launcher.executors.base import (
42
+ BaseExecutor,
43
+ ExecutionState,
44
+ ExecutionStatus,
45
+ )
46
+ from nemo_evaluator_launcher.executors.registry import register_executor
47
+
48
+ from .deployment_helpers import (
49
+ create_lepton_endpoint,
50
+ delete_lepton_endpoint,
51
+ get_lepton_endpoint_status,
52
+ get_lepton_endpoint_url,
53
+ wait_for_lepton_endpoint_ready,
54
+ )
55
+ from .job_helpers import create_lepton_job, delete_lepton_job, get_lepton_job_status
56
+
57
+
58
+ @register_executor("lepton")
59
+ class LeptonExecutor(BaseExecutor):
60
+ @staticmethod
61
+ def execute_eval(cfg: DictConfig, dry_run: bool = False) -> str:
62
+ """Deploy dedicated endpoints for each task on Lepton and run evaluation jobs.
63
+
64
+ For better resource isolation and parallel execution, each evaluation task
65
+ gets its own dedicated endpoint deployment of the same model.
66
+
67
+ Args:
68
+ cfg: The configuration object for the evaluation run.
69
+ dry_run: If True, prepare job configurations without submission.
70
+
71
+ Returns:
72
+ str: The invocation ID for the evaluation run.
73
+
74
+ Raises:
75
+ ValueError: If deployment configuration is invalid.
76
+ RuntimeError: If endpoint deployment or evaluation fails.
77
+ """
78
+ if cfg.deployment.type not in ["vllm", "sglang", "nim", "none"]:
79
+ raise ValueError(
80
+ "LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
81
+ )
82
+
83
+ # Load tasks mapping
84
+ tasks_mapping = load_tasks_mapping()
85
+ job_ids = []
86
+ lepton_job_names = []
87
+ endpoint_names = [] # Track multiple endpoints
88
+ db = ExecutionDB()
89
+
90
+ # Generate invocation ID
91
+ invocation_id = generate_invocation_id()
92
+
93
+ # TODO(agronskiy): the structure of this executor differs from others,
94
+ # so the best place to check for unsafe commands yelids a bit of duplication.
95
+ # We can't use the get_eval_factory_command here because the port is not yet
96
+ # populated.
97
+ # Refactor the whole thing.
98
+ is_potentially_unsafe = False
99
+ for idx, task in enumerate(cfg.evaluation.tasks):
100
+ pre_cmd: str = task.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
101
+ if pre_cmd:
102
+ is_potentially_unsafe = True
103
+ break
104
+
105
+ # DRY-RUN mode
106
+ if dry_run:
107
+ output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
108
+ output_dir.mkdir(parents=True, exist_ok=True)
109
+
110
+ # Validate configuration
111
+ _dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
112
+
113
+ if cfg.deployment.type == "none":
114
+ print("Using existing endpoint (deployment: none)")
115
+ print("using shared endpoint")
116
+ else:
117
+ print(f"with endpoint type '{cfg.deployment.type}'")
118
+
119
+ if is_potentially_unsafe:
120
+ print(
121
+ red(
122
+ "\nFound `pre_cmd` which carries security risk. When running without --dry-run "
123
+ "make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
124
+ )
125
+ )
126
+
127
+ return invocation_id
128
+
129
+ if is_potentially_unsafe:
130
+ if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
131
+ logger.warning(
132
+ "Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
133
+ "is set, proceeding with caution."
134
+ )
135
+
136
+ else:
137
+ logger.error(
138
+ "Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
139
+ "is not set. This might carry security risk and unstable environments. "
140
+ "To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
141
+ )
142
+ raise AttributeError(
143
+ "Untrusted command found in config, make sure you trust and "
144
+ "set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
145
+ )
146
+
147
+ # For deployment: none, we use the existing endpoint for all tasks
148
+ if cfg.deployment.type == "none":
149
+ print("📌 Using existing endpoint (deployment: none)")
150
+ shared_endpoint_url = cfg.target.api_endpoint.url
151
+ print(f"✅ Using shared endpoint: {shared_endpoint_url}")
152
+
153
+ try:
154
+ # Create local directory for outputs
155
+ output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
156
+ output_dir.mkdir(parents=True, exist_ok=True)
157
+
158
+ print(
159
+ f"🚀 Processing {len(cfg.evaluation.tasks)} evaluation tasks with dedicated endpoints..."
160
+ )
161
+
162
+ # For deployment: none, skip endpoint creation
163
+ if cfg.deployment.type == "none":
164
+ print("📌 Skipping endpoint creation (using existing endpoint)")
165
+ task_endpoints = {}
166
+ for idx, task in enumerate(cfg.evaluation.tasks):
167
+ task_endpoints[idx] = {
168
+ "name": None,
169
+ "url": shared_endpoint_url,
170
+ "full_url": shared_endpoint_url,
171
+ }
172
+ else:
173
+ # ================================================================
174
+ # PARALLEL ENDPOINT DEPLOYMENT
175
+ # ================================================================
176
+ print(
177
+ f"🚀 Creating {len(cfg.evaluation.tasks)} endpoints in parallel..."
178
+ )
179
+
180
+ import queue
181
+ import threading
182
+
183
+ # Generate short endpoint names for all tasks
184
+ task_endpoints = {}
185
+ endpoint_creation_tasks = []
186
+
187
+ for idx, task in enumerate(cfg.evaluation.tasks):
188
+ # Create shorter endpoint names: e.g., "nim-gpqa-0-abc123"
189
+ sanitized_task_name = task.name.replace("_", "-").lower()
190
+ if sanitized_task_name.count(".") > 0:
191
+ sanitized_task_name = sanitized_task_name.split(".")[-1]
192
+ # Take only first 6 chars of task name to keep it short (leaving room for index)
193
+ short_task_name = sanitized_task_name[:6]
194
+ short_invocation = invocation_id[:6]
195
+ task_index = str(idx)
196
+ endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
197
+
198
+ if len(endpoint_name) > 36:
199
+ logger.info(
200
+ "Lepton endpoint name will be deployed under name {task_name}",
201
+ task_name=task.name,
202
+ original=endpoint_name,
203
+ limit=36,
204
+ )
205
+ # Truncate task name further if needed
206
+ max_task_len = (
207
+ 36
208
+ - len(cfg.deployment.type)
209
+ - len(task_index)
210
+ - len(short_invocation)
211
+ - 3
212
+ ) # 3 hyphens
213
+ short_task_name = sanitized_task_name[:max_task_len]
214
+ endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
215
+ logger.info(
216
+ "Lepton endpoint name is auto-generated",
217
+ task_name=task.name,
218
+ original=endpoint_name,
219
+ truncated=endpoint_name,
220
+ limit=36,
221
+ )
222
+
223
+ logger.info(
224
+ "Lepton endpoint name (auto-generated)",
225
+ task_name=task.name,
226
+ endpoint_name=endpoint_name,
227
+ )
228
+ endpoint_names.append(endpoint_name)
229
+ endpoint_creation_tasks.append((idx, task, endpoint_name))
230
+
231
+ # Thread function to create a single endpoint
232
+ def create_endpoint_worker(
233
+ task_info: tuple[int, "DictConfig", str], result_queue: queue.Queue
234
+ ) -> None:
235
+ try:
236
+ idx, task, endpoint_name = task_info
237
+ print(f"🚀 Task {task.name}: Creating endpoint {endpoint_name}")
238
+
239
+ # Create Lepton endpoint
240
+ if not create_lepton_endpoint(cfg, endpoint_name):
241
+ result_queue.put(
242
+ (
243
+ idx,
244
+ False,
245
+ f"Failed to create endpoint {endpoint_name}",
246
+ None,
247
+ None,
248
+ )
249
+ )
250
+ return
251
+
252
+ # Wait for endpoint to be ready
253
+ print(
254
+ f"⏳ Task {task.name}: Waiting for endpoint {endpoint_name} to be ready..."
255
+ )
256
+ # Get timeout from config, default to 600 seconds if not set
257
+ endpoint_timeout = (
258
+ cfg.execution.get("lepton_platform", {})
259
+ .get("deployment", {})
260
+ .get("endpoint_readiness_timeout", 600)
261
+ )
262
+ if not wait_for_lepton_endpoint_ready(
263
+ endpoint_name, timeout=endpoint_timeout
264
+ ):
265
+ result_queue.put(
266
+ (
267
+ idx,
268
+ False,
269
+ f"Endpoint {endpoint_name} failed to become ready",
270
+ None,
271
+ None,
272
+ )
273
+ )
274
+ return
275
+
276
+ # Get endpoint URL
277
+ endpoint_url = get_lepton_endpoint_url(endpoint_name)
278
+ if not endpoint_url:
279
+ result_queue.put(
280
+ (
281
+ idx,
282
+ False,
283
+ f"Could not get URL for endpoint {endpoint_name}",
284
+ None,
285
+ None,
286
+ )
287
+ )
288
+ return
289
+
290
+ # Construct the full endpoint URL
291
+ task_definition = get_task_from_mapping(
292
+ task.name, tasks_mapping
293
+ )
294
+ task_endpoint_type = task_definition["endpoint_type"]
295
+ endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
296
+ full_endpoint_url = f"{endpoint_url.rstrip('/')}{endpoint_path}"
297
+
298
+ print(
299
+ f"✅ Task {task.name}: Endpoint {endpoint_name} ready at {endpoint_url}"
300
+ )
301
+ result_queue.put(
302
+ (
303
+ idx,
304
+ True,
305
+ None,
306
+ endpoint_name,
307
+ endpoint_url,
308
+ full_endpoint_url,
309
+ )
310
+ )
311
+
312
+ except Exception as e:
313
+ result_queue.put(
314
+ (
315
+ idx,
316
+ False,
317
+ f"Exception creating endpoint: {e}",
318
+ None,
319
+ None,
320
+ )
321
+ )
322
+
323
+ # Create and start threads for parallel endpoint creation
324
+ result_queue: queue.Queue = queue.Queue()
325
+ threads = []
326
+
327
+ for task_info in endpoint_creation_tasks:
328
+ thread = threading.Thread(
329
+ target=create_endpoint_worker, args=(task_info, result_queue)
330
+ )
331
+ thread.start()
332
+ threads.append(thread)
333
+
334
+ # Wait for all threads to complete and collect results
335
+ for thread in threads:
336
+ thread.join()
337
+
338
+ # Process results
339
+ failed_endpoints = []
340
+ for _ in range(len(endpoint_creation_tasks)):
341
+ try:
342
+ result = result_queue.get_nowait()
343
+ idx = result[0]
344
+ success = result[1]
345
+
346
+ if success:
347
+ _, _, _, endpoint_name, endpoint_url, full_endpoint_url = (
348
+ result
349
+ )
350
+ task_endpoints[idx] = {
351
+ "name": endpoint_name,
352
+ "url": endpoint_url,
353
+ "full_url": full_endpoint_url,
354
+ }
355
+ else:
356
+ error_msg = result[2]
357
+ failed_endpoints.append((idx, error_msg))
358
+ except queue.Empty:
359
+ break
360
+
361
+ # Check if any endpoints failed
362
+ if failed_endpoints:
363
+ error_details = "; ".join(
364
+ [f"Task {idx}: {msg}" for idx, msg in failed_endpoints]
365
+ )
366
+ raise RuntimeError(
367
+ f"Failed to create {len(failed_endpoints)} endpoints: {error_details}"
368
+ )
369
+
370
+ print(
371
+ f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
372
+ )
373
+
374
+ # ================================================================
375
+ # JOB SUBMISSION (Sequential, as before)
376
+ # ================================================================
377
+ print(f"📝 Submitting {len(cfg.evaluation.tasks)} evaluation jobs...")
378
+
379
+ # Submit each evaluation task as a Lepton job
380
+ for idx, task in enumerate(cfg.evaluation.tasks):
381
+ task_definition = get_task_from_mapping(task.name, tasks_mapping)
382
+
383
+ # Create job ID and Lepton job name (max 36 chars)
384
+ job_id = generate_job_id(invocation_id, idx)
385
+ # Sanitized task name for RFC 1123 compliance (no underscores, lowercase)
386
+ sanitized_task_name = task.name.replace("_", "-").lower()
387
+ if sanitized_task_name.count(".") > 0:
388
+ sanitized_task_name = sanitized_task_name.split(".")[-1]
389
+ base_job_name = f"eval-{invocation_id[:6]}-{sanitized_task_name}"
390
+ suffix = str(idx)
391
+
392
+ # Ensure job name length is within 36 character limit
393
+ max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
394
+ if len(base_job_name) > max_base_length:
395
+ base_job_name = base_job_name[:max_base_length]
396
+ logger.info(
397
+ "Lepton job auto-generated name",
398
+ task_name=task.name,
399
+ job_name=f"{base_job_name}-{suffix}",
400
+ )
401
+
402
+ lepton_job_name = f"{base_job_name}-{suffix}"
403
+ logger.info(
404
+ "Lepton job name (auto-generated)",
405
+ task_name=task.name,
406
+ job_name=lepton_job_name,
407
+ )
408
+ job_ids.append(job_id)
409
+ lepton_job_names.append(lepton_job_name)
410
+
411
+ # Create task output directory (for result collection)
412
+ task_output_dir = output_dir / task.name
413
+ task_output_dir.mkdir(parents=True, exist_ok=True)
414
+
415
+ # Determine evaluation image
416
+ eval_image = task_definition["container"]
417
+ if "container" in task:
418
+ eval_image = task["container"]
419
+
420
+ # Get endpoint info for this task
421
+ endpoint_info = task_endpoints[idx]
422
+ endpoint_name = endpoint_info["name"]
423
+ endpoint_url = endpoint_info["url"]
424
+ full_endpoint_url = endpoint_info["full_url"]
425
+
426
+ # Temporarily set the target URL for this specific task
427
+ from omegaconf import OmegaConf
428
+
429
+ # Temporarily disable struct mode to allow URL modification
430
+ was_struct = OmegaConf.is_struct(cfg)
431
+ if was_struct:
432
+ OmegaConf.set_struct(cfg, False)
433
+
434
+ # Save original URL
435
+ original_url = getattr(
436
+ cfg.get("target", {}).get("api_endpoint", {}), "url", None
437
+ )
438
+
439
+ try:
440
+ # Ensure target structure exists and set the task-specific URL
441
+ if "target" not in cfg:
442
+ cfg.target = OmegaConf.create({})
443
+ if "api_endpoint" not in cfg.target:
444
+ cfg.target.api_endpoint = OmegaConf.create({})
445
+
446
+ cfg.target.api_endpoint.url = full_endpoint_url
447
+
448
+ # Generate command with the correct endpoint URL
449
+ eval_command_struct = get_eval_factory_command(
450
+ cfg, task, task_definition
451
+ )
452
+ eval_command = eval_command_struct.cmd
453
+ # Debug string for explainability of some base64-parts of the command
454
+ eval_command_debug_comment = eval_command_struct.debug
455
+
456
+ finally:
457
+ # Restore original URL and struct mode
458
+ if original_url is not None:
459
+ cfg.target.api_endpoint.url = original_url
460
+ elif (
461
+ "target" in cfg
462
+ and "api_endpoint" in cfg.target
463
+ and "url" in cfg.target.api_endpoint
464
+ ):
465
+ del cfg.target.api_endpoint.url
466
+
467
+ if was_struct:
468
+ OmegaConf.set_struct(cfg, True)
469
+
470
+ # Create evaluation launch script
471
+ launch_script = _create_evaluation_launch_script(
472
+ cfg=cfg,
473
+ task=task,
474
+ task_definition=task_definition,
475
+ endpoint_url=full_endpoint_url,
476
+ task_name=task.name,
477
+ invocation_id=invocation_id,
478
+ eval_command=eval_command, # Pass the fixed command
479
+ eval_command_debug_comment=eval_command_debug_comment,
480
+ )
481
+
482
+ # Prepare job command to run the launch script
483
+ container_command = [
484
+ "/bin/bash",
485
+ "-c",
486
+ f"echo '{launch_script}' > /tmp/launch_script.sh && chmod +x /tmp/launch_script.sh && bash /tmp/launch_script.sh",
487
+ ]
488
+
489
+ # Get evaluation job settings from configuration
490
+ eval_settings = getattr(cfg.execution, "evaluation_tasks", {})
491
+ eval_resource_shape = eval_settings.get("resource_shape", "cpu.small")
492
+ eval_timeout = eval_settings.get("timeout", 3600)
493
+ use_shared_storage = eval_settings.get("use_shared_storage", True)
494
+
495
+ # Get environment variables for the job
496
+ task_config = cfg.execution.lepton_platform.tasks
497
+ node_group = task_config.get("node_group", "default")
498
+
499
+ # Import DictConfig for both env vars and mounts processing
500
+ from omegaconf import DictConfig
501
+
502
+ # Priority: lepton_platform.tasks.env_vars over cfg.execution.env_var_names
503
+ job_env_vars = {}
504
+
505
+ # Get env vars from lepton_platform config
506
+ lepton_env_vars = task_config.get("env_vars", {})
507
+ for key, value in lepton_env_vars.items():
508
+ if isinstance(value, (dict, DictConfig)):
509
+ # Convert DictConfig to dict to prevent stringification
510
+ job_env_vars[key] = dict(value)
511
+ else:
512
+ job_env_vars[key] = value
513
+
514
+ # Get mounts configuration and add invocation ID for isolation
515
+ job_mounts = []
516
+ original_mounts = task_config.get("mounts", [])
517
+
518
+ for mount in original_mounts:
519
+ # Create a copy of the mount with invocation ID added to path
520
+ mount_dict = (
521
+ dict(mount) if isinstance(mount, DictConfig) else mount.copy()
522
+ )
523
+
524
+ # Add invocation ID to the path for evaluation isolation
525
+ if "path" in mount_dict:
526
+ original_path = mount_dict["path"]
527
+ # Add invocation ID subdirectory: /shared/nemo-evaluator-launcher-workspace/abc12345
528
+ mount_dict["path"] = (
529
+ f"{original_path.rstrip('/')}/{invocation_id}"
530
+ )
531
+
532
+ job_mounts.append(mount_dict)
533
+
534
+ print(
535
+ f" - Storage: {len(job_mounts)} mount(s) with evaluation ID isolation"
536
+ )
537
+
538
+ # Get image pull secrets
539
+ image_pull_secrets = task_config.get("image_pull_secrets", [])
540
+
541
+ # Submit the evaluation job to Lepton
542
+ print(f"📝 Task {task.name}: Submitting job {lepton_job_name}")
543
+ print(f" - Endpoint: {endpoint_name if endpoint_name else 'shared'}")
544
+ print(f" - Resource: {eval_resource_shape}")
545
+
546
+ job_success, error_msg = create_lepton_job(
547
+ job_name=lepton_job_name,
548
+ container_image=eval_image,
549
+ command=container_command,
550
+ resource_shape=eval_resource_shape,
551
+ env_vars=job_env_vars,
552
+ mounts=job_mounts,
553
+ timeout=eval_timeout,
554
+ node_group=node_group,
555
+ image_pull_secrets=image_pull_secrets,
556
+ )
557
+
558
+ if not job_success:
559
+ raise RuntimeError(
560
+ f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
561
+ f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
562
+ )
563
+
564
+ # Store job metadata in database (with task-specific endpoint info)
565
+ db.write_job(
566
+ job=JobData(
567
+ invocation_id=invocation_id,
568
+ job_id=job_id,
569
+ timestamp=time.time(),
570
+ executor="lepton",
571
+ data={
572
+ "endpoint_name": endpoint_name, # Task-specific endpoint (or None for shared)
573
+ "endpoint_url": endpoint_url, # Task-specific URL (or shared)
574
+ "lepton_job_name": lepton_job_name,
575
+ "output_dir": str(task_output_dir),
576
+ "task_name": task.name,
577
+ "status": "submitted",
578
+ },
579
+ config=OmegaConf.to_object(cfg), # type: ignore[arg-type]
580
+ )
581
+ )
582
+
583
+ # Jobs submitted successfully - return immediately (non-blocking)
584
+ print(
585
+ f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
586
+ )
587
+ print(
588
+ " Each task running against its own dedicated endpoint for isolation"
589
+ )
590
+
591
+ print(f"\n📋 Invocation ID: {invocation_id}")
592
+ print(f"🔍 Check status: nemo-evaluator-launcher status {invocation_id}")
593
+ print(f"📋 Monitor logs: nemo-evaluator-launcher logs {invocation_id}")
594
+
595
+ if cfg.deployment.type != "none":
596
+ print(f"🔗 Deployed {len(endpoint_names)} dedicated endpoints:")
597
+ for i, endpoint_name in enumerate(endpoint_names):
598
+ task_name = cfg.evaluation.tasks[i].name
599
+ print(f" - {task_name}: {endpoint_name}")
600
+ print(
601
+ f"⚠️ Remember to clean up endpoints when done: nemo-evaluator-launcher kill {invocation_id}"
602
+ )
603
+ else:
604
+ print(f"📌 All tasks using shared endpoint: {shared_endpoint_url}")
605
+
606
+ print(f"📊 Evaluation results will be saved to: {output_dir}")
607
+
608
+ # Note: Jobs will continue running on Lepton infrastructure
609
+ # Status can be checked using nemo-evaluator-launcher status command
610
+
611
+ return invocation_id
612
+
613
+ except Exception:
614
+ # Clean up any created endpoints on failure
615
+ if cfg.deployment.type != "none" and "endpoint_names" in locals():
616
+ for endpoint_name in endpoint_names:
617
+ if endpoint_name:
618
+ print(f"🧹 Cleaning up endpoint: {endpoint_name}")
619
+ delete_lepton_endpoint(endpoint_name)
620
+ raise
621
+
622
+ @staticmethod
623
+ def get_status(id: str) -> List[ExecutionStatus]:
624
+ """Get the status of Lepton evaluation jobs and endpoints.
625
+
626
+ Args:
627
+ id: Unique job identifier or invocation identifier.
628
+
629
+ Returns:
630
+ List containing the execution status for the job(s) and endpoint(s).
631
+ """
632
+ db = ExecutionDB()
633
+
634
+ # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
635
+ if "." not in id:
636
+ return _get_statuses_for_invocation_id(id=id, db=db)
637
+ # Otherwise, treat as job_id
638
+ job_data = db.get_job(id)
639
+ if job_data is None:
640
+ return []
641
+ if job_data.executor != "lepton":
642
+ return []
643
+
644
+ # Check if this job has a Lepton job associated with it
645
+ lepton_job_name = job_data.data.get("lepton_job_name")
646
+ if lepton_job_name:
647
+ # Get live status from Lepton
648
+ lepton_status = get_lepton_job_status(lepton_job_name)
649
+ if lepton_status:
650
+ job_state = lepton_status.get("state", "Unknown")
651
+
652
+ # Map Lepton job states to our execution states
653
+ if job_state in ["Succeeded", "Completed"]:
654
+ state = ExecutionState.SUCCESS
655
+ elif job_state in ["Running", "Pending", "Starting"]:
656
+ state = ExecutionState.RUNNING
657
+ elif job_state in ["Failed", "Cancelled"]:
658
+ state = ExecutionState.FAILED
659
+ else:
660
+ state = ExecutionState.PENDING
661
+
662
+ progress_info = {
663
+ "type": "evaluation_job",
664
+ "task_name": job_data.data.get("task_name", "unknown"),
665
+ "lepton_job_name": lepton_job_name,
666
+ "lepton_state": job_state,
667
+ "start_time": lepton_status.get("start_time"),
668
+ "end_time": lepton_status.get("end_time"),
669
+ "endpoint_name": job_data.data.get("endpoint_name", "shared"),
670
+ }
671
+
672
+ return [ExecutionStatus(id=id, state=state, progress=progress_info)]
673
+
674
+ # Fallback to stored status
675
+ job_status = job_data.data.get("status", "unknown")
676
+
677
+ if job_status in ["running", "submitted"]:
678
+ state = ExecutionState.RUNNING
679
+ elif job_status in ["succeeded", "completed"]:
680
+ state = ExecutionState.SUCCESS
681
+ elif job_status in ["failed", "cancelled"]:
682
+ state = ExecutionState.FAILED
683
+ else:
684
+ state = ExecutionState.PENDING
685
+
686
+ progress_info = {
687
+ "type": "evaluation_job",
688
+ "task_name": job_data.data.get("task_name", "unknown"),
689
+ "status": job_status,
690
+ "lepton_job_name": job_data.data.get("lepton_job_name"),
691
+ "endpoint_name": job_data.data.get("endpoint_name", "shared"),
692
+ }
693
+
694
+ return [ExecutionStatus(id=id, state=state, progress=progress_info)]
695
+
696
+ @staticmethod
697
+ def kill_job(job_id: str) -> None:
698
+ """Kill Lepton evaluation jobs and clean up endpoints.
699
+
700
+ Args:
701
+ job_id: The job ID to kill.
702
+
703
+ Raises:
704
+ ValueError: If job is not found or invalid.
705
+ RuntimeError: If job cannot be killed.
706
+ """
707
+ db = ExecutionDB()
708
+ job_data = db.get_job(job_id)
709
+ if job_data is None:
710
+ raise ValueError(f"Job {job_id} not found")
711
+
712
+ if job_data.executor != "lepton":
713
+ raise ValueError(
714
+ f"Job {job_id} is not a Lepton job (executor: {job_data.executor})"
715
+ )
716
+
717
+ # Cancel the specific Lepton job
718
+ lepton_job_name = job_data.data.get("lepton_job_name")
719
+
720
+ if lepton_job_name:
721
+ cancel_success = delete_lepton_job(lepton_job_name)
722
+ if cancel_success:
723
+ print(f"✅ Cancelled Lepton job: {lepton_job_name}")
724
+ # Mark job as killed in database
725
+ job_data.data["status"] = "killed"
726
+ job_data.data["killed_time"] = time.time()
727
+ db.write_job(job_data)
728
+ else:
729
+ # Use common helper to get informative error message based on job status
730
+ status_list = LeptonExecutor.get_status(job_id)
731
+ current_status = status_list[0].state if status_list else None
732
+ error_msg = LeptonExecutor.get_kill_failure_message(
733
+ job_id, f"lepton_job: {lepton_job_name}", current_status
734
+ )
735
+ raise RuntimeError(error_msg)
736
+ else:
737
+ raise ValueError(f"No Lepton job name found for job {job_id}")
738
+
739
+ print(f"🛑 Killed Lepton job {job_id}")
740
+
741
+ # For individual jobs, also clean up the dedicated endpoint for this task
742
+ # Check if this was the last job using this specific endpoint
743
+ endpoint_name = job_data.data.get("endpoint_name")
744
+ if endpoint_name:
745
+ # Check if any other jobs are still using this endpoint
746
+ jobs = db.get_jobs(job_data.invocation_id)
747
+ other_jobs_using_endpoint = [
748
+ j
749
+ for j in jobs.values()
750
+ if (
751
+ j.data.get("endpoint_name") == endpoint_name
752
+ and j.data.get("status")
753
+ not in ["killed", "failed", "succeeded", "cancelled"]
754
+ and j.job_id != job_id
755
+ )
756
+ ]
757
+
758
+ if not other_jobs_using_endpoint:
759
+ print(
760
+ f"🧹 No other jobs using endpoint {endpoint_name}, cleaning up..."
761
+ )
762
+ success = delete_lepton_endpoint(endpoint_name)
763
+ if success:
764
+ print(f"✅ Cleaned up endpoint: {endpoint_name}")
765
+ else:
766
+ print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
767
+ else:
768
+ print(
769
+ f"📌 Keeping endpoint {endpoint_name} (still used by {len(other_jobs_using_endpoint)} other jobs)"
770
+ )
771
+ else:
772
+ print("📌 No dedicated endpoint to clean up for this job")
773
+
774
+
775
+ def _create_evaluation_launch_script(
776
+ cfg: DictConfig,
777
+ task: DictConfig,
778
+ task_definition: dict,
779
+ endpoint_url: str,
780
+ task_name: str,
781
+ invocation_id: str,
782
+ eval_command: str,
783
+ eval_command_debug_comment: str,
784
+ ) -> str:
785
+ """Create bash script for running evaluation in Lepton job container.
786
+
787
+ Based on the proven approach from the old implementation.
788
+
789
+ Args:
790
+ cfg: The configuration object.
791
+ task: The evaluation task configuration.
792
+ task_definition: Task definition from mapping.
793
+ endpoint_url: URL of the deployed Lepton endpoint.
794
+ task_name: Name of the evaluation task.
795
+ invocation_id: Unique invocation identifier.
796
+ eval_command: The evaluation command with correct endpoint URL.
797
+ eval_command_debug_comment: The debug comment for placing into the script and easy debug
798
+
799
+ Returns:
800
+ String containing the bash launch script.
801
+ """
802
+ # Use the provided eval_command (already has correct endpoint URL)
803
+
804
+ # Construct output directory path
805
+ output_dir = f"{cfg.execution.output_dir}/{task_name}"
806
+
807
+ # Replace the output directory in the evaluation command
808
+ eval_command_modified = eval_command.replace(
809
+ "--output_dir /results", f"--output_dir {output_dir}"
810
+ )
811
+
812
+ # Create the launch script (based on old implementation)
813
+ script = f"""#!/bin/bash
814
+ set -e
815
+
816
+ # Create output directory structure
817
+ mkdir -p {output_dir}/artifacts
818
+ mkdir -p {output_dir}/logs
819
+
820
+ # Create stage files for status tracking
821
+ echo "started" > {output_dir}/logs/stage.pre-start
822
+ echo "running" > {output_dir}/logs/stage.running
823
+
824
+ # Log evaluation details
825
+ echo "Starting evaluation for task: {task_name}"
826
+ echo "Invocation ID: {invocation_id}"
827
+ echo "Endpoint URL: {endpoint_url}"
828
+ echo "Command: {eval_command_modified}"
829
+
830
+ {eval_command_debug_comment}
831
+
832
+ # Execute the evaluation with proper error handling
833
+ set +e
834
+ {eval_command_modified}
835
+ exit_code=$?
836
+
837
+ # Set proper permissions
838
+ chmod 777 -R {output_dir} 2>/dev/null || true
839
+
840
+ # Record completion status
841
+ echo "exit_code: $exit_code" > {output_dir}/logs/stage.exit
842
+
843
+ if [ "$exit_code" -ne 0 ]; then
844
+ echo "Evaluation failed with exit code $exit_code" >&2
845
+ exit "$exit_code"
846
+ fi
847
+
848
+ echo "Evaluation completed successfully"
849
+ exit 0
850
+ """
851
+
852
+ return script
853
+
854
+
855
+ def _dry_run_lepton(
856
+ cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
857
+ ) -> None:
858
+ print("DRY RUN: Lepton job configurations prepared")
859
+ try:
860
+ # validate tasks
861
+ for task in cfg.evaluation.tasks:
862
+ get_task_from_mapping(task.name, tasks_mapping)
863
+
864
+ # nice-to-have checks (existing endpoint URL or endpoints mapping)
865
+ if getattr(cfg.deployment, "type", None) == "none":
866
+ tgt = getattr(cfg, "target", {})
867
+ api = (
868
+ tgt.get("api_endpoint")
869
+ if isinstance(tgt, dict)
870
+ else getattr(tgt, "api_endpoint", None)
871
+ ) or {}
872
+ url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
873
+ if not url or not str(url).strip():
874
+ raise ValueError(
875
+ "target.api_endpoint.url must be set when deployment.type == 'none'"
876
+ )
877
+ else:
878
+ endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
879
+ for task in cfg.evaluation.tasks:
880
+ td = get_task_from_mapping(task.name, tasks_mapping)
881
+ etype = td.get("endpoint_type")
882
+ if etype not in endpoints_cfg:
883
+ raise ValueError(
884
+ f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
885
+ )
886
+ path = endpoints_cfg.get(etype)
887
+ if not isinstance(path, str) or not path.startswith("/"):
888
+ raise ValueError(
889
+ f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
890
+ )
891
+
892
+ # lepton env var presence (reference-level)
893
+ tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
894
+ lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
895
+ api_key_name = getattr(
896
+ getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
897
+ )
898
+ for task in cfg.evaluation.tasks:
899
+ td = get_task_from_mapping(task.name, tasks_mapping)
900
+ required = td.get("required_env_vars", []) or []
901
+ for var in required:
902
+ if var == "API_KEY":
903
+ if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
904
+ raise ValueError(
905
+ f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
906
+ "or target.api_endpoint.api_key_name"
907
+ )
908
+ else:
909
+ if var not in lepton_env_vars:
910
+ raise ValueError(
911
+ f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
912
+ )
913
+
914
+ # success (use realized output directory if invocation_id is available)
915
+ preview_output_dir = (
916
+ Path(cfg.execution.output_dir).absolute() / invocation_id
917
+ if invocation_id
918
+ else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
919
+ )
920
+ print(f" - Tasks: {len(cfg.evaluation.tasks)}")
921
+ for idx, task in enumerate(cfg.evaluation.tasks):
922
+ print(f" - Task {idx}: {task.name}")
923
+ print(f" - Output directory: {preview_output_dir}")
924
+ print("\nTo run evaluation, execute run command without --dry-run")
925
+ except Exception as e:
926
+ print(f"❌ Configuration invalid: {e}")
927
+ logger.error("Lepton dry-run validation failed", error=str(e))
928
+ return
929
+
930
+
931
+ def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
932
+ """Helper method that returns statuses if id is the invocation id"""
933
+ jobs = db.get_jobs(id)
934
+ statuses: List[ExecutionStatus] = []
935
+
936
+ # Get status for all endpoints (each task may have its own)
937
+ endpoint_names = set()
938
+ for job_data in jobs.values():
939
+ endpoint_name = job_data.data.get("endpoint_name")
940
+ if endpoint_name:
941
+ endpoint_names.add(endpoint_name)
942
+
943
+ # Show status for each unique endpoint
944
+ for endpoint_name in endpoint_names:
945
+ endpoint_status = get_lepton_endpoint_status(endpoint_name)
946
+ if not endpoint_status:
947
+ logger.warning(
948
+ "Could not get Lepton endpoint statuses",
949
+ endpoint_name=endpoint_name,
950
+ )
951
+ return statuses
952
+
953
+ endpoint_state = endpoint_status.get("state", "Unknown")
954
+ if endpoint_state == "Ready":
955
+ state = ExecutionState.SUCCESS
956
+ elif endpoint_state in ["Starting", "Pending"]:
957
+ state = ExecutionState.RUNNING
958
+ else:
959
+ state = ExecutionState.FAILED
960
+
961
+ # Find which task(s) use this endpoint
962
+ using_tasks = [
963
+ job_data.data.get("task_name", "unknown")
964
+ for job_data in jobs.values()
965
+ if job_data.data.get("endpoint_name") == endpoint_name
966
+ ]
967
+
968
+ statuses.append(
969
+ ExecutionStatus(
970
+ id=f"{id}-endpoint-{endpoint_name}",
971
+ state=state,
972
+ progress={
973
+ "type": "endpoint",
974
+ "name": endpoint_name,
975
+ "state": endpoint_state,
976
+ "url": endpoint_status.get("endpoint", {}).get("external_endpoint"),
977
+ "tasks": using_tasks,
978
+ },
979
+ )
980
+ )
981
+
982
+ # If no dedicated endpoints, note that shared endpoint is being used
983
+ if not endpoint_names:
984
+ statuses.append(
985
+ ExecutionStatus(
986
+ id=f"{id}-endpoint-shared",
987
+ state=ExecutionState.SUCCESS,
988
+ progress={
989
+ "type": "endpoint",
990
+ "name": "shared",
991
+ "state": "Using existing endpoint",
992
+ "url": "external",
993
+ "tasks": [
994
+ job_data.data.get("task_name", "unknown")
995
+ for job_data in jobs.values()
996
+ ],
997
+ },
998
+ )
999
+ )
1000
+
1001
+ # Get individual job statuses
1002
+ for job_id, job_data in jobs.items():
1003
+ statuses.extend(LeptonExecutor.get_status(job_id))
1004
+ return statuses