nemo-evaluator-launcher 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (57) hide show
  1. nemo_evaluator_launcher/__init__.py +65 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +641 -0
  4. nemo_evaluator_launcher/api/types.py +89 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +148 -0
  8. nemo_evaluator_launcher/cli/info.py +117 -0
  9. nemo_evaluator_launcher/cli/kill.py +39 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +113 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +34 -0
  12. nemo_evaluator_launcher/cli/main.py +136 -0
  13. nemo_evaluator_launcher/cli/run.py +135 -0
  14. nemo_evaluator_launcher/cli/status.py +118 -0
  15. nemo_evaluator_launcher/cli/version.py +52 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +189 -0
  18. nemo_evaluator_launcher/common/helpers.py +157 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +349 -0
  20. nemo_evaluator_launcher/common/mapping.py +310 -0
  21. nemo_evaluator_launcher/configs/__init__.py +15 -0
  22. nemo_evaluator_launcher/configs/default.yaml +28 -0
  23. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  24. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  25. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  26. nemo_evaluator_launcher/configs/deployment/vllm.yaml +41 -0
  27. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  28. nemo_evaluator_launcher/configs/execution/local.yaml +17 -0
  29. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +33 -0
  30. nemo_evaluator_launcher/executors/__init__.py +22 -0
  31. nemo_evaluator_launcher/executors/base.py +97 -0
  32. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  33. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +589 -0
  34. nemo_evaluator_launcher/executors/lepton/executor.py +905 -0
  35. nemo_evaluator_launcher/executors/lepton/job_helpers.py +394 -0
  36. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  37. nemo_evaluator_launcher/executors/local/executor.py +491 -0
  38. nemo_evaluator_launcher/executors/local/run.template.sh +88 -0
  39. nemo_evaluator_launcher/executors/registry.py +38 -0
  40. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  41. nemo_evaluator_launcher/executors/slurm/executor.py +982 -0
  42. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  43. nemo_evaluator_launcher/exporters/base.py +112 -0
  44. nemo_evaluator_launcher/exporters/gsheets.py +391 -0
  45. nemo_evaluator_launcher/exporters/local.py +488 -0
  46. nemo_evaluator_launcher/exporters/mlflow.py +448 -0
  47. nemo_evaluator_launcher/exporters/registry.py +40 -0
  48. nemo_evaluator_launcher/exporters/utils.py +669 -0
  49. nemo_evaluator_launcher/exporters/wandb.py +376 -0
  50. nemo_evaluator_launcher/package_info.py +35 -0
  51. nemo_evaluator_launcher/resources/mapping.toml +344 -0
  52. nemo_evaluator_launcher-0.1.0rc2.dist-info/METADATA +35 -0
  53. nemo_evaluator_launcher-0.1.0rc2.dist-info/RECORD +57 -0
  54. nemo_evaluator_launcher-0.1.0rc2.dist-info/WHEEL +5 -0
  55. nemo_evaluator_launcher-0.1.0rc2.dist-info/entry_points.txt +3 -0
  56. nemo_evaluator_launcher-0.1.0rc2.dist-info/licenses/LICENSE +451 -0
  57. nemo_evaluator_launcher-0.1.0rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,905 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Lepton executor implementation for nemo-evaluator-launcher.
17
+
18
+ Handles deployment and evaluation using Lepton endpoints with NIM containers.
19
+ """
20
+
21
+ import time
22
+ from pathlib import Path
23
+ from typing import List
24
+
25
+ from omegaconf import DictConfig
26
+
27
+ from nemo_evaluator_launcher.common.execdb import (
28
+ ExecutionDB,
29
+ JobData,
30
+ generate_invocation_id,
31
+ generate_job_id,
32
+ )
33
+ from nemo_evaluator_launcher.common.helpers import get_eval_factory_command
34
+ from nemo_evaluator_launcher.common.logging_utils import logger
35
+ from nemo_evaluator_launcher.common.mapping import (
36
+ get_task_from_mapping,
37
+ load_tasks_mapping,
38
+ )
39
+ from nemo_evaluator_launcher.executors.base import (
40
+ BaseExecutor,
41
+ ExecutionState,
42
+ ExecutionStatus,
43
+ )
44
+ from nemo_evaluator_launcher.executors.registry import register_executor
45
+
46
+ from .deployment_helpers import (
47
+ create_lepton_endpoint,
48
+ delete_lepton_endpoint,
49
+ get_lepton_endpoint_status,
50
+ get_lepton_endpoint_url,
51
+ wait_for_lepton_endpoint_ready,
52
+ )
53
+ from .job_helpers import create_lepton_job, delete_lepton_job, get_lepton_job_status
54
+
55
+
56
+ @register_executor("lepton")
57
+ class LeptonExecutor(BaseExecutor):
58
+ @staticmethod
59
+ def execute_eval(cfg: DictConfig, dry_run: bool = False) -> str:
60
+ """Deploy dedicated endpoints for each task on Lepton and run evaluation jobs.
61
+
62
+ For better resource isolation and parallel execution, each evaluation task
63
+ gets its own dedicated endpoint deployment of the same model.
64
+
65
+ Args:
66
+ cfg: The configuration object for the evaluation run.
67
+ dry_run: If True, prepare job configurations without submission.
68
+
69
+ Returns:
70
+ str: The invocation ID for the evaluation run.
71
+
72
+ Raises:
73
+ ValueError: If deployment configuration is invalid.
74
+ RuntimeError: If endpoint deployment or evaluation fails.
75
+ """
76
+ if cfg.deployment.type not in ["vllm", "sglang", "nim", "none"]:
77
+ raise ValueError(
78
+ "LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
79
+ )
80
+
81
+ # Generate invocation ID
82
+ invocation_id = generate_invocation_id()
83
+
84
+ # For deployment: none, we use the existing endpoint for all tasks
85
+ if cfg.deployment.type == "none":
86
+ print("📌 Using existing endpoint (deployment: none)")
87
+ shared_endpoint_url = cfg.target.api_endpoint.url
88
+ print(f"✅ Using shared endpoint: {shared_endpoint_url}")
89
+
90
+ try:
91
+ # Load tasks mapping
92
+ tasks_mapping = load_tasks_mapping()
93
+ job_ids = []
94
+ lepton_job_names = []
95
+ endpoint_names = [] # Track multiple endpoints
96
+ db = ExecutionDB()
97
+
98
+ # Create local directory for outputs
99
+ output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
100
+ output_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ print(
103
+ f"🚀 Processing {len(cfg.evaluation.tasks)} evaluation tasks with dedicated endpoints..."
104
+ )
105
+
106
+ # For deployment: none, skip endpoint creation
107
+ if cfg.deployment.type == "none":
108
+ print("📌 Skipping endpoint creation (using existing endpoint)")
109
+ task_endpoints = {}
110
+ for idx, task in enumerate(cfg.evaluation.tasks):
111
+ task_endpoints[idx] = {
112
+ "name": None,
113
+ "url": shared_endpoint_url,
114
+ "full_url": shared_endpoint_url,
115
+ }
116
+ else:
117
+ # ================================================================
118
+ # PARALLEL ENDPOINT DEPLOYMENT
119
+ # ================================================================
120
+ print(
121
+ f"🚀 Creating {len(cfg.evaluation.tasks)} endpoints in parallel..."
122
+ )
123
+
124
+ import queue
125
+ import threading
126
+
127
+ # Generate short endpoint names for all tasks
128
+ task_endpoints = {}
129
+ endpoint_creation_tasks = []
130
+
131
+ for idx, task in enumerate(cfg.evaluation.tasks):
132
+ # Create shorter endpoint names: e.g., "nim-gpqa-0-abc123"
133
+ sanitized_task_name = task.name.replace("_", "-").lower()
134
+ if sanitized_task_name.count(".") > 0:
135
+ sanitized_task_name = sanitized_task_name.split(".")[-1]
136
+ # Take only first 6 chars of task name to keep it short (leaving room for index)
137
+ short_task_name = sanitized_task_name[:6]
138
+ short_invocation = invocation_id[:6]
139
+ task_index = str(idx)
140
+ endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
141
+
142
+ # Ensure we don't exceed 36 character limit
143
+ if len(endpoint_name) > 36:
144
+ # Truncate task name further if needed
145
+ max_task_len = (
146
+ 36
147
+ - len(cfg.deployment.type)
148
+ - len(task_index)
149
+ - len(short_invocation)
150
+ - 3
151
+ ) # 3 hyphens
152
+ short_task_name = sanitized_task_name[:max_task_len]
153
+ endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
154
+
155
+ endpoint_names.append(endpoint_name)
156
+ endpoint_creation_tasks.append((idx, task, endpoint_name))
157
+
158
+ # Thread function to create a single endpoint
159
+ def create_endpoint_worker(
160
+ task_info: tuple[int, "DictConfig", str], result_queue: queue.Queue
161
+ ) -> None:
162
+ try:
163
+ idx, task, endpoint_name = task_info
164
+ print(f"🚀 Task {task.name}: Creating endpoint {endpoint_name}")
165
+
166
+ # Create Lepton endpoint
167
+ if not create_lepton_endpoint(cfg, endpoint_name):
168
+ result_queue.put(
169
+ (
170
+ idx,
171
+ False,
172
+ f"Failed to create endpoint {endpoint_name}",
173
+ None,
174
+ None,
175
+ )
176
+ )
177
+ return
178
+
179
+ # Wait for endpoint to be ready
180
+ print(
181
+ f"⏳ Task {task.name}: Waiting for endpoint {endpoint_name} to be ready..."
182
+ )
183
+ # Get timeout from config, default to 600 seconds if not set
184
+ endpoint_timeout = (
185
+ cfg.execution.get("lepton_platform", {})
186
+ .get("deployment", {})
187
+ .get("endpoint_readiness_timeout", 600)
188
+ )
189
+ if not wait_for_lepton_endpoint_ready(
190
+ endpoint_name, timeout=endpoint_timeout
191
+ ):
192
+ result_queue.put(
193
+ (
194
+ idx,
195
+ False,
196
+ f"Endpoint {endpoint_name} failed to become ready",
197
+ None,
198
+ None,
199
+ )
200
+ )
201
+ return
202
+
203
+ # Get endpoint URL
204
+ endpoint_url = get_lepton_endpoint_url(endpoint_name)
205
+ if not endpoint_url:
206
+ result_queue.put(
207
+ (
208
+ idx,
209
+ False,
210
+ f"Could not get URL for endpoint {endpoint_name}",
211
+ None,
212
+ None,
213
+ )
214
+ )
215
+ return
216
+
217
+ # Construct the full endpoint URL
218
+ task_definition = get_task_from_mapping(
219
+ task.name, tasks_mapping
220
+ )
221
+ task_endpoint_type = task_definition["endpoint_type"]
222
+ endpoint_path = cfg.deployment.endpoints[task_endpoint_type]
223
+ full_endpoint_url = f"{endpoint_url.rstrip('/')}{endpoint_path}"
224
+
225
+ print(
226
+ f"✅ Task {task.name}: Endpoint {endpoint_name} ready at {endpoint_url}"
227
+ )
228
+ result_queue.put(
229
+ (
230
+ idx,
231
+ True,
232
+ None,
233
+ endpoint_name,
234
+ endpoint_url,
235
+ full_endpoint_url,
236
+ )
237
+ )
238
+
239
+ except Exception as e:
240
+ result_queue.put(
241
+ (
242
+ idx,
243
+ False,
244
+ f"Exception creating endpoint: {e}",
245
+ None,
246
+ None,
247
+ )
248
+ )
249
+
250
+ # Create and start threads for parallel endpoint creation
251
+ result_queue: queue.Queue = queue.Queue()
252
+ threads = []
253
+
254
+ for task_info in endpoint_creation_tasks:
255
+ thread = threading.Thread(
256
+ target=create_endpoint_worker, args=(task_info, result_queue)
257
+ )
258
+ thread.start()
259
+ threads.append(thread)
260
+
261
+ # Wait for all threads to complete and collect results
262
+ for thread in threads:
263
+ thread.join()
264
+
265
+ # Process results
266
+ failed_endpoints = []
267
+ for _ in range(len(endpoint_creation_tasks)):
268
+ try:
269
+ result = result_queue.get_nowait()
270
+ idx = result[0]
271
+ success = result[1]
272
+
273
+ if success:
274
+ _, _, _, endpoint_name, endpoint_url, full_endpoint_url = (
275
+ result
276
+ )
277
+ task_endpoints[idx] = {
278
+ "name": endpoint_name,
279
+ "url": endpoint_url,
280
+ "full_url": full_endpoint_url,
281
+ }
282
+ else:
283
+ error_msg = result[2]
284
+ failed_endpoints.append((idx, error_msg))
285
+ except queue.Empty:
286
+ break
287
+
288
+ # Check if any endpoints failed
289
+ if failed_endpoints:
290
+ error_details = "; ".join(
291
+ [f"Task {idx}: {msg}" for idx, msg in failed_endpoints]
292
+ )
293
+ raise RuntimeError(
294
+ f"Failed to create {len(failed_endpoints)} endpoints: {error_details}"
295
+ )
296
+
297
+ print(
298
+ f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
299
+ )
300
+
301
+ if dry_run:
302
+ print("🔍 DRY RUN: Lepton job configurations prepared")
303
+ print(f" - Tasks: {len(cfg.evaluation.tasks)}")
304
+ for idx, task in enumerate(cfg.evaluation.tasks):
305
+ if cfg.deployment.type == "none":
306
+ print(f" - Task {idx}: {task.name} using shared endpoint")
307
+ else:
308
+ print(
309
+ f" - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
310
+ )
311
+ print(f" - Output directory: {output_dir}")
312
+ print("\nTo submit jobs, run the executor without --dry-run")
313
+ return invocation_id
314
+
315
+ # ================================================================
316
+ # JOB SUBMISSION (Sequential, as before)
317
+ # ================================================================
318
+ print(f"📝 Submitting {len(cfg.evaluation.tasks)} evaluation jobs...")
319
+
320
+ # Submit each evaluation task as a Lepton job
321
+ for idx, task in enumerate(cfg.evaluation.tasks):
322
+ task_definition = get_task_from_mapping(task.name, tasks_mapping)
323
+
324
+ # Create job ID and Lepton job name (max 36 chars)
325
+ job_id = generate_job_id(invocation_id, idx)
326
+ # Sanitized task name for RFC 1123 compliance (no underscores, lowercase)
327
+ sanitized_task_name = task.name.replace("_", "-").lower()
328
+ if sanitized_task_name.count(".") > 0:
329
+ sanitized_task_name = sanitized_task_name.split(".")[-1]
330
+ base_job_name = f"eval-{invocation_id[:6]}-{sanitized_task_name}"
331
+ suffix = str(idx)
332
+
333
+ # Ensure job name length is within 36 character limit
334
+ max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
335
+ if len(base_job_name) > max_base_length:
336
+ base_job_name = base_job_name[:max_base_length]
337
+
338
+ lepton_job_name = f"{base_job_name}-{suffix}"
339
+ job_ids.append(job_id)
340
+ lepton_job_names.append(lepton_job_name)
341
+
342
+ # Create task output directory (for result collection)
343
+ task_output_dir = output_dir / task.name
344
+ task_output_dir.mkdir(parents=True, exist_ok=True)
345
+
346
+ # Determine evaluation image
347
+ eval_image = task_definition["container"]
348
+ if "container" in task:
349
+ eval_image = task["container"]
350
+
351
+ # Get endpoint info for this task
352
+ endpoint_info = task_endpoints[idx]
353
+ endpoint_name = endpoint_info["name"]
354
+ endpoint_url = endpoint_info["url"]
355
+ full_endpoint_url = endpoint_info["full_url"]
356
+
357
+ # Temporarily set the target URL for this specific task
358
+ from omegaconf import OmegaConf
359
+
360
+ # Temporarily disable struct mode to allow URL modification
361
+ was_struct = OmegaConf.is_struct(cfg)
362
+ if was_struct:
363
+ OmegaConf.set_struct(cfg, False)
364
+
365
+ # Save original URL
366
+ original_url = getattr(
367
+ cfg.get("target", {}).get("api_endpoint", {}), "url", None
368
+ )
369
+
370
+ try:
371
+ # Ensure target structure exists and set the task-specific URL
372
+ if "target" not in cfg:
373
+ cfg.target = OmegaConf.create({})
374
+ if "api_endpoint" not in cfg.target:
375
+ cfg.target.api_endpoint = OmegaConf.create({})
376
+
377
+ cfg.target.api_endpoint.url = full_endpoint_url
378
+
379
+ # Generate command with the correct endpoint URL
380
+ eval_command = get_eval_factory_command(cfg, task, task_definition)
381
+
382
+ finally:
383
+ # Restore original URL and struct mode
384
+ if original_url is not None:
385
+ cfg.target.api_endpoint.url = original_url
386
+ elif (
387
+ "target" in cfg
388
+ and "api_endpoint" in cfg.target
389
+ and "url" in cfg.target.api_endpoint
390
+ ):
391
+ del cfg.target.api_endpoint.url
392
+
393
+ if was_struct:
394
+ OmegaConf.set_struct(cfg, True)
395
+
396
+ # Create evaluation launch script
397
+ launch_script = _create_evaluation_launch_script(
398
+ cfg=cfg,
399
+ task=task,
400
+ task_definition=task_definition,
401
+ endpoint_url=full_endpoint_url,
402
+ task_name=task.name,
403
+ invocation_id=invocation_id,
404
+ eval_command=eval_command, # Pass the fixed command
405
+ )
406
+
407
+ # Prepare job command to run the launch script
408
+ container_command = [
409
+ "/bin/bash",
410
+ "-c",
411
+ f"echo '{launch_script}' > /tmp/launch_script.sh && chmod +x /tmp/launch_script.sh && bash /tmp/launch_script.sh",
412
+ ]
413
+
414
+ # Get evaluation job settings from configuration
415
+ eval_settings = getattr(cfg.execution, "evaluation_tasks", {})
416
+ eval_resource_shape = eval_settings.get("resource_shape", "cpu.small")
417
+ eval_timeout = eval_settings.get("timeout", 3600)
418
+ use_shared_storage = eval_settings.get("use_shared_storage", True)
419
+
420
+ # Get environment variables for the job
421
+ task_config = cfg.execution.lepton_platform.tasks
422
+ node_group = task_config.get("node_group", "default")
423
+
424
+ # Import DictConfig for both env vars and mounts processing
425
+ from omegaconf import DictConfig
426
+
427
+ # Priority: lepton_platform.tasks.env_vars over cfg.execution.env_var_names
428
+ job_env_vars = {}
429
+
430
+ # Get env vars from lepton_platform config
431
+ lepton_env_vars = task_config.get("env_vars", {})
432
+ for key, value in lepton_env_vars.items():
433
+ if isinstance(value, (dict, DictConfig)):
434
+ # Convert DictConfig to dict to prevent stringification
435
+ job_env_vars[key] = dict(value)
436
+ else:
437
+ job_env_vars[key] = value
438
+
439
+ # Get mounts configuration and add invocation ID for isolation
440
+ job_mounts = []
441
+ original_mounts = task_config.get("mounts", [])
442
+
443
+ for mount in original_mounts:
444
+ # Create a copy of the mount with invocation ID added to path
445
+ mount_dict = (
446
+ dict(mount) if isinstance(mount, DictConfig) else mount.copy()
447
+ )
448
+
449
+ # Add invocation ID to the path for evaluation isolation
450
+ if "path" in mount_dict:
451
+ original_path = mount_dict["path"]
452
+ # Add invocation ID subdirectory: /shared/nemo-evaluator-launcher-workspace/abc12345
453
+ mount_dict["path"] = (
454
+ f"{original_path.rstrip('/')}/{invocation_id}"
455
+ )
456
+
457
+ job_mounts.append(mount_dict)
458
+
459
+ print(
460
+ f" - Storage: {len(job_mounts)} mount(s) with evaluation ID isolation"
461
+ )
462
+
463
+ # Get image pull secrets
464
+ image_pull_secrets = task_config.get("image_pull_secrets", [])
465
+
466
+ # Submit the evaluation job to Lepton
467
+ print(f"📝 Task {task.name}: Submitting job {lepton_job_name}")
468
+ print(f" - Endpoint: {endpoint_name if endpoint_name else 'shared'}")
469
+ print(f" - Resource: {eval_resource_shape}")
470
+
471
+ job_success, error_msg = create_lepton_job(
472
+ job_name=lepton_job_name,
473
+ container_image=eval_image,
474
+ command=container_command,
475
+ resource_shape=eval_resource_shape,
476
+ env_vars=job_env_vars,
477
+ mounts=job_mounts,
478
+ timeout=eval_timeout,
479
+ node_group=node_group,
480
+ image_pull_secrets=image_pull_secrets,
481
+ )
482
+
483
+ if not job_success:
484
+ raise RuntimeError(
485
+ f"Failed to submit Lepton job for task: {task.name}. Error: {error_msg}"
486
+ )
487
+
488
+ # Store job metadata in database (with task-specific endpoint info)
489
+ db.write_job(
490
+ job=JobData(
491
+ invocation_id=invocation_id,
492
+ job_id=job_id,
493
+ timestamp=time.time(),
494
+ executor="lepton",
495
+ data={
496
+ "endpoint_name": endpoint_name, # Task-specific endpoint (or None for shared)
497
+ "endpoint_url": endpoint_url, # Task-specific URL (or shared)
498
+ "lepton_job_name": lepton_job_name,
499
+ "output_dir": str(task_output_dir),
500
+ "task_name": task.name,
501
+ "status": "submitted",
502
+ },
503
+ config=OmegaConf.to_object(cfg), # type: ignore[arg-type]
504
+ )
505
+ )
506
+
507
+ print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
508
+
509
+ # Jobs submitted successfully - return immediately (non-blocking)
510
+ print(
511
+ f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
512
+ )
513
+ print(
514
+ " Each task running against its own dedicated endpoint for isolation"
515
+ )
516
+
517
+ print(f"\n📋 Invocation ID: {invocation_id}")
518
+ print(f"🔍 Check status: nemo-evaluator-launcher status {invocation_id}")
519
+ print(f"📋 Monitor logs: nemo-evaluator-launcher logs {invocation_id}")
520
+
521
+ if cfg.deployment.type != "none":
522
+ print(f"🔗 Deployed {len(endpoint_names)} dedicated endpoints:")
523
+ for i, endpoint_name in enumerate(endpoint_names):
524
+ task_name = cfg.evaluation.tasks[i].name
525
+ print(f" - {task_name}: {endpoint_name}")
526
+ print(
527
+ f"⚠️ Remember to clean up endpoints when done: nemo-evaluator-launcher kill {invocation_id}"
528
+ )
529
+ else:
530
+ print(f"📌 All tasks using shared endpoint: {shared_endpoint_url}")
531
+
532
+ print(f"📊 Evaluation results will be saved to: {output_dir}")
533
+
534
+ # Note: Jobs will continue running on Lepton infrastructure
535
+ # Status can be checked using nemo-evaluator-launcher status command
536
+
537
+ return invocation_id
538
+
539
+ except Exception as e:
540
+ # Clean up any created endpoints on failure
541
+ print(f"❌ Error during evaluation: {e}")
542
+ if cfg.deployment.type != "none" and "endpoint_names" in locals():
543
+ for endpoint_name in endpoint_names:
544
+ if endpoint_name:
545
+ print(f"🧹 Cleaning up endpoint: {endpoint_name}")
546
+ delete_lepton_endpoint(endpoint_name)
547
+ raise
548
+
549
+ @staticmethod
550
+ def get_status(id: str) -> List[ExecutionStatus]:
551
+ """Get the status of Lepton evaluation jobs and endpoints.
552
+
553
+ Args:
554
+ id: Unique job identifier or invocation identifier.
555
+
556
+ Returns:
557
+ List containing the execution status for the job(s) and endpoint(s).
558
+ """
559
+ db = ExecutionDB()
560
+
561
+ # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
562
+ if len(id) == 8 and "." not in id:
563
+ return _get_statuses_for_invocation_id(id=id, db=db)
564
+ # Otherwise, treat as job_id
565
+ job_data = db.get_job(id)
566
+ if job_data is None:
567
+ return []
568
+ if job_data.executor != "lepton":
569
+ return []
570
+
571
+ # Check if this job has a Lepton job associated with it
572
+ lepton_job_name = job_data.data.get("lepton_job_name")
573
+ if lepton_job_name:
574
+ # Get live status from Lepton
575
+ lepton_status = get_lepton_job_status(lepton_job_name)
576
+ if lepton_status:
577
+ job_state = lepton_status.get("state", "Unknown")
578
+
579
+ # Map Lepton job states to our execution states
580
+ if job_state == "Succeeded":
581
+ state = ExecutionState.SUCCESS
582
+ elif job_state in ["Running", "Pending", "Starting"]:
583
+ state = ExecutionState.RUNNING
584
+ elif job_state in ["Failed", "Cancelled"]:
585
+ state = ExecutionState.FAILED
586
+ else:
587
+ state = ExecutionState.PENDING
588
+
589
+ progress_info = {
590
+ "type": "evaluation_job",
591
+ "task_name": job_data.data.get("task_name", "unknown"),
592
+ "lepton_job_name": lepton_job_name,
593
+ "lepton_state": job_state,
594
+ "start_time": lepton_status.get("start_time"),
595
+ "end_time": lepton_status.get("end_time"),
596
+ "endpoint_name": job_data.data.get("endpoint_name", "shared"),
597
+ }
598
+
599
+ return [ExecutionStatus(id=id, state=state, progress=progress_info)]
600
+
601
+ # Fallback to stored status
602
+ job_status = job_data.data.get("status", "unknown")
603
+
604
+ if job_status in ["running", "submitted"]:
605
+ state = ExecutionState.RUNNING
606
+ elif job_status in ["succeeded", "completed"]:
607
+ state = ExecutionState.SUCCESS
608
+ elif job_status in ["failed", "cancelled"]:
609
+ state = ExecutionState.FAILED
610
+ else:
611
+ state = ExecutionState.PENDING
612
+
613
+ progress_info = {
614
+ "type": "evaluation_job",
615
+ "task_name": job_data.data.get("task_name", "unknown"),
616
+ "status": job_status,
617
+ "lepton_job_name": job_data.data.get("lepton_job_name"),
618
+ "endpoint_name": job_data.data.get("endpoint_name", "shared"),
619
+ }
620
+
621
+ return [ExecutionStatus(id=id, state=state, progress=progress_info)]
622
+
623
+ @staticmethod
624
+ def kill_job(job_id: str) -> None:
625
+ """Kill Lepton evaluation jobs and clean up endpoints.
626
+
627
+ For invocation IDs, this will kill all jobs and clean up all
628
+ dedicated endpoints created for the invocation.
629
+
630
+ Args:
631
+ job_id: The job ID or invocation ID to kill.
632
+
633
+ Raises:
634
+ ValueError: If job is not found or invalid.
635
+ RuntimeError: If job cannot be killed.
636
+ """
637
+ db = ExecutionDB()
638
+
639
+ # If it looks like an invocation_id, kill all jobs for that invocation
640
+ if len(job_id) == 8 and "." not in job_id:
641
+ jobs = db.get_jobs(job_id)
642
+ if not jobs:
643
+ raise ValueError(f"No jobs found for invocation {job_id}")
644
+
645
+ endpoint_names = (
646
+ set()
647
+ ) # Use set to avoid duplicates (though each should be unique)
648
+ lepton_job_names = []
649
+
650
+ # Collect all Lepton jobs and endpoint info
651
+ for curr_job_data in jobs.values():
652
+ if curr_job_data.executor != "lepton":
653
+ continue
654
+
655
+ # Collect endpoint name for this job (each task may have its own)
656
+ endpoint_name = curr_job_data.data.get("endpoint_name")
657
+ if endpoint_name:
658
+ endpoint_names.add(endpoint_name)
659
+
660
+ lepton_job_name = curr_job_data.data.get("lepton_job_name")
661
+ if lepton_job_name:
662
+ lepton_job_names.append(lepton_job_name)
663
+
664
+ # Mark job as killed in database
665
+ curr_job_data.data["status"] = "killed"
666
+ curr_job_data.data["killed_time"] = time.time()
667
+ db.write_job(curr_job_data)
668
+
669
+ print(
670
+ f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
671
+ )
672
+
673
+ # Cancel all Lepton jobs
674
+ for lepton_job_name in lepton_job_names:
675
+ success = delete_lepton_job(lepton_job_name)
676
+ if success:
677
+ print(f"✅ Cancelled Lepton job: {lepton_job_name}")
678
+ else:
679
+ print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
680
+
681
+ # Clean up all dedicated endpoints
682
+ if endpoint_names:
683
+ print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
684
+ for endpoint_name in endpoint_names:
685
+ success = delete_lepton_endpoint(endpoint_name)
686
+ if success:
687
+ print(f"✅ Cleaned up endpoint: {endpoint_name}")
688
+ else:
689
+ print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
690
+ else:
691
+ print("📌 No dedicated endpoints to clean up (using shared endpoint)")
692
+
693
+ print(f"🛑 Killed all resources for invocation {job_id}")
694
+ return
695
+
696
+ # Otherwise, treat as individual job_id
697
+ job_data = db.get_job(job_id)
698
+ if job_data is None:
699
+ raise ValueError(f"Job {job_id} not found")
700
+
701
+ if job_data.executor != "lepton":
702
+ raise ValueError(
703
+ f"Job {job_id} is not a Lepton job (executor: {job_data.executor})"
704
+ )
705
+
706
+ # Cancel the specific Lepton job
707
+ lepton_job_name = job_data.data.get("lepton_job_name")
708
+ if lepton_job_name:
709
+ success = delete_lepton_job(lepton_job_name)
710
+ if success:
711
+ print(f"✅ Cancelled Lepton job: {lepton_job_name}")
712
+ else:
713
+ print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
714
+
715
+ # Mark job as killed in database
716
+ job_data.data["status"] = "killed"
717
+ job_data.data["killed_time"] = time.time()
718
+ db.write_job(job_data)
719
+
720
+ print(f"🛑 Killed Lepton job {job_id}")
721
+
722
+ # For individual jobs, also clean up the dedicated endpoint for this task
723
+ # Check if this was the last job using this specific endpoint
724
+ endpoint_name = job_data.data.get("endpoint_name")
725
+ if endpoint_name:
726
+ # Check if any other jobs are still using this endpoint
727
+ jobs = db.get_jobs(job_data.invocation_id)
728
+ other_jobs_using_endpoint = [
729
+ j
730
+ for j in jobs.values()
731
+ if (
732
+ j.data.get("endpoint_name") == endpoint_name
733
+ and j.data.get("status")
734
+ not in ["killed", "failed", "succeeded", "cancelled"]
735
+ and j.job_id != job_id
736
+ )
737
+ ]
738
+
739
+ if not other_jobs_using_endpoint:
740
+ print(
741
+ f"🧹 No other jobs using endpoint {endpoint_name}, cleaning up..."
742
+ )
743
+ success = delete_lepton_endpoint(endpoint_name)
744
+ if success:
745
+ print(f"✅ Cleaned up endpoint: {endpoint_name}")
746
+ else:
747
+ print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
748
+ else:
749
+ print(
750
+ f"📌 Keeping endpoint {endpoint_name} (still used by {len(other_jobs_using_endpoint)} other jobs)"
751
+ )
752
+ else:
753
+ print("📌 No dedicated endpoint to clean up for this job")
754
+
755
+
756
+ def _create_evaluation_launch_script(
757
+ cfg: DictConfig,
758
+ task: DictConfig,
759
+ task_definition: dict,
760
+ endpoint_url: str,
761
+ task_name: str,
762
+ invocation_id: str,
763
+ eval_command: str,
764
+ ) -> str:
765
+ """Create bash script for running evaluation in Lepton job container.
766
+
767
+ Based on the proven approach from the old implementation.
768
+
769
+ Args:
770
+ cfg: The configuration object.
771
+ task: The evaluation task configuration.
772
+ task_definition: Task definition from mapping.
773
+ endpoint_url: URL of the deployed Lepton endpoint.
774
+ task_name: Name of the evaluation task.
775
+ invocation_id: Unique invocation identifier.
776
+ eval_command: The evaluation command with correct endpoint URL.
777
+
778
+ Returns:
779
+ String containing the bash launch script.
780
+ """
781
+ # Use the provided eval_command (already has correct endpoint URL)
782
+
783
+ # Construct output directory path
784
+ output_dir = f"{cfg.execution.output_dir}/{task_name}"
785
+
786
+ # Replace the output directory in the evaluation command
787
+ eval_command_modified = eval_command.replace(
788
+ "--output_dir /results", f"--output_dir {output_dir}"
789
+ )
790
+
791
+ # Create the launch script (based on old implementation)
792
+ script = f"""#!/bin/bash
793
+ set -e
794
+
795
+ # Create output directory structure
796
+ mkdir -p {output_dir}/artifacts
797
+ mkdir -p {output_dir}/logs
798
+
799
+ # Create stage files for status tracking
800
+ echo "started" > {output_dir}/logs/stage.pre-start
801
+ echo "running" > {output_dir}/logs/stage.running
802
+
803
+ # Log evaluation details
804
+ echo "Starting evaluation for task: {task_name}"
805
+ echo "Invocation ID: {invocation_id}"
806
+ echo "Endpoint URL: {endpoint_url}"
807
+ echo "Command: {eval_command_modified}"
808
+
809
+ # Execute the evaluation with proper error handling
810
+ set +e
811
+ {eval_command_modified}
812
+ exit_code=$?
813
+
814
+ # Set proper permissions
815
+ chmod 777 -R {output_dir} 2>/dev/null || true
816
+
817
+ # Record completion status
818
+ echo "exit_code: $exit_code" > {output_dir}/logs/stage.exit
819
+
820
+ if [ "$exit_code" -ne 0 ]; then
821
+ echo "Evaluation failed with exit code $exit_code" >&2
822
+ exit "$exit_code"
823
+ fi
824
+
825
+ echo "Evaluation completed successfully"
826
+ exit 0
827
+ """
828
+
829
+ return script
830
+
831
+
832
+ def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
833
+ """Helper method that returns statuses if id is the invocation id"""
834
+ jobs = db.get_jobs(id)
835
+ statuses: List[ExecutionStatus] = []
836
+
837
+ # Get status for all endpoints (each task may have its own)
838
+ endpoint_names = set()
839
+ for job_data in jobs.values():
840
+ endpoint_name = job_data.data.get("endpoint_name")
841
+ if endpoint_name:
842
+ endpoint_names.add(endpoint_name)
843
+
844
+ # Show status for each unique endpoint
845
+ for endpoint_name in endpoint_names:
846
+ endpoint_status = get_lepton_endpoint_status(endpoint_name)
847
+ if not endpoint_status:
848
+ logger.warning(
849
+ "Could not get Lepton endpoint statuses",
850
+ endpoint_name=endpoint_name,
851
+ )
852
+ return statuses
853
+
854
+ endpoint_state = endpoint_status.get("state", "Unknown")
855
+ if endpoint_state == "Ready":
856
+ state = ExecutionState.SUCCESS
857
+ elif endpoint_state in ["Starting", "Pending"]:
858
+ state = ExecutionState.RUNNING
859
+ else:
860
+ state = ExecutionState.FAILED
861
+
862
+ # Find which task(s) use this endpoint
863
+ using_tasks = [
864
+ job_data.data.get("task_name", "unknown")
865
+ for job_data in jobs.values()
866
+ if job_data.data.get("endpoint_name") == endpoint_name
867
+ ]
868
+
869
+ statuses.append(
870
+ ExecutionStatus(
871
+ id=f"{id}-endpoint-{endpoint_name}",
872
+ state=state,
873
+ progress={
874
+ "type": "endpoint",
875
+ "name": endpoint_name,
876
+ "state": endpoint_state,
877
+ "url": endpoint_status.get("endpoint", {}).get("external_endpoint"),
878
+ "tasks": using_tasks,
879
+ },
880
+ )
881
+ )
882
+
883
+ # If no dedicated endpoints, note that shared endpoint is being used
884
+ if not endpoint_names:
885
+ statuses.append(
886
+ ExecutionStatus(
887
+ id=f"{id}-endpoint-shared",
888
+ state=ExecutionState.SUCCESS,
889
+ progress={
890
+ "type": "endpoint",
891
+ "name": "shared",
892
+ "state": "Using existing endpoint",
893
+ "url": "external",
894
+ "tasks": [
895
+ job_data.data.get("task_name", "unknown")
896
+ for job_data in jobs.values()
897
+ ],
898
+ },
899
+ )
900
+ )
901
+
902
+ # Get individual job statuses
903
+ for job_id, job_data in jobs.items():
904
+ statuses.extend(LeptonExecutor.get_status(job_id))
905
+ return statuses