nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ Handles Lepton endpoint creation, management, and health checks.
19
19
  """
20
20
 
21
21
  import json
22
+ import shlex
22
23
  import subprocess
23
24
  import time
24
25
  from pathlib import Path
@@ -27,6 +28,7 @@ from typing import Any, Dict, Optional
27
28
  # Import lepton dependencies
28
29
  from omegaconf import DictConfig
29
30
 
31
+ from nemo_evaluator_launcher.common.helpers import _str_to_echo_command
30
32
  from nemo_evaluator_launcher.common.logging_utils import logger
31
33
 
32
34
 
@@ -235,6 +237,8 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
235
237
  Returns:
236
238
  Container specification for Lepton.
237
239
  """
240
+ # Extract pre_cmd from deployment_cfg
241
+ pre_cmd: str = deployment_cfg.get("pre_cmd") or ""
238
242
  container_spec = {
239
243
  "image": deployment_cfg.image,
240
244
  "ports": [{"container_port": deployment_cfg.port}],
@@ -258,6 +262,18 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
258
262
  if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
259
263
  command_parts.extend(deployment_cfg.extra_args.split())
260
264
 
265
+ # Wrap with pre_cmd if provided
266
+ if pre_cmd:
267
+ create_pre_script_cmd = _str_to_echo_command(
268
+ pre_cmd, filename="deployment_pre_cmd.sh"
269
+ )
270
+ original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
271
+ command_parts = [
272
+ "/bin/bash",
273
+ "-c",
274
+ f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
275
+ ]
276
+
261
277
  container_spec["command"] = command_parts
262
278
 
263
279
  elif deployment_cfg.type == "sglang":
@@ -278,12 +294,31 @@ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, An
278
294
  if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
279
295
  command_parts.extend(deployment_cfg.extra_args.split())
280
296
 
297
+ # Wrap with pre_cmd if provided
298
+ if pre_cmd:
299
+ create_pre_script_cmd = _str_to_echo_command(
300
+ pre_cmd, filename="deployment_pre_cmd.sh"
301
+ )
302
+ original_cmd = " ".join(shlex.quote(str(c)) for c in command_parts)
303
+ command_parts = [
304
+ "/bin/bash",
305
+ "-c",
306
+ f"{create_pre_script_cmd.cmd} && source deployment_pre_cmd.sh && exec {original_cmd}",
307
+ ]
308
+
281
309
  container_spec["command"] = command_parts
282
310
 
283
311
  elif deployment_cfg.type == "nim":
284
312
  # NIM containers use their default entrypoint - no custom command needed
285
313
  # Configuration is handled via environment variables
286
- pass
314
+ # pre_cmd is not supported for NIM deployments
315
+ if pre_cmd:
316
+ logger.error(
317
+ "pre_cmd is not supported for NIM deployments",
318
+ deployment_type="nim",
319
+ pre_cmd=pre_cmd,
320
+ )
321
+ raise ValueError("pre_cmd is not supported for NIM deployments")
287
322
 
288
323
  return container_spec
289
324
 
@@ -428,14 +463,34 @@ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
428
463
  print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
429
464
  return True
430
465
  else:
431
- print(f"❌ Failed to create Lepton endpoint: {result.stderr}")
466
+ error_msg = result.stderr.strip() if result.stderr else ""
467
+ output_msg = result.stdout.strip() if result.stdout else ""
468
+ print(
469
+ f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
470
+ )
471
+ if error_msg:
472
+ print(f" stderr: {error_msg}")
473
+ if output_msg:
474
+ print(f" stdout: {output_msg}")
432
475
  return False
433
476
 
434
- except subprocess.TimeoutExpired:
435
- print(f"❌ Timeout creating Lepton endpoint: {endpoint_name}")
477
+ except subprocess.TimeoutExpired as e:
478
+ print(
479
+ f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
480
+ )
481
+ if hasattr(e, "stderr") and e.stderr:
482
+ print(f" stderr: {e.stderr}")
483
+ if hasattr(e, "stdout") and e.stdout:
484
+ print(f" stdout: {e.stdout}")
436
485
  return False
437
486
  except subprocess.CalledProcessError as e:
438
- print(f"❌ Error creating Lepton endpoint: {e}")
487
+ print(
488
+ f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
489
+ )
490
+ if hasattr(e, "stderr") and e.stderr:
491
+ print(f" stderr: {e.stderr}")
492
+ if hasattr(e, "stdout") and e.stdout:
493
+ print(f" stdout: {e.stdout}")
439
494
  return False
440
495
  finally:
441
496
  # Clean up temporary file
@@ -18,6 +18,7 @@
18
18
  Handles deployment and evaluation using Lepton endpoints with NIM containers.
19
19
  """
20
20
 
21
+ import os
21
22
  import time
22
23
  from pathlib import Path
23
24
  from typing import List
@@ -36,6 +37,7 @@ from nemo_evaluator_launcher.common.mapping import (
36
37
  get_task_from_mapping,
37
38
  load_tasks_mapping,
38
39
  )
40
+ from nemo_evaluator_launcher.common.printing_utils import red
39
41
  from nemo_evaluator_launcher.executors.base import (
40
42
  BaseExecutor,
41
43
  ExecutionState,
@@ -78,9 +80,75 @@ class LeptonExecutor(BaseExecutor):
78
80
  "LeptonExecutor supports deployment types: 'vllm', 'sglang', 'nim', 'none'"
79
81
  )
80
82
 
83
+ # Load tasks mapping
84
+ tasks_mapping = load_tasks_mapping()
85
+ job_ids = []
86
+ lepton_job_names = []
87
+ endpoint_names = [] # Track multiple endpoints
88
+ db = ExecutionDB()
89
+
81
90
  # Generate invocation ID
82
91
  invocation_id = generate_invocation_id()
83
92
 
93
+ # TODO(agronskiy): the structure of this executor differs from others,
94
+ # so the best place to check for unsafe commands yelids a bit of duplication.
95
+ # We can't use the get_eval_factory_command here because the port is not yet
96
+ # populated.
97
+ # Refactor the whole thing.
98
+ is_potentially_unsafe = False
99
+ for idx, task in enumerate(cfg.evaluation.tasks):
100
+ pre_cmd: str = task.get("pre_cmd") or cfg.evaluation.get("pre_cmd") or ""
101
+ if pre_cmd:
102
+ is_potentially_unsafe = True
103
+ break
104
+
105
+ # Check for deployment pre_cmd
106
+ deployment_pre_cmd: str = cfg.deployment.get("pre_cmd") or ""
107
+ if deployment_pre_cmd:
108
+ is_potentially_unsafe = True
109
+
110
+ # DRY-RUN mode
111
+ if dry_run:
112
+ output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
113
+ output_dir.mkdir(parents=True, exist_ok=True)
114
+
115
+ # Validate configuration
116
+ _dry_run_lepton(cfg, tasks_mapping, invocation_id=invocation_id)
117
+
118
+ if cfg.deployment.type == "none":
119
+ print("Using existing endpoint (deployment: none)")
120
+ print("using shared endpoint")
121
+ else:
122
+ print(f"with endpoint type '{cfg.deployment.type}'")
123
+
124
+ if is_potentially_unsafe:
125
+ print(
126
+ red(
127
+ "\nFound `pre_cmd` (evaluation or deployment) which carries security risk. When running without --dry-run "
128
+ "make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
129
+ )
130
+ )
131
+
132
+ return invocation_id
133
+
134
+ if is_potentially_unsafe:
135
+ if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
136
+ logger.warning(
137
+ "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
138
+ "is set, proceeding with caution."
139
+ )
140
+
141
+ else:
142
+ logger.error(
143
+ "Found non-empty commands (e.g. `pre_cmd` in evaluation or deployment) and NEMO_EVALUATOR_TRUST_PRE_CMD "
144
+ "is not set. This might carry security risk and unstable environments. "
145
+ "To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
146
+ )
147
+ raise AttributeError(
148
+ "Untrusted command found in config, make sure you trust and "
149
+ "set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
150
+ )
151
+
84
152
  # For deployment: none, we use the existing endpoint for all tasks
85
153
  if cfg.deployment.type == "none":
86
154
  print("📌 Using existing endpoint (deployment: none)")
@@ -88,13 +156,6 @@ class LeptonExecutor(BaseExecutor):
88
156
  print(f"✅ Using shared endpoint: {shared_endpoint_url}")
89
157
 
90
158
  try:
91
- # Load tasks mapping
92
- tasks_mapping = load_tasks_mapping()
93
- job_ids = []
94
- lepton_job_names = []
95
- endpoint_names = [] # Track multiple endpoints
96
- db = ExecutionDB()
97
-
98
159
  # Create local directory for outputs
99
160
  output_dir = Path(cfg.execution.output_dir).absolute() / invocation_id
100
161
  output_dir.mkdir(parents=True, exist_ok=True)
@@ -139,8 +200,13 @@ class LeptonExecutor(BaseExecutor):
139
200
  task_index = str(idx)
140
201
  endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
141
202
 
142
- # Ensure we don't exceed 36 character limit
143
203
  if len(endpoint_name) > 36:
204
+ logger.info(
205
+ "Lepton endpoint name will be deployed under name {task_name}",
206
+ task_name=task.name,
207
+ original=endpoint_name,
208
+ limit=36,
209
+ )
144
210
  # Truncate task name further if needed
145
211
  max_task_len = (
146
212
  36
@@ -151,7 +217,19 @@ class LeptonExecutor(BaseExecutor):
151
217
  ) # 3 hyphens
152
218
  short_task_name = sanitized_task_name[:max_task_len]
153
219
  endpoint_name = f"{cfg.deployment.type}-{short_task_name}-{task_index}-{short_invocation}"
220
+ logger.info(
221
+ "Lepton endpoint name is auto-generated",
222
+ task_name=task.name,
223
+ original=endpoint_name,
224
+ truncated=endpoint_name,
225
+ limit=36,
226
+ )
154
227
 
228
+ logger.info(
229
+ "Lepton endpoint name (auto-generated)",
230
+ task_name=task.name,
231
+ endpoint_name=endpoint_name,
232
+ )
155
233
  endpoint_names.append(endpoint_name)
156
234
  endpoint_creation_tasks.append((idx, task, endpoint_name))
157
235
 
@@ -298,20 +376,6 @@ class LeptonExecutor(BaseExecutor):
298
376
  f"✅ All {len(cfg.evaluation.tasks)} endpoints created successfully!"
299
377
  )
300
378
 
301
- if dry_run:
302
- print("🔍 DRY RUN: Lepton job configurations prepared")
303
- print(f" - Tasks: {len(cfg.evaluation.tasks)}")
304
- for idx, task in enumerate(cfg.evaluation.tasks):
305
- if cfg.deployment.type == "none":
306
- print(f" - Task {idx}: {task.name} using shared endpoint")
307
- else:
308
- print(
309
- f" - Task {idx}: {task.name} with endpoint {endpoint_names[idx]}"
310
- )
311
- print(f" - Output directory: {output_dir}")
312
- print("\nTo submit jobs, run the executor without --dry-run")
313
- return invocation_id
314
-
315
379
  # ================================================================
316
380
  # JOB SUBMISSION (Sequential, as before)
317
381
  # ================================================================
@@ -334,8 +398,18 @@ class LeptonExecutor(BaseExecutor):
334
398
  max_base_length = 36 - 1 - len(suffix) # -1 for the hyphen
335
399
  if len(base_job_name) > max_base_length:
336
400
  base_job_name = base_job_name[:max_base_length]
401
+ logger.info(
402
+ "Lepton job auto-generated name",
403
+ task_name=task.name,
404
+ job_name=f"{base_job_name}-{suffix}",
405
+ )
337
406
 
338
407
  lepton_job_name = f"{base_job_name}-{suffix}"
408
+ logger.info(
409
+ "Lepton job name (auto-generated)",
410
+ task_name=task.name,
411
+ job_name=lepton_job_name,
412
+ )
339
413
  job_ids.append(job_id)
340
414
  lepton_job_names.append(lepton_job_name)
341
415
 
@@ -377,7 +451,12 @@ class LeptonExecutor(BaseExecutor):
377
451
  cfg.target.api_endpoint.url = full_endpoint_url
378
452
 
379
453
  # Generate command with the correct endpoint URL
380
- eval_command = get_eval_factory_command(cfg, task, task_definition)
454
+ eval_command_struct = get_eval_factory_command(
455
+ cfg, task, task_definition
456
+ )
457
+ eval_command = eval_command_struct.cmd
458
+ # Debug string for explainability of some base64-parts of the command
459
+ eval_command_debug_comment = eval_command_struct.debug
381
460
 
382
461
  finally:
383
462
  # Restore original URL and struct mode
@@ -402,6 +481,7 @@ class LeptonExecutor(BaseExecutor):
402
481
  task_name=task.name,
403
482
  invocation_id=invocation_id,
404
483
  eval_command=eval_command, # Pass the fixed command
484
+ eval_command_debug_comment=eval_command_debug_comment,
405
485
  )
406
486
 
407
487
  # Prepare job command to run the launch script
@@ -456,6 +536,33 @@ class LeptonExecutor(BaseExecutor):
456
536
 
457
537
  job_mounts.append(mount_dict)
458
538
 
539
+ # Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
540
+ if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get(
541
+ "required_env_vars", []
542
+ ):
543
+ # Get dataset directory from task config
544
+ if "dataset_dir" in task:
545
+ dataset_mount_host = task["dataset_dir"]
546
+ else:
547
+ raise ValueError(
548
+ f"{task.name} task requires a dataset_dir to be specified. "
549
+ f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
550
+ )
551
+ # Get container mount path (default to /datasets if not specified)
552
+ dataset_mount_container = task.get(
553
+ "dataset_mount_path", "/datasets"
554
+ )
555
+ # Add dataset mount to job mounts
556
+ # Lepton mount format: {"path": "/path/in/container", "mount_from": {"path": "/host/path"}}
557
+ job_mounts.append(
558
+ {
559
+ "path": dataset_mount_container,
560
+ "mount_from": {"path": dataset_mount_host},
561
+ }
562
+ )
563
+ # Add NEMO_EVALUATOR_DATASET_DIR environment variable
564
+ job_env_vars["NEMO_EVALUATOR_DATASET_DIR"] = dataset_mount_container
565
+
459
566
  print(
460
567
  f" - Storage: {len(job_mounts)} mount(s) with evaluation ID isolation"
461
568
  )
@@ -482,7 +589,8 @@ class LeptonExecutor(BaseExecutor):
482
589
 
483
590
  if not job_success:
484
591
  raise RuntimeError(
485
- f"Failed to submit Lepton job for task: {task.name}. Error: {error_msg}"
592
+ f"Failed to submit Lepton job | Task: {task.name} | Job ID: {job_id} | "
593
+ f"Lepton job name: {lepton_job_name} | Error: {error_msg}"
486
594
  )
487
595
 
488
596
  # Store job metadata in database (with task-specific endpoint info)
@@ -504,8 +612,6 @@ class LeptonExecutor(BaseExecutor):
504
612
  )
505
613
  )
506
614
 
507
- print(f"✅ Task {task.name}: Submitted evaluation job {job_id}")
508
-
509
615
  # Jobs submitted successfully - return immediately (non-blocking)
510
616
  print(
511
617
  f"\n✅ Successfully submitted {len(lepton_job_names)} evaluation jobs to Lepton"
@@ -536,9 +642,8 @@ class LeptonExecutor(BaseExecutor):
536
642
 
537
643
  return invocation_id
538
644
 
539
- except Exception as e:
645
+ except Exception:
540
646
  # Clean up any created endpoints on failure
541
- print(f"❌ Error during evaluation: {e}")
542
647
  if cfg.deployment.type != "none" and "endpoint_names" in locals():
543
648
  for endpoint_name in endpoint_names:
544
649
  if endpoint_name:
@@ -559,7 +664,7 @@ class LeptonExecutor(BaseExecutor):
559
664
  db = ExecutionDB()
560
665
 
561
666
  # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
562
- if len(id) == 8 and "." not in id:
667
+ if "." not in id:
563
668
  return _get_statuses_for_invocation_id(id=id, db=db)
564
669
  # Otherwise, treat as job_id
565
670
  job_data = db.get_job(id)
@@ -577,7 +682,7 @@ class LeptonExecutor(BaseExecutor):
577
682
  job_state = lepton_status.get("state", "Unknown")
578
683
 
579
684
  # Map Lepton job states to our execution states
580
- if job_state == "Succeeded":
685
+ if job_state in ["Succeeded", "Completed"]:
581
686
  state = ExecutionState.SUCCESS
582
687
  elif job_state in ["Running", "Pending", "Starting"]:
583
688
  state = ExecutionState.RUNNING
@@ -624,76 +729,14 @@ class LeptonExecutor(BaseExecutor):
624
729
  def kill_job(job_id: str) -> None:
625
730
  """Kill Lepton evaluation jobs and clean up endpoints.
626
731
 
627
- For invocation IDs, this will kill all jobs and clean up all
628
- dedicated endpoints created for the invocation.
629
-
630
732
  Args:
631
- job_id: The job ID or invocation ID to kill.
733
+ job_id: The job ID to kill.
632
734
 
633
735
  Raises:
634
736
  ValueError: If job is not found or invalid.
635
737
  RuntimeError: If job cannot be killed.
636
738
  """
637
739
  db = ExecutionDB()
638
-
639
- # If it looks like an invocation_id, kill all jobs for that invocation
640
- if len(job_id) == 8 and "." not in job_id:
641
- jobs = db.get_jobs(job_id)
642
- if not jobs:
643
- raise ValueError(f"No jobs found for invocation {job_id}")
644
-
645
- endpoint_names = (
646
- set()
647
- ) # Use set to avoid duplicates (though each should be unique)
648
- lepton_job_names = []
649
-
650
- # Collect all Lepton jobs and endpoint info
651
- for curr_job_data in jobs.values():
652
- if curr_job_data.executor != "lepton":
653
- continue
654
-
655
- # Collect endpoint name for this job (each task may have its own)
656
- endpoint_name = curr_job_data.data.get("endpoint_name")
657
- if endpoint_name:
658
- endpoint_names.add(endpoint_name)
659
-
660
- lepton_job_name = curr_job_data.data.get("lepton_job_name")
661
- if lepton_job_name:
662
- lepton_job_names.append(lepton_job_name)
663
-
664
- # Mark job as killed in database
665
- curr_job_data.data["status"] = "killed"
666
- curr_job_data.data["killed_time"] = time.time()
667
- db.write_job(curr_job_data)
668
-
669
- print(
670
- f"🛑 Killing {len(lepton_job_names)} Lepton jobs for invocation {job_id}"
671
- )
672
-
673
- # Cancel all Lepton jobs
674
- for lepton_job_name in lepton_job_names:
675
- success = delete_lepton_job(lepton_job_name)
676
- if success:
677
- print(f"✅ Cancelled Lepton job: {lepton_job_name}")
678
- else:
679
- print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
680
-
681
- # Clean up all dedicated endpoints
682
- if endpoint_names:
683
- print(f"🧹 Cleaning up {len(endpoint_names)} dedicated endpoints")
684
- for endpoint_name in endpoint_names:
685
- success = delete_lepton_endpoint(endpoint_name)
686
- if success:
687
- print(f"✅ Cleaned up endpoint: {endpoint_name}")
688
- else:
689
- print(f"⚠️ Failed to cleanup endpoint: {endpoint_name}")
690
- else:
691
- print("📌 No dedicated endpoints to clean up (using shared endpoint)")
692
-
693
- print(f"🛑 Killed all resources for invocation {job_id}")
694
- return
695
-
696
- # Otherwise, treat as individual job_id
697
740
  job_data = db.get_job(job_id)
698
741
  if job_data is None:
699
742
  raise ValueError(f"Job {job_id} not found")
@@ -705,17 +748,25 @@ class LeptonExecutor(BaseExecutor):
705
748
 
706
749
  # Cancel the specific Lepton job
707
750
  lepton_job_name = job_data.data.get("lepton_job_name")
751
+
708
752
  if lepton_job_name:
709
- success = delete_lepton_job(lepton_job_name)
710
- if success:
753
+ cancel_success = delete_lepton_job(lepton_job_name)
754
+ if cancel_success:
711
755
  print(f"✅ Cancelled Lepton job: {lepton_job_name}")
756
+ # Mark job as killed in database
757
+ job_data.data["status"] = "killed"
758
+ job_data.data["killed_time"] = time.time()
759
+ db.write_job(job_data)
712
760
  else:
713
- print(f"⚠️ Failed to cancel Lepton job: {lepton_job_name}")
714
-
715
- # Mark job as killed in database
716
- job_data.data["status"] = "killed"
717
- job_data.data["killed_time"] = time.time()
718
- db.write_job(job_data)
761
+ # Use common helper to get informative error message based on job status
762
+ status_list = LeptonExecutor.get_status(job_id)
763
+ current_status = status_list[0].state if status_list else None
764
+ error_msg = LeptonExecutor.get_kill_failure_message(
765
+ job_id, f"lepton_job: {lepton_job_name}", current_status
766
+ )
767
+ raise RuntimeError(error_msg)
768
+ else:
769
+ raise ValueError(f"No Lepton job name found for job {job_id}")
719
770
 
720
771
  print(f"🛑 Killed Lepton job {job_id}")
721
772
 
@@ -761,6 +812,7 @@ def _create_evaluation_launch_script(
761
812
  task_name: str,
762
813
  invocation_id: str,
763
814
  eval_command: str,
815
+ eval_command_debug_comment: str,
764
816
  ) -> str:
765
817
  """Create bash script for running evaluation in Lepton job container.
766
818
 
@@ -774,6 +826,7 @@ def _create_evaluation_launch_script(
774
826
  task_name: Name of the evaluation task.
775
827
  invocation_id: Unique invocation identifier.
776
828
  eval_command: The evaluation command with correct endpoint URL.
829
+ eval_command_debug_comment: The debug comment for placing into the script and easy debug
777
830
 
778
831
  Returns:
779
832
  String containing the bash launch script.
@@ -806,6 +859,8 @@ echo "Invocation ID: {invocation_id}"
806
859
  echo "Endpoint URL: {endpoint_url}"
807
860
  echo "Command: {eval_command_modified}"
808
861
 
862
+ {eval_command_debug_comment}
863
+
809
864
  # Execute the evaluation with proper error handling
810
865
  set +e
811
866
  {eval_command_modified}
@@ -829,6 +884,90 @@ exit 0
829
884
  return script
830
885
 
831
886
 
887
+ def _dry_run_lepton(
888
+ cfg: DictConfig, tasks_mapping: dict, invocation_id: str | None = None
889
+ ) -> None:
890
+ print("DRY RUN: Lepton job configurations prepared")
891
+ try:
892
+ # validate tasks
893
+ for task in cfg.evaluation.tasks:
894
+ get_task_from_mapping(task.name, tasks_mapping)
895
+
896
+ # nice-to-have checks (existing endpoint URL or endpoints mapping)
897
+ if getattr(cfg.deployment, "type", None) == "none":
898
+ tgt = getattr(cfg, "target", {})
899
+ api = (
900
+ tgt.get("api_endpoint")
901
+ if isinstance(tgt, dict)
902
+ else getattr(tgt, "api_endpoint", None)
903
+ ) or {}
904
+ url = api.get("url") if isinstance(api, dict) else getattr(api, "url", None)
905
+ if not url or not str(url).strip():
906
+ raise ValueError(
907
+ "target.api_endpoint.url must be set when deployment.type == 'none'"
908
+ )
909
+ else:
910
+ endpoints_cfg = getattr(cfg.deployment, "endpoints", {}) or {}
911
+ for task in cfg.evaluation.tasks:
912
+ td = get_task_from_mapping(task.name, tasks_mapping)
913
+ etype = td.get("endpoint_type")
914
+ if etype not in endpoints_cfg:
915
+ raise ValueError(
916
+ f"deployment.endpoints missing path for endpoint_type '{etype}' (task '{task.name}')"
917
+ )
918
+ path = endpoints_cfg.get(etype)
919
+ if not isinstance(path, str) or not path.startswith("/"):
920
+ raise ValueError(
921
+ f"deployment.endpoints['{etype}'] must be a non-empty path starting with '/'"
922
+ )
923
+
924
+ # lepton env var presence (reference-level)
925
+ tasks_cfg = getattr(cfg.execution, "lepton_platform", {}).get("tasks", {}) or {}
926
+ lepton_env_vars = tasks_cfg.get("env_vars", {}) or {}
927
+ api_key_name = getattr(
928
+ getattr(cfg, "target", {}).get("api_endpoint", {}), "api_key_name", None
929
+ )
930
+ for task in cfg.evaluation.tasks:
931
+ td = get_task_from_mapping(task.name, tasks_mapping)
932
+ required = td.get("required_env_vars", []) or []
933
+ for var in required:
934
+ # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic
935
+ if var == "NEMO_EVALUATOR_DATASET_DIR":
936
+ if "dataset_dir" not in task:
937
+ raise ValueError(
938
+ f"Task '{task.name}' requires dataset_dir to be specified. "
939
+ f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
940
+ )
941
+ continue
942
+ if var == "API_KEY":
943
+ if not (("API_KEY" in lepton_env_vars) or bool(api_key_name)):
944
+ raise ValueError(
945
+ f"Task '{task.name}' requires API_KEY: set execution.lepton_platform.tasks.env_vars.API_KEY "
946
+ "or target.api_endpoint.api_key_name"
947
+ )
948
+ else:
949
+ if var not in lepton_env_vars:
950
+ raise ValueError(
951
+ f"Task '{task.name}' requires {var}: set it under execution.lepton_platform.tasks.env_vars"
952
+ )
953
+
954
+ # success (use realized output directory if invocation_id is available)
955
+ preview_output_dir = (
956
+ Path(cfg.execution.output_dir).absolute() / invocation_id
957
+ if invocation_id
958
+ else Path(cfg.execution.output_dir).absolute() / "<invocation_id>"
959
+ )
960
+ print(f" - Tasks: {len(cfg.evaluation.tasks)}")
961
+ for idx, task in enumerate(cfg.evaluation.tasks):
962
+ print(f" - Task {idx}: {task.name}")
963
+ print(f" - Output directory: {preview_output_dir}")
964
+ print("\nTo run evaluation, execute run command without --dry-run")
965
+ except Exception as e:
966
+ print(f"❌ Configuration invalid: {e}")
967
+ logger.error("Lepton dry-run validation failed", error=str(e))
968
+ return
969
+
970
+
832
971
  def _get_statuses_for_invocation_id(id: str, db: ExecutionDB) -> List[ExecutionStatus]:
833
972
  """Helper method that returns statuses if id is the invocation id"""
834
973
  jobs = db.get_jobs(id)
@@ -23,13 +23,6 @@ import subprocess
23
23
  import time
24
24
  from typing import Any, List, Union
25
25
 
26
- from leptonai.api.v1.types.affinity import LeptonResourceAffinity
27
- from leptonai.api.v1.types.common import LeptonVisibility, Metadata
28
- from leptonai.api.v1.types.deployment import EnvVar, LeptonContainer, Mount
29
- from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
30
-
31
- # Import lepton dependencies
32
- from leptonai.api.v2.client import APIClient
33
26
  from omegaconf import DictConfig
34
27
 
35
28
  from nemo_evaluator_launcher.common.logging_utils import logger
@@ -92,6 +85,18 @@ def _create_lepton_job_api(
92
85
  ) -> tuple[bool, str]:
93
86
  """Create Lepton job using API client (preferred method)."""
94
87
  try:
88
+ # Import leptonai dependencies locally
89
+ from leptonai.api.v1.types.affinity import LeptonResourceAffinity
90
+ from leptonai.api.v1.types.common import LeptonVisibility, Metadata
91
+ from leptonai.api.v1.types.deployment import (
92
+ EnvValue,
93
+ EnvVar,
94
+ LeptonContainer,
95
+ Mount,
96
+ )
97
+ from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec
98
+ from leptonai.api.v2.client import APIClient
99
+
95
100
  client = APIClient()
96
101
 
97
102
  # Prepare environment variables (support both direct values and secret references)
@@ -99,12 +104,8 @@ def _create_lepton_job_api(
99
104
  if env_vars:
100
105
  for key, value in env_vars.items():
101
106
  # Handle both regular dicts and OmegaConf objects
102
- from omegaconf import DictConfig
103
-
104
107
  if isinstance(value, (dict, DictConfig)) and "value_from" in value:
105
108
  # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
106
- from leptonai.api.v1.types.deployment import EnvValue
107
-
108
109
  # Convert OmegaConf to dict if needed
109
110
  value_dict = dict(value) if isinstance(value, DictConfig) else value
110
111
  env_var = EnvVar(
@@ -203,6 +204,9 @@ def get_lepton_job_status(job_name_or_id: str) -> dict[Any, Any] | None:
203
204
  def _get_lepton_job_status_api(job_name_or_id: str) -> dict[Any, Any] | None:
204
205
  """Get job status using API client (preferred method)."""
205
206
  try:
207
+ # Import leptonai dependencies locally
208
+ from leptonai.api.v2.client import APIClient
209
+
206
210
  client = APIClient()
207
211
 
208
212
  # Try to get job by ID first, then by name