nemo-evaluator-launcher 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nemo-evaluator-launcher might be problematic. Click here for more details.

Files changed (60) hide show
  1. nemo_evaluator_launcher/__init__.py +79 -0
  2. nemo_evaluator_launcher/api/__init__.py +24 -0
  3. nemo_evaluator_launcher/api/functional.py +698 -0
  4. nemo_evaluator_launcher/api/types.py +98 -0
  5. nemo_evaluator_launcher/api/utils.py +19 -0
  6. nemo_evaluator_launcher/cli/__init__.py +15 -0
  7. nemo_evaluator_launcher/cli/export.py +267 -0
  8. nemo_evaluator_launcher/cli/info.py +512 -0
  9. nemo_evaluator_launcher/cli/kill.py +41 -0
  10. nemo_evaluator_launcher/cli/ls_runs.py +134 -0
  11. nemo_evaluator_launcher/cli/ls_tasks.py +136 -0
  12. nemo_evaluator_launcher/cli/main.py +226 -0
  13. nemo_evaluator_launcher/cli/run.py +200 -0
  14. nemo_evaluator_launcher/cli/status.py +164 -0
  15. nemo_evaluator_launcher/cli/version.py +55 -0
  16. nemo_evaluator_launcher/common/__init__.py +16 -0
  17. nemo_evaluator_launcher/common/execdb.py +283 -0
  18. nemo_evaluator_launcher/common/helpers.py +366 -0
  19. nemo_evaluator_launcher/common/logging_utils.py +357 -0
  20. nemo_evaluator_launcher/common/mapping.py +295 -0
  21. nemo_evaluator_launcher/common/printing_utils.py +93 -0
  22. nemo_evaluator_launcher/configs/__init__.py +15 -0
  23. nemo_evaluator_launcher/configs/default.yaml +28 -0
  24. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  25. nemo_evaluator_launcher/configs/deployment/nim.yaml +32 -0
  26. nemo_evaluator_launcher/configs/deployment/none.yaml +16 -0
  27. nemo_evaluator_launcher/configs/deployment/sglang.yaml +38 -0
  28. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +24 -0
  29. nemo_evaluator_launcher/configs/deployment/vllm.yaml +42 -0
  30. nemo_evaluator_launcher/configs/execution/lepton/default.yaml +92 -0
  31. nemo_evaluator_launcher/configs/execution/local.yaml +19 -0
  32. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +34 -0
  33. nemo_evaluator_launcher/executors/__init__.py +22 -0
  34. nemo_evaluator_launcher/executors/base.py +120 -0
  35. nemo_evaluator_launcher/executors/lepton/__init__.py +16 -0
  36. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +609 -0
  37. nemo_evaluator_launcher/executors/lepton/executor.py +1004 -0
  38. nemo_evaluator_launcher/executors/lepton/job_helpers.py +398 -0
  39. nemo_evaluator_launcher/executors/local/__init__.py +15 -0
  40. nemo_evaluator_launcher/executors/local/executor.py +605 -0
  41. nemo_evaluator_launcher/executors/local/run.template.sh +103 -0
  42. nemo_evaluator_launcher/executors/registry.py +38 -0
  43. nemo_evaluator_launcher/executors/slurm/__init__.py +15 -0
  44. nemo_evaluator_launcher/executors/slurm/executor.py +1147 -0
  45. nemo_evaluator_launcher/exporters/__init__.py +36 -0
  46. nemo_evaluator_launcher/exporters/base.py +121 -0
  47. nemo_evaluator_launcher/exporters/gsheets.py +409 -0
  48. nemo_evaluator_launcher/exporters/local.py +502 -0
  49. nemo_evaluator_launcher/exporters/mlflow.py +619 -0
  50. nemo_evaluator_launcher/exporters/registry.py +40 -0
  51. nemo_evaluator_launcher/exporters/utils.py +624 -0
  52. nemo_evaluator_launcher/exporters/wandb.py +490 -0
  53. nemo_evaluator_launcher/package_info.py +38 -0
  54. nemo_evaluator_launcher/resources/mapping.toml +380 -0
  55. nemo_evaluator_launcher-0.1.28.dist-info/METADATA +494 -0
  56. nemo_evaluator_launcher-0.1.28.dist-info/RECORD +60 -0
  57. nemo_evaluator_launcher-0.1.28.dist-info/WHEEL +5 -0
  58. nemo_evaluator_launcher-0.1.28.dist-info/entry_points.txt +3 -0
  59. nemo_evaluator_launcher-0.1.28.dist-info/licenses/LICENSE +451 -0
  60. nemo_evaluator_launcher-0.1.28.dist-info/top_level.txt +1 -0
@@ -0,0 +1,609 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ """Lepton deployment helper functions for nemo-evaluator-launcher.
17
+
18
+ Handles Lepton endpoint creation, management, and health checks.
19
+ """
20
+
21
+ import json
22
+ import subprocess
23
+ import time
24
+ from pathlib import Path
25
+ from typing import Any, Dict, Optional
26
+
27
+ # Import lepton dependencies
28
+ from omegaconf import DictConfig
29
+
30
+ from nemo_evaluator_launcher.common.logging_utils import logger
31
+
32
+
33
+ def deep_merge(base: Dict[Any, Any], override: Dict[Any, Any]) -> Dict[Any, Any]:
34
+ """Deep merge two dictionaries, with override taking precedence."""
35
+ result = base.copy()
36
+
37
+ for key, value in override.items():
38
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
39
+ result[key] = deep_merge(result[key], value)
40
+ else:
41
+ result[key] = value
42
+
43
+ return result
44
+
45
+
46
+ def replace_placeholders(data: Any, replacements: Dict[str, str]) -> Any:
47
+ """Replace placeholders in the data structure."""
48
+
49
+ def replace_in_obj(obj: Any) -> Any:
50
+ if isinstance(obj, dict):
51
+ return {k: replace_in_obj(v) for k, v in obj.items()}
52
+ elif isinstance(obj, list):
53
+ return [replace_in_obj(item) for item in obj]
54
+ elif isinstance(obj, str):
55
+ result = obj
56
+ for placeholder, value in replacements.items():
57
+ result = result.replace(f"{{{{{placeholder}}}}}", value)
58
+ return result
59
+ else:
60
+ return obj
61
+
62
+ return replace_in_obj(data)
63
+
64
+
65
+ def generate_lepton_spec(cfg: DictConfig) -> Dict[str, Any]:
66
+ """Generate a Lepton endpoint specification from nemo-evaluator-launcher configuration.
67
+
68
+ This function creates a layered configuration by merging:
69
+ 1. Platform defaults (from execution.lepton_platform.platform_defaults)
70
+ 2. Environment settings (from execution.lepton_platform)
71
+ 3. Inference engine config (from deployment.* - vllm/sglang settings)
72
+ 4. Lepton platform config (from deployment.lepton_config - Lepton-specific settings)
73
+
74
+ Args:
75
+ cfg: The nemo-evaluator-launcher configuration object containing all settings.
76
+
77
+ Returns:
78
+ Dict containing the Lepton endpoint specification.
79
+ """
80
+
81
+ # Step 1: Start with platform defaults from execution config
82
+ platform_defaults = {}
83
+ if hasattr(cfg, "execution") and hasattr(cfg.execution, "lepton_platform"):
84
+ deployment_config = cfg.execution.lepton_platform.get("deployment", {})
85
+ platform_defaults = deployment_config.get("platform_defaults", {})
86
+
87
+ base_config = deep_merge({}, platform_defaults)
88
+
89
+ # Step 2: Apply deployment-specific settings from execution config
90
+ if hasattr(cfg, "execution") and hasattr(cfg.execution, "lepton_platform"):
91
+ lepton_platform = cfg.execution.lepton_platform
92
+ deployment_config = lepton_platform.get("deployment", {})
93
+
94
+ # Add deployment node group as affinity constraint
95
+ deployment_node_group = deployment_config.get("node_group")
96
+ if deployment_node_group:
97
+ if not base_config.get("resource_requirement"):
98
+ base_config["resource_requirement"] = {}
99
+ base_config["resource_requirement"]["affinity"] = {
100
+ "allowed_dedicated_node_groups": [deployment_node_group]
101
+ }
102
+
103
+ # Add queue config from platform defaults
104
+ platform_defaults = deployment_config.get("platform_defaults", {})
105
+ if platform_defaults.get("queue_config"):
106
+ base_config["queue_config"] = platform_defaults.get("queue_config")
107
+
108
+ # Step 3: Get Lepton-specific config from deployment.lepton_config
109
+ if not hasattr(cfg.deployment, "lepton_config"):
110
+ raise ValueError(
111
+ "deployment.lepton_config is required when using Lepton executor"
112
+ )
113
+
114
+ lepton_config = cfg.deployment.lepton_config
115
+
116
+ # Step 4: Convert inference engine config to container spec
117
+ container_spec = _create_inference_container_spec(cfg.deployment)
118
+
119
+ # Step 5: Apply Lepton platform deployment configurations
120
+ deployment_config = {
121
+ "resource_requirement": {
122
+ **base_config.get("resource_requirement", {}),
123
+ "resource_shape": lepton_config.resource_shape,
124
+ "min_replicas": lepton_config.min_replicas,
125
+ "max_replicas": lepton_config.max_replicas,
126
+ },
127
+ "auto_scaler": lepton_config.auto_scaler,
128
+ "container": container_spec,
129
+ "envs": [],
130
+ }
131
+
132
+ # Add health check configuration if provided
133
+ if hasattr(lepton_config, "health") and lepton_config.health:
134
+ deployment_config["health"] = lepton_config.health
135
+ # Merge deployment config into base config
136
+ final_config = deep_merge(base_config, deployment_config)
137
+
138
+ # Step 6: Add environment variables from lepton_config
139
+ if hasattr(lepton_config, "envs") and lepton_config.envs:
140
+ from omegaconf import DictConfig
141
+
142
+ for key, value in lepton_config.envs.items():
143
+ env_var: Dict[str, Any] = {"name": key}
144
+
145
+ # Support both direct values and secret references
146
+ if isinstance(value, (dict, DictConfig)) and "value_from" in value:
147
+ # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
148
+ env_var["value_from"] = dict(value["value_from"])
149
+ else:
150
+ # Direct value: "direct_value"
151
+ env_var["value"] = str(value)
152
+
153
+ final_config["envs"].append(env_var)
154
+
155
+ # Step 6b: Auto-populate environment variables from deployment parameters
156
+ _add_deployment_derived_envs(final_config["envs"], cfg.deployment)
157
+
158
+ # Step 7: Add mounts with intelligent path construction
159
+ if hasattr(lepton_config, "mounts") and lepton_config.mounts.enabled:
160
+ # Get storage source from task config mounts (since mounts are shared between tasks and deployments)
161
+ storage_source = "node-nfs:lepton-shared-fs" # default
162
+ if hasattr(cfg, "execution") and hasattr(cfg.execution, "lepton_platform"):
163
+ task_config = cfg.execution.lepton_platform.get("tasks", {})
164
+ task_mounts = task_config.get("mounts", [])
165
+ if task_mounts:
166
+ storage_source = task_mounts[0].get("from", storage_source)
167
+
168
+ final_config["mounts"] = [
169
+ {
170
+ "path": lepton_config.mounts.cache_path,
171
+ "from": storage_source,
172
+ "mount_path": lepton_config.mounts.mount_path,
173
+ "mount_options": {},
174
+ }
175
+ ]
176
+
177
+ # Step 8: Extract image_pull_secrets to top level (required by Lepton API)
178
+ if "image_pull_secrets" in final_config:
179
+ image_pull_secrets = final_config["image_pull_secrets"]
180
+ # Convert OmegaConf ListConfig to regular Python list
181
+ from omegaconf import ListConfig
182
+
183
+ if isinstance(image_pull_secrets, (list, ListConfig)):
184
+ final_config["image_pull_secrets"] = list(image_pull_secrets)
185
+ else:
186
+ # Remove invalid image_pull_secrets
187
+ final_config.pop("image_pull_secrets", None)
188
+
189
+ # Step 9: Add API tokens if provided (supports both single and multiple tokens)
190
+ if hasattr(lepton_config, "api_tokens") and lepton_config.api_tokens:
191
+ from omegaconf import DictConfig
192
+
193
+ api_tokens_list = []
194
+
195
+ for token_config in lepton_config.api_tokens:
196
+ token_var: Dict[str, Any] = {}
197
+
198
+ # Support both direct values and secret references
199
+ if isinstance(token_config, (dict, DictConfig)):
200
+ if "value" in token_config:
201
+ # Direct value: {value: "token_string"}
202
+ token_var["value"] = str(token_config["value"])
203
+ elif "value_from" in token_config:
204
+ # Secret reference: {value_from: {secret_name_ref: "secret_name"}}
205
+ token_var["value_from"] = dict(token_config["value_from"])
206
+ else:
207
+ # Simple string value
208
+ token_var["value"] = str(token_config)
209
+
210
+ api_tokens_list.append(token_var)
211
+
212
+ final_config["api_tokens"] = api_tokens_list
213
+
214
+ # Backward compatibility: support legacy single api_token
215
+ elif hasattr(lepton_config, "api_token") and lepton_config.api_token:
216
+ final_config["api_tokens"] = [{"value": lepton_config.api_token}]
217
+
218
+ # Step 10: Replace placeholders
219
+ replacements = {
220
+ "MODEL_CACHE_NAME": _generate_model_cache_name(cfg.deployment.image)
221
+ }
222
+ final_config_with_replacements: Dict[str, Any] = replace_placeholders(
223
+ final_config, replacements
224
+ )
225
+
226
+ return final_config_with_replacements
227
+
228
+
229
+ def _create_inference_container_spec(deployment_cfg: DictConfig) -> Dict[str, Any]:
230
+ """Create container specification from inference engine config (vLLM/SGLang/NIM).
231
+
232
+ Args:
233
+ deployment_cfg: Deployment configuration containing vLLM/SGLang/NIM settings.
234
+
235
+ Returns:
236
+ Container specification for Lepton.
237
+ """
238
+ container_spec = {
239
+ "image": deployment_cfg.image,
240
+ "ports": [{"container_port": deployment_cfg.port}],
241
+ }
242
+
243
+ # Generate command based on deployment type
244
+ if deployment_cfg.type == "vllm":
245
+ # Convert vLLM command template to actual command
246
+ command_parts = [
247
+ "vllm",
248
+ "serve",
249
+ deployment_cfg.checkpoint_path,
250
+ f"--tensor-parallel-size={deployment_cfg.tensor_parallel_size}",
251
+ f"--pipeline-parallel-size={deployment_cfg.pipeline_parallel_size}",
252
+ f"--data-parallel-size={deployment_cfg.data_parallel_size}",
253
+ f"--port={deployment_cfg.port}",
254
+ f"--served-model-name={deployment_cfg.served_model_name}",
255
+ ]
256
+
257
+ # Add extra args if provided
258
+ if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
259
+ command_parts.extend(deployment_cfg.extra_args.split())
260
+
261
+ container_spec["command"] = command_parts
262
+
263
+ elif deployment_cfg.type == "sglang":
264
+ # Convert SGLang command template to actual command
265
+ command_parts = [
266
+ "python3",
267
+ "-m",
268
+ "sglang.launch_server",
269
+ f"--model-path={deployment_cfg.checkpoint_path}",
270
+ "--host=0.0.0.0",
271
+ f"--port={deployment_cfg.port}",
272
+ f"--served-model-name={deployment_cfg.served_model_name}",
273
+ f"--tp={deployment_cfg.tensor_parallel_size}",
274
+ f"--dp={deployment_cfg.data_parallel_size}",
275
+ ]
276
+
277
+ # Add extra args if provided
278
+ if hasattr(deployment_cfg, "extra_args") and deployment_cfg.extra_args:
279
+ command_parts.extend(deployment_cfg.extra_args.split())
280
+
281
+ container_spec["command"] = command_parts
282
+
283
+ elif deployment_cfg.type == "nim":
284
+ # NIM containers use their default entrypoint - no custom command needed
285
+ # Configuration is handled via environment variables
286
+ pass
287
+
288
+ return container_spec
289
+
290
+
291
+ def _add_deployment_derived_envs(envs_list: list, deployment_cfg: DictConfig) -> None:
292
+ """Add environment variables derived from deployment configuration.
293
+
294
+ Args:
295
+ envs_list: List to append environment variables to.
296
+ deployment_cfg: Deployment configuration to derive from.
297
+ """
298
+ deployment_type = deployment_cfg.type
299
+
300
+ # Common environment variables for all deployment types
301
+ if (
302
+ hasattr(deployment_cfg, "served_model_name")
303
+ and deployment_cfg.served_model_name
304
+ ):
305
+ envs_list.append(
306
+ {"name": "SERVED_MODEL_NAME", "value": deployment_cfg.served_model_name}
307
+ )
308
+
309
+ if hasattr(deployment_cfg, "port") and deployment_cfg.port:
310
+ envs_list.append({"name": "MODEL_PORT", "value": str(deployment_cfg.port)})
311
+
312
+ # Deployment-specific environment variables
313
+ if deployment_type == "vllm":
314
+ if (
315
+ hasattr(deployment_cfg, "checkpoint_path")
316
+ and deployment_cfg.checkpoint_path
317
+ ):
318
+ envs_list.append(
319
+ {"name": "MODEL_PATH", "value": deployment_cfg.checkpoint_path}
320
+ )
321
+ if (
322
+ hasattr(deployment_cfg, "tensor_parallel_size")
323
+ and deployment_cfg.tensor_parallel_size
324
+ ):
325
+ envs_list.append(
326
+ {
327
+ "name": "TENSOR_PARALLEL_SIZE",
328
+ "value": str(deployment_cfg.tensor_parallel_size),
329
+ }
330
+ )
331
+
332
+ elif deployment_type == "sglang":
333
+ if (
334
+ hasattr(deployment_cfg, "checkpoint_path")
335
+ and deployment_cfg.checkpoint_path
336
+ ):
337
+ envs_list.append(
338
+ {"name": "MODEL_PATH", "value": deployment_cfg.checkpoint_path}
339
+ )
340
+ if (
341
+ hasattr(deployment_cfg, "tensor_parallel_size")
342
+ and deployment_cfg.tensor_parallel_size
343
+ ):
344
+ envs_list.append(
345
+ {
346
+ "name": "TENSOR_PARALLEL_SIZE",
347
+ "value": str(deployment_cfg.tensor_parallel_size),
348
+ }
349
+ )
350
+
351
+ elif deployment_type == "nim":
352
+ # NIM-specific derived environment variables
353
+ if (
354
+ hasattr(deployment_cfg, "served_model_name")
355
+ and deployment_cfg.served_model_name
356
+ ):
357
+ envs_list.append(
358
+ {"name": "NIM_MODEL_NAME", "value": deployment_cfg.served_model_name}
359
+ )
360
+
361
+
362
+ def _generate_model_cache_name(image: str) -> str:
363
+ """Generate a cache directory name from the container image.
364
+
365
+ Args:
366
+ image: Container image string like 'nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6'
367
+
368
+ Returns:
369
+ Clean cache name like 'llama-3-1-8b-instruct'
370
+ """
371
+ # Extract model name from image path
372
+ if "/" in image:
373
+ model_part = image.split("/")[-1] # Get 'llama-3.1-8b-instruct:1.8.6'
374
+ else:
375
+ model_part = image
376
+
377
+ # Remove version tag
378
+ if ":" in model_part:
379
+ model_part = model_part.split(":")[0] # Get 'llama-3.1-8b-instruct'
380
+
381
+ # Replace dots with dashes for filesystem compatibility
382
+ return model_part.replace(".", "-")
383
+
384
+
385
+ def create_lepton_endpoint(cfg: DictConfig, endpoint_name: str) -> bool:
386
+ """Create a Lepton endpoint using the lep CLI.
387
+
388
+ Args:
389
+ cfg: The nemo-evaluator-launcher configuration object.
390
+ endpoint_name: Name for the endpoint.
391
+
392
+ Returns:
393
+ True if endpoint creation succeeded, False otherwise.
394
+ """
395
+ spec = generate_lepton_spec(cfg)
396
+
397
+ # Convert OmegaConf objects to regular Python objects for JSON serialization
398
+ from omegaconf import DictConfig, ListConfig
399
+
400
+ def convert_to_json_serializable(obj: Any) -> Any:
401
+ """Recursively convert OmegaConf objects to regular Python objects."""
402
+ if isinstance(obj, (DictConfig, dict)):
403
+ return {k: convert_to_json_serializable(v) for k, v in obj.items()}
404
+ elif isinstance(obj, (ListConfig, list)):
405
+ return [convert_to_json_serializable(item) for item in obj]
406
+ else:
407
+ return obj
408
+
409
+ json_spec = convert_to_json_serializable(spec)
410
+
411
+ # Write spec to temporary file
412
+ import tempfile
413
+
414
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
415
+ json.dump(json_spec, f, indent=2)
416
+ spec_file = f.name
417
+
418
+ try:
419
+ # Create endpoint using lep CLI
420
+ result = subprocess.run(
421
+ ["lep", "endpoint", "create", "--file", spec_file, "--name", endpoint_name],
422
+ capture_output=True,
423
+ text=True,
424
+ timeout=300,
425
+ )
426
+
427
+ if result.returncode == 0:
428
+ print(f"✅ Successfully created Lepton endpoint: {endpoint_name}")
429
+ return True
430
+ else:
431
+ error_msg = result.stderr.strip() if result.stderr else ""
432
+ output_msg = result.stdout.strip() if result.stdout else ""
433
+ print(
434
+ f"✗ Failed to create Lepton endpoint | Endpoint: {endpoint_name} | Return code: {result.returncode}"
435
+ )
436
+ if error_msg:
437
+ print(f" stderr: {error_msg}")
438
+ if output_msg:
439
+ print(f" stdout: {output_msg}")
440
+ return False
441
+
442
+ except subprocess.TimeoutExpired as e:
443
+ print(
444
+ f"✗ Timeout creating Lepton endpoint | Endpoint: {endpoint_name} | Timeout: 300s"
445
+ )
446
+ if hasattr(e, "stderr") and e.stderr:
447
+ print(f" stderr: {e.stderr}")
448
+ if hasattr(e, "stdout") and e.stdout:
449
+ print(f" stdout: {e.stdout}")
450
+ return False
451
+ except subprocess.CalledProcessError as e:
452
+ print(
453
+ f"✗ Error creating Lepton endpoint | Endpoint: {endpoint_name} | Error: {e}"
454
+ )
455
+ if hasattr(e, "stderr") and e.stderr:
456
+ print(f" stderr: {e.stderr}")
457
+ if hasattr(e, "stdout") and e.stdout:
458
+ print(f" stdout: {e.stdout}")
459
+ return False
460
+ finally:
461
+ # Clean up temporary file
462
+ Path(spec_file).unlink(missing_ok=True)
463
+
464
+
465
+ def delete_lepton_endpoint(endpoint_name: str) -> bool:
466
+ """Delete a Lepton endpoint.
467
+
468
+ Args:
469
+ endpoint_name: Name of the endpoint to delete.
470
+
471
+ Returns:
472
+ True if deletion succeeded, False otherwise.
473
+ """
474
+ try:
475
+ result = subprocess.run(
476
+ ["lep", "endpoint", "remove", "--name", endpoint_name],
477
+ capture_output=True,
478
+ text=True,
479
+ timeout=60,
480
+ )
481
+
482
+ return result.returncode == 0
483
+ except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
484
+ return False
485
+
486
+
487
+ def get_lepton_endpoint_status(endpoint_name: str) -> Optional[dict[str, Any]]:
488
+ """Get the status of a Lepton endpoint.
489
+
490
+ Args:
491
+ endpoint_name: Name of the endpoint.
492
+
493
+ Returns:
494
+ Status dict if endpoint exists, None otherwise. See
495
+ https://github.com/leptonai/leptonai/blob/7de93b95357126da1e86fa99f54f9a769d5d2646/leptonai/api/v1/types/deployment.py#L338
496
+ for the definition.
497
+ """
498
+ try:
499
+ # TODO(agronskiy): why not use Python API?
500
+ cmd = ["lep", "endpoint", "get", "--name", endpoint_name]
501
+ result = subprocess.run(
502
+ cmd,
503
+ capture_output=True,
504
+ text=True,
505
+ timeout=30,
506
+ )
507
+
508
+ if result.returncode != 0:
509
+ return None
510
+
511
+ endpoint_info = json.loads(result.stdout)
512
+ status = endpoint_info.get("status", {})
513
+ if isinstance(status, dict):
514
+ return status
515
+ logger.error(
516
+ "Result of running lep command returne non-dict status",
517
+ cmd=cmd,
518
+ status=status,
519
+ )
520
+ return None
521
+
522
+ except (
523
+ subprocess.TimeoutExpired,
524
+ subprocess.CalledProcessError,
525
+ json.JSONDecodeError,
526
+ ):
527
+ return None
528
+
529
+
530
+ def wait_for_lepton_endpoint_ready(endpoint_name: str, timeout: int = 600) -> bool:
531
+ """Wait for a Lepton endpoint to become ready.
532
+
533
+ Args:
534
+ endpoint_name: Name of the endpoint.
535
+ timeout: Maximum time to wait in seconds.
536
+
537
+ Returns:
538
+ True if endpoint becomes ready, False if timeout.
539
+ """
540
+ start_time = time.time()
541
+ while time.time() - start_time < timeout:
542
+ status = get_lepton_endpoint_status(endpoint_name)
543
+
544
+ # `get_lepton_endpoint_status` might return `None` if
545
+ # e.g. there was a network error, see definition.
546
+ if status is not None:
547
+ state = status.get("state", "").lower()
548
+ if state == "ready":
549
+ logger.info(
550
+ "Lepton endpoint is ready",
551
+ endpoint_name=endpoint_name,
552
+ )
553
+
554
+ return True
555
+ elif state in ["failed", "error"]:
556
+ return False
557
+
558
+ logger.debug(
559
+ "Waiting for lepton endpoint",
560
+ endpoint_name=endpoint_name,
561
+ timeout=timeout,
562
+ time_delta=time.time() - start_time,
563
+ curr_status=status,
564
+ )
565
+ time.sleep(10)
566
+
567
+ logger.error(
568
+ "Timeout waiting for lepton endpoint",
569
+ endpoint_name=endpoint_name,
570
+ timeout=timeout,
571
+ )
572
+ return False
573
+
574
+
575
+ def get_lepton_endpoint_url(endpoint_name: str) -> Optional[str]:
576
+ """Get the URL of a Lepton endpoint.
577
+
578
+ Args:
579
+ endpoint_name: Name of the endpoint.
580
+
581
+ Returns:
582
+ Endpoint URL if available, None otherwise.
583
+ """
584
+ try:
585
+ result = subprocess.run(
586
+ ["lep", "endpoint", "get", "--name", endpoint_name],
587
+ capture_output=True,
588
+ text=True,
589
+ timeout=30,
590
+ )
591
+
592
+ if result.returncode == 0:
593
+ endpoint_info = json.loads(result.stdout)
594
+ status = endpoint_info.get("status", {})
595
+ endpoint = status.get("endpoint", {})
596
+ external_endpoint = endpoint.get("external_endpoint")
597
+ # Ensure we return a proper string type or None
598
+ if isinstance(external_endpoint, str):
599
+ return external_endpoint
600
+ else:
601
+ return None
602
+ else:
603
+ return None
604
+ except (
605
+ subprocess.TimeoutExpired,
606
+ subprocess.CalledProcessError,
607
+ json.JSONDecodeError,
608
+ ):
609
+ return None