flytekitplugins-dgxc-lepton 1.16.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: flytekitplugins-dgxc-lepton
3
+ Version: 1.16.6
4
+ Summary: DGXC Lepton Flytekit plugin for inference endpoints
5
+ Author: Anshul Jindal
6
+ Author-email: ansjindal@nvidia.com
7
+ Classifier: Intended Audience :: Science/Research
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Software Development
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: flytekit<2.0.0,>=1.9.1
20
+ Requires-Dist: leptonai
21
+ Dynamic: author
22
+ Dynamic: author-email
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ Flytekit DGXC Lepton Plugin - AI inference endpoints using Lepton AI infrastructure
@@ -0,0 +1,375 @@
1
+ # Flytekit DGXC Lepton Plugin
2
+
3
+ A professional Flytekit plugin that enables seamless deployment and management of AI inference endpoints using Lepton AI infrastructure within Flyte workflows.
4
+
5
+ ## Overview
6
+
7
+ This plugin provides:
8
+ - **Unified Task API** for deployment and management of Lepton AI endpoints
9
+ - **Type-safe configuration** with consolidated dataclasses and IDE support
10
+ - **Multiple endpoint engines**: VLLM, SGLang, NIM, and custom containers
11
+ - **Unified configuration classes** for scaling, environment, and mounts
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install flytekitplugins-dgxc-lepton
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```python
22
+ from flytekit import workflow
23
+ from flytekitplugins.dgxc_lepton import (
24
+ lepton_endpoint_deployment_task, lepton_endpoint_deletion_task, LeptonEndpointConfig,
25
+ EndpointEngineConfig, EnvironmentConfig, ScalingConfig
26
+ )
27
+
28
+ @workflow
29
+ def inference_workflow() -> str:
30
+ """Deploy Llama model using VLLM and return endpoint URL."""
31
+
32
+ # Complete configuration in one place
33
+ config = LeptonEndpointConfig(
34
+ endpoint_name="my-llama-endpoint",
35
+ resource_shape="gpu.1xh200",
36
+ node_group="your-node-group",
37
+ endpoint_config=EndpointEngineConfig.vllm(
38
+ checkpoint_path="meta-llama/Llama-3.1-8B-Instruct",
39
+ served_model_name="llama-3.1-8b-instruct",
40
+ ),
41
+ environment=EnvironmentConfig.create(
42
+ LOG_LEVEL="INFO",
43
+ secrets={"HF_TOKEN": "hf-secret"}
44
+ ),
45
+ scaling=ScalingConfig.traffic(min_replicas=1, max_replicas=2),
46
+ )
47
+
48
+ # Deploy endpoint and return URL
49
+ return lepton_endpoint_deployment_task(config=config)
50
+ ```
51
+
52
+ ## API Reference
53
+
54
+ ### Core Components
55
+
56
+ #### `lepton_endpoint_deployment_task(config: LeptonEndpointConfig) -> str`
57
+ Main function for Lepton AI endpoint deployment.
58
+
59
+ **Parameters:**
60
+ - `config`: Complete endpoint configuration
61
+ - `task_name`: Optional custom task name
62
+
63
+ **Returns:**
64
+ - Endpoint URL for successful deployment
65
+
66
+ #### `lepton_endpoint_deletion_task(endpoint_name: str, ...) -> str`
67
+ Function for Lepton AI endpoint deletion.
68
+
69
+ **Parameters:**
70
+ - `endpoint_name`: Name of the endpoint to delete
71
+ - `task_name`: Optional custom task name
72
+
73
+ **Returns:**
74
+ - Success message confirming deletion
75
+
76
+ #### `LeptonEndpointConfig`
77
+ Unified configuration for all Lepton endpoint operations.
78
+
79
+ **Required Fields:**
80
+ - `endpoint_name`: Name of the endpoint
81
+ - `resource_shape`: Hardware resource specification (e.g., "gpu.1xh200")
82
+ - `node_group`: Target node group for deployment
83
+ - `endpoint_config`: Engine-specific configuration
84
+
85
+ **Optional Fields:**
86
+ - `scaling`: Auto-scaling configuration
87
+ - `environment`: Environment variables and secrets
88
+ - `mounts`: Storage mount configurations
89
+ - `api_token`/`api_token_secret`: Authentication
90
+ - `image_pull_secrets`: Container registry secrets
91
+ - `endpoint_readiness_timeout`: Deployment timeout
92
+
93
+ ### Endpoint Engine Configuration
94
+
95
+ #### `EndpointEngineConfig`
96
+ Unified configuration for different inference engines.
97
+
98
+
99
+ ##### VLLM Engine
100
+ ```python
101
+ EndpointEngineConfig.vllm(
102
+ image="vllm/vllm-openai:latest",
103
+ checkpoint_path="meta-llama/Llama-3.1-8B-Instruct",
104
+ served_model_name="default-model",
105
+ tensor_parallel_size=1,
106
+ pipeline_parallel_size=1,
107
+ data_parallel_size=1,
108
+ extra_args="--max-model-len 4096",
109
+ port=8000
110
+ )
111
+ ```
112
+
113
+ ##### SGLang Engine
114
+ ```python
115
+ EndpointEngineConfig.sglang(
116
+ image="lmsysorg/sglang:latest",
117
+ checkpoint_path="meta-llama/Llama-3.1-8B-Instruct",
118
+ tensor_parallel_size=1,
119
+ data_parallel_size=1,
120
+ extra_args="--context-length 4096",
121
+ port=30000
122
+ )
123
+ ```
124
+
125
+ ##### NVIDIA NIM
126
+ ```python
127
+ EndpointEngineConfig.nim(
128
+ image="nvcr.io/nim/nvidia/llama-3_3-nemotron-super-49b-v1_5:latest",
129
+ port=8000
130
+ )
131
+ ```
132
+
133
+ ##### Custom Container
134
+ ```python
135
+ EndpointEngineConfig.custom(
136
+ image="python:3.11-slim",
137
+ command=["/bin/bash", "-c", "python3 -m http.server 8080"],
138
+ port=8080
139
+ )
140
+ ```
141
+
142
+ ### Scaling Configuration
143
+
144
+ #### `ScalingConfig`
145
+ Unified auto-scaling configuration with enforced single strategy.
146
+
147
+
148
+ ##### Traffic-based Scaling
149
+ ```python
150
+ ScalingConfig.traffic(
151
+ min_replicas=1,
152
+ max_replicas=5,
153
+ timeout=1800 # Scale down after 30 min of no traffic
154
+ )
155
+ ```
156
+
157
+ ##### GPU Utilization Scaling
158
+ ```python
159
+ ScalingConfig.gpu(
160
+ target_utilization=80, # Target 80% GPU utilization
161
+ min_replicas=1,
162
+ max_replicas=10
163
+ )
164
+ ```
165
+
166
+ ##### QPM (Queries Per Minute) Scaling
167
+ ```python
168
+ ScalingConfig.qpm(
169
+ target_qpm=100.5, # Target queries per minute
170
+ min_replicas=2,
171
+ max_replicas=8
172
+ )
173
+ ```
174
+
175
+ ### Environment Configuration
176
+
177
+ #### `EnvironmentConfig`
178
+ Unified configuration for environment variables and secrets.
179
+
180
+ **Factory Methods:**
181
+
182
+ ##### Environment Variables Only
183
+ ```python
184
+ EnvironmentConfig.from_env(
185
+ LOG_LEVEL="DEBUG",
186
+ MODEL_PATH="/models",
187
+ CUDA_VISIBLE_DEVICES="0,1"
188
+ )
189
+ ```
190
+
191
+ ##### Secrets Only
192
+ ```python
193
+ EnvironmentConfig.from_secrets(
194
+ HF_TOKEN="hf-secret",
195
+ NGC_API_KEY="ngc-secret"
196
+ )
197
+ ```
198
+
199
+ ##### Mixed Configuration
200
+ ```python
201
+ EnvironmentConfig.create(
202
+ LOG_LEVEL="INFO",
203
+ MODEL_PATH="/models",
204
+ secrets={
205
+ "HF_TOKEN": "hf-secret",
206
+ "NGC_API_KEY": "ngc-secret"
207
+ }
208
+ )
209
+ ```
210
+
211
+ ### Mount Configuration
212
+
213
+ #### `MountReader`
214
+ Simplified NFS mount configuration.
215
+
216
+ ```python
217
+ MountReader.node_nfs(
218
+ ("/shared-storage/models", "/opt/models"),
219
+ ("/shared-storage/data", "/opt/data"),
220
+ ("/shared-storage/logs", "/opt/logs", False), # Disabled mount
221
+ storage_name="production-nfs" # Custom storage name
222
+ )
223
+ ```
224
+
225
+ ## Complete Examples
226
+
227
+ ### VLLM Deployment with Auto-scaling
228
+
229
+ ```python
230
+ from flytekit import workflow
231
+ from flytekitplugins.dgxc_lepton import (
232
+ lepton_endpoint_deployment_task, LeptonEndpointConfig,
233
+ EndpointEngineConfig, EnvironmentConfig, ScalingConfig, MountReader
234
+ )
235
+
236
+ @workflow
237
+ def deploy_vllm_with_scaling() -> str:
238
+ """Deploy VLLM with traffic-based auto-scaling."""
239
+
240
+ config = LeptonEndpointConfig(
241
+ endpoint_name="vllm-llama-3.1-8b",
242
+ resource_shape="gpu.1xh200",
243
+ node_group="inference-nodes",
244
+ endpoint_config=EndpointEngineConfig.vllm(
245
+ checkpoint_path="meta-llama/Llama-3.1-8B-Instruct",
246
+ served_model_name="llama-3.1-8b-instruct",
247
+ tensor_parallel_size=1,
248
+ extra_args="--max-model-len 8192 --enable-chunked-prefill"
249
+ ),
250
+ environment=EnvironmentConfig.create(
251
+ LOG_LEVEL="INFO",
252
+ CUDA_VISIBLE_DEVICES="0",
253
+ secrets={"HF_TOKEN": "hf-secret"}
254
+ ),
255
+ scaling=ScalingConfig.traffic(
256
+ min_replicas=1,
257
+ max_replicas=3,
258
+ timeout=1800
259
+ ),
260
+ mounts=MountReader.node_nfs(
261
+ ("/shared-storage/models", "/opt/models"),
262
+ ("/shared-storage/cache", "/root/.cache")
263
+ ),
264
+ api_token_secret="lepton-api-token",
265
+ image_pull_secrets=["hf-secret"],
266
+ endpoint_readiness_timeout=600
267
+ )
268
+
269
+ return lepton_endpoint_deployment_task(config=config)
270
+ ```
271
+
272
+ ### NIM Deployment with QPM Scaling
273
+
274
+ ```python
275
+ @workflow
276
+ def deploy_nim_with_qpm_scaling() -> str:
277
+ """Deploy NVIDIA NIM with QPM-based scaling."""
278
+
279
+ config = LeptonEndpointConfig(
280
+ endpoint_name="nemotron-super-reasoning",
281
+ resource_shape="gpu.1xh200",
282
+ node_group="nim-nodes",
283
+ endpoint_config=EndpointEngineConfig.nim(
284
+ image="nvcr.io/nim/nvidia/llama-3_3-nemotron-super-49b-v1_5:latest"
285
+ ),
286
+ environment=EnvironmentConfig.create(
287
+ OMPI_ALLOW_RUN_AS_ROOT="1",
288
+ secrets={"NGC_API_KEY": "ngc-secret"}
289
+ ),
290
+ scaling=ScalingConfig.qpm(
291
+ target_qpm=2.5,
292
+ min_replicas=1,
293
+ max_replicas=3
294
+ ),
295
+ image_pull_secrets=["ngc-secret"],
296
+ api_token="UNIQUE_ENDPOINT_TOKEN"
297
+ )
298
+
299
+ return lepton_endpoint_deployment_task(config=config)
300
+ ```
301
+
302
+ ### Custom Container Deployment
303
+
304
+ ```python
305
+ @workflow
306
+ def deploy_custom_service() -> str:
307
+ """Deploy custom inference service."""
308
+
309
+ config = LeptonEndpointConfig(
310
+ endpoint_name="custom-inference-api",
311
+ resource_shape="cpu.large",
312
+ node_group="cpu-nodes",
313
+ endpoint_config=EndpointEngineConfig.custom(
314
+ image="my-registry/inference-api:v1.0",
315
+ command=["python", "app.py"],
316
+ port=8080
317
+ ),
318
+ environment=EnvironmentConfig.from_env(
319
+ LOG_LEVEL="DEBUG",
320
+ API_VERSION="v1",
321
+ WORKERS="4"
322
+ ),
323
+ scaling=ScalingConfig.gpu(
324
+ target_utilization=70,
325
+ min_replicas=2,
326
+ max_replicas=6
327
+ )
328
+ )
329
+
330
+ return lepton_endpoint_deployment_task(config=config)
331
+ ```
332
+ ## Configuration Requirements
333
+
334
+ Replace these placeholders with your actual values:
335
+ - `<your-node-group>`: Your Kubernetes node group for GPU workloads
336
+ - `<your-ngc-secret>`: Your NGC registry pull secret name
337
+ - `/shared-storage/model-cache/*`: Your shared storage paths for model caching
338
+ - `NGC_API_KEY`: Your NGC API key secret name
339
+ - `HUGGING_FACE_HUB_TOKEN_read`: Your HuggingFace token secret name
340
+
341
+ ## Monitoring & Debugging
342
+
343
+ ```bash
344
+ # Monitor connector logs
345
+ kubectl logs -n flyte deployment/lepton-connector --follow
346
+
347
+ # Check Lepton console (URLs auto-generated in Flyte execution view)
348
+
349
+ # List recent executions
350
+ pyflyte get executions -p flytesnacks -d development --limit 5
351
+
352
+ ## Development
353
+
354
+ ### Running Tests
355
+
356
+ ```bash
357
+ pytest tests/test_lepton.py -v
358
+ ```
359
+
360
+ ### Plugin Registration
361
+
362
+ The plugin automatically registers with Flytekit's dynamic plugin loading system:
363
+
364
+ ```python
365
+ # Automatic registration enables this usage pattern
366
+ task = LeptonEndpointDeploymentTask(config=config)
367
+ ```
368
+
369
+ ## Support
370
+
371
+ For issues, questions, or contributions, please refer to the Flytekit documentation and Lepton AI platform documentation.
372
+
373
+ ## License
374
+
375
+ This plugin follows the same license as Flytekit.
@@ -0,0 +1,56 @@
1
+ """
2
+ .. currentmodule:: flytekitplugins.dgxc_lepton
3
+
4
+ This package contains things that are useful when extending Flytekit for Lepton AI integration.
5
+
6
+ .. autosummary::
7
+ :template: custom.rst
8
+ :toctree: generated/
9
+
10
+ lepton_endpoint_deployment_task
11
+ lepton_endpoint_deletion_task
12
+ LeptonEndpointConfig
13
+ LeptonEndpointDeploymentTask
14
+ LeptonEndpointDeletionTask
15
+ EndpointType
16
+ EnvironmentConfig
17
+ MountReader
18
+ ScalingConfig
19
+ ScalingType
20
+ EndpointEngineConfig
21
+ """
22
+
23
+ # Clean imports with consolidated classes
24
+ # Import connector module to trigger connector registration (connectors are not part of public API)
25
+ from . import connector # noqa: F401
26
+ from .config import (
27
+ EndpointEngineConfig,
28
+ EndpointType,
29
+ EnvironmentConfig,
30
+ LeptonEndpointConfig,
31
+ MountReader,
32
+ ScalingConfig,
33
+ ScalingType,
34
+ )
35
+ from .task import (
36
+ LeptonEndpointDeletionTask,
37
+ LeptonEndpointDeploymentTask,
38
+ lepton_endpoint_deletion_task,
39
+ lepton_endpoint_deployment_task,
40
+ )
41
+
42
+ __all__ = [
43
+ # Task API
44
+ "lepton_endpoint_deployment_task",
45
+ "lepton_endpoint_deletion_task",
46
+ "LeptonEndpointConfig",
47
+ "LeptonEndpointDeploymentTask",
48
+ "LeptonEndpointDeletionTask",
49
+ "EndpointType",
50
+ # Configuration classes
51
+ "EnvironmentConfig",
52
+ "MountReader",
53
+ "ScalingConfig",
54
+ "ScalingType",
55
+ "EndpointEngineConfig",
56
+ ]