tetra-rp 0.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. tetra_rp-0.20.0/PKG-INFO +1144 -0
  2. tetra_rp-0.20.0/README.md +1122 -0
  3. tetra_rp-0.20.0/pyproject.toml +126 -0
  4. tetra_rp-0.20.0/setup.cfg +4 -0
  5. tetra_rp-0.20.0/src/tetra_rp/__init__.py +124 -0
  6. tetra_rp-0.20.0/src/tetra_rp/cli/__init__.py +0 -0
  7. tetra_rp-0.20.0/src/tetra_rp/cli/commands/__init__.py +1 -0
  8. tetra_rp-0.20.0/src/tetra_rp/cli/commands/apps.py +143 -0
  9. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build.py +1043 -0
  10. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build_utils/__init__.py +1 -0
  11. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
  12. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build_utils/lb_handler_generator.py +361 -0
  13. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build_utils/manifest.py +170 -0
  14. tetra_rp-0.20.0/src/tetra_rp/cli/commands/build_utils/scanner.py +393 -0
  15. tetra_rp-0.20.0/src/tetra_rp/cli/commands/deploy.py +469 -0
  16. tetra_rp-0.20.0/src/tetra_rp/cli/commands/init.py +119 -0
  17. tetra_rp-0.20.0/src/tetra_rp/cli/commands/resource.py +108 -0
  18. tetra_rp-0.20.0/src/tetra_rp/cli/commands/run.py +280 -0
  19. tetra_rp-0.20.0/src/tetra_rp/cli/commands/test_mothership.py +457 -0
  20. tetra_rp-0.20.0/src/tetra_rp/cli/commands/undeploy.py +533 -0
  21. tetra_rp-0.20.0/src/tetra_rp/cli/main.py +97 -0
  22. tetra_rp-0.20.0/src/tetra_rp/cli/utils/__init__.py +1 -0
  23. tetra_rp-0.20.0/src/tetra_rp/cli/utils/app.py +15 -0
  24. tetra_rp-0.20.0/src/tetra_rp/cli/utils/conda.py +127 -0
  25. tetra_rp-0.20.0/src/tetra_rp/cli/utils/deployment.py +157 -0
  26. tetra_rp-0.20.0/src/tetra_rp/cli/utils/ignore.py +143 -0
  27. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton.py +184 -0
  28. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
  29. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
  30. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
  31. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/README.md +263 -0
  32. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/main.py +44 -0
  33. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
  34. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
  35. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +20 -0
  36. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +38 -0
  37. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +20 -0
  38. tetra_rp-0.20.0/src/tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +62 -0
  39. tetra_rp-0.20.0/src/tetra_rp/client.py +174 -0
  40. tetra_rp-0.20.0/src/tetra_rp/config.py +29 -0
  41. tetra_rp-0.20.0/src/tetra_rp/core/__init__.py +0 -0
  42. tetra_rp-0.20.0/src/tetra_rp/core/api/__init__.py +6 -0
  43. tetra_rp-0.20.0/src/tetra_rp/core/api/runpod.py +855 -0
  44. tetra_rp-0.20.0/src/tetra_rp/core/deployment.py +232 -0
  45. tetra_rp-0.20.0/src/tetra_rp/core/discovery.py +425 -0
  46. tetra_rp-0.20.0/src/tetra_rp/core/exceptions.py +50 -0
  47. tetra_rp-0.20.0/src/tetra_rp/core/resources/__init__.py +53 -0
  48. tetra_rp-0.20.0/src/tetra_rp/core/resources/app.py +687 -0
  49. tetra_rp-0.20.0/src/tetra_rp/core/resources/base.py +182 -0
  50. tetra_rp-0.20.0/src/tetra_rp/core/resources/cloud.py +4 -0
  51. tetra_rp-0.20.0/src/tetra_rp/core/resources/constants.py +10 -0
  52. tetra_rp-0.20.0/src/tetra_rp/core/resources/cpu.py +146 -0
  53. tetra_rp-0.20.0/src/tetra_rp/core/resources/environment.py +41 -0
  54. tetra_rp-0.20.0/src/tetra_rp/core/resources/gpu.py +68 -0
  55. tetra_rp-0.20.0/src/tetra_rp/core/resources/live_serverless.py +169 -0
  56. tetra_rp-0.20.0/src/tetra_rp/core/resources/load_balancer_sls_resource.py +433 -0
  57. tetra_rp-0.20.0/src/tetra_rp/core/resources/network_volume.py +193 -0
  58. tetra_rp-0.20.0/src/tetra_rp/core/resources/resource_manager.py +464 -0
  59. tetra_rp-0.20.0/src/tetra_rp/core/resources/serverless.py +730 -0
  60. tetra_rp-0.20.0/src/tetra_rp/core/resources/serverless_cpu.py +209 -0
  61. tetra_rp-0.20.0/src/tetra_rp/core/resources/template.py +107 -0
  62. tetra_rp-0.20.0/src/tetra_rp/core/resources/utils.py +50 -0
  63. tetra_rp-0.20.0/src/tetra_rp/core/utils/__init__.py +0 -0
  64. tetra_rp-0.20.0/src/tetra_rp/core/utils/backoff.py +43 -0
  65. tetra_rp-0.20.0/src/tetra_rp/core/utils/constants.py +10 -0
  66. tetra_rp-0.20.0/src/tetra_rp/core/utils/file_lock.py +260 -0
  67. tetra_rp-0.20.0/src/tetra_rp/core/utils/http.py +67 -0
  68. tetra_rp-0.20.0/src/tetra_rp/core/utils/json.py +33 -0
  69. tetra_rp-0.20.0/src/tetra_rp/core/utils/lru_cache.py +75 -0
  70. tetra_rp-0.20.0/src/tetra_rp/core/utils/singleton.py +42 -0
  71. tetra_rp-0.20.0/src/tetra_rp/core/validation.py +44 -0
  72. tetra_rp-0.20.0/src/tetra_rp/execute_class.py +301 -0
  73. tetra_rp-0.20.0/src/tetra_rp/logger.py +34 -0
  74. tetra_rp-0.20.0/src/tetra_rp/protos/__init__.py +0 -0
  75. tetra_rp-0.20.0/src/tetra_rp/protos/remote_execution.py +146 -0
  76. tetra_rp-0.20.0/src/tetra_rp/runtime/__init__.py +1 -0
  77. tetra_rp-0.20.0/src/tetra_rp/runtime/config.py +12 -0
  78. tetra_rp-0.20.0/src/tetra_rp/runtime/exceptions.py +49 -0
  79. tetra_rp-0.20.0/src/tetra_rp/runtime/generic_handler.py +206 -0
  80. tetra_rp-0.20.0/src/tetra_rp/runtime/lb_handler.py +239 -0
  81. tetra_rp-0.20.0/src/tetra_rp/runtime/manifest_client.py +141 -0
  82. tetra_rp-0.20.0/src/tetra_rp/runtime/manifest_fetcher.py +192 -0
  83. tetra_rp-0.20.0/src/tetra_rp/runtime/models.py +73 -0
  84. tetra_rp-0.20.0/src/tetra_rp/runtime/mothership_provisioner.py +462 -0
  85. tetra_rp-0.20.0/src/tetra_rp/runtime/production_wrapper.py +264 -0
  86. tetra_rp-0.20.0/src/tetra_rp/runtime/serialization.py +124 -0
  87. tetra_rp-0.20.0/src/tetra_rp/runtime/service_registry.py +316 -0
  88. tetra_rp-0.20.0/src/tetra_rp/runtime/state_manager_client.py +248 -0
  89. tetra_rp-0.20.0/src/tetra_rp/stubs/__init__.py +5 -0
  90. tetra_rp-0.20.0/src/tetra_rp/stubs/live_serverless.py +151 -0
  91. tetra_rp-0.20.0/src/tetra_rp/stubs/load_balancer_sls.py +357 -0
  92. tetra_rp-0.20.0/src/tetra_rp/stubs/registry.py +211 -0
  93. tetra_rp-0.20.0/src/tetra_rp/stubs/serverless.py +30 -0
  94. tetra_rp-0.20.0/src/tetra_rp.egg-info/PKG-INFO +1144 -0
  95. tetra_rp-0.20.0/src/tetra_rp.egg-info/SOURCES.txt +97 -0
  96. tetra_rp-0.20.0/src/tetra_rp.egg-info/dependency_links.txt +1 -0
  97. tetra_rp-0.20.0/src/tetra_rp.egg-info/entry_points.txt +2 -0
  98. tetra_rp-0.20.0/src/tetra_rp.egg-info/requires.txt +11 -0
  99. tetra_rp-0.20.0/src/tetra_rp.egg-info/top_level.txt +1 -0
@@ -0,0 +1,1144 @@
1
+ Metadata-Version: 2.4
2
+ Name: tetra_rp
3
+ Version: 0.20.0
4
+ Summary: A Python library for distributed inference and serving of machine learning models
5
+ Author-email: Marut Pandya <pandyamarut@gmail.com>, Patrick Rachford <prachford@icloud.com>, Dean Quinanola <dean.quinanola@runpod.io>
6
+ License: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: <3.14,>=3.9
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: cloudpickle>=3.1.1
14
+ Requires-Dist: runpod
15
+ Requires-Dist: python-dotenv>=1.0.0
16
+ Requires-Dist: pydantic>=2.0.0
17
+ Requires-Dist: rich>=14.0.0
18
+ Requires-Dist: typer>=0.12.0
19
+ Requires-Dist: questionary>=2.0.0
20
+ Requires-Dist: pathspec>=0.11.0
21
+ Requires-Dist: tomli>=2.0.0; python_version < "3.11"
22
+
23
+ # Flash: Serverless computing for AI workloads
24
+
25
+ Runpod Flash is a Python SDK that streamlines the development and deployment of AI workflows on Runpod's [Serverless infrastructure](http://docs.runpod.io/serverless/overview). Write Python functions locally, and Flash handles the infrastructure, provisioning GPUs and CPUs, managing dependencies, and transferring data, allowing you to focus on building AI applications.
26
+
27
+ You can find a repository of prebuilt Flash examples at [runpod/flash-examples](https://github.com/runpod/flash-examples).
28
+
29
+ > [!Note]
30
+ > **New feature - Consolidated template management:** `PodTemplate` overrides now seamlessly integrate with `ServerlessResource` defaults, providing more consistent resource configuration and reducing deployment complexity.
31
+
32
+ ## Table of contents
33
+
34
+ - [Overview](#overview)
35
+ - [Get started](#get-started)
36
+ - [Create Flash API endpoints](#create-flash-api-endpoints)
37
+ - [Key concepts](#key-concepts)
38
+ - [How it works](#how-it-works)
39
+ - [Advanced features](#advanced-features)
40
+ - [Configuration](#configuration)
41
+ - [Workflow examples](#workflow-examples)
42
+ - [Use cases](#use-cases)
43
+ - [Limitations](#limitations)
44
+ - [Contributing](#contributing)
45
+ - [Troubleshooting](#troubleshooting)
46
+
47
+ ## Overview
48
+
49
+ There are two basic modes for using Flash. You can:
50
+
51
+ - Build and run standalone Python scripts using the `@remote` decorator.
52
+ - Create Flash API endpoints with FastAPI (using the same script syntax).
53
+
54
+ Follow the steps in the next section to install Flash and create your first script before learning how to [create Flash API endpoints](#create-flash-api-endpoints).
55
+
56
+ To learn more about how Flash works, see [Key concepts](#key-concepts).
57
+
58
+ ## Get started
59
+
60
+ Before you can use Flash, you'll need:
61
+
62
+ - Python 3.9 (or higher) installed on your local machine.
63
+ - A Runpod account with API key ([sign up here](https://runpod.io/console)).
64
+ - Basic knowledge of Python and async programming.
65
+
66
+ ### Step 1: Install Flash
67
+
68
+ ```bash
69
+ pip install tetra_rp
70
+ ```
71
+
72
+ ### Step 2: Set your API key
73
+
74
+ Generate an API key from the [Runpod account settings](https://docs.runpod.io/get-started/api-keys) page and set it as an environment variable:
75
+
76
+ ```bash
77
+ export RUNPOD_API_KEY=[YOUR_API_KEY]
78
+ ```
79
+
80
+ Or save it in a `.env` file in your project directory:
81
+
82
+ ```bash
83
+ echo "RUNPOD_API_KEY=[YOUR_API_KEY]" > .env
84
+ ```
85
+
86
+ ### Step 3: Create your first Flash function
87
+
88
+ Add the following code to a new Python file:
89
+
90
+ ```python
91
+ import asyncio
92
+ from tetra_rp import remote, LiveServerless
93
+ from dotenv import load_dotenv
94
+
95
+ # Uncomment if using a .env file
96
+ # load_dotenv()
97
+
98
+ # Configure GPU resources
99
+ gpu_config = LiveServerless(name="flash-quickstart")
100
+
101
+ @remote(
102
+ resource_config=gpu_config,
103
+ dependencies=["torch", "numpy"]
104
+ )
105
+ def gpu_compute(data):
106
+ import torch
107
+ import numpy as np
108
+
109
+ # This runs on a GPU in Runpod's cloud
110
+ tensor = torch.tensor(data, device="cuda")
111
+ result = tensor.sum().item()
112
+
113
+ return {
114
+ "result": result,
115
+ "device": torch.cuda.get_device_name(0)
116
+ }
117
+
118
+ async def main():
119
+ # This runs locally
120
+ result = await gpu_compute([1, 2, 3, 4, 5])
121
+ print(f"Sum: {result['result']}")
122
+ print(f"Computed on: {result['device']}")
123
+
124
+ if __name__ == "__main__":
125
+ asyncio.run(main())
126
+ ```
127
+
128
+ Run the example:
129
+
130
+ ```bash
131
+ python your_script.py
132
+ ```
133
+
134
+ The first time you run the script, it will take significantly longer to process than successive runs (about one minute for first run vs. one second for future runs), as your endpoint must be initialized.
135
+
136
+ When it's finished, you should see output similar to this:
137
+
138
+ ```bash
139
+ 2025-11-19 12:35:15,109 | INFO | Created endpoint: rb50waqznmn2kg - flash-quickstart-fb
140
+ 2025-11-19 12:35:15,112 | INFO | URL: https://console.runpod.io/serverless/user/endpoint/rb50waqznmn2kg
141
+ 2025-11-19 12:35:15,114 | INFO | LiveServerless:rb50waqznmn2kg | API /run
142
+ 2025-11-19 12:35:15,655 | INFO | LiveServerless:rb50waqznmn2kg | Started Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2
143
+ 2025-11-19 12:35:15,762 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | Status: IN_QUEUE
144
+ 2025-11-19 12:35:16,301 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | .
145
+ 2025-11-19 12:35:17,756 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | ..
146
+ 2025-11-19 12:35:22,610 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | ...
147
+ 2025-11-19 12:35:37,163 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | ....
148
+ 2025-11-19 12:35:59,248 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | .....
149
+ 2025-11-19 12:36:09,983 | INFO | Job:b0b341e7-e460-4305-9acd-fc2dfd1bd65c-u2 | Status: COMPLETED
150
+ 2025-11-19 12:36:10,068 | INFO | Worker:icmkdgnrmdf8gz | Delay Time: 51842 ms
151
+ 2025-11-19 12:36:10,068 | INFO | Worker:icmkdgnrmdf8gz | Execution Time: 1533 ms
152
+ 2025-11-19 17:36:07,485 | INFO | Installing Python dependencies: ['torch', 'numpy']
153
+ Sum: 15
154
+ Computed on: NVIDIA GeForce RTX 4090
155
+ ```
156
+
157
+ ## Create Flash API endpoints
158
+
159
+ > [!Note]
160
+ > **Flash API endpoints are currently only available for local testing:** Using `flash run` will start the API server on your local machine. Future updates will add the ability to build and deploy API servers for production deployments.
161
+
162
+ You can use Flash to deploy and serve API endpoints that compute responses using GPU and CPU Serverless workers. These endpoints will run scripts using the same Python remote decorators [demonstrated above](#get-started)
163
+
164
+ ### Step 1: Initialize a new project
165
+
166
+ Use the `flash init` command to generate a structured project template with a preconfigured FastAPI application entry point.
167
+
168
+ Run this command to initialize a new project directory:
169
+
170
+ ```bash
171
+ flash init my_project
172
+ ```
173
+
174
+ You can also initialize your current directory:
175
+ ```
176
+ flash init
177
+ ```
178
+
179
+ ### Step 2: Explore the project template
180
+
181
+ This is the structure of the project template created by `flash init`:
182
+
183
+ ```txt
184
+ my_project/
185
+ ├── main.py # FastAPI application entry point
186
+ ├── workers/
187
+ │ ├── gpu/ # GPU worker example
188
+ │ │ ├── __init__.py # FastAPI router
189
+ │ │ └── endpoint.py # GPU script @remote decorated function
190
+ │ └── cpu/ # CPU worker example
191
+ │ ├── __init__.py # FastAPI router
192
+ │ └── endpoint.py # CPU script with @remote decorated function
193
+ ├── .env # Environment variable template
194
+ ├── .gitignore # Git ignore patterns
195
+ ├── .flashignore # Flash deployment ignore patterns
196
+ ├── requirements.txt # Python dependencies
197
+ └── README.md # Project documentation
198
+ ```
199
+
200
+ This template includes:
201
+
202
+ - A FastAPI application entry point and routers.
203
+ - Templates for Python dependencies, `.env`, `.gitignore`, etc.
204
+ - Flash scripts (`endpoint.py`) for both GPU and CPU workers, which include:
205
+ - Pre-configured worker scaling limits using the `LiveServerless()` object.
206
+ - A `@remote` decorated function that returns a response from a worker.
207
+
208
+ When you start the FastAPI server, it creates API endpoints at `/gpu/hello` and `/cpu/hello`, which call the remote function described in their respective `endpoint.py` files.
209
+
210
+ ### Step 3: Install Python dependencies
211
+
212
+ After initializing the project, navigate into the project directory:
213
+
214
+ ```bash
215
+ cd my_project
216
+ ```
217
+
218
+ Install required dependencies:
219
+
220
+ ```bash
221
+ pip install -r requirements.txt
222
+ ```
223
+
224
+ ### Step 4: Configure your API key
225
+
226
+ Open the `.env` template file in a text editor and add your [Runpod API key](https://docs.runpod.io/get-started/api-keys):
227
+
228
+ ```bash
229
+ # Use your text editor of choice, e.g.
230
+ cursor .env
231
+ ```
232
+
233
+ Remove the `#` symbol from the beginning of the `RUNPOD_API_KEY` line and replace `your_api_key_here` with your actual Runpod API key:
234
+
235
+ ```txt
236
+ RUNPOD_API_KEY=your_api_key_here
237
+ # FLASH_HOST=localhost
238
+ # FLASH_PORT=8888
239
+ # LOG_LEVEL=INFO
240
+ ```
241
+
242
+ Save the file and close it.
243
+
244
+ ### Step 5: Start the local API server
245
+
246
+ Use `flash run` to start the API server:
247
+
248
+ ```bash
249
+ flash run
250
+ ```
251
+
252
+ Open a new terminal tab or window and test your GPU API using cURL:
253
+
254
+ ```bash
255
+ curl -X POST http://localhost:8888/gpu/hello \
256
+ -H "Content-Type: application/json" \
257
+ -d '{"message": "Hello from the GPU!"}'
258
+ ```
259
+
260
+ If you switch back to the terminal tab where you used `flash run`, you'll see the details of the job's progress.
261
+
262
+ ### Faster testing with auto-provisioning
263
+
264
+ For development with multiple endpoints, use `--auto-provision` to deploy all resources before testing:
265
+
266
+ ```bash
267
+ flash run --auto-provision
268
+ ```
269
+
270
+ This eliminates cold-start delays by provisioning all serverless endpoints upfront. Endpoints are cached and reused across server restarts, making subsequent runs much faster. Resources are identified by name, so the same endpoint won't be re-deployed if configuration hasn't changed.
271
+
272
+ ### Step 6: Open the API explorer
273
+
274
+ Besides starting the API server, `flash run` also starts an interactive API explorer. Point your web browser at [http://localhost:8888/docs](http://localhost:8888/docs) to explore the API.
275
+
276
+ To run remote functions in the explorer:
277
+
278
+ 1. Expand one of the functions under **GPU Workers** or **CPU Workers**.
279
+ 2. Click **Try it out** and then **Execute**
280
+
281
+ You'll get a response from your workers right in the explorer.
282
+
283
+ ### Step 7: Customize your API
284
+
285
+ To customize your API endpoint and functionality:
286
+
287
+ 1. Add/edit remote functions in your `endpoint.py` files.
288
+ 2. Test the scripts individually by running `python endpoint.py`.
289
+ 3. Configure your FastAPI routers by editing the `__init__.py` files.
290
+ 4. Add any new endpoints to your `main.py` file.
291
+
292
+ ## Key concepts
293
+
294
+ ### Remote functions
295
+
296
+ The Flash `@remote` decorator marks functions for execution on Runpod's infrastructure. Everything inside the decorated function runs remotely, while code outside runs locally.
297
+
298
+ ```python
299
+ @remote(resource_config=config, dependencies=["pandas"])
300
+ def process_data(data):
301
+ # This code runs remotely
302
+ import pandas as pd
303
+ df = pd.DataFrame(data)
304
+ return df.describe().to_dict()
305
+
306
+ async def main():
307
+ # This code runs locally
308
+ result = await process_data(my_data)
309
+ ```
310
+
311
+ ### Resource configuration
312
+
313
+ Flash provides fine-grained control over hardware allocation through configuration objects:
314
+
315
+ ```python
316
+ from tetra_rp import LiveServerless, GpuGroup, CpuInstanceType, PodTemplate
317
+
318
+ # GPU configuration
319
+ gpu_config = LiveServerless(
320
+ name="ml-inference",
321
+ gpus=[GpuGroup.AMPERE_80], # A100 80GB
322
+ workersMax=5,
323
+ template=PodTemplate(containerDiskInGb=100) # Extra disk space
324
+ )
325
+
326
+ # CPU configuration
327
+ cpu_config = LiveServerless(
328
+ name="data-processor",
329
+ instanceIds=[CpuInstanceType.CPU5C_4_16], # 4 vCPU, 16GB RAM
330
+ workersMax=3
331
+ )
332
+ ```
333
+
334
+ ### Dependency management
335
+
336
+ Specify Python packages in the decorator, and Flash installs them automatically:
337
+
338
+ ```python
339
+ @remote(
340
+ resource_config=gpu_config,
341
+ dependencies=["transformers==4.36.0", "torch", "pillow"]
342
+ )
343
+ def generate_image(prompt):
344
+ # Import inside the function
345
+ from transformers import pipeline
346
+ import torch
347
+ from PIL import Image
348
+
349
+ # Your code here
350
+ ```
351
+
352
+ ### Parallel execution
353
+
354
+ Run multiple remote functions concurrently using Python's async capabilities:
355
+
356
+ ```python
357
+ # Process multiple items in parallel
358
+ results = await asyncio.gather(
359
+ process_item(item1),
360
+ process_item(item2),
361
+ process_item(item3)
362
+ )
363
+ ```
364
+
365
+ ### Load-Balanced Endpoints with HTTP Routing
366
+
367
+ For API endpoints requiring low-latency HTTP access with direct routing, use load-balanced endpoints:
368
+
369
+ ```python
370
+ from tetra_rp import LiveLoadBalancer, remote
371
+
372
+ api = LiveLoadBalancer(name="api-service")
373
+
374
+ @remote(api, method="POST", path="/api/process")
375
+ async def process_data(x: int, y: int):
376
+ return {"result": x + y}
377
+
378
+ @remote(api, method="GET", path="/api/health")
379
+ def health_check():
380
+ return {"status": "ok"}
381
+
382
+ # Call functions directly
383
+ result = await process_data(5, 3) # → {"result": 8}
384
+ ```
385
+
386
+ **Key differences from queue-based endpoints:**
387
+ - **Direct HTTP routing** - Requests routed directly to workers, no queue
388
+ - **Lower latency** - No queuing overhead
389
+ - **Custom HTTP methods** - GET, POST, PUT, DELETE, PATCH support
390
+ - **No automatic retries** - Users handle errors directly
391
+
392
+ Load-balanced endpoints are ideal for REST APIs, webhooks, and real-time services. Queue-based endpoints are better for batch processing and fault-tolerant workflows.
393
+
394
+ For detailed information:
395
+ - **User guide:** [Using @remote with Load-Balanced Endpoints](docs/Using_Remote_With_LoadBalancer.md)
396
+ - **Runtime architecture:** [LoadBalancer Runtime Architecture](docs/LoadBalancer_Runtime_Architecture.md) - details on deployment, request flows, and execution
397
+
398
+ ## How it works
399
+
400
+ Flash orchestrates workflow execution through a sophisticated multi-step process:
401
+
402
+ 1. **Function identification**: The `@remote` decorator marks functions for remote execution, enabling Flash to distinguish between local and remote operations.
403
+ 2. **Dependency analysis**: Flash automatically analyzes function dependencies to construct an optimal execution order, ensuring data flows correctly between sequential and parallel operations.
404
+ 3. **Resource provisioning and execution**: For each remote function, Flash:
405
+ - Dynamically provisions endpoint and worker resources on Runpod's infrastructure.
406
+ - Serializes and securely transfers input data to the remote worker.
407
+ - Executes the function on the remote infrastructure with the specified GPU or CPU resources.
408
+ - Returns results to your local environment for further processing.
409
+ 4. **Data orchestration**: Results flow seamlessly between functions according to your local Python code structure, maintaining the same programming model whether functions run locally or remotely.
410
+
411
+
412
+ ## Advanced features
413
+
414
+ ### Custom Docker images
415
+
416
+ `LiveServerless` resources use a fixed Docker image that's optimized for Flash runtime, and supports full remote code execution. For specialized environments that require a custom Docker image, use `ServerlessEndpoint` or `CpuServerlessEndpoint`:
417
+
418
+ ```python
419
+ from tetra_rp import ServerlessEndpoint
420
+
421
+ custom_gpu = ServerlessEndpoint(
422
+ name="custom-ml-env",
423
+ imageName="pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime",
424
+ gpus=[GpuGroup.AMPERE_80]
425
+ )
426
+ ```
427
+
428
+ Unlike `LiveServerless`, these endpoints only support dictionary payloads in the form of `{"input": {...}}` (similar to a traditional [Serverless endpoint request](https://docs.runpod.io/serverless/endpoints/send-requests)), and cannot execute arbitrary Python functions remotely.
429
+
430
+ ### Persistent storage with network volumes
431
+
432
+ Attach [network volumes](https://docs.runpod.io/storage/network-volumes) for persistent storage across workers and endpoints:
433
+
434
+ ```python
435
+ config = LiveServerless(
436
+ name="model-server",
437
+ networkVolumeId="vol_abc123", # Your volume ID
438
+ template=PodTemplate(containerDiskInGb=100)
439
+ )
440
+ ```
441
+
442
+ ### Environment variables
443
+
444
+ Pass configuration to remote functions:
445
+
446
+ ```python
447
+ config = LiveServerless(
448
+ name="api-worker",
449
+ env={"HF_TOKEN": "your_token", "MODEL_ID": "gpt2"}
450
+ )
451
+ ```
452
+
453
+ Environment variables are excluded from configuration hashing, which means changing environment values won't trigger endpoint recreation. This allows different processes to load environment variables from `.env` files without causing false drift detection. Only structural changes (like GPU type, image, or template modifications) trigger endpoint updates.
454
+
455
+ ### Build Process and Handler Generation
456
+
457
+ Flash uses a sophisticated build process to package your application for deployment. Understanding how handlers are generated helps you debug issues and optimize your deployments.
458
+
459
+ #### How Flash Builds Your Application
460
+
461
+ When you run `flash build`, the following happens:
462
+
463
+ 1. **Discovery**: Flash scans your code for `@remote` decorated functions
464
+ 2. **Grouping**: Functions are grouped by their `resource_config`
465
+ 3. **Handler Generation**: For each resource config, Flash generates a lightweight handler file
466
+ 4. **Manifest Creation**: A `flash_manifest.json` file maps functions to their endpoints
467
+ 5. **Dependency Installation**: Python packages are installed with Linux x86_64 compatibility
468
+ 6. **Packaging**: Everything is bundled into `archive.tar.gz` for deployment
469
+
470
+ #### Cross-Platform Builds
471
+
472
+ Flash automatically handles cross-platform builds, ensuring your deployments work correctly regardless of your development platform:
473
+
474
+ - **Automatic Platform Targeting**: Dependencies are installed for Linux x86_64 (RunPod's serverless platform), even when building on macOS or Windows
475
+ - **Python Version Matching**: The build uses your current Python version to ensure package compatibility
476
+ - **Binary Wheel Enforcement**: Only pre-built binary wheels are used, preventing platform-specific compilation issues
477
+
478
+ This means you can build on macOS ARM64, Windows, or any other platform, and the resulting package will run correctly on RunPod serverless.
479
+
480
+ #### Handler Architecture
481
+
482
+ Flash uses a factory pattern for handlers to eliminate code duplication:
483
+
484
+ ```python
485
+ # Generated handler (handler_gpu_config.py)
486
+ from tetra_rp.runtime.generic_handler import create_handler
487
+ from workers.gpu import process_data
488
+
489
+ FUNCTION_REGISTRY = {
490
+ "process_data": process_data,
491
+ }
492
+
493
+ handler = create_handler(FUNCTION_REGISTRY)
494
+ ```
495
+
496
+ This approach provides:
497
+ - **Single source of truth**: All handler logic in one place
498
+ - **Easier maintenance**: Bug fixes don't require rebuilding projects
499
+
500
+ #### Cross-Endpoint Function Calls
501
+
502
+ Flash enables functions on different endpoints to call each other. The runtime automatically discovers endpoints using the manifest and routes calls appropriately:
503
+
504
+ ```python
505
+ # CPU endpoint function
506
+ @remote(resource_config=cpu_config)
507
+ def preprocess(data):
508
+ return clean_data
509
+
510
+ # GPU endpoint function
511
+ @remote(resource_config=gpu_config)
512
+ async def inference(data):
513
+ # Can call CPU endpoint function
514
+ clean = preprocess(data)
515
+ return result
516
+ ```
517
+
518
+ The runtime wrapper handles service discovery and routing automatically.
519
+
520
+ #### Build Artifacts
521
+
522
+ After `flash build` completes:
523
+ - `.flash/.build/`: Temporary build directory (removed unless `--keep-build`)
524
+ - `.flash/archive.tar.gz`: Deployment package
525
+ - `.flash/flash_manifest.json`: Service discovery configuration
526
+
527
+ For more details on the handler architecture, see [docs/Runtime_Generic_Handler.md](docs/Runtime_Generic_Handler.md).
528
+
529
+ For information on load-balanced endpoints (required for Mothership and HTTP services), see [docs/Load_Balancer_Endpoints.md](docs/Load_Balancer_Endpoints.md).
530
+
531
+ #### Troubleshooting Build Issues
532
+
533
+ **No @remote functions found:**
534
+ - Ensure your functions are decorated with `@remote(resource_config)`
535
+ - Check that Python files are not excluded by `.gitignore` or `.flashignore`
536
+ - Verify function decorators have valid syntax
537
+
538
+ **Handler generation failed:**
539
+ - Check for syntax errors in your Python files (these will be logged)
540
+ - Verify all imports in your worker modules are available
541
+ - Ensure resource config variables (e.g., `gpu_config`) are defined before functions reference them
542
+ - Use `--keep-build` to inspect generated handler files in `.flash/.build/`
543
+
544
+ **Build succeeded but deployment failed:**
545
+ - Verify all function imports work in the deployment environment
546
+ - Check that environment variables required by your functions are available
547
+ - Review the generated `flash_manifest.json` for correct function mappings
548
+
549
+ **Dependency installation failed:**
550
+ - If a package doesn't have pre-built Linux x86_64 wheels, the build will fail with an error
551
+ - For newer Python versions (3.13+), some packages may require manylinux_2_27 or higher
552
+ - Ensure you have standard pip installed (`python -m ensurepip --upgrade`) for best compatibility
553
+ - uv pip has known issues with newer manylinux tags - standard pip is recommended
554
+ - Check PyPI to verify the package supports your Python version on Linux
555
+
556
+ #### Managing Bundle Size
557
+
558
+ RunPod serverless has a **500MB deployment limit**. Exceeding this limit will cause deployment failures.
559
+
560
+ Use `--exclude` to skip packages already in your worker-tetra Docker image:
561
+
562
+ ```bash
563
+ # For GPU deployments (PyTorch pre-installed)
564
+ flash build --exclude torch,torchvision,torchaudio
565
+
566
+ # Check your resource config to determine which base image you're using
567
+ ```
568
+
569
+ **Which packages to exclude depends on your resource config:**
570
+ - **GPU resources** → PyTorch images have torch/torchvision/torchaudio pre-installed
571
+ - **CPU resources** → Python slim images have NO ML frameworks pre-installed
572
+ - **Load-balanced** → Same as above, depends on GPU vs CPU variant
573
+
574
+ See [worker-tetra](https://github.com/runpod-workers/worker-tetra) for base image details.
575
+
576
+ ## Configuration
577
+
578
+ ### GPU configuration parameters
579
+
580
+ The following parameters can be used with `LiveServerless` (full remote code execution) and `ServerlessEndpoint` (dictionary payload only) to configure your Runpod GPU endpoints:
581
+
582
+ | Parameter | Description | Default | Example Values |
583
+ |--------------------|-------------------------------------------------|---------------|-------------------------------------|
584
+ | `name` | (Required) Name for your endpoint | `""` | `"stable-diffusion-server"` |
585
+ | `gpus` | GPU pool IDs that can be used by workers | `[GpuGroup.ANY]` | `[GpuGroup.ADA_24]` for RTX 4090 |
586
+ | `gpuCount` | Number of GPUs per worker | 1 | 1, 2, 4 |
587
+ | `workersMin` | Minimum number of workers | 0 | Set to 1 for persistence |
588
+ | `workersMax` | Maximum number of workers | 3 | Higher for more concurrency |
589
+ | `idleTimeout` | Minutes before scaling down | 5 | 10, 30, 60 |
590
+ | `env` | Environment variables | `None` | `{"HF_TOKEN": "xyz"}` |
591
+ | `networkVolumeId` | Persistent storage ID | `None` | `"vol_abc123"` |
592
+ | `executionTimeoutMs`| Max execution time (ms) | 0 (no limit) | 600000 (10 min) |
593
+ | `scalerType` | Scaling strategy | `QUEUE_DELAY` | `REQUEST_COUNT` |
594
+ | `scalerValue` | Scaling parameter value | 4 | 1-10 range typical |
595
+ | `locations` | Preferred datacenter locations | `None` | `"us-east,eu-central"` |
596
+ | `imageName` | Custom Docker image (`ServerlessEndpoint` only) | Fixed for LiveServerless | `"pytorch/pytorch:latest"`, `"my-registry/custom:v1.0"` |
597
+
598
+ ### CPU configuration parameters
599
+
600
+ The same GPU configuration parameters above apply to `LiveServerless` (full remote code execution) and `CpuServerlessEndpoint` (dictionary payload only), with these additional CPU-specific parameters:
601
+
602
+ | Parameter | Description | Default | Example Values |
603
+ |--------------------|-------------------------------------------------|---------------|-------------------------------------|
604
+ | `instanceIds` | CPU Instance Types (forces a CPU endpoint type) | `None` | `[CpuInstanceType.CPU5C_2_4]` |
605
+ | `imageName` | Custom Docker image (`CpuServerlessEndpoint` only) | Fixed for `LiveServerless` | `"python:3.11-slim"`, `"my-registry/custom:v1.0"` |
606
+
607
+ ### Resource class comparison
608
+
609
+ | Feature | LiveServerless | ServerlessEndpoint | CpuServerlessEndpoint |
610
+ |---------|----------------|-------------------|----------------------|
611
+ | **Remote code execution** | ✅ Full Python function execution | ❌ Dictionary payload only | ❌ Dictionary payload only |
612
+ | **Custom Docker images** | ❌ Fixed optimized images | ✅ Any Docker image | ✅ Any Docker image |
613
+ | **Use case** | Dynamic remote functions | Traditional API endpoints | Traditional CPU endpoints |
614
+ | **Function returns** | Any Python object | Dictionary only | Dictionary only |
615
+ | **@remote decorator** | Full functionality | Limited to payload passing | Limited to payload passing |
616
+
617
+ ### Available GPU types
618
+
619
+ Some common GPU groups available through `GpuGroup`:
620
+
621
+ - `GpuGroup.ANY` - Any available GPU (default)
622
+ - `GpuGroup.ADA_24` - NVIDIA GeForce RTX 4090
623
+ - `GpuGroup.AMPERE_80` - NVIDIA A100 80GB
624
+ - `GpuGroup.AMPERE_48` - NVIDIA A40, RTX A6000
625
+ - `GpuGroup.AMPERE_24` - NVIDIA RTX A5000, L4, RTX 3090
626
+
627
+
628
+ ### Available CPU instance types
629
+
630
+ - `CpuInstanceType.CPU3G_1_4` - (cpu3g-1-4) 3rd gen general purpose, 1 vCPU, 4GB RAM
631
+ - `CpuInstanceType.CPU3G_2_8` - (cpu3g-2-8) 3rd gen general purpose, 2 vCPU, 8GB RAM
632
+ - `CpuInstanceType.CPU3G_4_16` - (cpu3g-4-16) 3rd gen general purpose, 4 vCPU, 16GB RAM
633
+ - `CpuInstanceType.CPU3G_8_32` - (cpu3g-8-32) 3rd gen general purpose, 8 vCPU, 32GB RAM
634
+ - `CpuInstanceType.CPU3C_1_2` - (cpu3c-1-2) 3rd gen compute-optimized, 1 vCPU, 2GB RAM
635
+ - `CpuInstanceType.CPU3C_2_4` - (cpu3c-2-4) 3rd gen compute-optimized, 2 vCPU, 4GB RAM
636
+ - `CpuInstanceType.CPU3C_4_8` - (cpu3c-4-8) 3rd gen compute-optimized, 4 vCPU, 8GB RAM
637
+ - `CpuInstanceType.CPU3C_8_16` - (cpu3c-8-16) 3rd gen compute-optimized, 8 vCPU, 16GB RAM
638
+ - `CpuInstanceType.CPU5C_1_2` - (cpu5c-1-2) 5th gen compute-optimized, 1 vCPU, 2GB RAM
639
+ - `CpuInstanceType.CPU5C_2_4` - (cpu5c-2-4) 5th gen compute-optimized, 2 vCPU, 4GB RAM
640
+ - `CpuInstanceType.CPU5C_4_8` - (cpu5c-4-8) 5th gen compute-optimized, 4 vCPU, 8GB RAM
641
+ - `CpuInstanceType.CPU5C_8_16` - (cpu5c-8-16) 5th gen compute-optimized, 8 vCPU, 16GB RAM
642
+
643
+ ## Workflow examples
644
+
645
+ ### Basic GPU workflow
646
+
647
+ ```python
648
+ import asyncio
649
+ from tetra_rp import remote, LiveServerless
650
+
651
+ # Simple GPU configuration
652
+ gpu_config = LiveServerless(name="example-gpu-server")
653
+
654
+ @remote(
655
+ resource_config=gpu_config,
656
+ dependencies=["torch", "numpy"]
657
+ )
658
+ def gpu_compute(data):
659
+ import torch
660
+ import numpy as np
661
+
662
+ # Convert to tensor and perform computation on GPU
663
+ tensor = torch.tensor(data, device="cuda")
664
+ result = tensor.sum().item()
665
+
666
+ # Get GPU info
667
+ gpu_info = torch.cuda.get_device_properties(0)
668
+
669
+ return {
670
+ "result": result,
671
+ "gpu_name": gpu_info.name,
672
+ "cuda_version": torch.version.cuda
673
+ }
674
+
675
+ async def main():
676
+ result = await gpu_compute([1, 2, 3, 4, 5])
677
+ print(f"Result: {result['result']}")
678
+ print(f"Computed on: {result['gpu_name']} with CUDA {result['cuda_version']}")
679
+
680
+ if __name__ == "__main__":
681
+ asyncio.run(main())
682
+ ```
683
+
684
+ ### Advanced GPU workflow with template configuration
685
+
686
+ ```python
687
+ import asyncio
688
+ from tetra_rp import remote, LiveServerless, GpuGroup, PodTemplate
689
+ import base64
690
+
691
+ # Advanced GPU configuration with consolidated template overrides
692
+ sd_config = LiveServerless(
693
+ gpus=[GpuGroup.AMPERE_80], # A100 80GB GPUs
694
+ name="example_image_gen_server",
695
+ template=PodTemplate(containerDiskInGb=100), # Large disk for models
696
+ workersMax=3,
697
+ idleTimeout=10
698
+ )
699
+
700
+ @remote(
701
+ resource_config=sd_config,
702
+ dependencies=["diffusers", "transformers", "torch", "accelerate", "safetensors"]
703
+ )
704
+ def generate_image(prompt, width=512, height=512):
705
+ import torch
706
+ from diffusers import StableDiffusionPipeline
707
+ import io
708
+ import base64
709
+
710
+ # Load pipeline (benefits from large container disk)
711
+ pipeline = StableDiffusionPipeline.from_pretrained(
712
+ "runwayml/stable-diffusion-v1-5",
713
+ torch_dtype=torch.float16
714
+ )
715
+ pipeline = pipeline.to("cuda")
716
+
717
+ # Generate image
718
+ image = pipeline(prompt=prompt, width=width, height=height).images[0]
719
+
720
+ # Convert to base64 for return
721
+ buffered = io.BytesIO()
722
+ image.save(buffered, format="PNG")
723
+ img_str = base64.b64encode(buffered.getvalue()).decode()
724
+
725
+ return {"image": img_str, "prompt": prompt}
726
+
727
+ async def main():
728
+ result = await generate_image("A serene mountain landscape at sunset")
729
+ print(f"Generated image for: {result['prompt']}")
730
+ # Save image locally if needed
731
+ # img_data = base64.b64decode(result["image"])
732
+ # with open("output.png", "wb") as f:
733
+ # f.write(img_data)
734
+
735
+ if __name__ == "__main__":
736
+ asyncio.run(main())
737
+ ```
738
+
739
+ ### Basic CPU workflow
740
+
741
+ ```python
742
+ import asyncio
743
+ from tetra_rp import remote, LiveServerless, CpuInstanceType
744
+
745
+ # Simple CPU configuration
746
+ cpu_config = LiveServerless(
747
+ name="example-cpu-server",
748
+ instanceIds=[CpuInstanceType.CPU5G_2_8], # 2 vCPU, 8GB RAM
749
+ )
750
+
751
+ @remote(
752
+ resource_config=cpu_config,
753
+ dependencies=["pandas", "numpy"]
754
+ )
755
+ def cpu_data_processing(data):
756
+ import pandas as pd
757
+ import numpy as np
758
+ import platform
759
+
760
+ # Process data using CPU
761
+ df = pd.DataFrame(data)
762
+
763
+ return {
764
+ "row_count": len(df),
765
+ "column_count": len(df.columns) if not df.empty else 0,
766
+ "mean_values": df.select_dtypes(include=[np.number]).mean().to_dict(),
767
+ "system_info": platform.processor(),
768
+ "platform": platform.platform()
769
+ }
770
+
771
+ async def main():
772
+ sample_data = [
773
+ {"name": "Alice", "age": 30, "score": 85},
774
+ {"name": "Bob", "age": 25, "score": 92},
775
+ {"name": "Charlie", "age": 35, "score": 78}
776
+ ]
777
+
778
+ result = await cpu_data_processing(sample_data)
779
+ print(f"Processed {result['row_count']} rows on {result['platform']}")
780
+ print(f"Mean values: {result['mean_values']}")
781
+
782
+ if __name__ == "__main__":
783
+ asyncio.run(main())
784
+ ```
785
+
786
+ ### Advanced CPU workflow with template configuration
787
+
788
+ ```python
789
+ import asyncio
790
+ import base64
791
+ from tetra_rp import remote, LiveServerless, CpuInstanceType, PodTemplate
792
+
793
+ # Advanced CPU configuration with template overrides
794
+ data_processing_config = LiveServerless(
795
+ name="advanced-cpu-processor",
796
+ instanceIds=[CpuInstanceType.CPU5C_4_16, CpuInstanceType.CPU3C_4_8], # Fallback options
797
+ template=PodTemplate(
798
+ containerDiskInGb=20, # Extra disk space for data processing
799
+ env=[{"key": "PYTHONPATH", "value": "/workspace"}] # Custom environment
800
+ ),
801
+ workersMax=5,
802
+ idleTimeout=15,
803
+ env={"PROCESSING_MODE": "batch", "DEBUG": "false"} # Additional env vars
804
+ )
805
+
806
+ @remote(
807
+ resource_config=data_processing_config,
808
+ dependencies=["pandas", "numpy", "scipy", "scikit-learn"]
809
+ )
810
+ def advanced_data_analysis(dataset, analysis_type="full"):
811
+ import pandas as pd
812
+ import numpy as np
813
+ from sklearn.preprocessing import StandardScaler
814
+ from sklearn.decomposition import PCA
815
+ import platform
816
+
817
+ # Create DataFrame
818
+ df = pd.DataFrame(dataset)
819
+
820
+ # Perform analysis based on type
821
+ results = {
822
+ "platform": platform.platform(),
823
+ "dataset_shape": df.shape,
824
+ "memory_usage": df.memory_usage(deep=True).sum()
825
+ }
826
+
827
+ if analysis_type == "full":
828
+ # Advanced statistical analysis
829
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
830
+ if len(numeric_cols) > 0:
831
+ # Standardize data
832
+ scaler = StandardScaler()
833
+ scaled_data = scaler.fit_transform(df[numeric_cols])
834
+
835
+ # PCA analysis
836
+ pca = PCA(n_components=min(len(numeric_cols), 3))
837
+ pca_result = pca.fit_transform(scaled_data)
838
+
839
+ results.update({
840
+ "correlation_matrix": df[numeric_cols].corr().to_dict(),
841
+ "pca_explained_variance": pca.explained_variance_ratio_.tolist(),
842
+ "pca_shape": pca_result.shape
843
+ })
844
+
845
+ return results
846
+
847
+ async def main():
848
+ # Generate sample dataset
849
+ sample_data = [
850
+ {"feature1": np.random.randn(), "feature2": np.random.randn(),
851
+ "feature3": np.random.randn(), "category": f"cat_{i%3}"}
852
+ for i in range(1000)
853
+ ]
854
+
855
+ result = await advanced_data_analysis(sample_data, "full")
856
+ print(f"Processed dataset with shape: {result['dataset_shape']}")
857
+ print(f"Memory usage: {result['memory_usage']} bytes")
858
+ print(f"PCA explained variance: {result.get('pca_explained_variance', 'N/A')}")
859
+
860
+ if __name__ == "__main__":
861
+ asyncio.run(main())
862
+ ```
863
+
864
+ ### Hybrid GPU/CPU workflow
865
+
866
+ ```python
867
+ import asyncio
868
+ from tetra_rp import remote, LiveServerless, GpuGroup, CpuInstanceType, PodTemplate
869
+
870
+ # GPU configuration for model inference
871
+ gpu_config = LiveServerless(
872
+ name="ml-inference-gpu",
873
+ gpus=[GpuGroup.AMPERE_24], # RTX 3090/A5000
874
+ template=PodTemplate(containerDiskInGb=50), # Space for models
875
+ workersMax=2
876
+ )
877
+
878
+ # CPU configuration for data preprocessing
879
+ cpu_config = LiveServerless(
880
+ name="data-preprocessor",
881
+ instanceIds=[CpuInstanceType.CPU5C_4_16], # 4 vCPU, 16GB RAM
882
+ template=PodTemplate(
883
+ containerDiskInGb=30,
884
+ env=[{"key": "NUMPY_NUM_THREADS", "value": "4"}]
885
+ ),
886
+ workersMax=3
887
+ )
888
+
889
+ @remote(
890
+ resource_config=cpu_config,
891
+ dependencies=["pandas", "numpy", "scikit-learn"]
892
+ )
893
+ def preprocess_data(raw_data):
894
+ import pandas as pd
895
+ import numpy as np
896
+ from sklearn.preprocessing import StandardScaler
897
+
898
+ # Data cleaning and preprocessing
899
+ df = pd.DataFrame(raw_data)
900
+
901
+ # Handle missing values
902
+ df = df.fillna(df.mean(numeric_only=True))
903
+
904
+ # Normalize numeric features
905
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
906
+ if len(numeric_cols) > 0:
907
+ scaler = StandardScaler()
908
+ df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
909
+
910
+ return {
911
+ "processed_data": df.to_dict('records'),
912
+ "shape": df.shape,
913
+ "columns": list(df.columns)
914
+ }
915
+
916
+ @remote(
917
+ resource_config=gpu_config,
918
+ dependencies=["torch", "transformers", "numpy"]
919
+ )
920
+ def run_inference(processed_data):
921
+ import torch
922
+ import numpy as np
923
+
924
+ # Simulate ML model inference on GPU
925
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
926
+
927
+ # Convert to tensor
928
+ data_array = np.array([list(item.values()) for item in processed_data["processed_data"]])
929
+ tensor = torch.tensor(data_array, dtype=torch.float32).to(device)
930
+
931
+ # Simple neural network simulation
932
+ with torch.no_grad():
933
+ # Simulate model computation
934
+ result = torch.nn.functional.softmax(tensor.mean(dim=1), dim=0)
935
+ predictions = result.cpu().numpy().tolist()
936
+
937
+ return {
938
+ "predictions": predictions,
939
+ "device_used": str(device),
940
+ "input_shape": tensor.shape
941
+ }
942
+
943
+ async def ml_pipeline(raw_dataset):
944
+ """Complete ML pipeline: CPU preprocessing -> GPU inference"""
945
+ print("Step 1: Preprocessing data on CPU...")
946
+ preprocessed = await preprocess_data(raw_dataset)
947
+ print(f"Preprocessed data shape: {preprocessed['shape']}")
948
+
949
+ print("Step 2: Running inference on GPU...")
950
+ results = await run_inference(preprocessed)
951
+ print(f"Inference completed on: {results['device_used']}")
952
+
953
+ return {
954
+ "preprocessing": preprocessed,
955
+ "inference": results
956
+ }
957
+
958
+ async def main():
959
+ # Sample dataset
960
+ raw_data = [
961
+ {"feature1": np.random.randn(), "feature2": np.random.randn(),
962
+ "feature3": np.random.randn(), "label": i % 2}
963
+ for i in range(100)
964
+ ]
965
+
966
+ # Run the complete pipeline
967
+ results = await ml_pipeline(raw_data)
968
+
969
+ print("\nPipeline Results:")
970
+ print(f"Data processed: {results['preprocessing']['shape']}")
971
+ print(f"Predictions generated: {len(results['inference']['predictions'])}")
972
+ print(f"GPU device: {results['inference']['device_used']}")
973
+
974
+ if __name__ == "__main__":
975
+ asyncio.run(main())
976
+ ```
977
+
978
+ ### Multi-stage ML pipeline example
979
+
980
+ ```python
981
+ import os
982
+ import asyncio
983
+ from tetra_rp import remote, LiveServerless
984
+
985
+ # Configure Runpod resources
986
+ runpod_config = LiveServerless(name="multi-stage-pipeline-server")
987
+
988
+ # Feature extraction on GPU
989
+ @remote(
990
+ resource_config=runpod_config,
991
+ dependencies=["torch", "transformers"]
992
+ )
993
+ def extract_features(texts):
994
+ import torch
995
+ from transformers import AutoTokenizer, AutoModel
996
+
997
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
998
+ model = AutoModel.from_pretrained("bert-base-uncased")
999
+ model.to("cuda")
1000
+
1001
+ features = []
1002
+ for text in texts:
1003
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
1004
+ with torch.no_grad():
1005
+ outputs = model(**inputs)
1006
+ features.append(outputs.last_hidden_state[:, 0].cpu().numpy().tolist()[0])
1007
+
1008
+ return features
1009
+
1010
+ # Classification on GPU
1011
+ @remote(
1012
+ resource_config=runpod_config,
1013
+ dependencies=["torch", "sklearn"]
1014
+ )
1015
+ def classify(features, labels=None):
1016
+ import torch
1017
+ import numpy as np
1018
+ from sklearn.linear_model import LogisticRegression
1019
+
1020
+ features_np = np.array(features[1:] if labels is None and isinstance(features, list) and len(features)>0 and isinstance(features[0], dict) else features)
1021
+
1022
+ if labels is not None:
1023
+ labels_np = np.array(labels)
1024
+ classifier = LogisticRegression()
1025
+ classifier.fit(features_np, labels_np)
1026
+
1027
+ coefficients = {
1028
+ "coef": classifier.coef_.tolist(),
1029
+ "intercept": classifier.intercept_.tolist(),
1030
+ "classes": classifier.classes_.tolist()
1031
+ }
1032
+ return coefficients
1033
+ else:
1034
+ coefficients = features[0]
1035
+
1036
+ classifier = LogisticRegression()
1037
+ classifier.coef_ = np.array(coefficients["coef"])
1038
+ classifier.intercept_ = np.array(coefficients["intercept"])
1039
+ classifier.classes_ = np.array(coefficients["classes"])
1040
+
1041
+ # Predict
1042
+ predictions = classifier.predict(features_np)
1043
+ probabilities = classifier.predict_proba(features_np)
1044
+
1045
+ return {
1046
+ "predictions": predictions.tolist(),
1047
+ "probabilities": probabilities.tolist()
1048
+ }
1049
+
1050
+ # Complete pipeline
1051
+ async def text_classification_pipeline(train_texts, train_labels, test_texts):
1052
+ train_features = await extract_features(train_texts)
1053
+ test_features = await extract_features(test_texts)
1054
+
1055
+ model_coeffs = await classify(train_features, train_labels)
1056
+
1057
+ # For inference, pass model coefficients along with test features
1058
+ # The classify function expects a list where the first element is the model (coeffs)
1059
+ # and subsequent elements are features for prediction.
1060
+ predictions = await classify([model_coeffs] + test_features)
1061
+
1062
+ return predictions
1063
+ ```
1064
+
1065
+ ### More examples
1066
+
1067
+ You can find many more examples in the [flash-examples repository](https://github.com/runpod/flash-examples).
1068
+
1069
+ ## Use cases
1070
+
1071
+ Flash is well-suited for a diverse range of AI and data processing workloads:
1072
+
1073
+ - **Multi-modal AI pipelines**: Orchestrate unified workflows combining text, image, and audio models with GPU acceleration.
1074
+ - **Distributed model training**: Scale training operations across multiple GPU workers for faster model development.
1075
+ - **AI research experimentation**: Rapidly prototype and test complex model combinations without infrastructure overhead.
1076
+ - **Production inference systems**: Deploy sophisticated multi-stage inference pipelines for real-world applications.
1077
+ - **Data processing workflows**: Efficiently process large datasets using CPU workers for general computation and GPU workers for accelerated tasks.
1078
+ - **Hybrid GPU/CPU workflows**: Optimize cost and performance by combining CPU preprocessing with GPU inference.
1079
+
1080
+ ## Limitations
1081
+
1082
+ - Serverless deployments using Flash are currently restricted to the `EU-RO-1` datacenter.
1083
+ - Flash is designed primarily for local development and live-testing workflows.
1084
+ - While Flash supports provisioning traditional Serverless endpoints (non-Live endpoints), the interface for interacting with these resources will change in upcoming releases. For now, focus on using `LiveServerless` for the most stable development experience, as it provides full remote code execution without requiring custom Docker images.
1085
+ - As you work through the Flash examples repository, you'll accumulate multiple endpoints in your Runpod account. These endpoints persist until manually deleted through the Runpod console. A `flash undeploy` command is in development to streamline cleanup, but for now, regular manual deletion of unused endpoints is recommended to avoid unnecessary charges.
1086
+ - Finally, be aware of your account's maximum worker capacity limits. Flash can rapidly scale workers across multiple endpoints, and you may hit capacity constraints faster than with traditional deployment patterns. If you find yourself consistently reaching worker limits, contact Runpod support to increase your account's capacity allocation.
1087
+
1088
+ ## Contributing
1089
+
1090
+ We welcome contributions to Flash! Whether you're fixing bugs, adding features, or improving documentation, your help makes this project better.
1091
+
1092
+ ### Development setup
1093
+
1094
+ 1. Fork and clone the repository.
1095
+ 2. Set up your development environment following the project guidelines.
1096
+ 3. Make your changes following our coding standards.
1097
+ 4. Test your changes thoroughly.
1098
+ 5. Submit a pull request.
1099
+
1100
+ ### Release process
1101
+
1102
+ This project uses an automated release system built on Release Please. For detailed information about how releases work, including conventional commits, versioning, and the CI/CD pipeline, see our [Release System Documentation](RELEASE_SYSTEM.md).
1103
+
1104
+ **Quick reference for contributors:**
1105
+ - Use conventional commits: `feat:`, `fix:`, `docs:`, etc.
1106
+ - CI automatically runs quality checks on all PRs.
1107
+ - Release PRs are created automatically when changes are merged to main.
1108
+ - Releases are published to PyPI automatically when release PRs are merged.
1109
+
1110
+ ## Troubleshooting
1111
+
1112
+ ### Authentication errors
1113
+
1114
+ Verify your API key is set correctly:
1115
+
1116
+ ```bash
1117
+ echo $RUNPOD_API_KEY # Should show your key
1118
+ ```
1119
+
1120
+ ### Import errors in remote functions
1121
+
1122
+ Remember to import packages inside remote functions:
1123
+
1124
+ ```python
1125
+ @remote(dependencies=["requests"])
1126
+ def fetch_data(url):
1127
+ import requests # Import here, not at top of file
1128
+ return requests.get(url).json()
1129
+ ```
1130
+
1131
+ ### Performance optimization
1132
+
1133
+ - Set `workersMin=1` to keep workers warm and avoid cold starts.
1134
+ - Use `idleTimeout` to balance cost and responsiveness.
1135
+ - Choose appropriate GPU types for your workload.
1136
+
1137
+ ## License
1138
+
1139
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
1140
+
1141
+ <p align="center">
1142
+ <a href="https://github.com/runpod/tetra-rp">Flash</a> •
1143
+ <a href="https://runpod.io">Runpod</a>
1144
+ </p>