podstack 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- podstack/__init__.py +222 -0
- podstack/annotations.py +725 -0
- podstack/client.py +322 -0
- podstack/exceptions.py +125 -0
- podstack/execution.py +291 -0
- podstack/gpu_runner.py +1141 -0
- podstack/models.py +274 -0
- podstack/notebook.py +410 -0
- podstack/registry/__init__.py +402 -0
- podstack/registry/client.py +957 -0
- podstack/registry/exceptions.py +107 -0
- podstack/registry/experiment.py +227 -0
- podstack/registry/model.py +273 -0
- podstack/registry/model_utils.py +231 -0
- podstack-1.2.0.dist-info/METADATA +299 -0
- podstack-1.2.0.dist-info/RECORD +27 -0
- podstack-1.2.0.dist-info/WHEEL +5 -0
- podstack-1.2.0.dist-info/licenses/LICENSE +21 -0
- podstack-1.2.0.dist-info/top_level.txt +2 -0
- podstack_gpu/__init__.py +126 -0
- podstack_gpu/app.py +675 -0
- podstack_gpu/exceptions.py +35 -0
- podstack_gpu/image.py +325 -0
- podstack_gpu/runner.py +746 -0
- podstack_gpu/secret.py +189 -0
- podstack_gpu/utils.py +203 -0
- podstack_gpu/volume.py +198 -0
podstack_gpu/app.py
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
1
|
+
"""Podstack App - Application class with function decorators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import os
|
|
5
|
+
import inspect
|
|
6
|
+
import textwrap
|
|
7
|
+
from typing import Optional, List, Dict, Any, Callable, Union, Generator, TYPE_CHECKING
|
|
8
|
+
from functools import wraps
|
|
9
|
+
|
|
10
|
+
from .exceptions import PodstackError, ValidationError
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .image import Image
|
|
14
|
+
from .volume import Volume
|
|
15
|
+
from .secret import Secret
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Valid GPU types
|
|
19
|
+
VALID_GPU_TYPES = ["A10", "L40", "L40S", "A100-40G", "A100-80G", "H100"]
|
|
20
|
+
VALID_FRACTIONS = [25, 50, 75, 100]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Function:
|
|
24
|
+
"""
|
|
25
|
+
A GPU-accelerated function that can be invoked remotely.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
@app.function(gpu="H100")
|
|
29
|
+
def train(epochs: int):
|
|
30
|
+
import torch
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
# Call remotely
|
|
34
|
+
result = train.remote(epochs=10)
|
|
35
|
+
|
|
36
|
+
# Call locally
|
|
37
|
+
result = train.local(epochs=10)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
func: Callable,
|
|
43
|
+
app: "App",
|
|
44
|
+
gpu: str = "L40S",
|
|
45
|
+
count: int = 1,
|
|
46
|
+
fraction: int = 100,
|
|
47
|
+
timeout: int = 3600,
|
|
48
|
+
memory: Optional[int] = None,
|
|
49
|
+
image: Optional["Image"] = None,
|
|
50
|
+
volumes: Optional[Dict[str, "Volume"]] = None,
|
|
51
|
+
secrets: Optional[List["Secret"]] = None,
|
|
52
|
+
retries: int = 0,
|
|
53
|
+
concurrency_limit: Optional[int] = None,
|
|
54
|
+
):
|
|
55
|
+
self._func = func
|
|
56
|
+
self._app = app
|
|
57
|
+
self._gpu = gpu.upper()
|
|
58
|
+
self._count = count
|
|
59
|
+
self._fraction = fraction
|
|
60
|
+
self._timeout = timeout
|
|
61
|
+
self._memory = memory
|
|
62
|
+
self._image = image
|
|
63
|
+
self._volumes = volumes or {}
|
|
64
|
+
self._secrets = secrets or []
|
|
65
|
+
self._retries = retries
|
|
66
|
+
self._concurrency_limit = concurrency_limit
|
|
67
|
+
|
|
68
|
+
# Validate
|
|
69
|
+
self._validate()
|
|
70
|
+
|
|
71
|
+
# Preserve function metadata
|
|
72
|
+
wraps(func)(self)
|
|
73
|
+
|
|
74
|
+
def _validate(self):
|
|
75
|
+
"""Validate function configuration."""
|
|
76
|
+
if self._gpu not in VALID_GPU_TYPES:
|
|
77
|
+
raise ValidationError(
|
|
78
|
+
f"Invalid GPU type: {self._gpu}. "
|
|
79
|
+
f"Valid types: {', '.join(VALID_GPU_TYPES)}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if self._fraction not in VALID_FRACTIONS:
|
|
83
|
+
raise ValidationError(
|
|
84
|
+
f"Invalid GPU fraction: {self._fraction}. "
|
|
85
|
+
f"Valid fractions: {VALID_FRACTIONS}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if self._count < 1 or self._count > 8:
|
|
89
|
+
raise ValidationError("GPU count must be between 1 and 8")
|
|
90
|
+
|
|
91
|
+
if self._timeout < 60:
|
|
92
|
+
raise ValidationError("Timeout must be at least 60 seconds")
|
|
93
|
+
|
|
94
|
+
if self._timeout > 86400:
|
|
95
|
+
raise ValidationError("Timeout cannot exceed 86400 seconds (24 hours)")
|
|
96
|
+
|
|
97
|
+
def __call__(self, *args, **kwargs):
|
|
98
|
+
"""Call the function locally (default behavior)."""
|
|
99
|
+
return self.local(*args, **kwargs)
|
|
100
|
+
|
|
101
|
+
def local(self, *args, **kwargs):
|
|
102
|
+
"""Execute the function locally without GPU."""
|
|
103
|
+
return self._func(*args, **kwargs)
|
|
104
|
+
|
|
105
|
+
def remote(self, *args, **kwargs) -> "FunctionCall":
|
|
106
|
+
"""
|
|
107
|
+
Execute the function on a remote GPU.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
FunctionCall object to get results
|
|
111
|
+
"""
|
|
112
|
+
return self._app._execute_remote(self, args, kwargs)
|
|
113
|
+
|
|
114
|
+
def spawn(self, *args, **kwargs) -> "FunctionCall":
|
|
115
|
+
"""
|
|
116
|
+
Spawn the function on a remote GPU without waiting.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
FunctionCall object to get results later
|
|
120
|
+
"""
|
|
121
|
+
return self._app._execute_remote(self, args, kwargs, wait=False)
|
|
122
|
+
|
|
123
|
+
def map(self, *iterables, order_outputs: bool = True, return_exceptions: bool = False):
|
|
124
|
+
"""
|
|
125
|
+
Map the function over iterables in parallel.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
*iterables: Input iterables to map over
|
|
129
|
+
order_outputs: If True, return results in input order
|
|
130
|
+
return_exceptions: If True, return exceptions instead of raising
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Generator of results
|
|
134
|
+
"""
|
|
135
|
+
return self._app._execute_map(
|
|
136
|
+
self, iterables,
|
|
137
|
+
order_outputs=order_outputs,
|
|
138
|
+
return_exceptions=return_exceptions
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def starmap(self, args_list: List[tuple], kwargs_list: List[dict] = None, **options):
|
|
142
|
+
"""
|
|
143
|
+
Map the function over a list of argument tuples.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
args_list: List of positional argument tuples
|
|
147
|
+
kwargs_list: List of keyword argument dicts (optional)
|
|
148
|
+
"""
|
|
149
|
+
kwargs_list = kwargs_list or [{}] * len(args_list)
|
|
150
|
+
return self._app._execute_starmap(self, args_list, kwargs_list, **options)
|
|
151
|
+
|
|
152
|
+
def get_source(self) -> str:
|
|
153
|
+
"""Get the source code of the function without decorators."""
|
|
154
|
+
source = inspect.getsource(self._func)
|
|
155
|
+
source = textwrap.dedent(source)
|
|
156
|
+
|
|
157
|
+
# Remove decorator lines (lines starting with @)
|
|
158
|
+
lines = source.split('\n')
|
|
159
|
+
cleaned_lines = []
|
|
160
|
+
in_decorator = False
|
|
161
|
+
|
|
162
|
+
for line in lines:
|
|
163
|
+
stripped = line.strip()
|
|
164
|
+
# Skip decorator lines
|
|
165
|
+
if stripped.startswith('@'):
|
|
166
|
+
in_decorator = True
|
|
167
|
+
# Check if decorator continues (ends with backslash or has unclosed parens)
|
|
168
|
+
if stripped.endswith('\\') or (stripped.count('(') > stripped.count(')')):
|
|
169
|
+
continue
|
|
170
|
+
else:
|
|
171
|
+
in_decorator = False
|
|
172
|
+
continue
|
|
173
|
+
# Skip continuation of multi-line decorators
|
|
174
|
+
if in_decorator:
|
|
175
|
+
if stripped.endswith('\\') or (stripped.count('(') > stripped.count(')')):
|
|
176
|
+
continue
|
|
177
|
+
else:
|
|
178
|
+
in_decorator = False
|
|
179
|
+
continue
|
|
180
|
+
cleaned_lines.append(line)
|
|
181
|
+
|
|
182
|
+
return '\n'.join(cleaned_lines)
|
|
183
|
+
|
|
184
|
+
def _build_annotation(self) -> str:
|
|
185
|
+
"""Build the GPU annotation string."""
|
|
186
|
+
parts = [f"#@podstack gpu={self._gpu} count={self._count} fraction={self._fraction} timeout={self._timeout}"]
|
|
187
|
+
|
|
188
|
+
if self._memory:
|
|
189
|
+
parts.append(f"memory={self._memory}")
|
|
190
|
+
|
|
191
|
+
if self._image:
|
|
192
|
+
annotation_part = self._image.definition.to_annotation()
|
|
193
|
+
if annotation_part:
|
|
194
|
+
parts.append(annotation_part)
|
|
195
|
+
|
|
196
|
+
return " ".join(parts)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def info(self) -> dict:
|
|
200
|
+
"""Get function metadata."""
|
|
201
|
+
return {
|
|
202
|
+
"name": self._func.__name__,
|
|
203
|
+
"gpu": self._gpu,
|
|
204
|
+
"count": self._count,
|
|
205
|
+
"fraction": self._fraction,
|
|
206
|
+
"timeout": self._timeout,
|
|
207
|
+
"memory": self._memory,
|
|
208
|
+
"image": self._image.to_dict() if self._image else None,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class FunctionCall:
|
|
213
|
+
"""
|
|
214
|
+
Represents an in-progress or completed function call.
|
|
215
|
+
|
|
216
|
+
Example:
|
|
217
|
+
call = train.spawn(epochs=10)
|
|
218
|
+
# Do other work...
|
|
219
|
+
result = call.get() # Wait for result
|
|
220
|
+
print(f"Duration: {call.gpu_seconds}s, Cost: ₹{call.cost_inr:.4f}")
|
|
221
|
+
|
|
222
|
+
# Or stream status updates in real-time
|
|
223
|
+
for update in call.stream_status():
|
|
224
|
+
print(f"Status: {update.status}")
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
def __init__(self, execution_id: str, app: "App"):
|
|
228
|
+
self._execution_id = execution_id
|
|
229
|
+
self._app = app
|
|
230
|
+
self._result = None
|
|
231
|
+
self._done = False
|
|
232
|
+
self._error = None
|
|
233
|
+
self._gpu_seconds = 0.0
|
|
234
|
+
self._cost_cents = 0
|
|
235
|
+
self._status = "submitted"
|
|
236
|
+
self._queue_position = None
|
|
237
|
+
self._estimated_wait = None
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def object_id(self) -> str:
|
|
241
|
+
"""Get the execution ID."""
|
|
242
|
+
return self._execution_id
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def status(self) -> str:
|
|
246
|
+
"""Get current execution status."""
|
|
247
|
+
return self._status
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def queue_position(self) -> Optional[int]:
|
|
251
|
+
"""Get queue position (if queued)."""
|
|
252
|
+
return self._queue_position
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def estimated_wait_seconds(self) -> Optional[int]:
|
|
256
|
+
"""Get estimated wait time in seconds (if queued)."""
|
|
257
|
+
return self._estimated_wait
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def gpu_seconds(self) -> float:
|
|
261
|
+
"""Get GPU execution time in seconds."""
|
|
262
|
+
return self._gpu_seconds
|
|
263
|
+
|
|
264
|
+
@property
|
|
265
|
+
def cost_cents(self) -> int:
|
|
266
|
+
"""Get execution cost in cents."""
|
|
267
|
+
return self._cost_cents
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def cost_inr(self) -> float:
|
|
271
|
+
"""Get execution cost in INR (rupees)."""
|
|
272
|
+
return self._cost_cents / 100 if self._cost_cents else 0.0
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def cost_dollars(self) -> float:
|
|
276
|
+
"""Deprecated: Use cost_inr instead. Returns cost in INR for backwards compatibility."""
|
|
277
|
+
return self.cost_inr
|
|
278
|
+
|
|
279
|
+
def get(self, timeout: float = None):
|
|
280
|
+
"""
|
|
281
|
+
Wait for the function to complete and return the result.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
timeout: Maximum time to wait in seconds
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
The function's return value
|
|
288
|
+
|
|
289
|
+
Raises:
|
|
290
|
+
TimeoutError: If timeout is exceeded
|
|
291
|
+
ExecutionError: If the function failed
|
|
292
|
+
"""
|
|
293
|
+
if self._done:
|
|
294
|
+
if self._error:
|
|
295
|
+
raise self._error
|
|
296
|
+
return self._result
|
|
297
|
+
|
|
298
|
+
result = self._app._wait_for_result(self._execution_id, timeout)
|
|
299
|
+
self._result = result
|
|
300
|
+
self._done = True
|
|
301
|
+
return result
|
|
302
|
+
|
|
303
|
+
def stream_status(self, callback: Callable = None):
|
|
304
|
+
"""
|
|
305
|
+
Stream status updates in real-time.
|
|
306
|
+
|
|
307
|
+
Can be used as an iterator or with a callback.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
callback: Optional callback function for each update
|
|
311
|
+
|
|
312
|
+
Yields:
|
|
313
|
+
StatusUpdate objects (if no callback provided)
|
|
314
|
+
|
|
315
|
+
Example:
|
|
316
|
+
# As iterator
|
|
317
|
+
for update in call.stream_status():
|
|
318
|
+
print(f"Status: {update.status}")
|
|
319
|
+
|
|
320
|
+
# With callback
|
|
321
|
+
call.stream_status(callback=lambda u: print(u.status))
|
|
322
|
+
"""
|
|
323
|
+
runner = self._app._get_runner()
|
|
324
|
+
|
|
325
|
+
if callback:
|
|
326
|
+
def _callback(update):
|
|
327
|
+
self._update_from_status(update)
|
|
328
|
+
callback(update)
|
|
329
|
+
runner.stream_status(self._execution_id, _callback)
|
|
330
|
+
else:
|
|
331
|
+
# Return a generator
|
|
332
|
+
updates = []
|
|
333
|
+
def _collect(update):
|
|
334
|
+
self._update_from_status(update)
|
|
335
|
+
updates.append(update)
|
|
336
|
+
|
|
337
|
+
runner.stream_status(self._execution_id, _collect)
|
|
338
|
+
yield from updates
|
|
339
|
+
|
|
340
|
+
def _update_from_status(self, update):
|
|
341
|
+
"""Update internal state from status update."""
|
|
342
|
+
from .runner import StatusUpdate
|
|
343
|
+
self._status = update.status
|
|
344
|
+
self._queue_position = update.queue_position
|
|
345
|
+
self._estimated_wait = update.estimated_wait_seconds
|
|
346
|
+
if update.gpu_seconds:
|
|
347
|
+
self._gpu_seconds = update.gpu_seconds
|
|
348
|
+
if update.cost_cents:
|
|
349
|
+
self._cost_cents = update.cost_cents
|
|
350
|
+
if update.is_terminal:
|
|
351
|
+
self._done = True
|
|
352
|
+
if update.error:
|
|
353
|
+
from .exceptions import ExecutionError
|
|
354
|
+
self._error = ExecutionError(update.error, self._execution_id, update.status)
|
|
355
|
+
|
|
356
|
+
def cancel(self, reason: str = "Cancelled by user") -> bool:
|
|
357
|
+
"""Cancel the function call."""
|
|
358
|
+
return self._app._cancel_execution(self._execution_id, reason)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class App:
|
|
362
|
+
"""
|
|
363
|
+
Podstack Application - container for GPU functions.
|
|
364
|
+
|
|
365
|
+
Example:
|
|
366
|
+
import podstack
|
|
367
|
+
|
|
368
|
+
app = podstack.App("my-training-app")
|
|
369
|
+
|
|
370
|
+
@app.function(gpu="H100", image=podstack.Image.ml())
|
|
371
|
+
def train(epochs: int):
|
|
372
|
+
import torch
|
|
373
|
+
...
|
|
374
|
+
|
|
375
|
+
if __name__ == "__main__":
|
|
376
|
+
result = train.remote(epochs=10)
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
# Default API URL for the Podstack Notebooks API
|
|
380
|
+
DEFAULT_API_URL = "https://cloud.podstack.ai/notebooks"
|
|
381
|
+
|
|
382
|
+
def __init__(
|
|
383
|
+
self,
|
|
384
|
+
name: str = None,
|
|
385
|
+
token: str = None,
|
|
386
|
+
project_id: str = None,
|
|
387
|
+
api_url: str = None,
|
|
388
|
+
):
|
|
389
|
+
"""
|
|
390
|
+
Initialize a Podstack App.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
name: Application name (for organization)
|
|
394
|
+
token: API token (or set PODSTACK_TOKEN env var) - supports psk_xxx platform tokens
|
|
395
|
+
project_id: Project ID (or set PODSTACK_PROJECT_ID env var)
|
|
396
|
+
api_url: API URL (optional, defaults to PODSTACK_API_URL env var or production URL)
|
|
397
|
+
"""
|
|
398
|
+
self.name = name or "default"
|
|
399
|
+
self._token = token or os.environ.get("PODSTACK_TOKEN")
|
|
400
|
+
self._project_id = project_id or os.environ.get("PODSTACK_PROJECT_ID")
|
|
401
|
+
self._api_url = (api_url or os.environ.get("PODSTACK_API_URL") or self.DEFAULT_API_URL).rstrip("/")
|
|
402
|
+
|
|
403
|
+
self._functions: Dict[str, Function] = {}
|
|
404
|
+
self._runner = None # Lazy initialized
|
|
405
|
+
|
|
406
|
+
def _get_runner(self):
|
|
407
|
+
"""Get or create the GPU runner."""
|
|
408
|
+
if self._runner is None:
|
|
409
|
+
from .runner import GPURunner
|
|
410
|
+
|
|
411
|
+
if not self._token:
|
|
412
|
+
raise PodstackError(
|
|
413
|
+
"No API token provided. Set PODSTACK_TOKEN environment variable "
|
|
414
|
+
"or pass token to App()"
|
|
415
|
+
)
|
|
416
|
+
if not self._project_id:
|
|
417
|
+
raise PodstackError(
|
|
418
|
+
"No project ID provided. Set PODSTACK_PROJECT_ID environment variable "
|
|
419
|
+
"or pass project_id to App()"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
self._runner = GPURunner(
|
|
423
|
+
token=self._token,
|
|
424
|
+
project_id=self._project_id,
|
|
425
|
+
api_url=self._api_url,
|
|
426
|
+
)
|
|
427
|
+
return self._runner
|
|
428
|
+
|
|
429
|
+
def function(
|
|
430
|
+
self,
|
|
431
|
+
gpu: str = "L40S",
|
|
432
|
+
count: int = 1,
|
|
433
|
+
fraction: int = 100,
|
|
434
|
+
timeout: int = 3600,
|
|
435
|
+
memory: int = None,
|
|
436
|
+
image: "Image" = None,
|
|
437
|
+
volumes: Dict[str, "Volume"] = None,
|
|
438
|
+
secrets: List["Secret"] = None,
|
|
439
|
+
retries: int = 0,
|
|
440
|
+
concurrency_limit: int = None,
|
|
441
|
+
) -> Callable[[Callable], Function]:
|
|
442
|
+
"""
|
|
443
|
+
Decorator to create a GPU-accelerated function.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
gpu: GPU type (A10, L40, L40S, A100-40G, A100-80G, H100)
|
|
447
|
+
count: Number of GPUs (1-8)
|
|
448
|
+
fraction: GPU fraction percentage (25, 50, 75, 100) - use lower fractions for cost savings
|
|
449
|
+
timeout: Maximum execution time in seconds
|
|
450
|
+
memory: GPU memory limit in GB
|
|
451
|
+
image: Container image specification
|
|
452
|
+
volumes: Mount volumes {"/path": Volume}
|
|
453
|
+
secrets: List of secrets to inject
|
|
454
|
+
retries: Number of retries on failure
|
|
455
|
+
concurrency_limit: Max concurrent executions
|
|
456
|
+
|
|
457
|
+
Example:
|
|
458
|
+
# Full GPU for training
|
|
459
|
+
@app.function(gpu="H100", image=podstack.Image.ml())
|
|
460
|
+
def train(epochs: int):
|
|
461
|
+
import torch
|
|
462
|
+
...
|
|
463
|
+
|
|
464
|
+
# Fractional GPU for inference (cost-effective)
|
|
465
|
+
@app.function(gpu="L40S", fraction=25, image=podstack.Image.ml())
|
|
466
|
+
def inference(data):
|
|
467
|
+
return model.predict(data)
|
|
468
|
+
|
|
469
|
+
# Multiple GPUs for distributed training
|
|
470
|
+
@app.function(gpu="A100-80G", count=4, image=podstack.Image.ml())
|
|
471
|
+
def distributed_train():
|
|
472
|
+
...
|
|
473
|
+
"""
|
|
474
|
+
def decorator(func: Callable) -> Function:
|
|
475
|
+
fn = Function(
|
|
476
|
+
func=func,
|
|
477
|
+
app=self,
|
|
478
|
+
gpu=gpu,
|
|
479
|
+
count=count,
|
|
480
|
+
fraction=fraction,
|
|
481
|
+
timeout=timeout,
|
|
482
|
+
memory=memory,
|
|
483
|
+
image=image,
|
|
484
|
+
volumes=volumes,
|
|
485
|
+
secrets=secrets,
|
|
486
|
+
retries=retries,
|
|
487
|
+
concurrency_limit=concurrency_limit,
|
|
488
|
+
)
|
|
489
|
+
self._functions[func.__name__] = fn
|
|
490
|
+
return fn
|
|
491
|
+
|
|
492
|
+
return decorator
|
|
493
|
+
|
|
494
|
+
def cls(
|
|
495
|
+
self,
|
|
496
|
+
gpu: str = "L40S",
|
|
497
|
+
count: int = 1,
|
|
498
|
+
fraction: int = 100,
|
|
499
|
+
timeout: int = 3600,
|
|
500
|
+
memory: int = None,
|
|
501
|
+
image: "Image" = None,
|
|
502
|
+
volumes: Dict[str, "Volume"] = None,
|
|
503
|
+
secrets: List["Secret"] = None,
|
|
504
|
+
):
|
|
505
|
+
"""
|
|
506
|
+
Decorator to create a GPU-accelerated class.
|
|
507
|
+
|
|
508
|
+
Methods decorated with @method will run on the GPU.
|
|
509
|
+
|
|
510
|
+
Example:
|
|
511
|
+
@app.cls(gpu="H100", image=podstack.Image.ml())
|
|
512
|
+
class ModelServer:
|
|
513
|
+
def __init__(self):
|
|
514
|
+
self.model = load_model()
|
|
515
|
+
|
|
516
|
+
@podstack.method()
|
|
517
|
+
def predict(self, data):
|
|
518
|
+
return self.model(data)
|
|
519
|
+
"""
|
|
520
|
+
def decorator(cls):
|
|
521
|
+
# Store GPU config on the class
|
|
522
|
+
cls._podstack_config = {
|
|
523
|
+
"gpu": gpu,
|
|
524
|
+
"count": count,
|
|
525
|
+
"fraction": fraction,
|
|
526
|
+
"timeout": timeout,
|
|
527
|
+
"memory": memory,
|
|
528
|
+
"image": image,
|
|
529
|
+
"volumes": volumes,
|
|
530
|
+
"secrets": secrets,
|
|
531
|
+
}
|
|
532
|
+
cls._podstack_app = self
|
|
533
|
+
return cls
|
|
534
|
+
|
|
535
|
+
return decorator
|
|
536
|
+
|
|
537
|
+
def _execute_remote(self, fn: Function, args: tuple, kwargs: dict, wait: bool = True) -> FunctionCall:
|
|
538
|
+
"""Execute a function remotely."""
|
|
539
|
+
runner = self._get_runner()
|
|
540
|
+
|
|
541
|
+
# Build the code to execute
|
|
542
|
+
source = fn.get_source()
|
|
543
|
+
|
|
544
|
+
# Build argument serialization
|
|
545
|
+
import json
|
|
546
|
+
args_json = json.dumps(args)
|
|
547
|
+
kwargs_json = json.dumps(kwargs)
|
|
548
|
+
|
|
549
|
+
code = f'''
|
|
550
|
+
{fn._build_annotation()}
|
|
551
|
+
{source}
|
|
552
|
+
|
|
553
|
+
import json
|
|
554
|
+
_args = json.loads({args_json!r})
|
|
555
|
+
_kwargs = json.loads({kwargs_json!r})
|
|
556
|
+
_result = {fn._func.__name__}(*_args, **_kwargs)
|
|
557
|
+
print("__RESULT__:", json.dumps(_result))
|
|
558
|
+
'''
|
|
559
|
+
|
|
560
|
+
# Submit
|
|
561
|
+
result = runner.run(
|
|
562
|
+
code,
|
|
563
|
+
gpu=fn._gpu,
|
|
564
|
+
count=fn._count,
|
|
565
|
+
fraction=fn._fraction,
|
|
566
|
+
timeout=fn._timeout,
|
|
567
|
+
memory=fn._memory,
|
|
568
|
+
env=fn._image.definition.env_preset if fn._image else None,
|
|
569
|
+
pip=fn._image.definition.pip_packages if fn._image else None,
|
|
570
|
+
wait=wait,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
call = FunctionCall(result.execution_id, self)
|
|
574
|
+
call._gpu_seconds = result.gpu_seconds
|
|
575
|
+
call._cost_cents = result.cost_cents
|
|
576
|
+
|
|
577
|
+
if wait and result.success:
|
|
578
|
+
# Parse result from output
|
|
579
|
+
for line in result.output.split('\n'):
|
|
580
|
+
if line.startswith("__RESULT__:"):
|
|
581
|
+
import json
|
|
582
|
+
call._result = json.loads(line[11:].strip())
|
|
583
|
+
call._done = True
|
|
584
|
+
break
|
|
585
|
+
|
|
586
|
+
# If no __RESULT__ found, return the raw output
|
|
587
|
+
if not call._done:
|
|
588
|
+
call._result = result.output
|
|
589
|
+
call._done = True
|
|
590
|
+
|
|
591
|
+
return call
|
|
592
|
+
|
|
593
|
+
def _execute_map(self, fn: Function, iterables, order_outputs: bool, return_exceptions: bool):
|
|
594
|
+
"""Execute map over iterables."""
|
|
595
|
+
# Simple implementation - submit all and collect
|
|
596
|
+
calls = []
|
|
597
|
+
for args in zip(*iterables):
|
|
598
|
+
call = self._execute_remote(fn, args, {}, wait=False)
|
|
599
|
+
calls.append(call)
|
|
600
|
+
|
|
601
|
+
# Collect results
|
|
602
|
+
results = []
|
|
603
|
+
for call in calls:
|
|
604
|
+
try:
|
|
605
|
+
results.append(call.get())
|
|
606
|
+
except Exception as e:
|
|
607
|
+
if return_exceptions:
|
|
608
|
+
results.append(e)
|
|
609
|
+
else:
|
|
610
|
+
raise
|
|
611
|
+
|
|
612
|
+
return results if order_outputs else iter(results)
|
|
613
|
+
|
|
614
|
+
def _execute_starmap(self, fn: Function, args_list, kwargs_list, **options):
|
|
615
|
+
"""Execute starmap over argument lists."""
|
|
616
|
+
calls = []
|
|
617
|
+
for args, kwargs in zip(args_list, kwargs_list):
|
|
618
|
+
call = self._execute_remote(fn, args, kwargs, wait=False)
|
|
619
|
+
calls.append(call)
|
|
620
|
+
|
|
621
|
+
# Collect results
|
|
622
|
+
return [call.get() for call in calls]
|
|
623
|
+
|
|
624
|
+
def _wait_for_result(self, execution_id: str, timeout: float = None):
|
|
625
|
+
"""Wait for an execution to complete."""
|
|
626
|
+
runner = self._get_runner()
|
|
627
|
+
import time
|
|
628
|
+
|
|
629
|
+
start = time.time()
|
|
630
|
+
while True:
|
|
631
|
+
if timeout and (time.time() - start) > timeout:
|
|
632
|
+
from .exceptions import TimeoutError
|
|
633
|
+
raise TimeoutError(f"Timed out waiting for {execution_id}")
|
|
634
|
+
|
|
635
|
+
status = runner.get_status(execution_id)
|
|
636
|
+
if status["status"] in ("completed", "failed", "timeout", "cancelled"):
|
|
637
|
+
break
|
|
638
|
+
time.sleep(2)
|
|
639
|
+
|
|
640
|
+
if status["status"] != "completed":
|
|
641
|
+
from .exceptions import ExecutionError
|
|
642
|
+
raise ExecutionError(
|
|
643
|
+
status.get("error", f"Execution {status['status']}"),
|
|
644
|
+
execution_id=execution_id,
|
|
645
|
+
status=status["status"],
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
return status.get("result")
|
|
649
|
+
|
|
650
|
+
def _cancel_execution(self, execution_id: str, reason: str) -> bool:
|
|
651
|
+
"""Cancel an execution."""
|
|
652
|
+
runner = self._get_runner()
|
|
653
|
+
return runner.cancel(execution_id, reason)
|
|
654
|
+
|
|
655
|
+
@property
|
|
656
|
+
def registered_functions(self) -> Dict[str, Function]:
|
|
657
|
+
"""Get all registered functions."""
|
|
658
|
+
return self._functions.copy()
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def method():
|
|
662
|
+
"""
|
|
663
|
+
Decorator to mark a method as GPU-executable within a @app.cls class.
|
|
664
|
+
|
|
665
|
+
Example:
|
|
666
|
+
@app.cls(gpu="H100")
|
|
667
|
+
class Model:
|
|
668
|
+
@podstack.method()
|
|
669
|
+
def predict(self, x):
|
|
670
|
+
...
|
|
671
|
+
"""
|
|
672
|
+
def decorator(func):
|
|
673
|
+
func._is_podstack_method = True
|
|
674
|
+
return func
|
|
675
|
+
return decorator
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Podstack SDK exceptions."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PodstackError(Exception):
|
|
5
|
+
"""Base exception for Podstack SDK."""
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AuthenticationError(PodstackError):
|
|
10
|
+
"""Authentication failed - invalid or expired token."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ValidationError(PodstackError):
|
|
15
|
+
"""Invalid parameters or annotation."""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ExecutionError(PodstackError):
|
|
20
|
+
"""GPU execution failed."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, message: str, execution_id: str = None, status: str = None):
|
|
23
|
+
super().__init__(message)
|
|
24
|
+
self.execution_id = execution_id
|
|
25
|
+
self.status = status
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TimeoutError(PodstackError):
|
|
29
|
+
"""Execution timed out."""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class InsufficientBalanceError(PodstackError):
|
|
34
|
+
"""Insufficient wallet balance."""
|
|
35
|
+
pass
|