podstack 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- podstack/__init__.py +222 -0
- podstack/annotations.py +725 -0
- podstack/client.py +322 -0
- podstack/exceptions.py +125 -0
- podstack/execution.py +291 -0
- podstack/gpu_runner.py +1141 -0
- podstack/models.py +274 -0
- podstack/notebook.py +410 -0
- podstack/registry/__init__.py +402 -0
- podstack/registry/client.py +957 -0
- podstack/registry/exceptions.py +107 -0
- podstack/registry/experiment.py +227 -0
- podstack/registry/model.py +273 -0
- podstack/registry/model_utils.py +231 -0
- podstack-1.2.0.dist-info/METADATA +299 -0
- podstack-1.2.0.dist-info/RECORD +27 -0
- podstack-1.2.0.dist-info/WHEEL +5 -0
- podstack-1.2.0.dist-info/licenses/LICENSE +21 -0
- podstack-1.2.0.dist-info/top_level.txt +2 -0
- podstack_gpu/__init__.py +126 -0
- podstack_gpu/app.py +675 -0
- podstack_gpu/exceptions.py +35 -0
- podstack_gpu/image.py +325 -0
- podstack_gpu/runner.py +746 -0
- podstack_gpu/secret.py +189 -0
- podstack_gpu/utils.py +203 -0
- podstack_gpu/volume.py +198 -0
podstack/annotations.py
ADDED
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Podstack Annotations
|
|
3
|
+
|
|
4
|
+
Decorators and context managers for GPU provisioning, experiment tracking,
|
|
5
|
+
and model registry in Podstack.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
import podstack
|
|
9
|
+
|
|
10
|
+
# GPU execution - actually provisions and runs on remote GPU
|
|
11
|
+
@podstack.gpu(type="L40S", fraction=100)
|
|
12
|
+
def train():
|
|
13
|
+
import torch
|
|
14
|
+
print(f"Running on: {torch.cuda.get_device_name(0)}")
|
|
15
|
+
return {"status": "done"}
|
|
16
|
+
|
|
17
|
+
result = train() # Executes on remote GPU!
|
|
18
|
+
|
|
19
|
+
# Experiment tracking
|
|
20
|
+
@podstack.experiment(name="my-experiment")
|
|
21
|
+
@podstack.run(name="training-v1")
|
|
22
|
+
def training_loop():
|
|
23
|
+
podstack.registry.log_metrics({"loss": 0.5})
|
|
24
|
+
|
|
25
|
+
# Model registration
|
|
26
|
+
@podstack.model.register(name="my-model")
|
|
27
|
+
def save_model():
|
|
28
|
+
...
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import functools
|
|
32
|
+
import os
|
|
33
|
+
import time
|
|
34
|
+
import inspect
|
|
35
|
+
import textwrap
|
|
36
|
+
from typing import Optional, Dict, Any, Callable, Union
|
|
37
|
+
from contextlib import contextmanager
|
|
38
|
+
|
|
39
|
+
from . import registry
|
|
40
|
+
from .gpu_runner import (
|
|
41
|
+
GPURunner,
|
|
42
|
+
GPUExecutionResult,
|
|
43
|
+
get_runner,
|
|
44
|
+
init as init_runner,
|
|
45
|
+
PodstackError,
|
|
46
|
+
PodstackTimeoutError,
|
|
47
|
+
PodstackExecutionError,
|
|
48
|
+
PodstackProvisioningError,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Global state for configuration
|
|
53
|
+
_current_gpu_config: Dict[str, Any] = {}
|
|
54
|
+
_current_environment: str = "pytorch"
|
|
55
|
+
_auto_shutdown_minutes: int = 60
|
|
56
|
+
_remote_execution_enabled: bool = True
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def enable_remote_execution(enabled: bool = True):
|
|
60
|
+
"""Enable or disable remote GPU execution for decorators."""
|
|
61
|
+
global _remote_execution_enabled
|
|
62
|
+
_remote_execution_enabled = enabled
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def is_remote_execution_enabled() -> bool:
|
|
66
|
+
"""Check if remote execution is enabled."""
|
|
67
|
+
return _remote_execution_enabled
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GPUConfig:
|
|
71
|
+
"""
|
|
72
|
+
GPU configuration decorator and context manager.
|
|
73
|
+
|
|
74
|
+
When used as a decorator, executes the function on a remote GPU.
|
|
75
|
+
When used as a context manager, sets the GPU config for nested operations.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
type: str = "L40S",
|
|
81
|
+
count: int = 1,
|
|
82
|
+
allocation: int = 100,
|
|
83
|
+
fraction: int = None,
|
|
84
|
+
memory_gb: Optional[int] = None,
|
|
85
|
+
timeout: int = 3600,
|
|
86
|
+
env: str = None,
|
|
87
|
+
pip: Union[str, list] = None,
|
|
88
|
+
uv: Union[str, list] = None,
|
|
89
|
+
conda: Union[str, list] = None,
|
|
90
|
+
requirements: str = None,
|
|
91
|
+
use_uv: bool = False,
|
|
92
|
+
remote: bool = None
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Configure GPU for execution.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
type: GPU type (L40S, A100-40G, A100-80G, H100, A10, T4)
|
|
99
|
+
count: Number of GPUs (1-8)
|
|
100
|
+
allocation: Alias for fraction (for backward compatibility)
|
|
101
|
+
fraction: GPU time-slice percentage (25, 50, 75, 100)
|
|
102
|
+
memory_gb: Optional memory override (not used in remote execution)
|
|
103
|
+
timeout: Maximum execution time in seconds
|
|
104
|
+
env: Environment preset (ml, nlp, cv, audio, tabular, rl, scientific)
|
|
105
|
+
pip: Pip packages - string "pkg1,pkg2" or list ["pkg1", "pkg2"]
|
|
106
|
+
uv: UV packages (faster than pip) - string or list
|
|
107
|
+
conda: Conda packages - string or list
|
|
108
|
+
requirements: Path to requirements.txt file
|
|
109
|
+
use_uv: Use uv instead of pip for all installations (faster)
|
|
110
|
+
remote: Whether to execute remotely (default: True if configured)
|
|
111
|
+
"""
|
|
112
|
+
self.type = type
|
|
113
|
+
self.count = count
|
|
114
|
+
self.fraction = fraction if fraction is not None else allocation
|
|
115
|
+
self.memory_gb = memory_gb
|
|
116
|
+
self.timeout = timeout
|
|
117
|
+
self.env = env
|
|
118
|
+
self.pip = pip
|
|
119
|
+
self.uv = uv
|
|
120
|
+
self.conda = conda
|
|
121
|
+
self.requirements = requirements
|
|
122
|
+
self.use_uv = use_uv
|
|
123
|
+
self.remote = remote if remote is not None else _remote_execution_enabled
|
|
124
|
+
|
|
125
|
+
# Store in global config
|
|
126
|
+
global _current_gpu_config
|
|
127
|
+
_current_gpu_config = {
|
|
128
|
+
"type": type,
|
|
129
|
+
"count": count,
|
|
130
|
+
"fraction": self.fraction,
|
|
131
|
+
"allocation": self.fraction,
|
|
132
|
+
"memory_gb": memory_gb,
|
|
133
|
+
"timeout": timeout,
|
|
134
|
+
"env": env,
|
|
135
|
+
"pip": pip,
|
|
136
|
+
"uv": uv,
|
|
137
|
+
"conda": conda,
|
|
138
|
+
"requirements": requirements,
|
|
139
|
+
"use_uv": use_uv,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
def __call__(self, func: Callable) -> Callable:
|
|
143
|
+
"""Decorator usage - executes function on remote GPU."""
|
|
144
|
+
@functools.wraps(func)
|
|
145
|
+
def wrapper(*args, **kwargs):
|
|
146
|
+
if not self.remote:
|
|
147
|
+
# Local execution - just run the function
|
|
148
|
+
print(f"[Podstack] GPU Config (local): {self.type} x{self.count} @ {self.fraction}%")
|
|
149
|
+
return func(*args, **kwargs)
|
|
150
|
+
|
|
151
|
+
# Remote execution on GPU
|
|
152
|
+
print(f"[Podstack] Provisioning GPU: {self.type} x{self.count} @ {self.fraction}%")
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
runner = get_runner()
|
|
156
|
+
except ValueError as e:
|
|
157
|
+
print(f"[Podstack] Warning: {e}")
|
|
158
|
+
print(f"[Podstack] Falling back to local execution")
|
|
159
|
+
return func(*args, **kwargs)
|
|
160
|
+
|
|
161
|
+
# Get function source
|
|
162
|
+
try:
|
|
163
|
+
source = inspect.getsource(func)
|
|
164
|
+
source = textwrap.dedent(source)
|
|
165
|
+
except OSError:
|
|
166
|
+
print("[Podstack] Warning: Cannot get function source, running locally")
|
|
167
|
+
return func(*args, **kwargs)
|
|
168
|
+
|
|
169
|
+
# Remove the decorator from source (to avoid recursion)
|
|
170
|
+
# Handle multi-line decorators by tracking parentheses balance
|
|
171
|
+
lines = source.split('\n')
|
|
172
|
+
clean_lines = []
|
|
173
|
+
skip_decorator = False
|
|
174
|
+
paren_depth = 0
|
|
175
|
+
for line in lines:
|
|
176
|
+
stripped = line.strip()
|
|
177
|
+
if stripped.startswith('@podstack.gpu') or stripped.startswith('@gpu'):
|
|
178
|
+
skip_decorator = True
|
|
179
|
+
# Count parentheses in this line
|
|
180
|
+
paren_depth += line.count('(') - line.count(')')
|
|
181
|
+
continue
|
|
182
|
+
if skip_decorator:
|
|
183
|
+
# Continue skipping until parentheses are balanced
|
|
184
|
+
paren_depth += line.count('(') - line.count(')')
|
|
185
|
+
if paren_depth <= 0:
|
|
186
|
+
skip_decorator = False
|
|
187
|
+
continue
|
|
188
|
+
# Skip other decorators too
|
|
189
|
+
if stripped.startswith('@'):
|
|
190
|
+
continue
|
|
191
|
+
clean_lines.append(line)
|
|
192
|
+
source = '\n'.join(clean_lines)
|
|
193
|
+
|
|
194
|
+
# Build kwargs string
|
|
195
|
+
kwargs_parts = []
|
|
196
|
+
for k, v in kwargs.items():
|
|
197
|
+
kwargs_parts.append(f"{k}={repr(v)}")
|
|
198
|
+
for i, arg in enumerate(args):
|
|
199
|
+
kwargs_parts.append(repr(arg))
|
|
200
|
+
kwargs_str = ", ".join(kwargs_parts)
|
|
201
|
+
|
|
202
|
+
# Build execution code
|
|
203
|
+
func_name = func.__name__
|
|
204
|
+
code = f"""{source}
|
|
205
|
+
|
|
206
|
+
# Execute the function
|
|
207
|
+
__podstack_result__ = {func_name}({kwargs_str})
|
|
208
|
+
if __podstack_result__ is not None:
|
|
209
|
+
print("__PODSTACK_RESULT__")
|
|
210
|
+
print(repr(__podstack_result__))
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
# Execute on remote GPU with improved error handling
|
|
214
|
+
# Auto-enable streaming in Jupyter notebooks for real-time output
|
|
215
|
+
try:
|
|
216
|
+
result = runner.run(
|
|
217
|
+
code=code,
|
|
218
|
+
gpu=self.type,
|
|
219
|
+
count=self.count,
|
|
220
|
+
fraction=self.fraction,
|
|
221
|
+
timeout=self.timeout,
|
|
222
|
+
env=self.env,
|
|
223
|
+
pip=self.pip,
|
|
224
|
+
uv=self.uv,
|
|
225
|
+
conda=self.conda,
|
|
226
|
+
requirements=self.requirements,
|
|
227
|
+
use_uv=self.use_uv,
|
|
228
|
+
wait=True,
|
|
229
|
+
stream=None # Auto-detect: True in Jupyter, False otherwise
|
|
230
|
+
)
|
|
231
|
+
except PodstackTimeoutError as e:
|
|
232
|
+
print(f"[Podstack] ⚠️ Timeout Error:")
|
|
233
|
+
print(f" Execution ID: {e.execution_id}")
|
|
234
|
+
print(f" Timeout: {e.timeout}s")
|
|
235
|
+
print(f" Last status: {e.last_status}")
|
|
236
|
+
print(f"\n To debug, check the execution result:")
|
|
237
|
+
print(f" from podstack import get_runner")
|
|
238
|
+
print(f" result = get_runner().result('{e.execution_id}')")
|
|
239
|
+
raise
|
|
240
|
+
except PodstackProvisioningError as e:
|
|
241
|
+
print(f"[Podstack] ⚠️ Provisioning Error:")
|
|
242
|
+
print(f" {e.message}")
|
|
243
|
+
raise
|
|
244
|
+
except PodstackExecutionError as e:
|
|
245
|
+
print(f"[Podstack] ⚠️ Execution Error:")
|
|
246
|
+
print(f" Execution ID: {e.execution_id}")
|
|
247
|
+
print(f" Error: {e.error}")
|
|
248
|
+
if e.output:
|
|
249
|
+
print(f"\n Output (last 500 chars):\n{e.output[-500:]}")
|
|
250
|
+
raise
|
|
251
|
+
|
|
252
|
+
if not result.success:
|
|
253
|
+
error_details = result.error or 'Unknown error'
|
|
254
|
+
error_msg = f"GPU execution failed: {error_details}"
|
|
255
|
+
|
|
256
|
+
# Include helpful debugging info
|
|
257
|
+
if result.output:
|
|
258
|
+
# Look for Python tracebacks in output
|
|
259
|
+
if "Traceback" in result.output or "Error" in result.output:
|
|
260
|
+
error_msg += f"\n\nExecution output (last 1000 chars):\n{result.output[-1000:]}"
|
|
261
|
+
else:
|
|
262
|
+
error_msg += f"\n\nOutput preview:\n{result.output[-500:]}"
|
|
263
|
+
|
|
264
|
+
error_msg += f"\n\nExecution ID: {result.execution_id}"
|
|
265
|
+
error_msg += f"\nTo get full output: get_runner().result('{result.execution_id}')"
|
|
266
|
+
|
|
267
|
+
raise PodstackExecutionError(
|
|
268
|
+
execution_id=result.execution_id,
|
|
269
|
+
error=error_details,
|
|
270
|
+
output=result.output
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Parse result from output
|
|
274
|
+
output = result.output
|
|
275
|
+
if "__PODSTACK_RESULT__" in output:
|
|
276
|
+
result_part = output.split("__PODSTACK_RESULT__")[1].strip()
|
|
277
|
+
if result_part:
|
|
278
|
+
lines = result_part.split('\n')
|
|
279
|
+
if lines:
|
|
280
|
+
try:
|
|
281
|
+
return eval(lines[0])
|
|
282
|
+
except Exception as e:
|
|
283
|
+
# If eval fails, return the raw string
|
|
284
|
+
print(f"[Podstack] Warning: Could not parse result ({e}), returning raw output")
|
|
285
|
+
return lines[0]
|
|
286
|
+
|
|
287
|
+
# Print any other output
|
|
288
|
+
if output and "__PODSTACK_RESULT__" not in output:
|
|
289
|
+
print(output)
|
|
290
|
+
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
return wrapper
|
|
294
|
+
|
|
295
|
+
def __enter__(self):
|
|
296
|
+
"""Context manager entry."""
|
|
297
|
+
print(f"[Podstack] GPU Config: {self.type} x{self.count} @ {self.fraction}%")
|
|
298
|
+
return self
|
|
299
|
+
|
|
300
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
301
|
+
"""Context manager exit."""
|
|
302
|
+
pass
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def gpu(
|
|
306
|
+
type: str = "L40S",
|
|
307
|
+
count: int = 1,
|
|
308
|
+
allocation: int = 100,
|
|
309
|
+
fraction: int = None,
|
|
310
|
+
memory_gb: Optional[int] = None,
|
|
311
|
+
timeout: int = 3600,
|
|
312
|
+
env: str = None,
|
|
313
|
+
pip: Union[str, list] = None,
|
|
314
|
+
uv: Union[str, list] = None,
|
|
315
|
+
conda: Union[str, list] = None,
|
|
316
|
+
requirements: str = None,
|
|
317
|
+
use_uv: bool = False,
|
|
318
|
+
remote: bool = None
|
|
319
|
+
) -> GPUConfig:
|
|
320
|
+
"""
|
|
321
|
+
Configure and provision GPU for execution.
|
|
322
|
+
|
|
323
|
+
Can be used as a decorator (executes on remote GPU) or context manager.
|
|
324
|
+
|
|
325
|
+
Examples:
|
|
326
|
+
# As decorator - runs on remote GPU!
|
|
327
|
+
@podstack.gpu(type="L40S", fraction=100)
|
|
328
|
+
def train():
|
|
329
|
+
import torch
|
|
330
|
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
|
331
|
+
return {"trained": True}
|
|
332
|
+
|
|
333
|
+
result = train() # Executes remotely
|
|
334
|
+
|
|
335
|
+
# With pip packages
|
|
336
|
+
@podstack.gpu(type="L40S", pip=["transformers", "datasets"])
|
|
337
|
+
def train_llm():
|
|
338
|
+
from transformers import AutoModel
|
|
339
|
+
...
|
|
340
|
+
|
|
341
|
+
# With uv packages (faster)
|
|
342
|
+
@podstack.gpu(type="L40S", uv=["torch", "transformers"])
|
|
343
|
+
def train_fast():
|
|
344
|
+
...
|
|
345
|
+
|
|
346
|
+
# Use uv for all pip installs (faster)
|
|
347
|
+
@podstack.gpu(type="L40S", pip=["torch"], use_uv=True)
|
|
348
|
+
def train_uv():
|
|
349
|
+
...
|
|
350
|
+
|
|
351
|
+
# With requirements.txt
|
|
352
|
+
@podstack.gpu(type="L40S", requirements="requirements.txt")
|
|
353
|
+
def train_with_deps():
|
|
354
|
+
...
|
|
355
|
+
|
|
356
|
+
# With requirements.txt using uv (faster)
|
|
357
|
+
@podstack.gpu(type="L40S", requirements="requirements.txt", use_uv=True)
|
|
358
|
+
def train_fast_deps():
|
|
359
|
+
...
|
|
360
|
+
|
|
361
|
+
# With conda packages
|
|
362
|
+
@podstack.gpu(type="L40S", conda="cudatoolkit=11.8")
|
|
363
|
+
def train_cuda():
|
|
364
|
+
...
|
|
365
|
+
|
|
366
|
+
# As context manager (sets config only)
|
|
367
|
+
with podstack.gpu(type="A100-80G", count=2):
|
|
368
|
+
# GPU config is set for tracking
|
|
369
|
+
...
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
type: GPU type (L40S, A100-40G, A100-80G, H100, A10, T4)
|
|
373
|
+
count: Number of GPUs (1-8, default: 1)
|
|
374
|
+
allocation: Alias for fraction (backward compatibility)
|
|
375
|
+
fraction: GPU time-slice percentage (25, 50, 75, 100, default: 100)
|
|
376
|
+
memory_gb: Optional memory limit in GB
|
|
377
|
+
timeout: Max execution time in seconds (default: 3600)
|
|
378
|
+
env: Environment preset (ml, nlp, cv, audio, tabular, rl, scientific)
|
|
379
|
+
pip: Pip packages - string "pkg1,pkg2" or list ["pkg1", "pkg2"]
|
|
380
|
+
uv: UV packages (faster than pip) - string or list
|
|
381
|
+
conda: Conda packages - string or list
|
|
382
|
+
requirements: Path to requirements.txt file
|
|
383
|
+
use_uv: Use uv instead of pip for all installations (faster)
|
|
384
|
+
remote: Execute remotely (default: True if configured)
|
|
385
|
+
"""
|
|
386
|
+
return GPUConfig(
|
|
387
|
+
type=type,
|
|
388
|
+
count=count,
|
|
389
|
+
allocation=allocation,
|
|
390
|
+
fraction=fraction,
|
|
391
|
+
memory_gb=memory_gb,
|
|
392
|
+
timeout=timeout,
|
|
393
|
+
env=env,
|
|
394
|
+
pip=pip,
|
|
395
|
+
uv=uv,
|
|
396
|
+
conda=conda,
|
|
397
|
+
requirements=requirements,
|
|
398
|
+
use_uv=use_uv,
|
|
399
|
+
remote=remote
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def environment(env: str = "pytorch") -> str:
|
|
404
|
+
"""
|
|
405
|
+
Set the runtime environment.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
env: Environment name (pytorch, tensorflow, jax, huggingface, rapids, custom)
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
The environment name
|
|
412
|
+
"""
|
|
413
|
+
global _current_environment
|
|
414
|
+
_current_environment = env
|
|
415
|
+
print(f"[Podstack] Environment: {env}")
|
|
416
|
+
return env
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def auto_shutdown(minutes: int = 60) -> int:
|
|
420
|
+
"""
|
|
421
|
+
Configure auto-shutdown timer.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
minutes: Minutes of idle time before auto-shutdown
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
The configured minutes
|
|
428
|
+
"""
|
|
429
|
+
global _auto_shutdown_minutes
|
|
430
|
+
_auto_shutdown_minutes = minutes
|
|
431
|
+
print(f"[Podstack] Auto-shutdown: {minutes} minutes")
|
|
432
|
+
return minutes
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class ExperimentDecorator:
|
|
436
|
+
"""Experiment tracking decorator."""
|
|
437
|
+
|
|
438
|
+
def __init__(self, name: str, description: str = None):
|
|
439
|
+
"""
|
|
440
|
+
Create or set an experiment.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
name: Experiment name
|
|
444
|
+
description: Optional description
|
|
445
|
+
"""
|
|
446
|
+
self.name = name
|
|
447
|
+
self.description = description
|
|
448
|
+
self._experiment = None
|
|
449
|
+
|
|
450
|
+
def __call__(self, func: Callable) -> Callable:
|
|
451
|
+
"""Decorator usage."""
|
|
452
|
+
@functools.wraps(func)
|
|
453
|
+
def wrapper(*args, **kwargs):
|
|
454
|
+
self._experiment = registry.set_experiment(self.name, self.description)
|
|
455
|
+
print(f"[Podstack] Experiment: {self.name} (ID: {self._experiment.id})")
|
|
456
|
+
return func(*args, **kwargs)
|
|
457
|
+
return wrapper
|
|
458
|
+
|
|
459
|
+
def __enter__(self):
|
|
460
|
+
"""Context manager entry."""
|
|
461
|
+
self._experiment = registry.set_experiment(self.name, self.description)
|
|
462
|
+
print(f"[Podstack] Experiment: {self.name} (ID: {self._experiment.id})")
|
|
463
|
+
return self._experiment
|
|
464
|
+
|
|
465
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
466
|
+
"""Context manager exit."""
|
|
467
|
+
pass
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def experiment(name: str, description: str = None) -> ExperimentDecorator:
|
|
471
|
+
"""
|
|
472
|
+
Create or set an experiment for tracking.
|
|
473
|
+
|
|
474
|
+
Can be used as decorator or context manager.
|
|
475
|
+
|
|
476
|
+
Examples:
|
|
477
|
+
@podstack.experiment(name="my-experiment")
|
|
478
|
+
def train():
|
|
479
|
+
...
|
|
480
|
+
|
|
481
|
+
with podstack.experiment(name="my-experiment") as exp:
|
|
482
|
+
print(exp.id)
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
name: Experiment name
|
|
486
|
+
description: Optional description
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
ExperimentDecorator instance
|
|
490
|
+
"""
|
|
491
|
+
return ExperimentDecorator(name=name, description=description)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
class RunDecorator:
|
|
495
|
+
"""Run tracking decorator with automatic metrics logging."""
|
|
496
|
+
|
|
497
|
+
def __init__(
|
|
498
|
+
self,
|
|
499
|
+
name: str = None,
|
|
500
|
+
track_gpu: bool = True,
|
|
501
|
+
track_time: bool = True,
|
|
502
|
+
tags: Dict[str, str] = None
|
|
503
|
+
):
|
|
504
|
+
"""
|
|
505
|
+
Start a tracked run.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
name: Run name
|
|
509
|
+
track_gpu: Auto-track GPU metrics
|
|
510
|
+
track_time: Auto-track execution time
|
|
511
|
+
tags: Optional tags
|
|
512
|
+
"""
|
|
513
|
+
self.name = name
|
|
514
|
+
self.track_gpu = track_gpu
|
|
515
|
+
self.track_time = track_time
|
|
516
|
+
self.tags = tags or {}
|
|
517
|
+
self._run = None
|
|
518
|
+
self._start_time = None
|
|
519
|
+
|
|
520
|
+
def __call__(self, func: Callable) -> Callable:
|
|
521
|
+
"""Decorator usage."""
|
|
522
|
+
@functools.wraps(func)
|
|
523
|
+
def wrapper(*args, **kwargs):
|
|
524
|
+
with self:
|
|
525
|
+
return func(*args, **kwargs)
|
|
526
|
+
return wrapper
|
|
527
|
+
|
|
528
|
+
def __enter__(self):
|
|
529
|
+
"""Context manager entry."""
|
|
530
|
+
self._start_time = time.time()
|
|
531
|
+
|
|
532
|
+
# Add GPU config to tags if available
|
|
533
|
+
if _current_gpu_config and self.track_gpu:
|
|
534
|
+
self.tags["gpu_type"] = _current_gpu_config.get("type", "unknown")
|
|
535
|
+
self.tags["gpu_fraction"] = str(_current_gpu_config.get("fraction", 100))
|
|
536
|
+
|
|
537
|
+
self._run = registry.start_run(name=self.name, tags=self.tags)
|
|
538
|
+
print(f"[Podstack] Run started: {self.name} (ID: {self._run.id})")
|
|
539
|
+
|
|
540
|
+
# Log GPU config as params
|
|
541
|
+
if _current_gpu_config and self.track_gpu:
|
|
542
|
+
registry.log_params({
|
|
543
|
+
"gpu_type": _current_gpu_config.get("type"),
|
|
544
|
+
"gpu_count": _current_gpu_config.get("count", 1),
|
|
545
|
+
"gpu_fraction_percent": _current_gpu_config.get("fraction", 100),
|
|
546
|
+
})
|
|
547
|
+
|
|
548
|
+
return self._run
|
|
549
|
+
|
|
550
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
551
|
+
"""Context manager exit."""
|
|
552
|
+
if self.track_time and self._start_time:
|
|
553
|
+
elapsed = time.time() - self._start_time
|
|
554
|
+
registry.log_metrics({"total_execution_time_seconds": elapsed})
|
|
555
|
+
|
|
556
|
+
if exc_type is not None:
|
|
557
|
+
registry.end_run(status="failed")
|
|
558
|
+
print(f"[Podstack] Run failed: {self.name}")
|
|
559
|
+
else:
|
|
560
|
+
registry.end_run(status="completed")
|
|
561
|
+
print(f"[Podstack] Run completed: {self.name}")
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def run(
|
|
565
|
+
name: str = None,
|
|
566
|
+
track_gpu: bool = True,
|
|
567
|
+
track_time: bool = True,
|
|
568
|
+
tags: Dict[str, str] = None
|
|
569
|
+
) -> RunDecorator:
|
|
570
|
+
"""
|
|
571
|
+
Start a tracked run.
|
|
572
|
+
|
|
573
|
+
Can be used as decorator or context manager.
|
|
574
|
+
|
|
575
|
+
Examples:
|
|
576
|
+
@podstack.run(name="training-v1", track_gpu=True)
|
|
577
|
+
def train():
|
|
578
|
+
...
|
|
579
|
+
|
|
580
|
+
with podstack.run(name="training-v1") as r:
|
|
581
|
+
podstack.registry.log_metrics({"loss": 0.5})
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
name: Run name
|
|
585
|
+
track_gpu: Auto-track GPU metrics (default: True)
|
|
586
|
+
track_time: Auto-track execution time (default: True)
|
|
587
|
+
tags: Optional tags dict
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
RunDecorator instance
|
|
591
|
+
"""
|
|
592
|
+
return RunDecorator(name=name, track_gpu=track_gpu, track_time=track_time, tags=tags)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
class ModelRegistry:
|
|
596
|
+
"""Model registry operations."""
|
|
597
|
+
|
|
598
|
+
@staticmethod
|
|
599
|
+
def register(
|
|
600
|
+
name: str,
|
|
601
|
+
run_id: str = None,
|
|
602
|
+
description: str = None,
|
|
603
|
+
tags: Dict[str, str] = None
|
|
604
|
+
) -> Callable:
|
|
605
|
+
"""
|
|
606
|
+
Decorator to register a model after function execution.
|
|
607
|
+
|
|
608
|
+
Examples:
|
|
609
|
+
@podstack.model.register(name="my-model")
|
|
610
|
+
def save_model():
|
|
611
|
+
torch.save(model, "model.pt")
|
|
612
|
+
podstack.registry.log_artifact("model.pt", "model")
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
name: Model name
|
|
616
|
+
run_id: Optional run ID (uses current run if not specified)
|
|
617
|
+
description: Optional description
|
|
618
|
+
tags: Optional tags
|
|
619
|
+
"""
|
|
620
|
+
def decorator(func: Callable) -> Callable:
|
|
621
|
+
@functools.wraps(func)
|
|
622
|
+
def wrapper(*args, **kwargs):
|
|
623
|
+
result = func(*args, **kwargs)
|
|
624
|
+
|
|
625
|
+
# Get run_id from current run if not specified
|
|
626
|
+
actual_run_id = run_id
|
|
627
|
+
if actual_run_id is None:
|
|
628
|
+
client = registry._get_client()
|
|
629
|
+
if client._active_run:
|
|
630
|
+
actual_run_id = client._active_run.id
|
|
631
|
+
|
|
632
|
+
if actual_run_id:
|
|
633
|
+
model = registry.register_model(
|
|
634
|
+
name=name,
|
|
635
|
+
run_id=actual_run_id,
|
|
636
|
+
description=description,
|
|
637
|
+
tags=tags
|
|
638
|
+
)
|
|
639
|
+
print(f"[Podstack] Model registered: {name} (ID: {model.id})")
|
|
640
|
+
else:
|
|
641
|
+
print(f"[Podstack] Warning: No active run, model not registered")
|
|
642
|
+
|
|
643
|
+
return result
|
|
644
|
+
return wrapper
|
|
645
|
+
return decorator
|
|
646
|
+
|
|
647
|
+
@staticmethod
|
|
648
|
+
def promote(
|
|
649
|
+
name: str = None,
|
|
650
|
+
version: int = 1,
|
|
651
|
+
stage: str = "staging",
|
|
652
|
+
comment: str = None
|
|
653
|
+
) -> Callable:
|
|
654
|
+
"""
|
|
655
|
+
Decorator to promote a model after function execution.
|
|
656
|
+
|
|
657
|
+
Examples:
|
|
658
|
+
@podstack.model.promote(name="my-model", version=1, stage="production")
|
|
659
|
+
def validate_model():
|
|
660
|
+
# validation code
|
|
661
|
+
pass
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
name: Model name
|
|
665
|
+
version: Version number to promote
|
|
666
|
+
stage: Target stage (development, staging, production, archived)
|
|
667
|
+
comment: Optional comment
|
|
668
|
+
"""
|
|
669
|
+
def decorator(func: Callable) -> Callable:
|
|
670
|
+
@functools.wraps(func)
|
|
671
|
+
def wrapper(*args, **kwargs):
|
|
672
|
+
result = func(*args, **kwargs)
|
|
673
|
+
|
|
674
|
+
if name:
|
|
675
|
+
version_obj = registry.set_model_stage(
|
|
676
|
+
model_name=name,
|
|
677
|
+
version=version,
|
|
678
|
+
stage=stage,
|
|
679
|
+
comment=comment
|
|
680
|
+
)
|
|
681
|
+
print(f"[Podstack] Model promoted: {name} v{version} -> {stage}")
|
|
682
|
+
|
|
683
|
+
return result
|
|
684
|
+
return wrapper
|
|
685
|
+
return decorator
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
# Singleton instance
|
|
689
|
+
model = ModelRegistry()
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def get_gpu_config() -> Dict[str, Any]:
|
|
693
|
+
"""Get current GPU configuration."""
|
|
694
|
+
return _current_gpu_config.copy()
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def get_environment() -> str:
|
|
698
|
+
"""Get current environment."""
|
|
699
|
+
return _current_environment
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def get_auto_shutdown_minutes() -> int:
|
|
703
|
+
"""Get auto-shutdown configuration."""
|
|
704
|
+
return _auto_shutdown_minutes
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
__all__ = [
|
|
708
|
+
"gpu",
|
|
709
|
+
"environment",
|
|
710
|
+
"auto_shutdown",
|
|
711
|
+
"experiment",
|
|
712
|
+
"run",
|
|
713
|
+
"model",
|
|
714
|
+
"get_gpu_config",
|
|
715
|
+
"get_environment",
|
|
716
|
+
"get_auto_shutdown_minutes",
|
|
717
|
+
"enable_remote_execution",
|
|
718
|
+
"is_remote_execution_enabled",
|
|
719
|
+
"GPUConfig",
|
|
720
|
+
# Exceptions
|
|
721
|
+
"PodstackError",
|
|
722
|
+
"PodstackTimeoutError",
|
|
723
|
+
"PodstackExecutionError",
|
|
724
|
+
"PodstackProvisioningError",
|
|
725
|
+
]
|