podstack 1.3.18__tar.gz → 1.3.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {podstack-1.3.18 → podstack-1.3.21}/PKG-INFO +1 -1
- {podstack-1.3.18 → podstack-1.3.21}/podstack/annotations.py +0 -3
- {podstack-1.3.18 → podstack-1.3.21}/podstack/gpu_runner.py +147 -53
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/__init__.py +11 -2
- {podstack-1.3.18 → podstack-1.3.21}/podstack.egg-info/PKG-INFO +1 -1
- {podstack-1.3.18 → podstack-1.3.21}/pyproject.toml +1 -1
- {podstack-1.3.18 → podstack-1.3.21}/LICENSE +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/README.md +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/__init__.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/client.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/execution.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/models.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/notebook.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/autolog.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/client.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/experiment.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/model.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack/registry/model_utils.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack.egg-info/SOURCES.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack.egg-info/dependency_links.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack.egg-info/requires.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack.egg-info/top_level.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/__init__.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/app.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/image.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/runner.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/secret.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/utils.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/podstack_gpu/volume.py +0 -0
- {podstack-1.3.18 → podstack-1.3.21}/setup.cfg +0 -0
|
@@ -158,9 +158,6 @@ class GPUConfig:
|
|
|
158
158
|
print(f"[Podstack] GPU Config (local): {self.type} x{self.count} @ {self.fraction}%")
|
|
159
159
|
return func(*args, **kwargs)
|
|
160
160
|
|
|
161
|
-
# Remote execution on GPU
|
|
162
|
-
print(f"[Podstack] Provisioning GPU: {self.type} x{self.count} @ {self.fraction}%")
|
|
163
|
-
|
|
164
161
|
try:
|
|
165
162
|
runner = get_runner()
|
|
166
163
|
except ValueError as e:
|
|
@@ -18,6 +18,99 @@ import httpx
|
|
|
18
18
|
# Configure logging
|
|
19
19
|
logger = logging.getLogger("podstack.gpu_runner")
|
|
20
20
|
|
|
21
|
+
SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LiveDisplay:
|
|
25
|
+
"""Animated phase display for GPU runner lifecycle in Jupyter + terminal."""
|
|
26
|
+
|
|
27
|
+
PHASES = {
|
|
28
|
+
"pending": ("🔍", "Searching for GPU..."),
|
|
29
|
+
"queued": ("📋", "Queued — waiting for available GPU..."),
|
|
30
|
+
"provisioning": ("🚀", "Allocating GPU pod..."),
|
|
31
|
+
"running": None, # No spinner — logs stream directly
|
|
32
|
+
}
|
|
33
|
+
CHECKMARKS = {
|
|
34
|
+
"pending": "🔍 Submitted",
|
|
35
|
+
"queued": "📋 In queue",
|
|
36
|
+
"provisioning": "✓ GPU pod provisioning...",
|
|
37
|
+
"running": "✓ Pod ready — logging live output:",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def __init__(self, gpu_type: str, gpu_count: int, fraction: int):
|
|
41
|
+
self._gpu_type = gpu_type
|
|
42
|
+
self._gpu_count = gpu_count
|
|
43
|
+
self._fraction = fraction
|
|
44
|
+
self._phase = None
|
|
45
|
+
self._spinner_thread: Optional[threading.Thread] = None
|
|
46
|
+
self._stop_evt = threading.Event()
|
|
47
|
+
self._logs_started = False
|
|
48
|
+
|
|
49
|
+
def set_phase(self, status: str, extra: str = ""):
|
|
50
|
+
"""Transition to a new lifecycle phase."""
|
|
51
|
+
self._stop_spinner()
|
|
52
|
+
if self._phase and self._phase != status:
|
|
53
|
+
label = self.CHECKMARKS.get(self._phase, f"✓ {self._phase}")
|
|
54
|
+
self._print(f"\r[Podstack] {label}{' ' * 30}\n")
|
|
55
|
+
self._phase = status
|
|
56
|
+
if status == "running":
|
|
57
|
+
if not self._logs_started:
|
|
58
|
+
self._logs_started = True
|
|
59
|
+
self._print(f"[Podstack] ─── Live Logs ({self._gpu_type} x{self._gpu_count}) ───\n\n")
|
|
60
|
+
else:
|
|
61
|
+
phase_info = self.PHASES.get(status)
|
|
62
|
+
if phase_info:
|
|
63
|
+
_, label = phase_info
|
|
64
|
+
if extra:
|
|
65
|
+
label = f"{label} {extra}"
|
|
66
|
+
self._start_spinner(f"[Podstack] {{spinner}} {label}")
|
|
67
|
+
|
|
68
|
+
def log(self, content: str, output_type: str = "stdout"):
|
|
69
|
+
"""Write a log line from the pod (indented, real-time)."""
|
|
70
|
+
if output_type == "stderr":
|
|
71
|
+
sys.stderr.write(f" {content}" if not content.startswith(" ") else content)
|
|
72
|
+
sys.stderr.flush()
|
|
73
|
+
else:
|
|
74
|
+
sys.stdout.write(f" {content}" if not content.startswith(" ") else content)
|
|
75
|
+
sys.stdout.flush()
|
|
76
|
+
|
|
77
|
+
def complete(self, success: bool, gpu_seconds: float, cost_paise: int, error: str = None):
|
|
78
|
+
"""Print final summary line."""
|
|
79
|
+
self._stop_spinner()
|
|
80
|
+
if self._logs_started:
|
|
81
|
+
self._print(f"\n[Podstack] ─────────────────────────────────────\n")
|
|
82
|
+
if success:
|
|
83
|
+
cost_str = f"₹{cost_paise/100:.2f}" if cost_paise else ""
|
|
84
|
+
self._print(f"[Podstack] ✓ Completed in {gpu_seconds:.1f}s | {self._gpu_type} x{self._gpu_count} | {cost_str}\n")
|
|
85
|
+
else:
|
|
86
|
+
self._print(f"[Podstack] ✗ Failed: {error}\n")
|
|
87
|
+
|
|
88
|
+
def _start_spinner(self, template: str):
|
|
89
|
+
self._stop_evt.clear()
|
|
90
|
+
|
|
91
|
+
def _spin():
|
|
92
|
+
i = 0
|
|
93
|
+
while not self._stop_evt.is_set():
|
|
94
|
+
frame = SPINNER_FRAMES[i % len(SPINNER_FRAMES)]
|
|
95
|
+
sys.stdout.write(f"\r{template.format(spinner=frame)} ")
|
|
96
|
+
sys.stdout.flush()
|
|
97
|
+
self._stop_evt.wait(0.1)
|
|
98
|
+
i += 1
|
|
99
|
+
|
|
100
|
+
self._spinner_thread = threading.Thread(target=_spin, daemon=True)
|
|
101
|
+
self._spinner_thread.start()
|
|
102
|
+
|
|
103
|
+
def _stop_spinner(self):
|
|
104
|
+
if self._spinner_thread and self._spinner_thread.is_alive():
|
|
105
|
+
self._stop_evt.set()
|
|
106
|
+
self._spinner_thread.join(timeout=0.5)
|
|
107
|
+
self._spinner_thread = None
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _print(msg: str):
|
|
111
|
+
sys.stdout.write(msg)
|
|
112
|
+
sys.stdout.flush()
|
|
113
|
+
|
|
21
114
|
|
|
22
115
|
def is_jupyter() -> bool:
|
|
23
116
|
"""Check if running in a Jupyter notebook."""
|
|
@@ -728,8 +821,6 @@ _stream_install(
|
|
|
728
821
|
if not execution_id:
|
|
729
822
|
raise RuntimeError(f"No execution_id in response: {submission}")
|
|
730
823
|
|
|
731
|
-
print(f"[Podstack] Execution submitted: {execution_id}")
|
|
732
|
-
|
|
733
824
|
if not wait:
|
|
734
825
|
return GPUExecutionResult(
|
|
735
826
|
execution_id=execution_id,
|
|
@@ -742,9 +833,9 @@ _stream_install(
|
|
|
742
833
|
should_stream = stream if stream is not None else is_jupyter()
|
|
743
834
|
|
|
744
835
|
if should_stream:
|
|
745
|
-
return self._run_with_streaming(execution_id, gpu, count, timeout, max_retries, cancel_on_timeout)
|
|
836
|
+
return self._run_with_streaming(execution_id, gpu, count, timeout, max_retries, cancel_on_timeout, fraction)
|
|
746
837
|
else:
|
|
747
|
-
return self._run_with_polling(execution_id, gpu, count, timeout, poll_interval, max_retries, provisioning_timeout, cancel_on_timeout)
|
|
838
|
+
return self._run_with_polling(execution_id, gpu, count, timeout, poll_interval, max_retries, provisioning_timeout, cancel_on_timeout, fraction)
|
|
748
839
|
|
|
749
840
|
def _run_with_streaming(
|
|
750
841
|
self,
|
|
@@ -753,10 +844,12 @@ _stream_install(
|
|
|
753
844
|
count: int,
|
|
754
845
|
timeout: int,
|
|
755
846
|
max_retries: int,
|
|
756
|
-
cancel_on_timeout: bool
|
|
847
|
+
cancel_on_timeout: bool,
|
|
848
|
+
fraction: int = 100
|
|
757
849
|
) -> GPUExecutionResult:
|
|
758
850
|
"""Run execution with real-time output streaming."""
|
|
759
|
-
|
|
851
|
+
display = LiveDisplay(gpu, count, fraction)
|
|
852
|
+
display.set_phase("pending")
|
|
760
853
|
|
|
761
854
|
start_time = time.time()
|
|
762
855
|
output_buffer = []
|
|
@@ -764,13 +857,14 @@ _stream_install(
|
|
|
764
857
|
final_event = {}
|
|
765
858
|
|
|
766
859
|
try:
|
|
767
|
-
for event in self.stream_output(execution_id, show_output=
|
|
860
|
+
for event in self.stream_output(execution_id, show_output=False):
|
|
768
861
|
elapsed = time.time() - start_time
|
|
769
862
|
if elapsed > timeout:
|
|
770
863
|
if cancel_on_timeout:
|
|
771
864
|
try:
|
|
772
865
|
self.cancel(execution_id)
|
|
773
|
-
|
|
866
|
+
display._stop_spinner()
|
|
867
|
+
display._print(f"\r[Podstack] Execution cancelled due to timeout{' ' * 30}\n")
|
|
774
868
|
except Exception as e:
|
|
775
869
|
logger.warning(f"Failed to cancel execution: {e}")
|
|
776
870
|
|
|
@@ -787,37 +881,46 @@ _stream_install(
|
|
|
787
881
|
)
|
|
788
882
|
)
|
|
789
883
|
|
|
790
|
-
# Track
|
|
791
|
-
if event.get("type") in ("stdout", "stderr", "output"):
|
|
792
|
-
content = event.get("content", "")
|
|
793
|
-
if content:
|
|
794
|
-
output_buffer.append(content)
|
|
795
|
-
|
|
796
|
-
# Track status
|
|
884
|
+
# Track status transitions
|
|
797
885
|
if "status" in event:
|
|
798
886
|
new_status = event["status"]
|
|
799
887
|
if new_status != final_status:
|
|
800
888
|
final_status = new_status
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
889
|
+
extra = ""
|
|
890
|
+
if new_status == "queued":
|
|
891
|
+
pos = event.get("queue_position", "?")
|
|
892
|
+
extra = f"(position: {pos})"
|
|
893
|
+
display.set_phase(new_status, extra)
|
|
805
894
|
|
|
806
895
|
# Check for terminal status
|
|
807
896
|
if final_status in ("completed", "failed", "timeout", "cancelled"):
|
|
808
897
|
final_event = event
|
|
809
898
|
break
|
|
810
899
|
|
|
900
|
+
# Stream output lines
|
|
901
|
+
if event.get("type") in ("stdout", "stderr", "output"):
|
|
902
|
+
content = event.get("content", "")
|
|
903
|
+
if content:
|
|
904
|
+
# Logs arriving means the pod is running — advance phase
|
|
905
|
+
# if we missed the "running" status event (connected late)
|
|
906
|
+
if final_status not in ("running", "completed", "failed", "timeout", "cancelled"):
|
|
907
|
+
final_status = "running"
|
|
908
|
+
display.set_phase("running")
|
|
909
|
+
output_buffer.append(content)
|
|
910
|
+
display.log(content, event.get("type", "stdout"))
|
|
911
|
+
|
|
811
912
|
except RuntimeError as e:
|
|
812
913
|
if "HTTP 401" in str(e):
|
|
813
914
|
# Auth failed on stream — fall back to polling
|
|
814
|
-
|
|
815
|
-
|
|
915
|
+
display._stop_spinner()
|
|
916
|
+
display._print(f"\r[Podstack] Streaming auth failed, falling back to polling...{' ' * 10}\n")
|
|
917
|
+
return self._run_with_polling(execution_id, gpu, count, timeout, 2.0, max_retries, 300, cancel_on_timeout, fraction)
|
|
816
918
|
raise
|
|
817
919
|
except (ConnectionError, httpx.ConnectError) as e:
|
|
818
920
|
# Try to recover and get the result
|
|
819
921
|
logger.warning(f"Stream connection lost: {e}")
|
|
820
|
-
|
|
922
|
+
display._stop_spinner()
|
|
923
|
+
display._print(f"\r[Podstack] Stream connection lost, fetching final result...{' ' * 10}\n")
|
|
821
924
|
|
|
822
925
|
# Get final result
|
|
823
926
|
result = None
|
|
@@ -837,11 +940,7 @@ _stream_install(
|
|
|
837
940
|
if "__PODSTACK_RESULT__" not in result.output and "__PODSTACK_RESULT__" in streamed:
|
|
838
941
|
result.output = streamed
|
|
839
942
|
|
|
840
|
-
|
|
841
|
-
print(f"\n[Podstack] Completed in {result.gpu_seconds:.1f}s (cost: ₹{result.cost_paise/100:.2f})")
|
|
842
|
-
else:
|
|
843
|
-
error_msg = result.error or 'Unknown error'
|
|
844
|
-
print(f"\n[Podstack] Failed: {error_msg}")
|
|
943
|
+
display.complete(result.success, result.gpu_seconds, result.cost_paise, result.error or "Unknown error")
|
|
845
944
|
|
|
846
945
|
return result
|
|
847
946
|
|
|
@@ -854,10 +953,13 @@ _stream_install(
|
|
|
854
953
|
poll_interval: float,
|
|
855
954
|
max_retries: int,
|
|
856
955
|
provisioning_timeout: int,
|
|
857
|
-
cancel_on_timeout: bool
|
|
956
|
+
cancel_on_timeout: bool,
|
|
957
|
+
fraction: int = 100
|
|
858
958
|
) -> GPUExecutionResult:
|
|
859
959
|
"""Run execution with polling (non-streaming mode)."""
|
|
860
|
-
|
|
960
|
+
display = LiveDisplay(gpu, count, fraction)
|
|
961
|
+
display.set_phase("pending")
|
|
962
|
+
|
|
861
963
|
start_time = time.time()
|
|
862
964
|
provisioning_start = None
|
|
863
965
|
last_status = ""
|
|
@@ -872,7 +974,8 @@ _stream_install(
|
|
|
872
974
|
if cancel_on_timeout:
|
|
873
975
|
try:
|
|
874
976
|
self.cancel(execution_id)
|
|
875
|
-
|
|
977
|
+
display._stop_spinner()
|
|
978
|
+
display._print(f"\r[Podstack] Execution cancelled due to timeout{' ' * 30}\n")
|
|
876
979
|
except Exception as e:
|
|
877
980
|
logger.warning(f"Failed to cancel execution: {e}")
|
|
878
981
|
|
|
@@ -914,24 +1017,17 @@ _stream_install(
|
|
|
914
1017
|
|
|
915
1018
|
if current_status != last_status:
|
|
916
1019
|
last_status = current_status
|
|
917
|
-
if current_status == "
|
|
918
|
-
print(f"[Podstack] Pending...")
|
|
919
|
-
elif current_status == "queued":
|
|
920
|
-
pos = status_data.get("queue_position", "?")
|
|
921
|
-
print(f"[Podstack] Queued (position: {pos})")
|
|
922
|
-
elif current_status == "provisioning":
|
|
1020
|
+
if current_status == "provisioning":
|
|
923
1021
|
provisioning_start = time.time()
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
else:
|
|
1022
|
+
if current_status in ("pending", "queued", "provisioning", "running"):
|
|
1023
|
+
extra = ""
|
|
1024
|
+
if current_status == "queued":
|
|
1025
|
+
pos = status_data.get("queue_position", "?")
|
|
1026
|
+
extra = f"(position: {pos})"
|
|
1027
|
+
display.set_phase(current_status, extra)
|
|
1028
|
+
elif current_status not in ("completed", "failed", "timeout", "cancelled"):
|
|
932
1029
|
# Unknown status - log but continue
|
|
933
1030
|
logger.warning(f"Unknown status: {current_status}")
|
|
934
|
-
print(f"[Podstack] Status: {current_status}")
|
|
935
1031
|
|
|
936
1032
|
# Check for provisioning timeout
|
|
937
1033
|
if provisioning_start and current_status == "provisioning":
|
|
@@ -974,14 +1070,12 @@ _stream_install(
|
|
|
974
1070
|
raise ConnectionError(f"Failed to get result after {max_retries} attempts: {e}")
|
|
975
1071
|
time.sleep(poll_interval * (attempt + 1))
|
|
976
1072
|
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
if result.output:
|
|
984
|
-
print(f"[Podstack] Output (last 500 chars):\n{result.output[-500:]}")
|
|
1073
|
+
display.complete(result.success, result.gpu_seconds, result.cost_paise, result.error or "Unknown error")
|
|
1074
|
+
|
|
1075
|
+
# In polling mode, output isn't streamed — print it after completion
|
|
1076
|
+
if result.output and not result.success:
|
|
1077
|
+
sys.stdout.write(f"[Podstack] Output (last 500 chars):\n{result.output[-500:]}\n")
|
|
1078
|
+
sys.stdout.flush()
|
|
985
1079
|
|
|
986
1080
|
return result
|
|
987
1081
|
|
|
@@ -176,7 +176,13 @@ def list_experiments(limit: int = 20, offset: int = 0) -> list:
|
|
|
176
176
|
return _get_client().list_experiments(limit, offset)
|
|
177
177
|
|
|
178
178
|
|
|
179
|
-
def start_run(
|
|
179
|
+
def start_run(
|
|
180
|
+
name: str = None,
|
|
181
|
+
tags: dict = None,
|
|
182
|
+
capture_env: bool = True,
|
|
183
|
+
system_metrics: bool = True,
|
|
184
|
+
system_metrics_interval: float = 10.0,
|
|
185
|
+
) -> Run:
|
|
180
186
|
"""
|
|
181
187
|
Start a new run in the active experiment.
|
|
182
188
|
|
|
@@ -188,11 +194,14 @@ def start_run(name: str = None, tags: dict = None) -> Run:
|
|
|
188
194
|
Args:
|
|
189
195
|
name: Optional run name
|
|
190
196
|
tags: Optional tags dict
|
|
197
|
+
capture_env: Auto-capture Python/pip/git/CUDA as _env.* params
|
|
198
|
+
system_metrics: Log CPU/RAM/GPU metrics every system_metrics_interval seconds
|
|
199
|
+
system_metrics_interval: Seconds between metric samples (default 10)
|
|
191
200
|
|
|
192
201
|
Returns:
|
|
193
202
|
Run object (context manager)
|
|
194
203
|
"""
|
|
195
|
-
return _get_client().start_run(name, tags)
|
|
204
|
+
return _get_client().start_run(name, tags, capture_env, system_metrics, system_metrics_interval)
|
|
196
205
|
|
|
197
206
|
|
|
198
207
|
def end_run(status: str = "completed"):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|