podstack 1.3.18__tar.gz → 1.3.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {podstack-1.3.18 → podstack-1.3.20}/PKG-INFO +1 -1
- {podstack-1.3.18 → podstack-1.3.20}/podstack/annotations.py +0 -3
- {podstack-1.3.18 → podstack-1.3.20}/podstack/gpu_runner.py +142 -53
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/__init__.py +11 -2
- {podstack-1.3.18 → podstack-1.3.20}/podstack.egg-info/PKG-INFO +1 -1
- {podstack-1.3.18 → podstack-1.3.20}/pyproject.toml +1 -1
- {podstack-1.3.18 → podstack-1.3.20}/LICENSE +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/README.md +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/__init__.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/client.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/execution.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/models.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/notebook.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/autolog.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/client.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/experiment.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/model.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack/registry/model_utils.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack.egg-info/SOURCES.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack.egg-info/dependency_links.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack.egg-info/requires.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack.egg-info/top_level.txt +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/__init__.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/app.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/exceptions.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/image.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/runner.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/secret.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/utils.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/podstack_gpu/volume.py +0 -0
- {podstack-1.3.18 → podstack-1.3.20}/setup.cfg +0 -0
|
@@ -158,9 +158,6 @@ class GPUConfig:
|
|
|
158
158
|
print(f"[Podstack] GPU Config (local): {self.type} x{self.count} @ {self.fraction}%")
|
|
159
159
|
return func(*args, **kwargs)
|
|
160
160
|
|
|
161
|
-
# Remote execution on GPU
|
|
162
|
-
print(f"[Podstack] Provisioning GPU: {self.type} x{self.count} @ {self.fraction}%")
|
|
163
|
-
|
|
164
161
|
try:
|
|
165
162
|
runner = get_runner()
|
|
166
163
|
except ValueError as e:
|
|
@@ -18,6 +18,99 @@ import httpx
|
|
|
18
18
|
# Configure logging
|
|
19
19
|
logger = logging.getLogger("podstack.gpu_runner")
|
|
20
20
|
|
|
21
|
+
SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LiveDisplay:
|
|
25
|
+
"""Animated phase display for GPU runner lifecycle in Jupyter + terminal."""
|
|
26
|
+
|
|
27
|
+
PHASES = {
|
|
28
|
+
"pending": ("🔍", "Searching for GPU..."),
|
|
29
|
+
"queued": ("📋", "Queued — waiting for available GPU..."),
|
|
30
|
+
"provisioning": ("🚀", "Allocating GPU pod..."),
|
|
31
|
+
"running": None, # No spinner — logs stream directly
|
|
32
|
+
}
|
|
33
|
+
CHECKMARKS = {
|
|
34
|
+
"pending": "🔍 Submitted",
|
|
35
|
+
"queued": "📋 In queue",
|
|
36
|
+
"provisioning": "✓ GPU pod provisioning...",
|
|
37
|
+
"running": "✓ Pod ready — logging live output:",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def __init__(self, gpu_type: str, gpu_count: int, fraction: int):
|
|
41
|
+
self._gpu_type = gpu_type
|
|
42
|
+
self._gpu_count = gpu_count
|
|
43
|
+
self._fraction = fraction
|
|
44
|
+
self._phase = None
|
|
45
|
+
self._spinner_thread: Optional[threading.Thread] = None
|
|
46
|
+
self._stop_evt = threading.Event()
|
|
47
|
+
self._logs_started = False
|
|
48
|
+
|
|
49
|
+
def set_phase(self, status: str, extra: str = ""):
|
|
50
|
+
"""Transition to a new lifecycle phase."""
|
|
51
|
+
self._stop_spinner()
|
|
52
|
+
if self._phase and self._phase != status:
|
|
53
|
+
label = self.CHECKMARKS.get(self._phase, f"✓ {self._phase}")
|
|
54
|
+
self._print(f"\r[Podstack] {label}{' ' * 30}\n")
|
|
55
|
+
self._phase = status
|
|
56
|
+
if status == "running":
|
|
57
|
+
if not self._logs_started:
|
|
58
|
+
self._logs_started = True
|
|
59
|
+
self._print(f"[Podstack] ─── Live Logs ({self._gpu_type} x{self._gpu_count}) ───\n\n")
|
|
60
|
+
else:
|
|
61
|
+
phase_info = self.PHASES.get(status)
|
|
62
|
+
if phase_info:
|
|
63
|
+
_, label = phase_info
|
|
64
|
+
if extra:
|
|
65
|
+
label = f"{label} {extra}"
|
|
66
|
+
self._start_spinner(f"[Podstack] {{spinner}} {label}")
|
|
67
|
+
|
|
68
|
+
def log(self, content: str, output_type: str = "stdout"):
|
|
69
|
+
"""Write a log line from the pod (indented, real-time)."""
|
|
70
|
+
if output_type == "stderr":
|
|
71
|
+
sys.stderr.write(f" {content}" if not content.startswith(" ") else content)
|
|
72
|
+
sys.stderr.flush()
|
|
73
|
+
else:
|
|
74
|
+
sys.stdout.write(f" {content}" if not content.startswith(" ") else content)
|
|
75
|
+
sys.stdout.flush()
|
|
76
|
+
|
|
77
|
+
def complete(self, success: bool, gpu_seconds: float, cost_paise: int, error: str = None):
|
|
78
|
+
"""Print final summary line."""
|
|
79
|
+
self._stop_spinner()
|
|
80
|
+
if self._logs_started:
|
|
81
|
+
self._print(f"\n[Podstack] ─────────────────────────────────────\n")
|
|
82
|
+
if success:
|
|
83
|
+
cost_str = f"₹{cost_paise/100:.2f}" if cost_paise else ""
|
|
84
|
+
self._print(f"[Podstack] ✓ Completed in {gpu_seconds:.1f}s | {self._gpu_type} x{self._gpu_count} | {cost_str}\n")
|
|
85
|
+
else:
|
|
86
|
+
self._print(f"[Podstack] ✗ Failed: {error}\n")
|
|
87
|
+
|
|
88
|
+
def _start_spinner(self, template: str):
|
|
89
|
+
self._stop_evt.clear()
|
|
90
|
+
|
|
91
|
+
def _spin():
|
|
92
|
+
i = 0
|
|
93
|
+
while not self._stop_evt.is_set():
|
|
94
|
+
frame = SPINNER_FRAMES[i % len(SPINNER_FRAMES)]
|
|
95
|
+
sys.stdout.write(f"\r{template.format(spinner=frame)} ")
|
|
96
|
+
sys.stdout.flush()
|
|
97
|
+
self._stop_evt.wait(0.1)
|
|
98
|
+
i += 1
|
|
99
|
+
|
|
100
|
+
self._spinner_thread = threading.Thread(target=_spin, daemon=True)
|
|
101
|
+
self._spinner_thread.start()
|
|
102
|
+
|
|
103
|
+
def _stop_spinner(self):
|
|
104
|
+
if self._spinner_thread and self._spinner_thread.is_alive():
|
|
105
|
+
self._stop_evt.set()
|
|
106
|
+
self._spinner_thread.join(timeout=0.5)
|
|
107
|
+
self._spinner_thread = None
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _print(msg: str):
|
|
111
|
+
sys.stdout.write(msg)
|
|
112
|
+
sys.stdout.flush()
|
|
113
|
+
|
|
21
114
|
|
|
22
115
|
def is_jupyter() -> bool:
|
|
23
116
|
"""Check if running in a Jupyter notebook."""
|
|
@@ -728,8 +821,6 @@ _stream_install(
|
|
|
728
821
|
if not execution_id:
|
|
729
822
|
raise RuntimeError(f"No execution_id in response: {submission}")
|
|
730
823
|
|
|
731
|
-
print(f"[Podstack] Execution submitted: {execution_id}")
|
|
732
|
-
|
|
733
824
|
if not wait:
|
|
734
825
|
return GPUExecutionResult(
|
|
735
826
|
execution_id=execution_id,
|
|
@@ -742,9 +833,9 @@ _stream_install(
|
|
|
742
833
|
should_stream = stream if stream is not None else is_jupyter()
|
|
743
834
|
|
|
744
835
|
if should_stream:
|
|
745
|
-
return self._run_with_streaming(execution_id, gpu, count, timeout, max_retries, cancel_on_timeout)
|
|
836
|
+
return self._run_with_streaming(execution_id, gpu, count, timeout, max_retries, cancel_on_timeout, fraction)
|
|
746
837
|
else:
|
|
747
|
-
return self._run_with_polling(execution_id, gpu, count, timeout, poll_interval, max_retries, provisioning_timeout, cancel_on_timeout)
|
|
838
|
+
return self._run_with_polling(execution_id, gpu, count, timeout, poll_interval, max_retries, provisioning_timeout, cancel_on_timeout, fraction)
|
|
748
839
|
|
|
749
840
|
def _run_with_streaming(
|
|
750
841
|
self,
|
|
@@ -753,10 +844,12 @@ _stream_install(
|
|
|
753
844
|
count: int,
|
|
754
845
|
timeout: int,
|
|
755
846
|
max_retries: int,
|
|
756
|
-
cancel_on_timeout: bool
|
|
847
|
+
cancel_on_timeout: bool,
|
|
848
|
+
fraction: int = 100
|
|
757
849
|
) -> GPUExecutionResult:
|
|
758
850
|
"""Run execution with real-time output streaming."""
|
|
759
|
-
|
|
851
|
+
display = LiveDisplay(gpu, count, fraction)
|
|
852
|
+
display.set_phase("pending")
|
|
760
853
|
|
|
761
854
|
start_time = time.time()
|
|
762
855
|
output_buffer = []
|
|
@@ -764,13 +857,14 @@ _stream_install(
|
|
|
764
857
|
final_event = {}
|
|
765
858
|
|
|
766
859
|
try:
|
|
767
|
-
for event in self.stream_output(execution_id, show_output=
|
|
860
|
+
for event in self.stream_output(execution_id, show_output=False):
|
|
768
861
|
elapsed = time.time() - start_time
|
|
769
862
|
if elapsed > timeout:
|
|
770
863
|
if cancel_on_timeout:
|
|
771
864
|
try:
|
|
772
865
|
self.cancel(execution_id)
|
|
773
|
-
|
|
866
|
+
display._stop_spinner()
|
|
867
|
+
display._print(f"\r[Podstack] Execution cancelled due to timeout{' ' * 30}\n")
|
|
774
868
|
except Exception as e:
|
|
775
869
|
logger.warning(f"Failed to cancel execution: {e}")
|
|
776
870
|
|
|
@@ -787,37 +881,41 @@ _stream_install(
|
|
|
787
881
|
)
|
|
788
882
|
)
|
|
789
883
|
|
|
790
|
-
# Track
|
|
791
|
-
if event.get("type") in ("stdout", "stderr", "output"):
|
|
792
|
-
content = event.get("content", "")
|
|
793
|
-
if content:
|
|
794
|
-
output_buffer.append(content)
|
|
795
|
-
|
|
796
|
-
# Track status
|
|
884
|
+
# Track status transitions
|
|
797
885
|
if "status" in event:
|
|
798
886
|
new_status = event["status"]
|
|
799
887
|
if new_status != final_status:
|
|
800
888
|
final_status = new_status
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
889
|
+
extra = ""
|
|
890
|
+
if new_status == "queued":
|
|
891
|
+
pos = event.get("queue_position", "?")
|
|
892
|
+
extra = f"(position: {pos})"
|
|
893
|
+
display.set_phase(new_status, extra)
|
|
805
894
|
|
|
806
895
|
# Check for terminal status
|
|
807
896
|
if final_status in ("completed", "failed", "timeout", "cancelled"):
|
|
808
897
|
final_event = event
|
|
809
898
|
break
|
|
810
899
|
|
|
900
|
+
# Stream output lines
|
|
901
|
+
if event.get("type") in ("stdout", "stderr", "output"):
|
|
902
|
+
content = event.get("content", "")
|
|
903
|
+
if content:
|
|
904
|
+
output_buffer.append(content)
|
|
905
|
+
display.log(content, event.get("type", "stdout"))
|
|
906
|
+
|
|
811
907
|
except RuntimeError as e:
|
|
812
908
|
if "HTTP 401" in str(e):
|
|
813
909
|
# Auth failed on stream — fall back to polling
|
|
814
|
-
|
|
815
|
-
|
|
910
|
+
display._stop_spinner()
|
|
911
|
+
display._print(f"\r[Podstack] Streaming auth failed, falling back to polling...{' ' * 10}\n")
|
|
912
|
+
return self._run_with_polling(execution_id, gpu, count, timeout, 2.0, max_retries, 300, cancel_on_timeout, fraction)
|
|
816
913
|
raise
|
|
817
914
|
except (ConnectionError, httpx.ConnectError) as e:
|
|
818
915
|
# Try to recover and get the result
|
|
819
916
|
logger.warning(f"Stream connection lost: {e}")
|
|
820
|
-
|
|
917
|
+
display._stop_spinner()
|
|
918
|
+
display._print(f"\r[Podstack] Stream connection lost, fetching final result...{' ' * 10}\n")
|
|
821
919
|
|
|
822
920
|
# Get final result
|
|
823
921
|
result = None
|
|
@@ -837,11 +935,7 @@ _stream_install(
|
|
|
837
935
|
if "__PODSTACK_RESULT__" not in result.output and "__PODSTACK_RESULT__" in streamed:
|
|
838
936
|
result.output = streamed
|
|
839
937
|
|
|
840
|
-
|
|
841
|
-
print(f"\n[Podstack] Completed in {result.gpu_seconds:.1f}s (cost: ₹{result.cost_paise/100:.2f})")
|
|
842
|
-
else:
|
|
843
|
-
error_msg = result.error or 'Unknown error'
|
|
844
|
-
print(f"\n[Podstack] Failed: {error_msg}")
|
|
938
|
+
display.complete(result.success, result.gpu_seconds, result.cost_paise, result.error or "Unknown error")
|
|
845
939
|
|
|
846
940
|
return result
|
|
847
941
|
|
|
@@ -854,10 +948,13 @@ _stream_install(
|
|
|
854
948
|
poll_interval: float,
|
|
855
949
|
max_retries: int,
|
|
856
950
|
provisioning_timeout: int,
|
|
857
|
-
cancel_on_timeout: bool
|
|
951
|
+
cancel_on_timeout: bool,
|
|
952
|
+
fraction: int = 100
|
|
858
953
|
) -> GPUExecutionResult:
|
|
859
954
|
"""Run execution with polling (non-streaming mode)."""
|
|
860
|
-
|
|
955
|
+
display = LiveDisplay(gpu, count, fraction)
|
|
956
|
+
display.set_phase("pending")
|
|
957
|
+
|
|
861
958
|
start_time = time.time()
|
|
862
959
|
provisioning_start = None
|
|
863
960
|
last_status = ""
|
|
@@ -872,7 +969,8 @@ _stream_install(
|
|
|
872
969
|
if cancel_on_timeout:
|
|
873
970
|
try:
|
|
874
971
|
self.cancel(execution_id)
|
|
875
|
-
|
|
972
|
+
display._stop_spinner()
|
|
973
|
+
display._print(f"\r[Podstack] Execution cancelled due to timeout{' ' * 30}\n")
|
|
876
974
|
except Exception as e:
|
|
877
975
|
logger.warning(f"Failed to cancel execution: {e}")
|
|
878
976
|
|
|
@@ -914,24 +1012,17 @@ _stream_install(
|
|
|
914
1012
|
|
|
915
1013
|
if current_status != last_status:
|
|
916
1014
|
last_status = current_status
|
|
917
|
-
if current_status == "
|
|
918
|
-
print(f"[Podstack] Pending...")
|
|
919
|
-
elif current_status == "queued":
|
|
920
|
-
pos = status_data.get("queue_position", "?")
|
|
921
|
-
print(f"[Podstack] Queued (position: {pos})")
|
|
922
|
-
elif current_status == "provisioning":
|
|
1015
|
+
if current_status == "provisioning":
|
|
923
1016
|
provisioning_start = time.time()
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
else:
|
|
1017
|
+
if current_status in ("pending", "queued", "provisioning", "running"):
|
|
1018
|
+
extra = ""
|
|
1019
|
+
if current_status == "queued":
|
|
1020
|
+
pos = status_data.get("queue_position", "?")
|
|
1021
|
+
extra = f"(position: {pos})"
|
|
1022
|
+
display.set_phase(current_status, extra)
|
|
1023
|
+
elif current_status not in ("completed", "failed", "timeout", "cancelled"):
|
|
932
1024
|
# Unknown status - log but continue
|
|
933
1025
|
logger.warning(f"Unknown status: {current_status}")
|
|
934
|
-
print(f"[Podstack] Status: {current_status}")
|
|
935
1026
|
|
|
936
1027
|
# Check for provisioning timeout
|
|
937
1028
|
if provisioning_start and current_status == "provisioning":
|
|
@@ -974,14 +1065,12 @@ _stream_install(
|
|
|
974
1065
|
raise ConnectionError(f"Failed to get result after {max_retries} attempts: {e}")
|
|
975
1066
|
time.sleep(poll_interval * (attempt + 1))
|
|
976
1067
|
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
if result.output:
|
|
984
|
-
print(f"[Podstack] Output (last 500 chars):\n{result.output[-500:]}")
|
|
1068
|
+
display.complete(result.success, result.gpu_seconds, result.cost_paise, result.error or "Unknown error")
|
|
1069
|
+
|
|
1070
|
+
# In polling mode, output isn't streamed — print it after completion
|
|
1071
|
+
if result.output and not result.success:
|
|
1072
|
+
sys.stdout.write(f"[Podstack] Output (last 500 chars):\n{result.output[-500:]}\n")
|
|
1073
|
+
sys.stdout.flush()
|
|
985
1074
|
|
|
986
1075
|
return result
|
|
987
1076
|
|
|
@@ -176,7 +176,13 @@ def list_experiments(limit: int = 20, offset: int = 0) -> list:
|
|
|
176
176
|
return _get_client().list_experiments(limit, offset)
|
|
177
177
|
|
|
178
178
|
|
|
179
|
-
def start_run(
|
|
179
|
+
def start_run(
|
|
180
|
+
name: str = None,
|
|
181
|
+
tags: dict = None,
|
|
182
|
+
capture_env: bool = True,
|
|
183
|
+
system_metrics: bool = True,
|
|
184
|
+
system_metrics_interval: float = 10.0,
|
|
185
|
+
) -> Run:
|
|
180
186
|
"""
|
|
181
187
|
Start a new run in the active experiment.
|
|
182
188
|
|
|
@@ -188,11 +194,14 @@ def start_run(name: str = None, tags: dict = None) -> Run:
|
|
|
188
194
|
Args:
|
|
189
195
|
name: Optional run name
|
|
190
196
|
tags: Optional tags dict
|
|
197
|
+
capture_env: Auto-capture Python/pip/git/CUDA as _env.* params
|
|
198
|
+
system_metrics: Log CPU/RAM/GPU metrics every system_metrics_interval seconds
|
|
199
|
+
system_metrics_interval: Seconds between metric samples (default 10)
|
|
191
200
|
|
|
192
201
|
Returns:
|
|
193
202
|
Run object (context manager)
|
|
194
203
|
"""
|
|
195
|
-
return _get_client().start_run(name, tags)
|
|
204
|
+
return _get_client().start_run(name, tags, capture_env, system_metrics, system_metrics_interval)
|
|
196
205
|
|
|
197
206
|
|
|
198
207
|
def end_run(status: str = "completed"):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|