swm-gpu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swm/__init__.py +1 -0
- swm/bootstrap.py +337 -0
- swm/bootstrap_frameworks.py +177 -0
- swm/bootstrap_ssh.py +175 -0
- swm/cli.py +281 -0
- swm/commands/__init__.py +1 -0
- swm/commands/_helpers.py +265 -0
- swm/commands/config.py +77 -0
- swm/commands/costs.py +423 -0
- swm/commands/guard.py +191 -0
- swm/commands/models.py +484 -0
- swm/commands/pod.py +693 -0
- swm/commands/pricing.py +162 -0
- swm/commands/remote.py +229 -0
- swm/commands/setup.py +421 -0
- swm/commands/storage.py +290 -0
- swm/commands/sync.py +341 -0
- swm/commands/use.py +79 -0
- swm/config.py +78 -0
- swm/costs/__init__.py +1 -0
- swm/costs/billing.py +60 -0
- swm/costs/budget.py +68 -0
- swm/costs/db.py +268 -0
- swm/costs/reconcile.py +288 -0
- swm/costs/tracker.py +101 -0
- swm/frameworks/__init__.py +100 -0
- swm/frameworks/axolotl.py +54 -0
- swm/frameworks/comfyui.py +128 -0
- swm/frameworks/llm_studio.py +56 -0
- swm/frameworks/ollama.py +85 -0
- swm/frameworks/open_webui.py +98 -0
- swm/frameworks/swarmui.py +124 -0
- swm/frameworks/vllm_server.py +140 -0
- swm/guard.py +537 -0
- swm/models/__init__.py +1 -0
- swm/models/huggingface.py +84 -0
- swm/pricing/__init__.py +12 -0
- swm/pricing/calculator.py +114 -0
- swm/pricing/providers.py +207 -0
- swm/providers/__init__.py +106 -0
- swm/providers/aws.py +170 -0
- swm/providers/azure.py +307 -0
- swm/providers/base.py +187 -0
- swm/providers/coreweave.py +167 -0
- swm/providers/fluidstack.py +274 -0
- swm/providers/gcp.py +260 -0
- swm/providers/lambda_labs.py +202 -0
- swm/providers/runpod.py +212 -0
- swm/providers/tensordock.py +266 -0
- swm/providers/vastai.py +274 -0
- swm/providers/vultr.py +233 -0
- swm/remote/__init__.py +13 -0
- swm/remote/ssh.py +438 -0
- swm/storage/__init__.py +92 -0
- swm/storage/b2.py +88 -0
- swm/storage/base.py +212 -0
- swm/storage/gcs.py +74 -0
- swm/storage/s3.py +59 -0
- swm/sync/__init__.py +34 -0
- swm/sync/_autosync_daemon.sh +128 -0
- swm/sync/_common.py +59 -0
- swm/sync/autosync.py +179 -0
- swm/sync/paths.py +40 -0
- swm/sync/preflight.py +90 -0
- swm/sync/pull.py +116 -0
- swm/sync/push.py +326 -0
- swm/sync/watcher.py +75 -0
- swm_gpu-0.1.0.dist-info/METADATA +236 -0
- swm_gpu-0.1.0.dist-info/RECORD +73 -0
- swm_gpu-0.1.0.dist-info/WHEEL +4 -0
- swm_gpu-0.1.0.dist-info/entry_points.txt +2 -0
- swm_gpu-0.1.0.dist-info/licenses/LICENSE +190 -0
- swm_gpu-0.1.0.dist-info/licenses/NOTICE +5 -0
swm/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
swm/bootstrap.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
"""Bootstrap scripts for setting up remote GPU instances with storage and tools."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from swm.remote.ssh import RemoteSession
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
S5CMD_VERSION = "2.3.0"
|
|
14
|
+
S5CMD_URL = (
|
|
15
|
+
f"https://github.com/peak/s5cmd/releases/download/v{S5CMD_VERSION}/"
|
|
16
|
+
f"s5cmd_{S5CMD_VERSION}_Linux-64bit.tar.gz"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
SAFETY_MARGIN = 0.90
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _humanize(n: int | float) -> str:
|
|
23
|
+
v = float(n)
|
|
24
|
+
for unit in ("B", "KB", "MB", "GB", "TB"):
|
|
25
|
+
if v < 1024:
|
|
26
|
+
return f"{v:.1f} {unit}" if unit != "B" else f"{int(v)} B"
|
|
27
|
+
v /= 1024
|
|
28
|
+
return f"{v:.1f} PB"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _step(session: RemoteSession, label: str, command: str) -> tuple[int, str, str]:
|
|
32
|
+
"""Run a labelled step on the remote, streaming output to the terminal."""
|
|
33
|
+
console.print(f"\n[bold cyan]▸ {label}[/bold cyan]")
|
|
34
|
+
code, stdout, stderr = session.exec(command, stream=True)
|
|
35
|
+
if code != 0:
|
|
36
|
+
raise RuntimeError(f"Step failed (exit {code}): {label}")
|
|
37
|
+
return code, stdout, stderr
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── s5cmd ──────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _s3_env(storage_slug: str) -> str:
|
|
44
|
+
"""Build env-var prefix for s5cmd from swm config."""
|
|
45
|
+
from swm import config as cfg
|
|
46
|
+
|
|
47
|
+
if storage_slug == "b2":
|
|
48
|
+
endpoint = cfg.get("b2.s3_endpoint") or ""
|
|
49
|
+
ak = cfg.get("b2.key_id") or ""
|
|
50
|
+
sk = cfg.get("b2.app_key") or ""
|
|
51
|
+
elif storage_slug == "gcs":
|
|
52
|
+
endpoint = "https://storage.googleapis.com"
|
|
53
|
+
ak = cfg.get("gcs.hmac_access") or ""
|
|
54
|
+
sk = cfg.get("gcs.hmac_secret") or ""
|
|
55
|
+
elif storage_slug == "s3":
|
|
56
|
+
endpoint = ""
|
|
57
|
+
ak = cfg.get("s3.access_key") or ""
|
|
58
|
+
sk = cfg.get("s3.secret_key") or ""
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unknown storage slug: {storage_slug}")
|
|
61
|
+
|
|
62
|
+
parts = [
|
|
63
|
+
f"AWS_ACCESS_KEY_ID='{ak}'",
|
|
64
|
+
f"AWS_SECRET_ACCESS_KEY='{sk}'",
|
|
65
|
+
]
|
|
66
|
+
if endpoint:
|
|
67
|
+
parts.append(f"S3_ENDPOINT_URL='{endpoint}'")
|
|
68
|
+
return " ".join(parts)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def install_s5cmd(session: RemoteSession) -> None:
|
|
72
|
+
"""Download the s5cmd static binary if not already present."""
|
|
73
|
+
_step(
|
|
74
|
+
session,
|
|
75
|
+
"Installing s5cmd",
|
|
76
|
+
f"command -v s5cmd >/dev/null 2>&1 && echo 's5cmd already installed' || "
|
|
77
|
+
f"(curl -sL '{S5CMD_URL}' | tar xz -C /usr/local/bin s5cmd "
|
|
78
|
+
f"&& chmod +x /usr/local/bin/s5cmd && s5cmd version)",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _install_inotify(session: RemoteSession) -> None:
|
|
83
|
+
"""Install inotify-tools for the filesystem change watcher."""
|
|
84
|
+
_step(
|
|
85
|
+
session,
|
|
86
|
+
"Installing inotify-tools",
|
|
87
|
+
"command -v inotifywait >/dev/null 2>&1 && echo 'inotify-tools already installed' || "
|
|
88
|
+
"(apt-get update -qq && apt-get install -y -qq inotify-tools && echo 'installed')",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def configure_storage(
|
|
93
|
+
session: RemoteSession, storage_slug: str, bucket: str = "",
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Install s5cmd, inotify-tools, and verify the S3-compatible connection."""
|
|
96
|
+
install_s5cmd(session)
|
|
97
|
+
try:
|
|
98
|
+
_install_inotify(session)
|
|
99
|
+
except RuntimeError:
|
|
100
|
+
console.print(" [yellow]⚠ inotify-tools install failed — push will use find[/yellow]")
|
|
101
|
+
env = _s3_env(storage_slug)
|
|
102
|
+
target = f"s3://{bucket}/" if bucket else ""
|
|
103
|
+
_step(
|
|
104
|
+
session,
|
|
105
|
+
f"Verifying {storage_slug} connection",
|
|
106
|
+
f"{env} s5cmd ls {target} 2>&1 | head -5 || true",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
_WS_MARKER_NAMES = (
|
|
111
|
+
".swm_last_push",
|
|
112
|
+
".swm_changes.log",
|
|
113
|
+
".swm_workspace.tar.gz",
|
|
114
|
+
".swm_autosync.log",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _ensure_workspace_empty_on_pod(session: RemoteSession) -> None:
|
|
119
|
+
"""Raise if /workspace/ on the pod contains any non-marker files.
|
|
120
|
+
|
|
121
|
+
A blind ``touch PUSH_STAMP`` on a non-empty workspace would silently
|
|
122
|
+
declare those files synced even though they were never uploaded.
|
|
123
|
+
Refuse, and tell the user to push or clear before attaching.
|
|
124
|
+
"""
|
|
125
|
+
excludes = " ".join(f"-not -name '{n}'" for n in _WS_MARKER_NAMES)
|
|
126
|
+
cmd = (
|
|
127
|
+
f"find /workspace -mindepth 1 -maxdepth 1 {excludes} 2>/dev/null "
|
|
128
|
+
"| head -5"
|
|
129
|
+
)
|
|
130
|
+
_, out, _ = session.exec(cmd, stream=False)
|
|
131
|
+
leftover = [line for line in out.splitlines() if line.strip()]
|
|
132
|
+
if leftover:
|
|
133
|
+
sample = ", ".join(p.rsplit("/", 1)[-1] for p in leftover[:3])
|
|
134
|
+
more = "" if len(leftover) <= 3 else f" (+ more)"
|
|
135
|
+
raise RuntimeError(
|
|
136
|
+
f"/workspace/ on the pod is not empty (e.g. {sample}{more}). "
|
|
137
|
+
"Refusing to mark this as a fresh workspace because those "
|
|
138
|
+
"files would not be uploaded. Either: (a) upload them first "
|
|
139
|
+
"with `swm sync push <pod> -b <provider:bucket> -d <name> "
|
|
140
|
+
"--force`, then re-run setup; or (b) clear /workspace/ on "
|
|
141
|
+
"the pod and re-run; or (c) re-run with an existing "
|
|
142
|
+
"workspace name to pull from storage."
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def bootstrap_workspace_on_pod(
|
|
147
|
+
session: RemoteSession,
|
|
148
|
+
storage_slug: str,
|
|
149
|
+
bucket: str,
|
|
150
|
+
workspace: str,
|
|
151
|
+
*,
|
|
152
|
+
qualified_id: str,
|
|
153
|
+
is_new: bool,
|
|
154
|
+
extra_excludes: list[str] | None = None,
|
|
155
|
+
autosync_interval: int = 60,
|
|
156
|
+
console_obj: Console | None = None,
|
|
157
|
+
) -> list[tuple[str, str]]:
|
|
158
|
+
"""Configure storage, pull (or watcher-init), and start auto-sync on a pod.
|
|
159
|
+
|
|
160
|
+
Returns a list of ``(label, recovery_command)`` tuples for any sub-step
|
|
161
|
+
that failed. An empty list means full success.
|
|
162
|
+
"""
|
|
163
|
+
from swm.sync import start_watcher
|
|
164
|
+
from swm.sync.autosync import AutosyncUnsafeError, start_autosync
|
|
165
|
+
from swm.sync.paths import PUSH_STAMP, WATCH_LOG
|
|
166
|
+
from swm.sync.pull import workspace_pull
|
|
167
|
+
|
|
168
|
+
_con = console_obj or console
|
|
169
|
+
failed: list[tuple[str, str]] = []
|
|
170
|
+
|
|
171
|
+
def _fail(label: str, cmd: str, exc: Exception) -> None:
|
|
172
|
+
_con.print(f" [yellow]⚠ {label} failed: {exc}[/yellow]")
|
|
173
|
+
failed.append((label, cmd))
|
|
174
|
+
|
|
175
|
+
storage_ok = False
|
|
176
|
+
try:
|
|
177
|
+
with _con.status(
|
|
178
|
+
"Installing s5cmd & configuring storage…", spinner="dots"
|
|
179
|
+
):
|
|
180
|
+
configure_storage(session, storage_slug, bucket=bucket)
|
|
181
|
+
_con.print("[green]✓[/green] Storage configured")
|
|
182
|
+
storage_ok = True
|
|
183
|
+
except Exception as exc:
|
|
184
|
+
_fail("Storage configuration", f"swm setup storage {qualified_id}", exc)
|
|
185
|
+
|
|
186
|
+
pull_ok = False
|
|
187
|
+
if storage_ok:
|
|
188
|
+
try:
|
|
189
|
+
if is_new:
|
|
190
|
+
_ensure_workspace_empty_on_pod(session)
|
|
191
|
+
_con.print(" [dim]New workspace — skipping pull[/dim]")
|
|
192
|
+
session.exec(
|
|
193
|
+
f": > {WATCH_LOG} 2>/dev/null; touch {PUSH_STAMP}",
|
|
194
|
+
stream=False,
|
|
195
|
+
)
|
|
196
|
+
if start_watcher(session, "/workspace"):
|
|
197
|
+
_con.print(" [dim]Watcher started for change tracking[/dim]")
|
|
198
|
+
else:
|
|
199
|
+
workspace_pull(
|
|
200
|
+
session, storage_slug, bucket, workspace,
|
|
201
|
+
extra_excludes=extra_excludes,
|
|
202
|
+
)
|
|
203
|
+
pull_ok = True
|
|
204
|
+
except Exception as exc:
|
|
205
|
+
_fail("Workspace pull", f"swm sync pull {qualified_id}", exc)
|
|
206
|
+
else:
|
|
207
|
+
failed.append(("Workspace pull", f"swm sync pull {qualified_id}"))
|
|
208
|
+
|
|
209
|
+
if pull_ok:
|
|
210
|
+
try:
|
|
211
|
+
if start_autosync(
|
|
212
|
+
session, storage_slug, bucket, workspace,
|
|
213
|
+
interval=autosync_interval,
|
|
214
|
+
):
|
|
215
|
+
_con.print(
|
|
216
|
+
" [dim]Auto-sync started "
|
|
217
|
+
f"(every {autosync_interval}s → "
|
|
218
|
+
f"{storage_slug}:{bucket}/{workspace})[/dim]"
|
|
219
|
+
)
|
|
220
|
+
except (AutosyncUnsafeError, Exception) as exc:
|
|
221
|
+
_fail("Auto-sync start", f"swm sync auto {qualified_id}", exc)
|
|
222
|
+
else:
|
|
223
|
+
failed.append(("Auto-sync start", f"swm sync auto {qualified_id}"))
|
|
224
|
+
|
|
225
|
+
return failed
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
_LOCK_FILE = "/tmp/.swm_transfer.lock"
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _acquire_transfer_lock(session: RemoteSession, force: bool = False) -> None:
|
|
232
|
+
"""Check for an existing transfer and acquire the lock.
|
|
233
|
+
|
|
234
|
+
If a lock exists with a live PID, raises unless *force* is True
|
|
235
|
+
(which kills the stale process first).
|
|
236
|
+
"""
|
|
237
|
+
code, out, _ = session.exec(
|
|
238
|
+
f"cat {_LOCK_FILE} 2>/dev/null", stream=False,
|
|
239
|
+
)
|
|
240
|
+
old_pid = out.strip()
|
|
241
|
+
|
|
242
|
+
if old_pid:
|
|
243
|
+
_, alive, _ = session.exec(
|
|
244
|
+
f"kill -0 {old_pid} 2>/dev/null && echo alive || echo dead",
|
|
245
|
+
stream=False,
|
|
246
|
+
)
|
|
247
|
+
if "alive" in alive:
|
|
248
|
+
if not force:
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
f"A transfer is already running (PID {old_pid}). "
|
|
251
|
+
"Use --force to kill it and start a new one."
|
|
252
|
+
)
|
|
253
|
+
console.print(
|
|
254
|
+
f" [yellow]⚠ Killing existing transfer (PID {old_pid})[/yellow]"
|
|
255
|
+
)
|
|
256
|
+
session.exec(f"kill -9 {old_pid} 2>/dev/null; sleep 1", stream=False)
|
|
257
|
+
|
|
258
|
+
# Stale lock — clean up temp files left behind
|
|
259
|
+
console.print(" [dim]Cleaning up stale temp files…[/dim]")
|
|
260
|
+
_, cleanup_out, _ = session.exec(
|
|
261
|
+
"find /workspace -maxdepth 5 -type f -regex '.*\\.[a-z]*[0-9]\\{9,\\}$' "
|
|
262
|
+
"-delete -print 2>/dev/null | wc -l",
|
|
263
|
+
stream=False,
|
|
264
|
+
)
|
|
265
|
+
n = cleanup_out.strip()
|
|
266
|
+
if n and n != "0":
|
|
267
|
+
console.print(f" [dim]Removed {n} orphaned temp files[/dim]")
|
|
268
|
+
|
|
269
|
+
session.exec(f"echo $$ > {_LOCK_FILE}", stream=False)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _s5cmd_transfer(
|
|
273
|
+
session: RemoteSession,
|
|
274
|
+
label: str,
|
|
275
|
+
s5cmd_cmd: str,
|
|
276
|
+
force: bool = False,
|
|
277
|
+
total_bytes: int = 0,
|
|
278
|
+
total_files: int = 0,
|
|
279
|
+
) -> int:
|
|
280
|
+
"""Run an s5cmd transfer with output streamed directly to the terminal.
|
|
281
|
+
|
|
282
|
+
Acquires a lock file on the pod, wraps the command in a shell trap
|
|
283
|
+
for guaranteed cleanup (even on SSH disconnect), and streams
|
|
284
|
+
s5cmd's native ``--show-progress`` output to the terminal.
|
|
285
|
+
|
|
286
|
+
Returns the process exit code.
|
|
287
|
+
"""
|
|
288
|
+
console.print(f"\n[bold cyan]▸ {label}[/bold cyan]")
|
|
289
|
+
_acquire_transfer_lock(session, force=force)
|
|
290
|
+
|
|
291
|
+
wrapped = (
|
|
292
|
+
f"trap 'rm -f {_LOCK_FILE}' EXIT; "
|
|
293
|
+
f"echo $$ > {_LOCK_FILE}; "
|
|
294
|
+
f"{s5cmd_cmd}"
|
|
295
|
+
)
|
|
296
|
+
cmd = session._ssh_cmd() + [wrapped]
|
|
297
|
+
code = subprocess.call(cmd)
|
|
298
|
+
|
|
299
|
+
if code != 0:
|
|
300
|
+
console.print(f" [yellow]⚠ Transfer finished with warnings (exit {code})[/yellow]")
|
|
301
|
+
else:
|
|
302
|
+
console.print(f" [green]✓ {label} — done[/green]")
|
|
303
|
+
|
|
304
|
+
return code
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# Backward-compatible re-exports (lazy to avoid circular imports)
|
|
308
|
+
_RE_EXPORTS: dict[str, str] = {
|
|
309
|
+
"install_framework": "swm.bootstrap_frameworks",
|
|
310
|
+
"start_framework": "swm.bootstrap_frameworks",
|
|
311
|
+
"stop_framework": "swm.bootstrap_frameworks",
|
|
312
|
+
"install_comfyui": "swm.bootstrap_frameworks",
|
|
313
|
+
"install_swarmui": "swm.bootstrap_frameworks",
|
|
314
|
+
"link_models_to_comfyui": "swm.bootstrap_frameworks",
|
|
315
|
+
"wait_for_ssh": "swm.bootstrap_ssh",
|
|
316
|
+
"next_workspace_name": "swm.bootstrap_ssh",
|
|
317
|
+
"DiskCheck": "swm.sync",
|
|
318
|
+
"preflight_check": "swm.sync",
|
|
319
|
+
"start_watcher": "swm.sync",
|
|
320
|
+
"stop_watcher": "swm.sync",
|
|
321
|
+
"is_watcher_alive": "swm.sync",
|
|
322
|
+
"workspace_pull": "swm.sync",
|
|
323
|
+
"workspace_push": "swm.sync",
|
|
324
|
+
"tar_pull": "swm.sync",
|
|
325
|
+
"start_autosync": "swm.sync",
|
|
326
|
+
"stop_autosync": "swm.sync",
|
|
327
|
+
"is_autosync_alive": "swm.sync",
|
|
328
|
+
"autosync_status": "swm.sync",
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def __getattr__(name: str): # noqa: E302
|
|
333
|
+
if name in _RE_EXPORTS:
|
|
334
|
+
import importlib
|
|
335
|
+
mod = importlib.import_module(_RE_EXPORTS[name])
|
|
336
|
+
return getattr(mod, name)
|
|
337
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Framework lifecycle management for remote GPU instances."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from swm.bootstrap import _step
|
|
10
|
+
from swm.remote.ssh import RemoteSession
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def install_framework(
|
|
16
|
+
session: RemoteSession,
|
|
17
|
+
name: str,
|
|
18
|
+
console: Console | None = None,
|
|
19
|
+
) -> None:
|
|
20
|
+
"""Install a framework by name using its declarative step list."""
|
|
21
|
+
from swm.frameworks import Framework, get_framework
|
|
22
|
+
|
|
23
|
+
_con = console or globals()["console"]
|
|
24
|
+
|
|
25
|
+
fw: Framework = get_framework(name)
|
|
26
|
+
_con.print(f"\n[bold]Installing {fw.label}[/bold]")
|
|
27
|
+
env_prefix = f"{fw.env_setup} && " if fw.env_setup else ""
|
|
28
|
+
|
|
29
|
+
total = len(fw.steps) + len(fw.post_install)
|
|
30
|
+
for idx, step in enumerate(fw.steps, 1):
|
|
31
|
+
workdir = step.workdir or fw.install_dir
|
|
32
|
+
if step.check:
|
|
33
|
+
cmd = f"{step.check} && echo '{step.label}: already done' || ({env_prefix}cd {workdir} && {step.command})"
|
|
34
|
+
else:
|
|
35
|
+
cmd = f"{env_prefix}cd {workdir} && {step.command}"
|
|
36
|
+
_step(session, f"[{idx}/{total}] {step.label}", cmd)
|
|
37
|
+
|
|
38
|
+
for idx, step in enumerate(fw.post_install, len(fw.steps) + 1):
|
|
39
|
+
workdir = step.workdir or fw.install_dir
|
|
40
|
+
if step.check:
|
|
41
|
+
cmd = f"{step.check} && echo '{step.label}: already done' || ({env_prefix}cd {workdir} && {step.command})"
|
|
42
|
+
else:
|
|
43
|
+
cmd = f"{env_prefix}cd {workdir} && {step.command}"
|
|
44
|
+
_step(session, f"[{idx}/{total}] {step.label}", cmd)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_framework(
|
|
48
|
+
session: RemoteSession,
|
|
49
|
+
name: str,
|
|
50
|
+
port: int | None = None,
|
|
51
|
+
console: Console | None = None,
|
|
52
|
+
qualified_id: str | None = None,
|
|
53
|
+
) -> str | None:
|
|
54
|
+
"""Launch a framework in the background. Returns proxy URL if applicable."""
|
|
55
|
+
from swm.frameworks import get_framework
|
|
56
|
+
|
|
57
|
+
_con = console or globals()["console"]
|
|
58
|
+
fw = get_framework(name)
|
|
59
|
+
|
|
60
|
+
if fw.process_pattern:
|
|
61
|
+
with _con.status(f"Checking if {fw.label} is running…", spinner="dots"):
|
|
62
|
+
_, out, _ = session.exec(
|
|
63
|
+
f"pgrep -fa '{fw.process_pattern}' | grep -v grep || true",
|
|
64
|
+
stream=False,
|
|
65
|
+
)
|
|
66
|
+
if out.strip():
|
|
67
|
+
pid = out.strip().split("\n")[0].split()[0]
|
|
68
|
+
_con.print(
|
|
69
|
+
f" [yellow]{fw.label} is already running (PID {pid})[/yellow]"
|
|
70
|
+
)
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
env_prefix = f"{fw.env_setup} && " if fw.env_setup else ""
|
|
74
|
+
if fw.pre_start:
|
|
75
|
+
_con.print(f"\n[bold cyan]▸ Preparing {fw.label}[/bold cyan]")
|
|
76
|
+
for step in fw.pre_start:
|
|
77
|
+
workdir = step.workdir or fw.install_dir
|
|
78
|
+
if step.check:
|
|
79
|
+
cmd = (
|
|
80
|
+
f"{step.check} && echo '{step.label}: already done' "
|
|
81
|
+
f"|| ({env_prefix}cd {workdir} && {step.command})"
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
cmd = f"{env_prefix}cd {workdir} && {step.command}"
|
|
85
|
+
_step(session, step.label, cmd)
|
|
86
|
+
|
|
87
|
+
_con.print(f"\n[bold cyan]▸ Starting {fw.label}[/bold cyan]")
|
|
88
|
+
launch = fw.launch_cmd
|
|
89
|
+
if port and fw.ports:
|
|
90
|
+
default_port = str(next(iter(fw.ports)))
|
|
91
|
+
launch = launch.replace(default_port, str(port))
|
|
92
|
+
|
|
93
|
+
logfile = f"/tmp/{fw.name}.log"
|
|
94
|
+
|
|
95
|
+
with _con.status(f"Launching {fw.label}…", spinner="dots"):
|
|
96
|
+
session.exec_background(
|
|
97
|
+
launch,
|
|
98
|
+
logfile=logfile,
|
|
99
|
+
workdir=fw.launch_workdir,
|
|
100
|
+
env_setup=fw.env_setup,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
max_checks = 5
|
|
104
|
+
alive = False
|
|
105
|
+
for i in range(max_checks):
|
|
106
|
+
time.sleep(3 if i == 0 else 2)
|
|
107
|
+
if not fw.process_pattern:
|
|
108
|
+
alive = True
|
|
109
|
+
break
|
|
110
|
+
_, out, _ = session.exec(
|
|
111
|
+
f"pgrep -fa '{fw.process_pattern}' | grep -v grep || true",
|
|
112
|
+
stream=False,
|
|
113
|
+
)
|
|
114
|
+
if out.strip():
|
|
115
|
+
alive = True
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
if alive:
|
|
119
|
+
_con.print(f" [green]✓ {fw.label} started[/green]")
|
|
120
|
+
_pod_ref = qualified_id or "<pod>"
|
|
121
|
+
_con.print(f" Logs: swm run {_pod_ref} 'tail -f {logfile}'")
|
|
122
|
+
else:
|
|
123
|
+
_con.print(f" [red]✗ {fw.label} failed to start[/red]")
|
|
124
|
+
_con.print(f" Last lines from {logfile}:")
|
|
125
|
+
_, tail, _ = session.exec(f"tail -15 {logfile} 2>/dev/null", stream=False)
|
|
126
|
+
if tail.strip():
|
|
127
|
+
for line in tail.strip().splitlines():
|
|
128
|
+
_con.print(f" [dim]{line}[/dim]")
|
|
129
|
+
raise RuntimeError(f"{fw.label} exited immediately — check logs above")
|
|
130
|
+
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def stop_framework(
|
|
135
|
+
session: RemoteSession,
|
|
136
|
+
name: str,
|
|
137
|
+
console: Console | None = None,
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Stop a running framework."""
|
|
140
|
+
from swm.frameworks import get_framework
|
|
141
|
+
|
|
142
|
+
_con = console or globals()["console"]
|
|
143
|
+
fw = get_framework(name)
|
|
144
|
+
if not fw.stop_cmd:
|
|
145
|
+
_con.print(f" [yellow]{fw.label} has no stop command defined[/yellow]")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
_con.print(f"\n[bold cyan]▸ Stopping {fw.label}[/bold cyan]")
|
|
149
|
+
with _con.status(f"Stopping {fw.label}…", spinner="dots"):
|
|
150
|
+
session.exec(fw.stop_cmd, stream=False)
|
|
151
|
+
_con.print(f" [green]✓ {fw.label} stopped[/green]")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def install_comfyui(session: RemoteSession) -> None:
|
|
155
|
+
"""Backward-compatible wrapper."""
|
|
156
|
+
install_framework(session, "comfyui")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def install_swarmui(session: RemoteSession) -> None:
|
|
160
|
+
"""Backward-compatible wrapper."""
|
|
161
|
+
install_framework(session, "swarmui")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ── symlinks ────────────────────────────────────────────────────────
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def link_models_to_comfyui(session: RemoteSession) -> None:
|
|
168
|
+
"""Symlink /workspace/models into ComfyUI's model directory."""
|
|
169
|
+
dirs = ["checkpoints", "loras", "vae", "controlnet", "clip", "upscale_models", "unet"]
|
|
170
|
+
cmds = ["mkdir -p /workspace/models/{" + ",".join(dirs) + "}"]
|
|
171
|
+
for d in dirs:
|
|
172
|
+
cmds.append(
|
|
173
|
+
f"[ -L /workspace/ComfyUI/models/{d} ] || "
|
|
174
|
+
f"(rm -rf /workspace/ComfyUI/models/{d} "
|
|
175
|
+
f"&& ln -s /workspace/models/{d} /workspace/ComfyUI/models/{d})"
|
|
176
|
+
)
|
|
177
|
+
_step(session, "Symlinking models → ComfyUI", " && ".join(cmds))
|
swm/bootstrap_ssh.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""SSH waiting utilities for remote GPU instances."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from swm.providers.base import Instance, InstanceStatus
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _has_direct_ssh(inst: Instance) -> bool:
|
|
16
|
+
"""True when the instance exposes a public IP with a mapped SSH port."""
|
|
17
|
+
return bool(inst.ip_address and inst.ports.get(22))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _has_relay_ssh(inst: Instance) -> bool:
|
|
21
|
+
return bool(inst.ssh_host and inst.ssh_port)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is_relay_host(inst: Instance) -> bool:
|
|
25
|
+
"""True when ssh_host is a relay proxy, not the instance's own IP."""
|
|
26
|
+
if not inst.ssh_host:
|
|
27
|
+
return False
|
|
28
|
+
return inst.ssh_host != inst.ip_address
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def wait_for_ssh(
|
|
32
|
+
provider,
|
|
33
|
+
instance_id: str,
|
|
34
|
+
timeout: int = 600,
|
|
35
|
+
poll_interval: int = 10,
|
|
36
|
+
direct_grace: int = 30,
|
|
37
|
+
) -> Instance:
|
|
38
|
+
"""Poll until the instance is running and SSH is reachable.
|
|
39
|
+
|
|
40
|
+
Prefers a direct IP+port connection over the provider relay. When
|
|
41
|
+
only a relay endpoint is found, keeps polling up to *direct_grace*
|
|
42
|
+
extra seconds for a direct endpoint before falling back. Providers
|
|
43
|
+
that expose SSH directly (Lambda, GCP, AWS) skip the grace window.
|
|
44
|
+
|
|
45
|
+
Displays ``status_detail`` from the provider (e.g. Vast.ai Docker
|
|
46
|
+
build progress) when available to give the user visibility into
|
|
47
|
+
what the remote host is doing.
|
|
48
|
+
"""
|
|
49
|
+
from swm import config as _cfg
|
|
50
|
+
|
|
51
|
+
start = time.time()
|
|
52
|
+
last_status = ""
|
|
53
|
+
last_detail = ""
|
|
54
|
+
inst = None
|
|
55
|
+
relay_seen_at: float | None = None
|
|
56
|
+
|
|
57
|
+
def _elapsed() -> str:
|
|
58
|
+
return f"{int(time.time() - start)}s"
|
|
59
|
+
|
|
60
|
+
# Phase 1: wait for the instance to be RUNNING with an SSH endpoint.
|
|
61
|
+
while time.time() - start < timeout:
|
|
62
|
+
try:
|
|
63
|
+
if hasattr(provider, "get_instance"):
|
|
64
|
+
inst = provider.get_instance(instance_id)
|
|
65
|
+
else:
|
|
66
|
+
instances = provider.list_instances()
|
|
67
|
+
inst = next((i for i in instances if i.id == instance_id), None)
|
|
68
|
+
|
|
69
|
+
if inst:
|
|
70
|
+
status = inst.status.value
|
|
71
|
+
if status != last_status:
|
|
72
|
+
console.print(f" Status: [bold]{status}[/bold] ({_elapsed()})")
|
|
73
|
+
last_status = status
|
|
74
|
+
|
|
75
|
+
detail = inst.status_detail or ""
|
|
76
|
+
if detail and detail != last_detail:
|
|
77
|
+
truncated = (detail[:100] + "…") if len(detail) > 100 else detail
|
|
78
|
+
console.print(f" [dim]{truncated}[/dim]")
|
|
79
|
+
last_detail = detail
|
|
80
|
+
|
|
81
|
+
if inst.status == InstanceStatus.RUNNING:
|
|
82
|
+
if _has_direct_ssh(inst):
|
|
83
|
+
break
|
|
84
|
+
if _has_relay_ssh(inst):
|
|
85
|
+
if not _is_relay_host(inst):
|
|
86
|
+
break
|
|
87
|
+
if relay_seen_at is None:
|
|
88
|
+
relay_seen_at = time.time()
|
|
89
|
+
elif time.time() - relay_seen_at >= direct_grace:
|
|
90
|
+
break
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
time.sleep(poll_interval)
|
|
95
|
+
else:
|
|
96
|
+
raise TimeoutError(
|
|
97
|
+
f"Pod not running after {timeout}s for instance {instance_id}. "
|
|
98
|
+
f"Last status: {last_status}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Phase 2: pick the best SSH path — direct mapped port always wins.
|
|
102
|
+
if _has_direct_ssh(inst):
|
|
103
|
+
ssh_target = inst.ip_address
|
|
104
|
+
port = inst.ports[22]
|
|
105
|
+
ssh_user = "root"
|
|
106
|
+
console.print(f" Direct SSH: {ssh_target}:{port}")
|
|
107
|
+
elif _has_relay_ssh(inst):
|
|
108
|
+
ssh_target = inst.ssh_host
|
|
109
|
+
port = inst.ssh_port
|
|
110
|
+
ssh_user = inst.ssh_user or "root"
|
|
111
|
+
if _is_relay_host(inst):
|
|
112
|
+
console.print(f" Relay SSH: {ssh_user}@{ssh_target}:{port}")
|
|
113
|
+
else:
|
|
114
|
+
console.print(f" SSH: {ssh_user}@{ssh_target}:{port}")
|
|
115
|
+
else:
|
|
116
|
+
console.print(" [yellow]No SSH endpoint found — returning anyway[/yellow]")
|
|
117
|
+
return inst
|
|
118
|
+
|
|
119
|
+
# Phase 3: probe until SSH actually responds.
|
|
120
|
+
console.print(f" Probing SSH… ({_elapsed()})")
|
|
121
|
+
probe = [
|
|
122
|
+
"ssh",
|
|
123
|
+
"-o", "StrictHostKeyChecking=no",
|
|
124
|
+
"-o", "UserKnownHostsFile=/dev/null",
|
|
125
|
+
"-o", "ConnectTimeout=5",
|
|
126
|
+
"-o", "LogLevel=ERROR",
|
|
127
|
+
"-p", str(port),
|
|
128
|
+
]
|
|
129
|
+
key = _cfg.get(f"{provider.slug}.ssh_key") or _cfg.get("ssh.key_path")
|
|
130
|
+
if key:
|
|
131
|
+
probe.extend(["-i", str(key)])
|
|
132
|
+
probe.extend([f"{ssh_user}@{ssh_target}", "echo __SWM_OK__"])
|
|
133
|
+
|
|
134
|
+
while time.time() - start < timeout:
|
|
135
|
+
try:
|
|
136
|
+
result = subprocess.run(probe, capture_output=True, timeout=15)
|
|
137
|
+
if b"__SWM_OK__" in result.stdout:
|
|
138
|
+
console.print(f" [green]✓ SSH ready[/green] ({_elapsed()})")
|
|
139
|
+
return inst
|
|
140
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
141
|
+
pass
|
|
142
|
+
time.sleep(5)
|
|
143
|
+
|
|
144
|
+
raise TimeoutError(
|
|
145
|
+
f"SSH not reachable after {timeout}s for instance {instance_id}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def next_workspace_name(storage_provider, bucket: str) -> str:
|
|
150
|
+
"""Find the next available workspace name in a bucket.
|
|
151
|
+
|
|
152
|
+
Existing: workspace/, workspace2/ → returns "workspace3".
|
|
153
|
+
Empty bucket → returns "workspace".
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
objects = storage_provider.ls(bucket)
|
|
157
|
+
except Exception:
|
|
158
|
+
return "workspace"
|
|
159
|
+
|
|
160
|
+
existing: set[int] = set()
|
|
161
|
+
for obj in objects:
|
|
162
|
+
name = obj.key.rstrip("/")
|
|
163
|
+
if name == "workspace":
|
|
164
|
+
existing.add(1)
|
|
165
|
+
elif name.startswith("workspace"):
|
|
166
|
+
suffix = name.removeprefix("workspace")
|
|
167
|
+
try:
|
|
168
|
+
existing.add(int(suffix))
|
|
169
|
+
except ValueError:
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
if not existing:
|
|
173
|
+
return "workspace"
|
|
174
|
+
|
|
175
|
+
return f"workspace{max(existing) + 1}"
|