swm-gpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. swm/__init__.py +1 -0
  2. swm/bootstrap.py +337 -0
  3. swm/bootstrap_frameworks.py +177 -0
  4. swm/bootstrap_ssh.py +175 -0
  5. swm/cli.py +281 -0
  6. swm/commands/__init__.py +1 -0
  7. swm/commands/_helpers.py +265 -0
  8. swm/commands/config.py +77 -0
  9. swm/commands/costs.py +423 -0
  10. swm/commands/guard.py +191 -0
  11. swm/commands/models.py +484 -0
  12. swm/commands/pod.py +693 -0
  13. swm/commands/pricing.py +162 -0
  14. swm/commands/remote.py +229 -0
  15. swm/commands/setup.py +421 -0
  16. swm/commands/storage.py +290 -0
  17. swm/commands/sync.py +341 -0
  18. swm/commands/use.py +79 -0
  19. swm/config.py +78 -0
  20. swm/costs/__init__.py +1 -0
  21. swm/costs/billing.py +60 -0
  22. swm/costs/budget.py +68 -0
  23. swm/costs/db.py +268 -0
  24. swm/costs/reconcile.py +288 -0
  25. swm/costs/tracker.py +101 -0
  26. swm/frameworks/__init__.py +100 -0
  27. swm/frameworks/axolotl.py +54 -0
  28. swm/frameworks/comfyui.py +128 -0
  29. swm/frameworks/llm_studio.py +56 -0
  30. swm/frameworks/ollama.py +85 -0
  31. swm/frameworks/open_webui.py +98 -0
  32. swm/frameworks/swarmui.py +124 -0
  33. swm/frameworks/vllm_server.py +140 -0
  34. swm/guard.py +537 -0
  35. swm/models/__init__.py +1 -0
  36. swm/models/huggingface.py +84 -0
  37. swm/pricing/__init__.py +12 -0
  38. swm/pricing/calculator.py +114 -0
  39. swm/pricing/providers.py +207 -0
  40. swm/providers/__init__.py +106 -0
  41. swm/providers/aws.py +170 -0
  42. swm/providers/azure.py +307 -0
  43. swm/providers/base.py +187 -0
  44. swm/providers/coreweave.py +167 -0
  45. swm/providers/fluidstack.py +274 -0
  46. swm/providers/gcp.py +260 -0
  47. swm/providers/lambda_labs.py +202 -0
  48. swm/providers/runpod.py +212 -0
  49. swm/providers/tensordock.py +266 -0
  50. swm/providers/vastai.py +274 -0
  51. swm/providers/vultr.py +233 -0
  52. swm/remote/__init__.py +13 -0
  53. swm/remote/ssh.py +438 -0
  54. swm/storage/__init__.py +92 -0
  55. swm/storage/b2.py +88 -0
  56. swm/storage/base.py +212 -0
  57. swm/storage/gcs.py +74 -0
  58. swm/storage/s3.py +59 -0
  59. swm/sync/__init__.py +34 -0
  60. swm/sync/_autosync_daemon.sh +128 -0
  61. swm/sync/_common.py +59 -0
  62. swm/sync/autosync.py +179 -0
  63. swm/sync/paths.py +40 -0
  64. swm/sync/preflight.py +90 -0
  65. swm/sync/pull.py +116 -0
  66. swm/sync/push.py +326 -0
  67. swm/sync/watcher.py +75 -0
  68. swm_gpu-0.1.0.dist-info/METADATA +236 -0
  69. swm_gpu-0.1.0.dist-info/RECORD +73 -0
  70. swm_gpu-0.1.0.dist-info/WHEEL +4 -0
  71. swm_gpu-0.1.0.dist-info/entry_points.txt +2 -0
  72. swm_gpu-0.1.0.dist-info/licenses/LICENSE +190 -0
  73. swm_gpu-0.1.0.dist-info/licenses/NOTICE +5 -0
swm/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
swm/bootstrap.py ADDED
@@ -0,0 +1,337 @@
1
+ """Bootstrap scripts for setting up remote GPU instances with storage and tools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+
7
+ from rich.console import Console
8
+
9
+ from swm.remote.ssh import RemoteSession
10
+
11
+ console = Console()
12
+
13
+ S5CMD_VERSION = "2.3.0"
14
+ S5CMD_URL = (
15
+ f"https://github.com/peak/s5cmd/releases/download/v{S5CMD_VERSION}/"
16
+ f"s5cmd_{S5CMD_VERSION}_Linux-64bit.tar.gz"
17
+ )
18
+
19
+ SAFETY_MARGIN = 0.90
20
+
21
+
22
+ def _humanize(n: int | float) -> str:
23
+ v = float(n)
24
+ for unit in ("B", "KB", "MB", "GB", "TB"):
25
+ if v < 1024:
26
+ return f"{v:.1f} {unit}" if unit != "B" else f"{int(v)} B"
27
+ v /= 1024
28
+ return f"{v:.1f} PB"
29
+
30
+
31
+ def _step(session: RemoteSession, label: str, command: str) -> tuple[int, str, str]:
32
+ """Run a labelled step on the remote, streaming output to the terminal."""
33
+ console.print(f"\n[bold cyan]▸ {label}[/bold cyan]")
34
+ code, stdout, stderr = session.exec(command, stream=True)
35
+ if code != 0:
36
+ raise RuntimeError(f"Step failed (exit {code}): {label}")
37
+ return code, stdout, stderr
38
+
39
+
40
+ # ── s5cmd ──────────────────────────────────────────────────────────
41
+
42
+
43
+ def _s3_env(storage_slug: str) -> str:
44
+ """Build env-var prefix for s5cmd from swm config."""
45
+ from swm import config as cfg
46
+
47
+ if storage_slug == "b2":
48
+ endpoint = cfg.get("b2.s3_endpoint") or ""
49
+ ak = cfg.get("b2.key_id") or ""
50
+ sk = cfg.get("b2.app_key") or ""
51
+ elif storage_slug == "gcs":
52
+ endpoint = "https://storage.googleapis.com"
53
+ ak = cfg.get("gcs.hmac_access") or ""
54
+ sk = cfg.get("gcs.hmac_secret") or ""
55
+ elif storage_slug == "s3":
56
+ endpoint = ""
57
+ ak = cfg.get("s3.access_key") or ""
58
+ sk = cfg.get("s3.secret_key") or ""
59
+ else:
60
+ raise ValueError(f"Unknown storage slug: {storage_slug}")
61
+
62
+ parts = [
63
+ f"AWS_ACCESS_KEY_ID='{ak}'",
64
+ f"AWS_SECRET_ACCESS_KEY='{sk}'",
65
+ ]
66
+ if endpoint:
67
+ parts.append(f"S3_ENDPOINT_URL='{endpoint}'")
68
+ return " ".join(parts)
69
+
70
+
71
+ def install_s5cmd(session: RemoteSession) -> None:
72
+ """Download the s5cmd static binary if not already present."""
73
+ _step(
74
+ session,
75
+ "Installing s5cmd",
76
+ f"command -v s5cmd >/dev/null 2>&1 && echo 's5cmd already installed' || "
77
+ f"(curl -sL '{S5CMD_URL}' | tar xz -C /usr/local/bin s5cmd "
78
+ f"&& chmod +x /usr/local/bin/s5cmd && s5cmd version)",
79
+ )
80
+
81
+
82
+ def _install_inotify(session: RemoteSession) -> None:
83
+ """Install inotify-tools for the filesystem change watcher."""
84
+ _step(
85
+ session,
86
+ "Installing inotify-tools",
87
+ "command -v inotifywait >/dev/null 2>&1 && echo 'inotify-tools already installed' || "
88
+ "(apt-get update -qq && apt-get install -y -qq inotify-tools && echo 'installed')",
89
+ )
90
+
91
+
92
+ def configure_storage(
93
+ session: RemoteSession, storage_slug: str, bucket: str = "",
94
+ ) -> None:
95
+ """Install s5cmd, inotify-tools, and verify the S3-compatible connection."""
96
+ install_s5cmd(session)
97
+ try:
98
+ _install_inotify(session)
99
+ except RuntimeError:
100
+ console.print(" [yellow]⚠ inotify-tools install failed — push will use find[/yellow]")
101
+ env = _s3_env(storage_slug)
102
+ target = f"s3://{bucket}/" if bucket else ""
103
+ _step(
104
+ session,
105
+ f"Verifying {storage_slug} connection",
106
+ f"{env} s5cmd ls {target} 2>&1 | head -5 || true",
107
+ )
108
+
109
+
110
+ _WS_MARKER_NAMES = (
111
+ ".swm_last_push",
112
+ ".swm_changes.log",
113
+ ".swm_workspace.tar.gz",
114
+ ".swm_autosync.log",
115
+ )
116
+
117
+
118
+ def _ensure_workspace_empty_on_pod(session: RemoteSession) -> None:
119
+ """Raise if /workspace/ on the pod contains any non-marker files.
120
+
121
+ A blind ``touch PUSH_STAMP`` on a non-empty workspace would silently
122
+ declare those files synced even though they were never uploaded.
123
+ Refuse, and tell the user to push or clear before attaching.
124
+ """
125
+ excludes = " ".join(f"-not -name '{n}'" for n in _WS_MARKER_NAMES)
126
+ cmd = (
127
+ f"find /workspace -mindepth 1 -maxdepth 1 {excludes} 2>/dev/null "
128
+ "| head -5"
129
+ )
130
+ _, out, _ = session.exec(cmd, stream=False)
131
+ leftover = [line for line in out.splitlines() if line.strip()]
132
+ if leftover:
133
+ sample = ", ".join(p.rsplit("/", 1)[-1] for p in leftover[:3])
134
+ more = "" if len(leftover) <= 3 else f" (+ more)"
135
+ raise RuntimeError(
136
+ f"/workspace/ on the pod is not empty (e.g. {sample}{more}). "
137
+ "Refusing to mark this as a fresh workspace because those "
138
+ "files would not be uploaded. Either: (a) upload them first "
139
+ "with `swm sync push <pod> -b <provider:bucket> -d <name> "
140
+ "--force`, then re-run setup; or (b) clear /workspace/ on "
141
+ "the pod and re-run; or (c) re-run with an existing "
142
+ "workspace name to pull from storage."
143
+ )
144
+
145
+
146
+ def bootstrap_workspace_on_pod(
147
+ session: RemoteSession,
148
+ storage_slug: str,
149
+ bucket: str,
150
+ workspace: str,
151
+ *,
152
+ qualified_id: str,
153
+ is_new: bool,
154
+ extra_excludes: list[str] | None = None,
155
+ autosync_interval: int = 60,
156
+ console_obj: Console | None = None,
157
+ ) -> list[tuple[str, str]]:
158
+ """Configure storage, pull (or watcher-init), and start auto-sync on a pod.
159
+
160
+ Returns a list of ``(label, recovery_command)`` tuples for any sub-step
161
+ that failed. An empty list means full success.
162
+ """
163
+ from swm.sync import start_watcher
164
+ from swm.sync.autosync import AutosyncUnsafeError, start_autosync
165
+ from swm.sync.paths import PUSH_STAMP, WATCH_LOG
166
+ from swm.sync.pull import workspace_pull
167
+
168
+ _con = console_obj or console
169
+ failed: list[tuple[str, str]] = []
170
+
171
+ def _fail(label: str, cmd: str, exc: Exception) -> None:
172
+ _con.print(f" [yellow]⚠ {label} failed: {exc}[/yellow]")
173
+ failed.append((label, cmd))
174
+
175
+ storage_ok = False
176
+ try:
177
+ with _con.status(
178
+ "Installing s5cmd & configuring storage…", spinner="dots"
179
+ ):
180
+ configure_storage(session, storage_slug, bucket=bucket)
181
+ _con.print("[green]✓[/green] Storage configured")
182
+ storage_ok = True
183
+ except Exception as exc:
184
+ _fail("Storage configuration", f"swm setup storage {qualified_id}", exc)
185
+
186
+ pull_ok = False
187
+ if storage_ok:
188
+ try:
189
+ if is_new:
190
+ _ensure_workspace_empty_on_pod(session)
191
+ _con.print(" [dim]New workspace — skipping pull[/dim]")
192
+ session.exec(
193
+ f": > {WATCH_LOG} 2>/dev/null; touch {PUSH_STAMP}",
194
+ stream=False,
195
+ )
196
+ if start_watcher(session, "/workspace"):
197
+ _con.print(" [dim]Watcher started for change tracking[/dim]")
198
+ else:
199
+ workspace_pull(
200
+ session, storage_slug, bucket, workspace,
201
+ extra_excludes=extra_excludes,
202
+ )
203
+ pull_ok = True
204
+ except Exception as exc:
205
+ _fail("Workspace pull", f"swm sync pull {qualified_id}", exc)
206
+ else:
207
+ failed.append(("Workspace pull", f"swm sync pull {qualified_id}"))
208
+
209
+ if pull_ok:
210
+ try:
211
+ if start_autosync(
212
+ session, storage_slug, bucket, workspace,
213
+ interval=autosync_interval,
214
+ ):
215
+ _con.print(
216
+ " [dim]Auto-sync started "
217
+ f"(every {autosync_interval}s → "
218
+ f"{storage_slug}:{bucket}/{workspace})[/dim]"
219
+ )
220
+ except (AutosyncUnsafeError, Exception) as exc:
221
+ _fail("Auto-sync start", f"swm sync auto {qualified_id}", exc)
222
+ else:
223
+ failed.append(("Auto-sync start", f"swm sync auto {qualified_id}"))
224
+
225
+ return failed
226
+
227
+
228
+ _LOCK_FILE = "/tmp/.swm_transfer.lock"
229
+
230
+
231
+ def _acquire_transfer_lock(session: RemoteSession, force: bool = False) -> None:
232
+ """Check for an existing transfer and acquire the lock.
233
+
234
+ If a lock exists with a live PID, raises unless *force* is True
235
+ (which kills the stale process first).
236
+ """
237
+ code, out, _ = session.exec(
238
+ f"cat {_LOCK_FILE} 2>/dev/null", stream=False,
239
+ )
240
+ old_pid = out.strip()
241
+
242
+ if old_pid:
243
+ _, alive, _ = session.exec(
244
+ f"kill -0 {old_pid} 2>/dev/null && echo alive || echo dead",
245
+ stream=False,
246
+ )
247
+ if "alive" in alive:
248
+ if not force:
249
+ raise RuntimeError(
250
+ f"A transfer is already running (PID {old_pid}). "
251
+ "Use --force to kill it and start a new one."
252
+ )
253
+ console.print(
254
+ f" [yellow]⚠ Killing existing transfer (PID {old_pid})[/yellow]"
255
+ )
256
+ session.exec(f"kill -9 {old_pid} 2>/dev/null; sleep 1", stream=False)
257
+
258
+ # Stale lock — clean up temp files left behind
259
+ console.print(" [dim]Cleaning up stale temp files…[/dim]")
260
+ _, cleanup_out, _ = session.exec(
261
+ "find /workspace -maxdepth 5 -type f -regex '.*\\.[a-z]*[0-9]\\{9,\\}$' "
262
+ "-delete -print 2>/dev/null | wc -l",
263
+ stream=False,
264
+ )
265
+ n = cleanup_out.strip()
266
+ if n and n != "0":
267
+ console.print(f" [dim]Removed {n} orphaned temp files[/dim]")
268
+
269
+ session.exec(f"echo $$ > {_LOCK_FILE}", stream=False)
270
+
271
+
272
+ def _s5cmd_transfer(
273
+ session: RemoteSession,
274
+ label: str,
275
+ s5cmd_cmd: str,
276
+ force: bool = False,
277
+ total_bytes: int = 0,
278
+ total_files: int = 0,
279
+ ) -> int:
280
+ """Run an s5cmd transfer with output streamed directly to the terminal.
281
+
282
+ Acquires a lock file on the pod, wraps the command in a shell trap
283
+ for guaranteed cleanup (even on SSH disconnect), and streams
284
+ s5cmd's native ``--show-progress`` output to the terminal.
285
+
286
+ Returns the process exit code.
287
+ """
288
+ console.print(f"\n[bold cyan]▸ {label}[/bold cyan]")
289
+ _acquire_transfer_lock(session, force=force)
290
+
291
+ wrapped = (
292
+ f"trap 'rm -f {_LOCK_FILE}' EXIT; "
293
+ f"echo $$ > {_LOCK_FILE}; "
294
+ f"{s5cmd_cmd}"
295
+ )
296
+ cmd = session._ssh_cmd() + [wrapped]
297
+ code = subprocess.call(cmd)
298
+
299
+ if code != 0:
300
+ console.print(f" [yellow]⚠ Transfer finished with warnings (exit {code})[/yellow]")
301
+ else:
302
+ console.print(f" [green]✓ {label} — done[/green]")
303
+
304
+ return code
305
+
306
+
307
+ # Backward-compatible re-exports (lazy to avoid circular imports)
308
+ _RE_EXPORTS: dict[str, str] = {
309
+ "install_framework": "swm.bootstrap_frameworks",
310
+ "start_framework": "swm.bootstrap_frameworks",
311
+ "stop_framework": "swm.bootstrap_frameworks",
312
+ "install_comfyui": "swm.bootstrap_frameworks",
313
+ "install_swarmui": "swm.bootstrap_frameworks",
314
+ "link_models_to_comfyui": "swm.bootstrap_frameworks",
315
+ "wait_for_ssh": "swm.bootstrap_ssh",
316
+ "next_workspace_name": "swm.bootstrap_ssh",
317
+ "DiskCheck": "swm.sync",
318
+ "preflight_check": "swm.sync",
319
+ "start_watcher": "swm.sync",
320
+ "stop_watcher": "swm.sync",
321
+ "is_watcher_alive": "swm.sync",
322
+ "workspace_pull": "swm.sync",
323
+ "workspace_push": "swm.sync",
324
+ "tar_pull": "swm.sync",
325
+ "start_autosync": "swm.sync",
326
+ "stop_autosync": "swm.sync",
327
+ "is_autosync_alive": "swm.sync",
328
+ "autosync_status": "swm.sync",
329
+ }
330
+
331
+
332
+ def __getattr__(name: str): # noqa: E302
333
+ if name in _RE_EXPORTS:
334
+ import importlib
335
+ mod = importlib.import_module(_RE_EXPORTS[name])
336
+ return getattr(mod, name)
337
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,177 @@
1
+ """Framework lifecycle management for remote GPU instances."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from rich.console import Console
8
+
9
+ from swm.bootstrap import _step
10
+ from swm.remote.ssh import RemoteSession
11
+
12
+ console = Console()
13
+
14
+
15
+ def install_framework(
16
+ session: RemoteSession,
17
+ name: str,
18
+ console: Console | None = None,
19
+ ) -> None:
20
+ """Install a framework by name using its declarative step list."""
21
+ from swm.frameworks import Framework, get_framework
22
+
23
+ _con = console or globals()["console"]
24
+
25
+ fw: Framework = get_framework(name)
26
+ _con.print(f"\n[bold]Installing {fw.label}[/bold]")
27
+ env_prefix = f"{fw.env_setup} && " if fw.env_setup else ""
28
+
29
+ total = len(fw.steps) + len(fw.post_install)
30
+ for idx, step in enumerate(fw.steps, 1):
31
+ workdir = step.workdir or fw.install_dir
32
+ if step.check:
33
+ cmd = f"{step.check} && echo '{step.label}: already done' || ({env_prefix}cd {workdir} && {step.command})"
34
+ else:
35
+ cmd = f"{env_prefix}cd {workdir} && {step.command}"
36
+ _step(session, f"[{idx}/{total}] {step.label}", cmd)
37
+
38
+ for idx, step in enumerate(fw.post_install, len(fw.steps) + 1):
39
+ workdir = step.workdir or fw.install_dir
40
+ if step.check:
41
+ cmd = f"{step.check} && echo '{step.label}: already done' || ({env_prefix}cd {workdir} && {step.command})"
42
+ else:
43
+ cmd = f"{env_prefix}cd {workdir} && {step.command}"
44
+ _step(session, f"[{idx}/{total}] {step.label}", cmd)
45
+
46
+
47
+ def start_framework(
48
+ session: RemoteSession,
49
+ name: str,
50
+ port: int | None = None,
51
+ console: Console | None = None,
52
+ qualified_id: str | None = None,
53
+ ) -> str | None:
54
+ """Launch a framework in the background. Returns proxy URL if applicable."""
55
+ from swm.frameworks import get_framework
56
+
57
+ _con = console or globals()["console"]
58
+ fw = get_framework(name)
59
+
60
+ if fw.process_pattern:
61
+ with _con.status(f"Checking if {fw.label} is running…", spinner="dots"):
62
+ _, out, _ = session.exec(
63
+ f"pgrep -fa '{fw.process_pattern}' | grep -v grep || true",
64
+ stream=False,
65
+ )
66
+ if out.strip():
67
+ pid = out.strip().split("\n")[0].split()[0]
68
+ _con.print(
69
+ f" [yellow]{fw.label} is already running (PID {pid})[/yellow]"
70
+ )
71
+ return None
72
+
73
+ env_prefix = f"{fw.env_setup} && " if fw.env_setup else ""
74
+ if fw.pre_start:
75
+ _con.print(f"\n[bold cyan]▸ Preparing {fw.label}[/bold cyan]")
76
+ for step in fw.pre_start:
77
+ workdir = step.workdir or fw.install_dir
78
+ if step.check:
79
+ cmd = (
80
+ f"{step.check} && echo '{step.label}: already done' "
81
+ f"|| ({env_prefix}cd {workdir} && {step.command})"
82
+ )
83
+ else:
84
+ cmd = f"{env_prefix}cd {workdir} && {step.command}"
85
+ _step(session, step.label, cmd)
86
+
87
+ _con.print(f"\n[bold cyan]▸ Starting {fw.label}[/bold cyan]")
88
+ launch = fw.launch_cmd
89
+ if port and fw.ports:
90
+ default_port = str(next(iter(fw.ports)))
91
+ launch = launch.replace(default_port, str(port))
92
+
93
+ logfile = f"/tmp/{fw.name}.log"
94
+
95
+ with _con.status(f"Launching {fw.label}…", spinner="dots"):
96
+ session.exec_background(
97
+ launch,
98
+ logfile=logfile,
99
+ workdir=fw.launch_workdir,
100
+ env_setup=fw.env_setup,
101
+ )
102
+
103
+ max_checks = 5
104
+ alive = False
105
+ for i in range(max_checks):
106
+ time.sleep(3 if i == 0 else 2)
107
+ if not fw.process_pattern:
108
+ alive = True
109
+ break
110
+ _, out, _ = session.exec(
111
+ f"pgrep -fa '{fw.process_pattern}' | grep -v grep || true",
112
+ stream=False,
113
+ )
114
+ if out.strip():
115
+ alive = True
116
+ break
117
+
118
+ if alive:
119
+ _con.print(f" [green]✓ {fw.label} started[/green]")
120
+ _pod_ref = qualified_id or "<pod>"
121
+ _con.print(f" Logs: swm run {_pod_ref} 'tail -f {logfile}'")
122
+ else:
123
+ _con.print(f" [red]✗ {fw.label} failed to start[/red]")
124
+ _con.print(f" Last lines from {logfile}:")
125
+ _, tail, _ = session.exec(f"tail -15 {logfile} 2>/dev/null", stream=False)
126
+ if tail.strip():
127
+ for line in tail.strip().splitlines():
128
+ _con.print(f" [dim]{line}[/dim]")
129
+ raise RuntimeError(f"{fw.label} exited immediately — check logs above")
130
+
131
+ return None
132
+
133
+
134
+ def stop_framework(
135
+ session: RemoteSession,
136
+ name: str,
137
+ console: Console | None = None,
138
+ ) -> None:
139
+ """Stop a running framework."""
140
+ from swm.frameworks import get_framework
141
+
142
+ _con = console or globals()["console"]
143
+ fw = get_framework(name)
144
+ if not fw.stop_cmd:
145
+ _con.print(f" [yellow]{fw.label} has no stop command defined[/yellow]")
146
+ return
147
+
148
+ _con.print(f"\n[bold cyan]▸ Stopping {fw.label}[/bold cyan]")
149
+ with _con.status(f"Stopping {fw.label}…", spinner="dots"):
150
+ session.exec(fw.stop_cmd, stream=False)
151
+ _con.print(f" [green]✓ {fw.label} stopped[/green]")
152
+
153
+
154
+ def install_comfyui(session: RemoteSession) -> None:
155
+ """Backward-compatible wrapper."""
156
+ install_framework(session, "comfyui")
157
+
158
+
159
+ def install_swarmui(session: RemoteSession) -> None:
160
+ """Backward-compatible wrapper."""
161
+ install_framework(session, "swarmui")
162
+
163
+
164
+ # ── symlinks ────────────────────────────────────────────────────────
165
+
166
+
167
+ def link_models_to_comfyui(session: RemoteSession) -> None:
168
+ """Symlink /workspace/models into ComfyUI's model directory."""
169
+ dirs = ["checkpoints", "loras", "vae", "controlnet", "clip", "upscale_models", "unet"]
170
+ cmds = ["mkdir -p /workspace/models/{" + ",".join(dirs) + "}"]
171
+ for d in dirs:
172
+ cmds.append(
173
+ f"[ -L /workspace/ComfyUI/models/{d} ] || "
174
+ f"(rm -rf /workspace/ComfyUI/models/{d} "
175
+ f"&& ln -s /workspace/models/{d} /workspace/ComfyUI/models/{d})"
176
+ )
177
+ _step(session, "Symlinking models → ComfyUI", " && ".join(cmds))
swm/bootstrap_ssh.py ADDED
@@ -0,0 +1,175 @@
1
+ """SSH waiting utilities for remote GPU instances."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ import time
7
+
8
+ from rich.console import Console
9
+
10
+ from swm.providers.base import Instance, InstanceStatus
11
+
12
+ console = Console()
13
+
14
+
15
+ def _has_direct_ssh(inst: Instance) -> bool:
16
+ """True when the instance exposes a public IP with a mapped SSH port."""
17
+ return bool(inst.ip_address and inst.ports.get(22))
18
+
19
+
20
+ def _has_relay_ssh(inst: Instance) -> bool:
21
+ return bool(inst.ssh_host and inst.ssh_port)
22
+
23
+
24
+ def _is_relay_host(inst: Instance) -> bool:
25
+ """True when ssh_host is a relay proxy, not the instance's own IP."""
26
+ if not inst.ssh_host:
27
+ return False
28
+ return inst.ssh_host != inst.ip_address
29
+
30
+
31
+ def wait_for_ssh(
32
+ provider,
33
+ instance_id: str,
34
+ timeout: int = 600,
35
+ poll_interval: int = 10,
36
+ direct_grace: int = 30,
37
+ ) -> Instance:
38
+ """Poll until the instance is running and SSH is reachable.
39
+
40
+ Prefers a direct IP+port connection over the provider relay. When
41
+ only a relay endpoint is found, keeps polling up to *direct_grace*
42
+ extra seconds for a direct endpoint before falling back. Providers
43
+ that expose SSH directly (Lambda, GCP, AWS) skip the grace window.
44
+
45
+ Displays ``status_detail`` from the provider (e.g. Vast.ai Docker
46
+ build progress) when available to give the user visibility into
47
+ what the remote host is doing.
48
+ """
49
+ from swm import config as _cfg
50
+
51
+ start = time.time()
52
+ last_status = ""
53
+ last_detail = ""
54
+ inst = None
55
+ relay_seen_at: float | None = None
56
+
57
+ def _elapsed() -> str:
58
+ return f"{int(time.time() - start)}s"
59
+
60
+ # Phase 1: wait for the instance to be RUNNING with an SSH endpoint.
61
+ while time.time() - start < timeout:
62
+ try:
63
+ if hasattr(provider, "get_instance"):
64
+ inst = provider.get_instance(instance_id)
65
+ else:
66
+ instances = provider.list_instances()
67
+ inst = next((i for i in instances if i.id == instance_id), None)
68
+
69
+ if inst:
70
+ status = inst.status.value
71
+ if status != last_status:
72
+ console.print(f" Status: [bold]{status}[/bold] ({_elapsed()})")
73
+ last_status = status
74
+
75
+ detail = inst.status_detail or ""
76
+ if detail and detail != last_detail:
77
+ truncated = (detail[:100] + "…") if len(detail) > 100 else detail
78
+ console.print(f" [dim]{truncated}[/dim]")
79
+ last_detail = detail
80
+
81
+ if inst.status == InstanceStatus.RUNNING:
82
+ if _has_direct_ssh(inst):
83
+ break
84
+ if _has_relay_ssh(inst):
85
+ if not _is_relay_host(inst):
86
+ break
87
+ if relay_seen_at is None:
88
+ relay_seen_at = time.time()
89
+ elif time.time() - relay_seen_at >= direct_grace:
90
+ break
91
+ except Exception:
92
+ pass
93
+
94
+ time.sleep(poll_interval)
95
+ else:
96
+ raise TimeoutError(
97
+ f"Pod not running after {timeout}s for instance {instance_id}. "
98
+ f"Last status: {last_status}"
99
+ )
100
+
101
+ # Phase 2: pick the best SSH path — direct mapped port always wins.
102
+ if _has_direct_ssh(inst):
103
+ ssh_target = inst.ip_address
104
+ port = inst.ports[22]
105
+ ssh_user = "root"
106
+ console.print(f" Direct SSH: {ssh_target}:{port}")
107
+ elif _has_relay_ssh(inst):
108
+ ssh_target = inst.ssh_host
109
+ port = inst.ssh_port
110
+ ssh_user = inst.ssh_user or "root"
111
+ if _is_relay_host(inst):
112
+ console.print(f" Relay SSH: {ssh_user}@{ssh_target}:{port}")
113
+ else:
114
+ console.print(f" SSH: {ssh_user}@{ssh_target}:{port}")
115
+ else:
116
+ console.print(" [yellow]No SSH endpoint found — returning anyway[/yellow]")
117
+ return inst
118
+
119
+ # Phase 3: probe until SSH actually responds.
120
+ console.print(f" Probing SSH… ({_elapsed()})")
121
+ probe = [
122
+ "ssh",
123
+ "-o", "StrictHostKeyChecking=no",
124
+ "-o", "UserKnownHostsFile=/dev/null",
125
+ "-o", "ConnectTimeout=5",
126
+ "-o", "LogLevel=ERROR",
127
+ "-p", str(port),
128
+ ]
129
+ key = _cfg.get(f"{provider.slug}.ssh_key") or _cfg.get("ssh.key_path")
130
+ if key:
131
+ probe.extend(["-i", str(key)])
132
+ probe.extend([f"{ssh_user}@{ssh_target}", "echo __SWM_OK__"])
133
+
134
+ while time.time() - start < timeout:
135
+ try:
136
+ result = subprocess.run(probe, capture_output=True, timeout=15)
137
+ if b"__SWM_OK__" in result.stdout:
138
+ console.print(f" [green]✓ SSH ready[/green] ({_elapsed()})")
139
+ return inst
140
+ except (subprocess.TimeoutExpired, OSError):
141
+ pass
142
+ time.sleep(5)
143
+
144
+ raise TimeoutError(
145
+ f"SSH not reachable after {timeout}s for instance {instance_id}"
146
+ )
147
+
148
+
149
+ def next_workspace_name(storage_provider, bucket: str) -> str:
150
+ """Find the next available workspace name in a bucket.
151
+
152
+ Existing: workspace/, workspace2/ → returns "workspace3".
153
+ Empty bucket → returns "workspace".
154
+ """
155
+ try:
156
+ objects = storage_provider.ls(bucket)
157
+ except Exception:
158
+ return "workspace"
159
+
160
+ existing: set[int] = set()
161
+ for obj in objects:
162
+ name = obj.key.rstrip("/")
163
+ if name == "workspace":
164
+ existing.add(1)
165
+ elif name.startswith("workspace"):
166
+ suffix = name.removeprefix("workspace")
167
+ try:
168
+ existing.add(int(suffix))
169
+ except ValueError:
170
+ pass
171
+
172
+ if not existing:
173
+ return "workspace"
174
+
175
+ return f"workspace{max(existing) + 1}"