icefold-runner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ """icefold-runner — a self-hosted execution runner for IceFold nodes.
2
+
3
+ Like a GitHub self-hosted CI runner: you start it on your own machine, it
4
+ reverse-connects to an IceFold server, receives node-execution jobs, and runs
5
+ them locally — pulling input media over HTTP and pushing products back.
6
+
7
+ The runner is a *generic execution framework*: it ships no node implementations
8
+ of its own. The server renders each node into a self-contained ``.py`` bundle;
9
+ the runner fetches the bundle on demand, pre-flights its declared dependencies,
10
+ and runs it. So upgrading or adding nodes on the server never requires updating
11
+ the runner.
12
+ """
13
+
14
+ __version__ = "0.1.0"
15
+
16
+ __all__ = ["__version__", "WorkerClient", "NodeRunner"]
17
+
18
+
19
+ def __getattr__(name): # lazy so importing the package is cheap
20
+ if name == "WorkerClient":
21
+ from icefold_runner.client import WorkerClient
22
+ return WorkerClient
23
+ if name == "NodeRunner":
24
+ from icefold_runner.runner import NodeRunner
25
+ return NodeRunner
26
+ raise AttributeError(name)
@@ -0,0 +1,75 @@
1
+ """CLI entrypoint: icefold-runner --token <token>
2
+
3
+ Run IceFold nodes on this machine. The runner reverse-connects to IceFold and
4
+ serves the account the token belongs to — the token (generated in the IceFold
5
+ app, Nodes ▸ Connect a runner) encodes + signs your user id, so there's no
6
+ server URL or user id to pass.
7
+
8
+ Bootstrap order matters: we point ``ICEFOLD_PROJECT_ROOT`` at the runner's
9
+ ``--work-dir`` *before* importing ``icefold``, so the SDK's ``DATA_DIR``
10
+ (hence where ffmpeg writes products) resolves under this runner's own dir.
11
+ ``icefold`` itself is an installed dependency (``pip install icefold-sdk``).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import asyncio
18
+ import os
19
+ import socket
20
+
21
+ # Built-in server. Self-hosters / dev can override via the ICEFOLD_RUNNER_SERVER
22
+ # env var (intentionally not a CLI flag — the normal user never sets it).
23
+ DEFAULT_SERVER = "wss://api.icefold.com"
24
+
25
+
26
+ def _parse_args(argv):
27
+ p = argparse.ArgumentParser(
28
+ prog="icefold-runner",
29
+ description="Run IceFold nodes on this machine. "
30
+ "Get a token from the IceFold app (Nodes ▸ Connect a runner).",
31
+ )
32
+ p.add_argument("--token", default=os.environ.get("ICEFOLD_RUNNER_TOKEN", ""),
33
+ help="Runner token from the IceFold app. env: ICEFOLD_RUNNER_TOKEN")
34
+ p.add_argument("--runner-id", default=os.environ.get("ICEFOLD_RUNNER_ID", "") or socket.gethostname(),
35
+ help="Stable id for this runner (default: hostname). env: ICEFOLD_RUNNER_ID")
36
+ p.add_argument("--work-dir",
37
+ default=os.environ.get("ICEFOLD_RUNNER_DIR", "") or os.path.abspath("./icefold-runner-data"),
38
+ help="Scratch dir for staged inputs + ffmpeg products. env: ICEFOLD_RUNNER_DIR")
39
+ args = p.parse_args(argv)
40
+
41
+ if not args.token:
42
+ p.error("missing required argument: --token "
43
+ "(generate one in the IceFold app: Nodes ▸ Connect a runner)")
44
+ return args
45
+
46
+
47
+ def main(argv=None) -> int:
48
+ args = _parse_args(argv)
49
+
50
+ # Built-in server; ICEFOLD_RUNNER_SERVER overrides for self-host / dev.
51
+ server = os.environ.get("ICEFOLD_RUNNER_SERVER", "").strip() or DEFAULT_SERVER
52
+
53
+ work_dir = os.path.abspath(args.work_dir)
54
+ os.makedirs(os.path.join(work_dir, "data", "download"), exist_ok=True)
55
+ os.makedirs(os.path.join(work_dir, "data", "upload"), exist_ok=True)
56
+
57
+ # Must precede any icefold import so DATA_DIR resolves under work_dir.
58
+ os.environ["ICEFOLD_PROJECT_ROOT"] = work_dir
59
+
60
+ from icefold_runner.client import WorkerClient
61
+
62
+ client = WorkerClient(
63
+ server=server,
64
+ token=args.token,
65
+ worker_id=args.runner_id,
66
+ )
67
+ try:
68
+ asyncio.run(client.run_forever())
69
+ except KeyboardInterrupt:
70
+ print("\nicefold-runner stopped")
71
+ return 0
72
+
73
+
74
+ if __name__ == "__main__":
75
+ raise SystemExit(main())
@@ -0,0 +1,306 @@
1
+ """Reverse WebSocket client for icefold-runner.
2
+
3
+ Dials out to ``<server>/v1/ws/worker``, authenticates with the shared token
4
+ (also the XOR keystream), then serves leaf ``node_exec`` jobs concurrently.
5
+ Reconnects with jittered exponential backoff; an auth rejection is fatal.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import json
12
+ import random
13
+ from typing import Dict, Optional
14
+ from urllib.parse import urlencode, urlsplit, urlunsplit
15
+
16
+ import websockets
17
+
18
+ import uuid
19
+
20
+ from icefold.crypto import xor_bytes
21
+ from icefold.exceptions import MissingDependencyError
22
+ from icefold.wire import (
23
+ SRV_CANCEL,
24
+ SRV_NODE_CALLBACK_RESULT,
25
+ SRV_NODE_EXEC,
26
+ SRV_PING,
27
+ WKR_HELLO,
28
+ WKR_NODE_DONE,
29
+ WKR_PONG,
30
+ WKR_PING,
31
+ make_missing_dep,
32
+ make_node_callback,
33
+ )
34
+ from icefold_runner import __version__ as VERSION
35
+ from icefold_runner.runner import NodeRunner
36
+ from icefold import log_error, log_info, log_warning
37
+
38
+ _BACKOFF_MIN = 1.0
39
+ _BACKOFF_MAX = 30.0
40
+ _KEEPALIVE_S = 20.0
41
+ _MAX_FRAME = 8 * 1024 * 1024
42
+
43
+
44
+ class AuthError(Exception):
45
+ """Server rejected our credentials — fatal, retrying won't help."""
46
+
47
+
48
+ def _log(level: str, msg: str, **kw) -> None:
49
+ {"warn": log_warning, "error": log_error}.get(level, log_info)("worker", msg, **kw)
50
+
51
+
52
+ class WorkerClient:
53
+ def __init__(
54
+ self,
55
+ *,
56
+ server: str,
57
+ token: str,
58
+ worker_id: str,
59
+ http_base: Optional[str] = None,
60
+ ) -> None:
61
+ self.server = server
62
+ self.token = token
63
+ self.worker_id = worker_id
64
+ self.xor_key = token.encode("utf-8") if token else b""
65
+ self.http_base = (http_base or self._derive_http_base(server)).rstrip("/")
66
+ self.runner = NodeRunner(self.http_base, token, _log)
67
+ self._tasks: Dict[str, asyncio.Task] = {}
68
+ # Bundle-host callback bookkeeping: bundle code reaches back into the
69
+ # server via ``ctx.progress(...)`` / ``ctx.llm.text(...)``; those land
70
+ # here as outbound ``node_callback`` frames keyed by ``req_id`` and
71
+ # we await the server's matching ``node_callback_result`` to resolve
72
+ # the bundle's future.
73
+ self._pending_callbacks: Dict[str, "asyncio.Future[dict]"] = {}
74
+
75
+ # ── URL helpers ──
76
+
77
+ @staticmethod
78
+ def _derive_http_base(server: str) -> str:
79
+ parts = urlsplit(server)
80
+ scheme = {"ws": "http", "wss": "https"}.get(parts.scheme, parts.scheme or "http")
81
+ return urlunsplit((scheme, parts.netloc, "", "", ""))
82
+
83
+ def _ws_url(self) -> str:
84
+ parts = urlsplit(self.server)
85
+ scheme = {"http": "ws", "https": "wss"}.get(parts.scheme, parts.scheme or "ws")
86
+ path = parts.path.rstrip("/")
87
+ if not path.endswith("/v1/ws/worker"):
88
+ path = path + "/v1/ws/worker"
89
+ # No user_id: the token encodes + signs it; the server derives the
90
+ # identity from the token so this runner can't claim another account.
91
+ query = urlencode({
92
+ "token": self.token,
93
+ "worker_id": self.worker_id,
94
+ })
95
+ return urlunsplit((scheme, parts.netloc, path, query, ""))
96
+
97
+ # ── main loop ──
98
+
99
+ async def run_forever(self) -> None:
100
+ _log("info", f"icefold-runner {VERSION} starting",
101
+ server=self.server, worker_id=self.worker_id)
102
+ backoff = _BACKOFF_MIN
103
+ while True:
104
+ try:
105
+ await self._run_once()
106
+ backoff = _BACKOFF_MIN
107
+ _log("info", "connection closed; will reconnect")
108
+ except AuthError as e:
109
+ _log("error", f"authentication failed; exiting: {e}")
110
+ return
111
+ except Exception as e: # noqa: BLE001
112
+ _log("error", f"connection failed: {e}", next_retry=round(backoff, 1))
113
+ sleep = backoff + random.uniform(0, backoff / 4)
114
+ await asyncio.sleep(sleep)
115
+ backoff = min(backoff * 2, _BACKOFF_MAX)
116
+
117
+ async def _run_once(self) -> None:
118
+ url = self._ws_url()
119
+ _log("info", "dialing", url=self._redact(url))
120
+ try:
121
+ ws = await websockets.connect(
122
+ url, max_size=_MAX_FRAME, open_timeout=15,
123
+ ping_interval=_KEEPALIVE_S, ping_timeout=_KEEPALIVE_S,
124
+ )
125
+ except Exception as e: # noqa: BLE001
126
+ if self._is_auth_rejection(e):
127
+ raise AuthError(str(e))
128
+ raise
129
+ async with ws:
130
+ await self._send(ws, {
131
+ "type": WKR_HELLO,
132
+ "worker_id": self.worker_id,
133
+ "version": VERSION,
134
+ "capabilities": ["builtin"],
135
+ })
136
+ _log("info", "connected", worker_id=self.worker_id)
137
+ keepalive = asyncio.create_task(self._keepalive(ws))
138
+ try:
139
+ async for raw in ws:
140
+ msg = self._decode(raw)
141
+ if msg is not None:
142
+ await self._handle(ws, msg)
143
+ finally:
144
+ keepalive.cancel()
145
+ for t in list(self._tasks.values()):
146
+ t.cancel()
147
+ self._tasks.clear()
148
+
149
+ async def _keepalive(self, ws) -> None:
150
+ try:
151
+ while True:
152
+ await asyncio.sleep(_KEEPALIVE_S)
153
+ await self._send(ws, {"type": WKR_PING})
154
+ except asyncio.CancelledError:
155
+ pass
156
+ except Exception: # noqa: BLE001
157
+ pass
158
+
159
+ # ── frame codec ──
160
+
161
+ def _decode(self, raw) -> Optional[dict]:
162
+ try:
163
+ if isinstance(raw, (bytes, bytearray)):
164
+ data = xor_bytes(bytes(raw), self.xor_key) if self.xor_key else bytes(raw)
165
+ return json.loads(data.decode("utf-8"))
166
+ return json.loads(raw)
167
+ except Exception as e: # noqa: BLE001
168
+ _log("warn", f"bad frame from server: {e}")
169
+ return None
170
+
171
+ async def _send(self, ws, msg: dict) -> None:
172
+ payload = json.dumps(msg).encode("utf-8")
173
+ if self.xor_key:
174
+ await ws.send(xor_bytes(payload, self.xor_key))
175
+ else:
176
+ await ws.send(payload.decode("utf-8"))
177
+
178
+ # ── dispatch ──
179
+
180
+ async def _handle(self, ws, msg: dict) -> None:
181
+ mtype = msg.get("type", "")
182
+ if mtype == SRV_NODE_EXEC:
183
+ call_id = msg.get("call_id", "")
184
+ if not call_id:
185
+ return
186
+ self._tasks[call_id] = asyncio.create_task(self._run_node(ws, msg))
187
+ elif mtype == SRV_CANCEL:
188
+ task = self._tasks.get(msg.get("call_id", ""))
189
+ if task is not None:
190
+ task.cancel()
191
+ elif mtype == SRV_NODE_CALLBACK_RESULT:
192
+ # Server is replying to a callback the bundle issued via
193
+ # ctx.progress(...) / ctx.llm.text(...). Look up the awaiter
194
+ # by req_id and feed it the result; the bundle's coroutine
195
+ # resumes inside the node's task.
196
+ req_id = msg.get("req_id", "")
197
+ fut = self._pending_callbacks.pop(req_id, None)
198
+ if fut is not None and not fut.done():
199
+ fut.set_result(msg)
200
+ elif mtype == SRV_PING:
201
+ await self._send(ws, {"type": WKR_PONG})
202
+
203
+ async def _run_node(self, ws, msg: dict) -> None:
204
+ call_id = msg["call_id"]
205
+ node_type = msg.get("node_type", "")
206
+ try:
207
+ _log("info", f"running node {node_type}", call_id=call_id)
208
+ send_callback = self._make_send_callback(ws, call_id)
209
+ output = await self.runner.run(msg, send_callback=send_callback)
210
+ await self._send(ws, {
211
+ "type": WKR_NODE_DONE, "call_id": call_id,
212
+ "output": output, "err": "", "killed": False,
213
+ })
214
+ _log("info", f"node done {node_type}", call_id=call_id)
215
+ except MissingDependencyError as dep:
216
+ # Bundle pre-flight detected a missing native/python dep. Send the
217
+ # typed reply (not node_done) so the server can surface a
218
+ # user-actionable "install X via …" notification.
219
+ _log(
220
+ "warn",
221
+ f"node {node_type} skipped: missing deps "
222
+ f"binaries={list(dep.missing_binaries)} python={list(dep.missing_python)}",
223
+ call_id=call_id,
224
+ )
225
+ await self._safe_send(ws, make_missing_dep(
226
+ call_id=call_id,
227
+ missing_binaries=dep.missing_binaries,
228
+ missing_python=dep.missing_python,
229
+ install_hint=dep.install_hint,
230
+ ))
231
+ except asyncio.TimeoutError:
232
+ await self._safe_send(ws, {
233
+ "type": WKR_NODE_DONE, "call_id": call_id,
234
+ "output": None, "err": "remote node timed out", "killed": True,
235
+ })
236
+ except asyncio.CancelledError:
237
+ # Server asked us to cancel (or we're tearing down). The server's
238
+ # awaiting future is already cancelled, so no node_done is needed.
239
+ raise
240
+ except Exception as e: # noqa: BLE001
241
+ _log("error", f"node failed {node_type}: {e}", call_id=call_id)
242
+ await self._safe_send(ws, {
243
+ "type": WKR_NODE_DONE, "call_id": call_id,
244
+ "output": None, "err": str(e), "killed": False,
245
+ })
246
+ finally:
247
+ self._tasks.pop(call_id, None)
248
+ # Fail any still-pending callbacks (e.g. the bundle was cancelled
249
+ # mid-LLM-call) so the bundle's awaiter doesn't hang on shutdown.
250
+ for req_id, fut in list(self._pending_callbacks.items()):
251
+ if not fut.done():
252
+ fut.set_result({
253
+ "type": SRV_NODE_CALLBACK_RESULT,
254
+ "call_id": call_id, "req_id": req_id,
255
+ "ok": False, "result": None,
256
+ "error": "node ended before callback resolved",
257
+ })
258
+ self._pending_callbacks.pop(req_id, None)
259
+
260
+ def _make_send_callback(self, ws, call_id: str):
261
+ """Return the bundle-host callback sender bound to one node_exec.
262
+
263
+ Bundles only ever see this closure (never the raw WS). It allocates
264
+ a ``req_id``, queues a ``node_callback`` frame, and awaits the
265
+ server's matching ``node_callback_result``. Result frames where
266
+ ``ok=False`` are translated into ``RuntimeError`` so the bundle can
267
+ catch them like any synchronous failure.
268
+ """
269
+ loop = asyncio.get_event_loop()
270
+
271
+ async def _send(kind: str, payload: dict):
272
+ req_id = uuid.uuid4().hex
273
+ fut: "asyncio.Future[dict]" = loop.create_future()
274
+ self._pending_callbacks[req_id] = fut
275
+ try:
276
+ await self._send(ws, make_node_callback(
277
+ call_id=call_id, req_id=req_id, kind=kind, payload=payload,
278
+ ))
279
+ except Exception:
280
+ self._pending_callbacks.pop(req_id, None)
281
+ raise
282
+ reply = await fut
283
+ if not reply.get("ok"):
284
+ raise RuntimeError(reply.get("error") or f"callback {kind!r} failed")
285
+ return reply.get("result")
286
+
287
+ return _send
288
+
289
+ async def _safe_send(self, ws, msg: dict) -> None:
290
+ try:
291
+ await self._send(ws, msg)
292
+ except Exception: # noqa: BLE001
293
+ pass
294
+
295
+ # ── error classification ──
296
+
297
+ @staticmethod
298
+ def _is_auth_rejection(e: Exception) -> bool:
299
+ resp = getattr(e, "response", None)
300
+ status = getattr(resp, "status_code", None) or getattr(e, "status_code", None)
301
+ return status in (401, 403)
302
+
303
+ @staticmethod
304
+ def _redact(url: str) -> str:
305
+ parts = urlsplit(url)
306
+ return urlunsplit((parts.scheme, parts.netloc, parts.path, "token=<redacted>", ""))
@@ -0,0 +1,282 @@
1
+ """Run one leaf node-exec job on this machine.
2
+
3
+ Each ``node_exec`` frame is a single, already-sliced variant (the server did
4
+ all variant planning) and carries a **bundle hash** — the server has already
5
+ rendered the node into a self-contained ``.py``. The runner ships no node
6
+ implementations of its own and never compiles user source.
7
+
8
+ Per call:
9
+
10
+ 1. fetch (cache-aware) ``/v1/bundles/<hash>`` into ``runner_work_dir/bundles/``
11
+ 2. exec the bundle in a fresh module namespace — it self-declares
12
+ ``__icefold_python_deps__`` / ``__icefold_binary_deps__`` plus the
13
+ ``async def __icefold_run__(inputs, ctx_dict) -> Any`` entry point
14
+ 3. pre-flight the declared deps (``shutil.which`` + ``import_module``);
15
+ surface ``MissingDependencyError`` so the client wraps a structured
16
+ ``missing_dep`` reply instead of ``node_done``
17
+ 4. download ``/upload/`` & ``/download/`` input refs to a staging dir and
18
+ rewrite them to local paths
19
+ 5. await ``__icefold_run__(local_inputs, ctx_dict)``
20
+ 6. upload product files back to the server and rewrite the output to the
21
+ server-canonical paths it hands back
22
+
23
+ Output that isn't a file (text, numbers, None) passes through untouched.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import asyncio
29
+ import hashlib
30
+ import importlib.util
31
+ import os
32
+ import shutil
33
+ import sys
34
+ from types import ModuleType
35
+ from typing import Any, Dict, List, Tuple
36
+
37
+ import httpx
38
+
39
+ # Bundled node SDK (importable via the CLI's _sdk sys.path entry). DATA_DIR
40
+ # reflects this runner's --work-dir because the CLI sets ICEFOLD_PROJECT_ROOT
41
+ # before these imports, so executors write products under our download dir.
42
+ from icefold.config import DATA_DIR
43
+ from icefold.exceptions import MissingDependencyError
44
+ from icefold.wire import OUTPUT_UPLOAD_PATH, binary_install_hint
45
+
46
+ _STAGED_DIR = os.path.join(DATA_DIR, "staged")
47
+ _BUNDLES_DIR = os.path.join(DATA_DIR, "bundles")
48
+
49
+
50
+ def _is_server_ref(value: Any) -> bool:
51
+ return isinstance(value, str) and (
52
+ value.startswith("/upload/") or value.startswith("/download/")
53
+ )
54
+
55
+
56
+ def _ext_from_ref(ref: str) -> str:
57
+ ext = os.path.splitext(ref.split("?", 1)[0])[1].lower()
58
+ if not ext or len(ext) > 12 or not ext[1:].isalnum():
59
+ return ""
60
+ return ext
61
+
62
+
63
+ class NodeRunner:
64
+ """Stateless per-worker runner; one instance shared across jobs."""
65
+
66
+ def __init__(self, http_base: str, token: str, log) -> None:
67
+ self._http_base = http_base.rstrip("/")
68
+ self._token = token
69
+ self._log = log
70
+ # Cache of bundle modules keyed by bundle hash. A bundle is a
71
+ # self-contained .py; once exec'd we keep the module around for the
72
+ # lifetime of this runner process.
73
+ self._bundles: Dict[str, ModuleType] = {}
74
+ os.makedirs(_STAGED_DIR, exist_ok=True)
75
+ os.makedirs(_BUNDLES_DIR, exist_ok=True)
76
+
77
+ async def run(self, msg: dict, *, send_callback=None) -> Any:
78
+ """Execute one ``node_exec`` frame against a server-rendered bundle.
79
+
80
+ ``send_callback(kind, payload) -> awaitable`` (optional) is the
81
+ host-injected seam the bundle uses to reach back into the server for
82
+ capabilities the runner can't fulfil locally — ``progress`` (session
83
+ notifications) and ``llm.*`` (the server owns the provider keys and
84
+ accounting). The runner client wires this so the same callable
85
+ correlates replies via ``req_id``. ``None`` means no host is wired
86
+ (e.g. self-check), and the bundle's callback methods raise instead of
87
+ silently no-op'ing.
88
+ """
89
+ bundle_hash = msg.get("bundle_hash") or ""
90
+ if not bundle_hash:
91
+ node_type = msg.get("node_type") or msg.get("node_id", "")
92
+ raise RuntimeError(
93
+ f"node_exec for {node_type!r} arrived without bundle_hash; "
94
+ "the server must render a bundle via codegen before dispatch"
95
+ )
96
+ timeout = max(1.0, msg.get("timeout_ms", 1800_000) / 1000.0)
97
+
98
+ async with httpx.AsyncClient(timeout=httpx.Timeout(600.0)) as http:
99
+ local_inputs = await self._download_inputs(http, msg.get("inputs") or {})
100
+ output = await asyncio.wait_for(
101
+ self._run_bundle(http, bundle_hash, msg, local_inputs, send_callback),
102
+ timeout=timeout,
103
+ )
104
+ return await self._upload_outputs(http, output, msg.get("session_id", ""))
105
+
106
+ # ── bundle path ──
107
+
108
+ async def _run_bundle(
109
+ self,
110
+ http: httpx.AsyncClient,
111
+ bundle_hash: str,
112
+ msg: dict,
113
+ local_inputs: Any,
114
+ send_callback,
115
+ ) -> Any:
116
+ """Fetch + pre-flight + exec a server-rendered self-contained bundle."""
117
+ mod = self._bundles.get(bundle_hash)
118
+ if mod is None:
119
+ bundle_path = await self._fetch_bundle(http, bundle_hash, msg.get("bundle_url") or "")
120
+ mod = self._import_bundle(bundle_hash, bundle_path)
121
+ self._bundles[bundle_hash] = mod
122
+
123
+ # Pre-flight declared deps (binary first, then python). Raise a typed
124
+ # exception so the client wraps a ``missing_dep`` reply instead of
125
+ # ``node_done``.
126
+ self._preflight_deps(
127
+ tuple(getattr(mod, "__icefold_binary_deps__", ()) or ()),
128
+ tuple(getattr(mod, "__icefold_python_deps__", ()) or ()),
129
+ )
130
+
131
+ ctx_dict = {
132
+ "node_id": msg.get("node_id", msg.get("node_type", "")),
133
+ "node_config": msg.get("node_config") or {},
134
+ "user_id": msg.get("user_id", ""),
135
+ "session_id": msg.get("session_id") or None,
136
+ "space_name": msg.get("space_name") or None,
137
+ "variant": msg.get("variant") or {},
138
+ "raw_inputs": local_inputs if isinstance(local_inputs, dict) else {},
139
+ "provider": msg.get("provider") or {},
140
+ "model": msg.get("model", ""),
141
+ }
142
+ # Bundle-host callback seam: bundles call this via the embedded
143
+ # NodeContext's ``progress`` / ``llm.text`` methods. The runner
144
+ # client wires ``send_callback(kind, payload)`` so it correlates
145
+ # the reply via ``req_id`` and resolves the bundle's awaiter.
146
+ if send_callback is not None:
147
+ ctx_dict["_send_callback"] = send_callback
148
+
149
+ entry = getattr(mod, "__icefold_run__", None)
150
+ if entry is None:
151
+ raise RuntimeError(
152
+ f"bundle {bundle_hash[:8]} is missing __icefold_run__ entry point"
153
+ )
154
+ return await entry(local_inputs if isinstance(local_inputs, dict) else {}, ctx_dict)
155
+
156
+ async def _fetch_bundle(
157
+ self, http: httpx.AsyncClient, bundle_hash: str, bundle_url: str,
158
+ ) -> str:
159
+ """Cache-aware bundle fetch. Returns the on-disk path."""
160
+ path = os.path.join(_BUNDLES_DIR, f"{bundle_hash}.py")
161
+ if os.path.isfile(path):
162
+ return path
163
+ url = bundle_url or f"{self._http_base}/v1/bundles/{bundle_hash}"
164
+ self._log("info", f"pulling bundle {bundle_hash[:8]}")
165
+ headers = {"X-Worker-Token": self._token} if self._token else {}
166
+ async with http.stream("GET", url, headers=headers) as resp:
167
+ resp.raise_for_status()
168
+ tmp = path + ".part"
169
+ with open(tmp, "wb") as fh:
170
+ async for chunk in resp.aiter_bytes(64 * 1024):
171
+ fh.write(chunk)
172
+ os.replace(tmp, path)
173
+ # Sanity: re-hash + compare so a corrupted download can't silently exec.
174
+ with open(path, "rb") as fh:
175
+ got = hashlib.sha256(fh.read()).hexdigest()
176
+ if got != bundle_hash:
177
+ os.unlink(path)
178
+ raise RuntimeError(
179
+ f"bundle hash mismatch: expected {bundle_hash[:8]}, got {got[:8]}"
180
+ )
181
+ return path
182
+
183
+ @staticmethod
184
+ def _import_bundle(bundle_hash: str, path: str) -> ModuleType:
185
+ """exec the bundle in a fresh module namespace. No sys.modules pollution."""
186
+ mod_name = f"_icefold_bundle_{bundle_hash[:16]}"
187
+ spec = importlib.util.spec_from_file_location(mod_name, path)
188
+ if spec is None or spec.loader is None:
189
+ raise RuntimeError(f"failed to create import spec for bundle {bundle_hash[:8]}")
190
+ mod = importlib.util.module_from_spec(spec)
191
+ sys.modules[mod_name] = mod
192
+ try:
193
+ spec.loader.exec_module(mod)
194
+ except Exception:
195
+ sys.modules.pop(mod_name, None)
196
+ raise
197
+ return mod
198
+
199
+ def _preflight_deps(
200
+ self, binary_deps: Tuple[str, ...], python_deps: Tuple[str, ...],
201
+ ) -> None:
202
+ """Surface a structured ``MissingDependencyError`` when any dep is absent."""
203
+ missing_bin: List[str] = [b for b in binary_deps if b and shutil.which(b) is None]
204
+ missing_py: List[str] = []
205
+ for pkg in python_deps:
206
+ if not pkg:
207
+ continue
208
+ try:
209
+ __import__(pkg.split(".")[0])
210
+ except ImportError:
211
+ missing_py.append(pkg)
212
+ if not (missing_bin or missing_py):
213
+ return
214
+ # Build a platform-aware install hint covering both categories.
215
+ plat = sys.platform if sys.platform in ("linux", "darwin", "win32") else "linux"
216
+ lines: List[str] = []
217
+ for b in missing_bin:
218
+ lines.append(f" · {b} (binary) → {binary_install_hint(b, plat)}")
219
+ for p in missing_py:
220
+ lines.append(f" · {p} (python) → pip install {p}")
221
+ hint = "Install the following on this runner host:\n" + "\n".join(lines)
222
+ raise MissingDependencyError(
223
+ missing_binaries=tuple(missing_bin),
224
+ missing_python=tuple(missing_py),
225
+ install_hint=hint,
226
+ )
227
+
228
+ # ── input staging (download) ──
229
+
230
+ async def _download_inputs(self, http: httpx.AsyncClient, inputs: Any) -> Any:
231
+ if isinstance(inputs, str):
232
+ if _is_server_ref(inputs):
233
+ return await self._download_one(http, inputs)
234
+ return inputs
235
+ if isinstance(inputs, dict):
236
+ return {k: await self._download_inputs(http, v) for k, v in inputs.items()}
237
+ if isinstance(inputs, (list, tuple)):
238
+ return [await self._download_inputs(http, v) for v in inputs]
239
+ return inputs
240
+
241
+ async def _download_one(self, http: httpx.AsyncClient, ref: str) -> str:
242
+ url = self._http_base + ref
243
+ dest = os.path.join(_STAGED_DIR, f"{os.urandom(8).hex()}{_ext_from_ref(ref)}")
244
+ self._log("info", f"pulling input {ref}")
245
+ async with http.stream("GET", url) as resp:
246
+ resp.raise_for_status()
247
+ with open(dest, "wb") as fh:
248
+ async for chunk in resp.aiter_bytes(1024 * 1024):
249
+ fh.write(chunk)
250
+ return dest
251
+
252
+ # ── output staging (upload) ──
253
+
254
+ async def _upload_outputs(self, http: httpx.AsyncClient, output: Any, session_id: str) -> Any:
255
+ if isinstance(output, str):
256
+ if output and os.path.isfile(output) and os.path.abspath(output).startswith(
257
+ os.path.abspath(DATA_DIR)
258
+ ):
259
+ return await self._upload_one(http, output, session_id)
260
+ return output
261
+ if isinstance(output, dict):
262
+ return {k: await self._upload_outputs(http, v, session_id) for k, v in output.items()}
263
+ if isinstance(output, (list, tuple)):
264
+ return [await self._upload_outputs(http, v, session_id) for v in output]
265
+ return output
266
+
267
+ async def _upload_one(self, http: httpx.AsyncClient, path: str, session_id: str) -> str:
268
+ url = self._http_base + OUTPUT_UPLOAD_PATH
269
+ self._log("info", f"pushing product {os.path.basename(path)}")
270
+ headers = {"X-Worker-Token": self._token} if self._token else {}
271
+ with open(path, "rb") as fh:
272
+ resp = await http.post(
273
+ url,
274
+ headers=headers,
275
+ data={"session_id": session_id or ""},
276
+ files={"file": (os.path.basename(path), fh, "application/octet-stream")},
277
+ )
278
+ resp.raise_for_status()
279
+ server_path = resp.json().get("path")
280
+ if not server_path:
281
+ raise RuntimeError("server did not return a stored path for output")
282
+ return server_path
@@ -0,0 +1,166 @@
1
+ Metadata-Version: 2.4
2
+ Name: icefold-runner
3
+ Version: 0.1.0
4
+ Summary: Self-hosted execution runner for IceFold nodes (reverse-connects to an IceFold server, like a self-hosted CI runner).
5
+ Author: IceFold
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/IceFold/icefold-runner
8
+ Project-URL: Repository, https://github.com/IceFold/icefold-runner
9
+ Keywords: icefold,runner,self-hosted,node-execution
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Intended Audience :: Developers
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: icefold-sdk>=0.1.0
19
+ Requires-Dist: websockets>=12
20
+ Requires-Dist: httpx>=0.27
21
+ Dynamic: license-file
22
+
23
+ # icefold-runner
24
+
25
+ A self-hosted execution runner for [IceFold](https://icefold.com) nodes — like
26
+ a GitHub self-hosted CI runner. You start it on your own machine; it
27
+ **reverse-connects** to an IceFold server (so it works behind NAT with no
28
+ inbound ports, no public IP, no tunnel), receives node-execution jobs, runs them
29
+ locally, and streams results back.
30
+
31
+ It is the place where **your uploaded node code runs** — on your hardware, with
32
+ full `subprocess` / `ffmpeg` / GPU / any-dependency access — instead of inside
33
+ the server's restricted sandbox.
34
+
35
+ ## How it works
36
+
37
+ ```
38
+ your machine (private, behind NAT) IceFold server (public)
39
+ ┌──────────────────────────────────┐ reverse WSS ┌───────────────────────────┐
40
+ │ icefold-runner │ ───────────► │ /v1/ws/worker?token │
41
+ │ • dials out, token auth │ node_exec ◄─│ routes node runs (per user) │
42
+ │ • reconnect + keepalive │ node_done ─►│ │
43
+ │ • bundle runner: │ │ │
44
+ │ GET /v1/bundles/<hash> │ HTTP pull │ /upload /download │
45
+ │ import bundle + preflight deps │ ◄──────────► │ /v1/workers/output │
46
+ │ await __icefold_run__ │ │ │
47
+ └──────────────────────────────────┘ HTTP push └───────────────────────────┘
48
+ ```
49
+
50
+ - **Control plane** rides the reverse WebSocket (`node_exec` / `cancel` →
51
+ `node_status` / `node_done` / `missing_dep`), JSON frames XOR-obfuscated with
52
+ the token (TLS still does the real protection). Each `node_exec` frame only
53
+ carries a `bundle_hash` and a single already-sliced variant — no source.
54
+ - **Bulk media + bundles** ride plain HTTP: the runner GETs inputs from the
55
+ server's `/upload` & `/download` and node bundles from `/v1/bundles/<hash>`
56
+ (sha256-addressed, cached locally as `runner_work_dir/bundles/<hash>.py`,
57
+ re-hashed on every download), runs the bundle, POSTs products back to
58
+ `/v1/workers/output` (which returns server-canonical paths).
59
+ - **The runner ships no node implementations and never compiles user source.**
60
+ The IceFold server renders every node (your custom ones *and* the platform's
61
+ built-in ones) into a self-contained `.py` bundle, with `python_deps` /
62
+ `binary_deps` declared in the bundle header. The runner imports the bundle,
63
+ pre-flights the deps (sending back a structured `missing_dep` reply with
64
+ platform-aware install hints if anything is absent), and awaits
65
+ `__icefold_run__(inputs, ctx_dict)`. So when the server adds or upgrades
66
+ nodes, **you never have to upgrade the runner.**
67
+ - Variant planning / dimension & provider resolution all stay on the server;
68
+ each job is a single already-sliced leaf call.
69
+
70
+ ## Install
71
+
72
+ Requires **Python ≥ 3.11**, **ffmpeg/ffprobe** on `PATH` (for media nodes), and
73
+ whatever third-party packages your custom nodes import.
74
+
75
+ ```bash
76
+ pip install icefold-runner # pulls in icefold-sdk
77
+ ```
78
+
79
+ From source:
80
+
81
+ ```bash
82
+ git clone <this-repo> icefold-runner
83
+ cd icefold-runner
84
+ python -m venv .venv && . .venv/bin/activate
85
+ pip install -e .
86
+ ```
87
+
88
+ ## Run
89
+
90
+ Generate a token in the IceFold app (**Nodes ▸ Connect a runner**), then:
91
+
92
+ ```bash
93
+ icefold-runner --token <your-token>
94
+ ```
95
+
96
+ That's it — the token (GitHub-CI style) encodes + signs your IceFold user id, so
97
+ there's no server URL or user id to pass. The server is built in.
98
+
99
+ Every flag also reads an env var (see [`.env.example`](.env.example)):
100
+
101
+ | flag | env | meaning |
102
+ |---|---|---|
103
+ | `--token` | `ICEFOLD_RUNNER_TOKEN` | runner token from the IceFold app |
104
+ | `--runner-id` | `ICEFOLD_RUNNER_ID` | stable id (default: hostname) |
105
+ | `--work-dir` | `ICEFOLD_RUNNER_DIR` | scratch for staged inputs + products |
106
+
107
+ The runner honors standard proxy env vars (`HTTPS_PROXY`, …) for reaching the
108
+ server. It reconnects automatically with backoff; an auth rejection is fatal.
109
+
110
+ > Self-hosting / dev: point the runner at a different server with the
111
+ > `ICEFOLD_RUNNER_SERVER` env var (e.g. `ws://127.0.0.1:7000`).
112
+
113
+ ### Run as a service (systemd)
114
+
115
+ ```ini
116
+ # /etc/systemd/system/icefold-runner.service
117
+ [Unit]
118
+ Description=IceFold runner
119
+ After=network-online.target
120
+
121
+ [Service]
122
+ EnvironmentFile=/etc/icefold-runner.env
123
+ ExecStart=/opt/icefold-runner/.venv/bin/icefold-runner
124
+ Restart=always
125
+ RestartSec=5
126
+
127
+ [Install]
128
+ WantedBy=multi-user.target
129
+ ```
130
+
131
+ ## Layout
132
+
133
+ ```
134
+ icefold_runner/ the runner agent (connection, file staging, bundle exec)
135
+ client.py reverse-WS client: dial / auth / reconnect / keepalive
136
+ runner.py fetch /v1/bundles/<hash>, preflight deps, await __icefold_run__
137
+ __main__.py CLI entrypoint (icefold-runner)
138
+ ```
139
+
140
+ The runner imports the bundle on demand; the bundle is **self-contained** and
141
+ already inlines whatever it needs (the author's function body, the
142
+ `Inputs` / `Output` dataclasses, and a minimal `NodeContext` shim). The only
143
+ runtime dependency on `icefold-sdk` is the wire protocol + a small helper kit
144
+ (`get_file_id` / `run_blocking` / `write_text`), used by the runner agent
145
+ itself, not by node code.
146
+
147
+ ## Security model
148
+
149
+ - Node code runs **unsandboxed** here — it's your machine, your risk. That's the
150
+ point: code the server sandbox forbids (subprocess/ffmpeg/native deps) runs on
151
+ the runner instead. The runner downloads each bundle from the server and
152
+ executes it; it verifies the bundle's sha256 matches the requested hash, but
153
+ the bundle itself is whatever the server you authenticated to sends. Only
154
+ point a runner at a server you trust.
155
+ - The runner only talks to the one server you point it at, authenticated by the
156
+ shared token; it pulls input files and pushes products over HTTP to that host.
157
+
158
+ ## Self-check
159
+
160
+ A no-network sanity check that `icefold` is importable and the bundle execution
161
+ path (fetch + import + run `__icefold_run__`) works against a locally-rendered
162
+ bundle:
163
+
164
+ ```bash
165
+ python selfcheck.py
166
+ ```
@@ -0,0 +1,10 @@
1
+ icefold_runner/__init__.py,sha256=E5P4VMUKq7Kva5iRBnTsvZkM_nCsWUfCUB8dmmicyys,1035
2
+ icefold_runner/__main__.py,sha256=BEjQKyeniGOTaai0r5QuiKDV6CVPwCVu7_zkxVd10Jw,2881
3
+ icefold_runner/client.py,sha256=mEnjkzMRlVVcuO8KG0_3TjhNAFhu4paToGt_wcIMlE0,11842
4
+ icefold_runner/runner.py,sha256=BWVc7M7RnyO2eqmA0M6Su_0eWUc-9RTiBSBR0zU0Xoc,12280
5
+ icefold_runner-0.1.0.dist-info/licenses/LICENSE,sha256=GkZpO-PWJeVUDysnFXlj-lDZPx0qQWrtOe1Cuzk4phA,1064
6
+ icefold_runner-0.1.0.dist-info/METADATA,sha256=alMtzr2kXRm2dfGLh-Y00M-hlCUXMAYf8HekpcJRjpI,7171
7
+ icefold_runner-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ icefold_runner-0.1.0.dist-info/entry_points.txt,sha256=U63xdZSCxgXJhs8JT-F1FHHr_aH_Ov97yUrbJB5lEs4,64
9
+ icefold_runner-0.1.0.dist-info/top_level.txt,sha256=o5xHC1eeRUfy6gGuAUA-tTXazxLy0zpDR9C8IKgaf68,15
10
+ icefold_runner-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ icefold-runner = icefold_runner.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 IceFold
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ icefold_runner