PyPI - nvdc - Versions diffs - 0.1.0__tar.gz - Mend

nvdc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

nvdc-0.1.0/PKG-INFO +169 -0
nvdc-0.1.0/README.md +149 -0
nvdc-0.1.0/pyproject.toml +34 -0
nvdc-0.1.0/setup.cfg +4 -0
nvdc-0.1.0/src/nvdc/__init__.py +3 -0
nvdc-0.1.0/src/nvdc/agent.py +329 -0
nvdc-0.1.0/src/nvdc/app.py +306 -0
nvdc-0.1.0/src/nvdc/attestation.py +122 -0
nvdc-0.1.0/src/nvdc/catalog.py +104 -0
nvdc-0.1.0/src/nvdc/cli.py +217 -0
nvdc-0.1.0/src/nvdc/config.py +17 -0
nvdc-0.1.0/src/nvdc/coordinator.py +869 -0
nvdc-0.1.0/src/nvdc/gpu.py +167 -0
nvdc-0.1.0/src/nvdc/hardware.py +157 -0
nvdc-0.1.0/src/nvdc/inference.py +133 -0
nvdc-0.1.0/src/nvdc/keys.py +114 -0
nvdc-0.1.0/src/nvdc/payments.py +125 -0
nvdc-0.1.0/src/nvdc/protocol.py +91 -0
nvdc-0.1.0/src/nvdc/runtime.py +317 -0
nvdc-0.1.0/src/nvdc/storage.py +207 -0
nvdc-0.1.0/src/nvdc/wallet.py +26 -0
nvdc-0.1.0/src/nvdc/web/index.html +605 -0
nvdc-0.1.0/src/nvdc.egg-info/PKG-INFO +169 -0
nvdc-0.1.0/src/nvdc.egg-info/SOURCES.txt +26 -0
nvdc-0.1.0/src/nvdc.egg-info/dependency_links.txt +1 -0
nvdc-0.1.0/src/nvdc.egg-info/entry_points.txt +2 -0
nvdc-0.1.0/src/nvdc.egg-info/requires.txt +12 -0
nvdc-0.1.0/src/nvdc.egg-info/top_level.txt +1 -0

nvdc-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,169 @@
+Metadata-Version: 2.4
+Name: nvdc
+Version: 0.1.0
+Summary: Bring your GPU onto the network: one command turns a GPU into a verifiable, OpenAI-compatible inference node.
+Author: NVDC
+License: Apache-2.0
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: fastapi>=0.110
+Requires-Dist: uvicorn[standard]>=0.27
+Requires-Dist: websockets>=12.0
+Requires-Dist: httpx>=0.27
+Requires-Dist: cryptography>=42.0
+Requires-Dist: nvidia-ml-py>=12.535.77
+Requires-Dist: redis>=5.0
+Requires-Dist: stripe>=9.0
+Provides-Extra: attestation
+Requires-Dist: nv-attestation-sdk>=2.7.0; extra == "attestation"
+Requires-Dist: nv-local-gpu-verifier>=2.7.0; extra == "attestation"
+# NVDC — bring your GPU onto the network
+NVDC turns any GPU machine into a **verifiable, OpenAI-compatible inference node**
+on a shared network. The node operator runs one command, opens a small visual
+client, picks a model to hold hot in memory, and flips the switch to go live.
+A coordinator exposes a standard `POST /v1/chat/completions` endpoint and routes
+each request — over an outbound tunnel — to a connected GPU node.
+```
+┌─────────────┐   OpenAI API    ┌──────────────┐   WebSocket tunnel   ┌──────────────┐
+│  any client │ ───────────────▶│ coordinator  │◀────────────────────▶│  GPU node    │
+│ (OpenAI SDK)│  /v1/chat/...   │  (public)    │   (node dials out)    │ Ollama + UI  │
+└─────────────┘                 └──────────────┘                       └──────────────┘
+```
+## Why a tunnel?
+The node opens a single **outbound** WebSocket to the coordinator, so it never
+needs an inbound public port and its IP stays private — the same pattern used by
+`brev register` (NetBird) and consumer GPU marketplaces.
+## Deployment (split: hosted web + downloadable client)
+Three pieces, three homes:
+| Component | Where it runs | Notes |
+|---|---|---|
+| **Coordinator** (`nvdc coordinator`) | A **persistent host** (Railway / Render / Fly.io / VM) | Needs long-lived WebSockets + in-memory state. **Not** Vercel serverless. A `Dockerfile` + `Procfile` are included. |
+| **Web app** (`site/`) | **Vercel** (static) | Mirrors the client UI: Home/Chat/Network read from the coordinator (CORS); Mine shows a download CTA + live market figures, and lights up with real data if the client is running locally. |
+| **Downloadable client** (`nvdc app`) | The miner's GPU box | The full app from above — detects the GPU, mines, holds the signing identity. |
+### Deploy the coordinator (example: Railway)
+```bash
+# from the repo root — Railway/Render auto-detect the Dockerfile
+#   exposes the OpenAI API + /node/ws tunnel + ledger on $PORT
+# After deploy you'll get a URL like https://nvdc-xxxx.up.railway.app
+```
+### Deploy the web app to Vercel
+The root `vercel.json` deploys `site/` as a static site (bypassing the Python
+FastAPI auto-detection). If Vercel still tries a Python build, set the project's
+**Root Directory** to `site/` in the Vercel dashboard.
+In the deployed site, click **"set network…"** under the logo and paste your
+coordinator URL (or load it with `?coordinator=https://...`). The page then reads
+the live network and, if the downloadable client is running on the visitor's
+machine, recognizes it automatically (CORS + Private Network Access).
+## Quick start
+One-line install (installs Python deps + Ollama + the `nvdc` client, then launches it):
+```bash
+# macOS / Linux
+curl -fsSL https://nvdc.ai/download/install.sh | bash     # Linux
+curl -fsSL https://nvdc.ai/download/install.command | bash # macOS
+```
+```powershell
+# Windows (PowerShell)
+irm https://nvdc.ai/download/install.ps1 | iex
+```
+Or install the package directly (Python 3.9+):
+```bash
+pipx install nvdc        # or: pip install nvdc
+# on the GPU machine, launch the visual client
+#   it defaults to the public network at wss://api.nvdc.ai
+nvdc app
+# (running your own hub? point the client at it)
+nvdc coordinator --port 8000
+nvdc app --coordinator ws://<coordinator-host>:8000
+```
+Then in the browser UI: see your hardware, pick a model (it must load **hot into
+memory** first), and click **Go Live**. The green light turns on only when a
+model is hot *and* the node is live.
+### Try it without a GPU / without downloading weights
+```bash
+nvdc coordinator --port 8000 &
+nvdc app --mock --coordinator ws://127.0.0.1:8000
+```
+Mock mode simulates model loading and uses an echo backend, so you can exercise
+the entire flow (load → hot → go live → green light → routed inference).
+### Use it from any OpenAI client
+```python
+from openai import OpenAI
+client = OpenAI(base_url="https://api.nvdc.ai/v1", api_key="x")
+client.chat.completions.create(model="llama3.1:8b",
+    messages=[{"role": "user", "content": "hello"}])
+```
+## CLI
+| Command | What it does |
+|---|---|
+| `nvdc app` | Launch the visual node client (web UI) |
+| `nvdc serve` | Headless node: bring this GPU onto the network |
+| `nvdc coordinator` | Run the public hub + OpenAI-compatible API |
+| `nvdc status` | Print local GPU + attestation status as JSON |
+## Models
+The catalog is pinned to the **Ollama** library (reliable, known sizes; Ollama
+also handles CUDA / Apple Metal / CPU offload). Each node reports its memory
+budget and the UI marks every model **Fits / Tight / Won't fit** against it:
+- unified-memory systems (DGX Spark / GB10, Apple Silicon) → budget = system RAM
+- dedicated-VRAM GPUs → budget = VRAM
+Popular tags included: `gpt-oss:20b`, `gpt-oss:120b`, `llama3.1:8b/70b`,
+`qwen2.5:7b/32b`, `deepseek-r1`, `mistral`, `gemma2`, `phi4`.
+## Attestation (verifiable work)
+Attestation is a first-class, pluggable component (`nvdc/attestation.py`):
+- On a **Confidential-Computing-capable** GPU (H100/H200, B100/B200, GB200,
+  RTX PRO 6000 Blackwell) with CC enabled, it performs a real NVIDIA **nvTrust**
+  local GPU attestation and reports the verdict + claims.
+- On hardware without CC (e.g. **GB10 / DGX Spark**, consumer GPUs), it reports
+  `supported: false` with a clear reason — it never fabricates a "verified"
+  result.
+A coordinator can enforce policy with `--require-attested` to only route work to
+nodes whose attestation verifies.
+> Note: the DGX Spark / GB10 cannot produce hardware attestation (NVIDIA disabled
+> CC on this SKU). It serves inference fine; it just joins as an unattested node.
+## Layout
+```
+src/nvdc/
+  cli.py          # nvdc app | serve | coordinator | status
+  app.py          # local web server for the visual client
+  web/index.html  # the visual client UI
+  runtime.py      # node state machine: load → hot → live
+  hardware.py     # accelerator + memory-budget detection (CUDA/MPS/CPU)
+  catalog.py      # curated Ollama model catalog + fit logic
+  attestation.py  # pluggable nvTrust attestation hook
+  agent.py        # node agent: outbound tunnel + request handling
+  coordinator.py  # hub: node registry + OpenAI-compatible API
+  inference.py    # Ollama + echo backends
+  protocol.py     # tiny JSON wire protocol
+```

nvdc-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,149 @@
+# NVDC — bring your GPU onto the network
+NVDC turns any GPU machine into a **verifiable, OpenAI-compatible inference node**
+on a shared network. The node operator runs one command, opens a small visual
+client, picks a model to hold hot in memory, and flips the switch to go live.
+A coordinator exposes a standard `POST /v1/chat/completions` endpoint and routes
+each request — over an outbound tunnel — to a connected GPU node.
+```
+┌─────────────┐   OpenAI API    ┌──────────────┐   WebSocket tunnel   ┌──────────────┐
+│  any client │ ───────────────▶│ coordinator  │◀────────────────────▶│  GPU node    │
+│ (OpenAI SDK)│  /v1/chat/...   │  (public)    │   (node dials out)    │ Ollama + UI  │
+└─────────────┘                 └──────────────┘                       └──────────────┘
+```
+## Why a tunnel?
+The node opens a single **outbound** WebSocket to the coordinator, so it never
+needs an inbound public port and its IP stays private — the same pattern used by
+`brev register` (NetBird) and consumer GPU marketplaces.
+## Deployment (split: hosted web + downloadable client)
+Three pieces, three homes:
+| Component | Where it runs | Notes |
+|---|---|---|
+| **Coordinator** (`nvdc coordinator`) | A **persistent host** (Railway / Render / Fly.io / VM) | Needs long-lived WebSockets + in-memory state. **Not** Vercel serverless. A `Dockerfile` + `Procfile` are included. |
+| **Web app** (`site/`) | **Vercel** (static) | Mirrors the client UI: Home/Chat/Network read from the coordinator (CORS); Mine shows a download CTA + live market figures, and lights up with real data if the client is running locally. |
+| **Downloadable client** (`nvdc app`) | The miner's GPU box | The full app from above — detects the GPU, mines, holds the signing identity. |
+### Deploy the coordinator (example: Railway)
+```bash
+# from the repo root — Railway/Render auto-detect the Dockerfile
+#   exposes the OpenAI API + /node/ws tunnel + ledger on $PORT
+# After deploy you'll get a URL like https://nvdc-xxxx.up.railway.app
+```
+### Deploy the web app to Vercel
+The root `vercel.json` deploys `site/` as a static site (bypassing the Python
+FastAPI auto-detection). If Vercel still tries a Python build, set the project's
+**Root Directory** to `site/` in the Vercel dashboard.
+In the deployed site, click **"set network…"** under the logo and paste your
+coordinator URL (or load it with `?coordinator=https://...`). The page then reads
+the live network and, if the downloadable client is running on the visitor's
+machine, recognizes it automatically (CORS + Private Network Access).
+## Quick start
+One-line install (installs Python deps + Ollama + the `nvdc` client, then launches it):
+```bash
+# macOS / Linux
+curl -fsSL https://nvdc.ai/download/install.sh | bash     # Linux
+curl -fsSL https://nvdc.ai/download/install.command | bash # macOS
+```
+```powershell
+# Windows (PowerShell)
+irm https://nvdc.ai/download/install.ps1 | iex
+```
+Or install the package directly (Python 3.9+):
+```bash
+pipx install nvdc        # or: pip install nvdc
+# on the GPU machine, launch the visual client
+#   it defaults to the public network at wss://api.nvdc.ai
+nvdc app
+# (running your own hub? point the client at it)
+nvdc coordinator --port 8000
+nvdc app --coordinator ws://<coordinator-host>:8000
+```
+Then in the browser UI: see your hardware, pick a model (it must load **hot into
+memory** first), and click **Go Live**. The green light turns on only when a
+model is hot *and* the node is live.
+### Try it without a GPU / without downloading weights
+```bash
+nvdc coordinator --port 8000 &
+nvdc app --mock --coordinator ws://127.0.0.1:8000
+```
+Mock mode simulates model loading and uses an echo backend, so you can exercise
+the entire flow (load → hot → go live → green light → routed inference).
+### Use it from any OpenAI client
+```python
+from openai import OpenAI
+client = OpenAI(base_url="https://api.nvdc.ai/v1", api_key="x")
+client.chat.completions.create(model="llama3.1:8b",
+    messages=[{"role": "user", "content": "hello"}])
+```
+## CLI
+| Command | What it does |
+|---|---|
+| `nvdc app` | Launch the visual node client (web UI) |
+| `nvdc serve` | Headless node: bring this GPU onto the network |
+| `nvdc coordinator` | Run the public hub + OpenAI-compatible API |
+| `nvdc status` | Print local GPU + attestation status as JSON |
+## Models
+The catalog is pinned to the **Ollama** library (reliable, known sizes; Ollama
+also handles CUDA / Apple Metal / CPU offload). Each node reports its memory
+budget and the UI marks every model **Fits / Tight / Won't fit** against it:
+- unified-memory systems (DGX Spark / GB10, Apple Silicon) → budget = system RAM
+- dedicated-VRAM GPUs → budget = VRAM
+Popular tags included: `gpt-oss:20b`, `gpt-oss:120b`, `llama3.1:8b/70b`,
+`qwen2.5:7b/32b`, `deepseek-r1`, `mistral`, `gemma2`, `phi4`.
+## Attestation (verifiable work)
+Attestation is a first-class, pluggable component (`nvdc/attestation.py`):
+- On a **Confidential-Computing-capable** GPU (H100/H200, B100/B200, GB200,
+  RTX PRO 6000 Blackwell) with CC enabled, it performs a real NVIDIA **nvTrust**
+  local GPU attestation and reports the verdict + claims.
+- On hardware without CC (e.g. **GB10 / DGX Spark**, consumer GPUs), it reports
+  `supported: false` with a clear reason — it never fabricates a "verified"
+  result.
+A coordinator can enforce policy with `--require-attested` to only route work to
+nodes whose attestation verifies.
+> Note: the DGX Spark / GB10 cannot produce hardware attestation (NVIDIA disabled
+> CC on this SKU). It serves inference fine; it just joins as an unattested node.
+## Layout
+```
+src/nvdc/
+  cli.py          # nvdc app | serve | coordinator | status
+  app.py          # local web server for the visual client
+  web/index.html  # the visual client UI
+  runtime.py      # node state machine: load → hot → live
+  hardware.py     # accelerator + memory-budget detection (CUDA/MPS/CPU)
+  catalog.py      # curated Ollama model catalog + fit logic
+  attestation.py  # pluggable nvTrust attestation hook
+  agent.py        # node agent: outbound tunnel + request handling
+  coordinator.py  # hub: node registry + OpenAI-compatible API
+  inference.py    # Ollama + echo backends
+  protocol.py     # tiny JSON wire protocol
+```

nvdc-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,34 @@
+[project]
+name = "nvdc"
+version = "0.1.0"
+description = "Bring your GPU onto the network: one command turns a GPU into a verifiable, OpenAI-compatible inference node."
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "Apache-2.0" }
+authors = [{ name = "NVDC" }]
+dependencies = [
+    "fastapi>=0.110",
+    "uvicorn[standard]>=0.27",
+    "websockets>=12.0",
+    "httpx>=0.27",
+    "cryptography>=42.0",
+    "nvidia-ml-py>=12.535.77",
+    "redis>=5.0",
+    "stripe>=9.0",
+]
+[project.optional-dependencies]
+attestation = ["nv-attestation-sdk>=2.7.0", "nv-local-gpu-verifier>=2.7.0"]
+[project.scripts]
+nvdc = "nvdc.cli:main"
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+nvdc = ["web/*.html"]

nvdc-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

nvdc-0.1.0/src/nvdc/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""nvdc: bring your GPU onto the network as a verifiable, OpenAI-compatible inference node."""
+__version__ = "0.1.0"

nvdc-0.1.0/src/nvdc/agent.py ADDED Viewed

@@ -0,0 +1,329 @@
+"""Node agent: the thing `nvdc serve` runs.
+Opens ONE outbound WebSocket to the coordinator (so the node never needs an
+inbound public port and its IP stays private), registers its GPU + attestation
+profile, then services inference requests over that tunnel.
+"""
+from __future__ import annotations
+import asyncio
+import hashlib
+import json as _json
+import logging
+from typing import Any, Dict, Optional
+import websockets
+from . import __version__, keys, protocol
+from .attestation import attest
+from .gpu import detect_gpu, detect_gpus, detect_interconnect
+from .hardware import detect_hardware, machine_id as hw_machine_id
+from .inference import Backend, make_backend
+from .keys import Identity
+log = logging.getLogger("nvdc.agent")
+def _extract_content(sse_line: str) -> str:
+    """Pull the delta content out of one OpenAI SSE 'data: {...}' line."""
+    line = sse_line.strip()
+    if not line.startswith("data:"):
+        return ""
+    try:
+        obj = _json.loads(line[len("data:"):].strip())
+        return obj["choices"][0]["delta"].get("content") or ""
+    except Exception:
+        return ""
+class NodeAgent:
+    def __init__(
+        self,
+        coordinator_url: str,
+        name: str,
+        backend: Backend,
+        model: str,
+        token: str = "",
+        require_attestation: bool = False,
+        status_cb=None,
+        drain_timeout: float = 120.0,
+        price_per_mtok: float = 0.0,
+        account_id: str = "",
+        identity: Optional[Identity] = None,
+        owner_account: str = "",
+        machine_id: str = "",
+        cluster: str = "",
+    ):
+        # A node commits to exactly ONE hot-loaded model at a time — the
+        # "mining algorithm" it has chosen. It advertises and serves only this
+        # model; requests for anything else are rejected at the node boundary.
+        self.coordinator_url = coordinator_url
+        self.name = name
+        self.backend = backend
+        self.model = model
+        self.price_per_mtok = price_per_mtok
+        self.identity = identity or Identity()
+        self.account_id = account_id or self.identity.account_id
+        # Earnings credit the owner account; a single machine owns itself.
+        self.owner_account = owner_account or self.account_id
+        self.machine_id = machine_id or hw_machine_id()
+        self.cluster = cluster
+        self.token = token
+        self.require_attestation = require_attestation
+        self.status_cb = status_cb
+        self.drain_timeout = drain_timeout
+        self._ws = None
+        self._send_lock = asyncio.Lock()
+        # graceful drain bookkeeping
+        self._stopped = False
+        self._draining = False
+        self._inflight_ids = set()  # request ids currently being served
+        self._inflight_zero = asyncio.Event()
+        self._inflight_zero.set()  # starts idle
+    def _emit(self, status: str, **info):
+        if self.status_cb:
+            try:
+                self.status_cb(status, info)
+            except Exception:
+                log.debug("status_cb error", exc_info=True)
+    async def run_forever(self):
+        backoff = 1
+        while not self._stopped:
+            try:
+                await self._connect_and_serve()
+                backoff = 1
+            except (OSError, websockets.WebSocketException) as e:
+                if self._stopped:
+                    break
+                log.warning("connection lost (%s); reconnecting in %ss", e, backoff)
+                self._emit("connecting", detail=str(e))
+                await asyncio.sleep(backoff)
+                backoff = min(backoff * 2, 30)
+    async def drain(self):
+        """Gracefully leave: tell the coordinator to stop routing new work,
+        let in-flight requests finish, then disconnect. In-flight responses are
+        never interrupted, so the node's delivery/completion score is preserved.
+        """
+        if self._draining:
+            return
+        self._draining = True
+        log.info("draining: %d request(s) in flight", len(self._inflight_ids))
+        self._emit("draining", inflight=len(self._inflight_ids))
+        try:
+            await self._send(protocol.MSG_DRAIN)  # coordinator stops routing now
+        except Exception:
+            pass
+        try:
+            # block until all in-flight complete, but not forever
+            await asyncio.wait_for(self._inflight_zero.wait(), timeout=self.drain_timeout)
+            log.info("drain complete; disconnecting")
+        except asyncio.TimeoutError:
+            stuck = list(self._inflight_ids)
+            log.warning(
+                "drain timeout after %ss; force-failing %d stuck request(s): %s",
+                self.drain_timeout, len(stuck), stuck,
+            )
+            # Fail only the stuck requests as node_failed; completed ones are
+            # already done and unaffected.
+            for rid in stuck:
+                try:
+                    await self._send(
+                        protocol.MSG_ERROR, id=rid,
+                        error=f"node_failed: drain timeout after {self.drain_timeout}s",
+                    )
+                except Exception:
+                    pass
+        self._stopped = True
+        if self._ws is not None:
+            try:
+                await self._ws.close()
+            except Exception:
+                pass
+    async def _connect_and_serve(self):
+        headers = {"Authorization": f"Bearer {self.token}"} if self.token else {}
+        log.info("connecting to coordinator %s", self.coordinator_url)
+        async with websockets.connect(
+            self.coordinator_url,
+            additional_headers=headers,
+            max_size=32 * 1024 * 1024,
+            ping_interval=20,
+        ) as ws:
+            self._ws = ws
+            await self._register()
+            try:
+                async for raw in ws:
+                    msg = protocol.decode(raw)
+                    await self._dispatch(msg)
+            finally:
+                self._ws = None
+                self._emit("offline")
+    async def _register(self):
+        gpus = detect_gpus()
+        gpu = gpus[0] if gpus else detect_gpu()
+        interconnect = detect_interconnect() if len(gpus) > 1 else ""
+        hw = detect_hardware()
+        att = attest(require=self.require_attestation)
+        if self.require_attestation and not att.verified:
+            raise RuntimeError(
+                f"attestation required but not verified: {att.reason or att.mode}"
+            )
+        profile = protocol.NodeProfile(
+            name=self.name,
+            models=[self.model],
+            gpu=gpu,
+            attestation=att,
+            gpus=gpus,
+            gpu_count=len(gpus),
+            interconnect=interconnect,
+            ram_mb=hw.ram_mb,
+            memory_budget_mb=hw.memory_budget_mb,
+            accelerator=hw.accelerator.type,
+            price_per_mtok=self.price_per_mtok,
+            account_id=self.account_id,
+            owner_account=self.owner_account,
+            machine_id=self.machine_id,
+            cluster=self.cluster,
+            agent_version=__version__,
+        )
+        await self._send(protocol.MSG_REGISTER, profile=protocol.node_profile_to_dict(profile))
+        log.info(
+            "registered '%s' | gpu=%s | serving=%s | attestation=%s(verified=%s)",
+            self.name, gpu.name, self.model, att.mode, att.verified,
+        )
+    async def _dispatch(self, msg: Dict[str, Any]):
+        t = msg.get("t")
+        if t == protocol.MSG_INFER:
+            asyncio.create_task(self._handle_infer(msg))
+        elif t == protocol.MSG_PING:
+            await self._send(protocol.MSG_PONG)
+        elif t == protocol.MSG_REGISTERED:
+            log.info("coordinator assigned node_id=%s", msg.get("node_id"))
+            self._emit("live", node_id=msg.get("node_id"))
+        else:
+            log.debug("ignoring message type %s", t)
+    async def _handle_infer(self, msg: Dict[str, Any]):
+        req_id = msg.get("id")
+        body = dict(msg.get("body", {}))
+        requested = body.get("model", "")
+        # Once draining, refuse new work so it can be routed elsewhere. This
+        # closes the race between the operator leaving and the coordinator
+        # marking us un-routable; in-flight requests (already past this point)
+        # are unaffected and run to completion.
+        if self._draining:
+            await self._send(
+                protocol.MSG_ERROR, id=req_id,
+                error="node is draining; request not accepted",
+            )
+            return
+        # Enforce the single committed model at the node boundary. A node only
+        # serves the model it has hot-loaded; anything else is refused so it
+        # can never be coerced into running a cold/different model.
+        if requested and requested != self.model:
+            await self._send(
+                protocol.MSG_ERROR, id=req_id,
+                error=f"this node only serves '{self.model}', not '{requested}'",
+            )
+            return
+        body["model"] = self.model  # pin, in case the request omitted it
+        prompt_commit = hashlib.sha256(
+            _json.dumps(body.get("messages", []), sort_keys=True).encode()).hexdigest()
+        self._inflight_ids.add(req_id)
+        self._inflight_zero.clear()
+        stream = bool(body.get("stream", False))
+        try:
+            if stream:
+                acc, tokens = [], 0
+                async for line in self.backend.chat_stream(body):
+                    await self._send(protocol.MSG_CHUNK, id=req_id, data=line)
+                    if '"content"' in line:
+                        tokens += 1
+                        c = _extract_content(line)
+                        if c:
+                            acc.append(c)
+                response_commit = hashlib.sha256("".join(acc).encode()).hexdigest()
+                sig = self._sign_work(req_id, prompt_commit, tokens, response_commit, "complete")
+                await self._send(protocol.MSG_END, id=req_id, tokens=tokens,
+                                 response_commit=response_commit, sig=sig)
+            else:
+                result = await self.backend.chat_once(body)
+                content = ""
+                try:
+                    content = result["choices"][0]["message"].get("content") or ""
+                except Exception:
+                    pass
+                tokens = (result.get("usage", {}) or {}).get("completion_tokens", 0)
+                response_commit = hashlib.sha256(content.encode()).hexdigest()
+                sig = self._sign_work(req_id, prompt_commit, tokens, response_commit, "complete")
+                await self._send(protocol.MSG_RESULT, id=req_id, body=result,
+                                 tokens=tokens, response_commit=response_commit, sig=sig)
+        except Exception as e:
+            log.exception("inference failed for %s", req_id)
+            await self._send(protocol.MSG_ERROR, id=req_id, error=str(e))
+        finally:
+            self._inflight_ids.discard(req_id)
+            if not self._inflight_ids:
+                self._inflight_zero.set()
+    def _sign_work(self, req_id, prompt_commit, tokens, response_commit, delivery) -> str:
+        payload = keys.work_payload(req_id, self.model, prompt_commit, tokens,
+                                    response_commit, delivery)
+        return self.identity.sign(payload)
+    async def _send(self, msg_type: str, **fields: Any):
+        if self._ws is None:
+            return
+        frame = protocol.encode(msg_type, **fields)
+        async with self._send_lock:
+            await self._ws.send(frame)
+async def serve(
+    coordinator_url: str,
+    name: str,
+    backend_kind: str,
+    model: str,
+    ollama_url: str,
+    token: str = "",
+    require_attestation: bool = False,
+    warm: bool = True,
+    drain_timeout: float = 120.0,
+    owner_account: str = "",
+    cluster: str = "",
+):
+    if not model:
+        raise ValueError("a single --model must be specified; a node serves exactly one model")
+    backend = make_backend(backend_kind, model=model, ollama_url=ollama_url)
+    # Hot-load the committed model before advertising it to the network, so the
+    # node is never live with a cold model (low TTFT guarantee).
+    if warm:
+        log.info("hot-loading '%s' into memory ...", model)
+        try:
+            await backend.warm(model)
+            log.info("'%s' is hot", model)
+        except Exception as e:
+            log.warning("warm-up failed for '%s' (%s); serving anyway", model, e)
+    agent = NodeAgent(
+        coordinator_url=coordinator_url,
+        name=name,
+        backend=backend,
+        model=model,
+        token=token,
+        require_attestation=require_attestation,
+        drain_timeout=drain_timeout,
+        owner_account=owner_account,
+        cluster=cluster,
+    )
+    await agent.run_forever()