npm - @hypabolic/crossbar - Versions diffs - 0.1.0 → 0.2.0 - Mend

@hypabolic/crossbar 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CAPABILITY-MATRIX.md +2 -2
package/README.md +6 -17
package/docs/onboarding.gif +0 -0
package/package.json +16 -5
package/src/adapters/generic.ts +6 -0
package/src/adapters/llamacpp.ts +5 -0
package/src/adapters/llamaswap.ts +5 -0
package/src/adapters/lmstudio.ts +57 -14
package/src/adapters/ollama.ts +5 -0
package/src/adapters/vllm.ts +6 -0
package/src/ui/onboarding.ts +342 -60

package/CAPABILITY-MATRIX.md CHANGED Viewed

@@ -7,7 +7,7 @@ adapter registers under (`oai` = `openai-completions`, `ant` = `anthropic-messag
 | Backend | port | pi api | listModels | introspectLoaded | switchModel | loadUnload | auth | health | perModelCaps | streaming | discovery fingerprint |
 |---|---|---|---|---|---|---|---|---|---|---|---|
 | **Ollama** | 11434 | oai | ✅ `/api/tags`,`/v1/models` | ✅ `/api/ps` | ✅ implicit (request id) | ✅ `keep_alive:0` | ◐ none local | ✅ `GET /` text | ✅ `/api/show` caps + ctx | ✅ | `GET /` → `Ollama is running` |
-| **LM Studio** | 1234 | oai | ✅ `/api/v0/models`,`/v1/models` | ✅ `state` field | ✅ JIT + `/api/v1/models/load` | ✅ load/unload + `lms` | ◐ Bearer, none default | ◐ infer 200 | ✅ type+`max_context_length` | ✅ | `/api/v0/models` w/ `state`,`compatibility_type` |
+| **LM Studio** | 1234 | oai | ✅ `/api/v1/models` (v0 fallback) | ✅ `state` field | ✅ JIT + `/api/v1/models/load` | ✅ load/unload + `lms` | ◐ Bearer, none default | ◐ infer 200 | ✅ type+`max_context_length` | ✅ | `/api/v1/models` (v0 fallback) w/ `state`,`compatibility_type` |
 | **llama-server** | 8080 | oai | ✅ `/v1/models` | ◐ `/props`,`/slots` (single) | ❌ (1/instance) | ❌ classic | ◐ none / `--api-key` | ✅ `/health` | ◐ ctx via `/props`,`meta` | ✅ | `/props` w/ `default_generation_settings`+`build_info` |
 | **llama-swap** | 8080 | oai/ant | ✅ `/v1/models` (all config) | ✅ `/running` | ✅ via `model` → restart upstream | ✅ `/api/models/unload`, ttl | ◐ optional multi-scheme | ✅ `/health`→OK | ◐ via upstream | ✅ | `/` → `/ui/`; `/running`,`/upstream/{model}` |
 | **vLLM** | 8000 | oai | ✅ `/v1/models` | ◐ `/is_sleeping` (dev) | ❌ base · ◐ LoRA | ◐ sleep/wake + LoRA | ◐ none / `--api-key` | ✅ `/health` | ◐ `max_model_len` only | ✅ | `/version` + `/metrics` `vllm:` + `owned_by:"vllm"` |
@@ -38,7 +38,7 @@ adapter registers under (`oai` = `openai-completions`, `ant` = `anthropic-messag
 1. `GET /` → `Ollama is running` ⇒ Ollama · redirect `/ui/` ⇒ llama-swap
 2. `GET /api/extra/version` → `{"result":"KoboldCpp"}` ⇒ KoboldCpp
-3. `GET /api/v0/models` 200 w/ `state`/`compatibility_type` ⇒ LM Studio
+3. `GET /api/v1/models` (v0 fallback) 200 w/ `state`/`compatibility_type` ⇒ LM Studio
 4. `GET /props` w/ `default_generation_settings`+`build_info` ⇒ llama-server / llamafile
 5. `GET /version` + `/metrics` `vllm:` ⇒ vLLM
 6. `GET /v1/models` shape: `owned_by:"vllm"`⇒vLLM · `meta.n_ctx_train`⇒llama.cpp ·

package/README.md CHANGED Viewed

@@ -3,7 +3,7 @@
 [![CI](https://github.com/Hypabolic/Crossbar/actions/workflows/ci.yml/badge.svg)](https://github.com/Hypabolic/Crossbar/actions/workflows/ci.yml)
 [![npm](https://img.shields.io/npm/v/@hypabolic/crossbar)](https://www.npmjs.com/package/@hypabolic/crossbar)
-**The local/self-hosted inference connector Pi should have shipped with.**
+**Effortless local & self-hosted model backends for the Pi coding agent.**
 Crossbar is an extension for the [Pi coding agent](https://github.com/earendil-works/pi) that makes
 wiring Pi to *any* local or self-hosted model backend effortless — zero hand-edited JSON, all setup
@@ -12,6 +12,8 @@ in-place model switching.
 > Built by [Hypabolic](https://github.com/hypabolic).
+![Crossbar onboarding: discover a server, open the manage menu, switch the active model](docs/onboarding.gif)
 ---
 ## Why Crossbar
@@ -104,23 +106,10 @@ The `BackendAdapter` contract (`src/core/`) is the frozen boundary every adapter
 conformance suite (`tests/conformance/`) validates every adapter against it, and
 `tests/integration/` exercises the real discovery path over live sockets.
-### CI / releasing
-- **CI** (`.github/workflows/ci.yml`) runs `tsc --noEmit` + the full test suite on every push and PR
-  (Node 22 & 24).
-- **Releases** (`.github/workflows/release.yml`) publish to npm via **GitHub→npm OIDC trusted
-  publishing** — no tokens or secrets. [Provenance](https://docs.npmjs.com/generating-provenance-statements)
-  is attached automatically. Two ways:
-  1. **Manual** — GitHub → *Actions → Release → Run workflow* → choose `patch` / `minor` / `major`.
-     It bumps `package.json`, commits, tags `vX.Y.Z`, and publishes.
-  2. **Tag push** — `npm version patch && git push --follow-tags` locally.
-  **One-time setup:** on npmjs.com, add a **Trusted Publisher** for `@hypabolic/crossbar`
-  (*Package settings → Trusted Publisher → GitHub Actions*) pointing at repo **`Hypabolic/Crossbar`**
-  and workflow **`release.yml`**. The workflow authenticates through the OIDC `id-token` it already
-  requests — no `NPM_TOKEN` needed.
+### CI
-<!-- TODO: add an onboarding demo GIF (docs/onboarding.gif) recorded against a live Ollama + LM Studio. -->
+CI (`.github/workflows/ci.yml`) runs `tsc --noEmit` + the full test suite on every push and PR
+(Node 22 & 24).
 ## License

package/docs/onboarding.gif ADDED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,11 +1,18 @@
 {
   "name": "@hypabolic/crossbar",
-  "version": "0.1.0",
-  "description": "The local/self-hosted inference connector Pi should have shipped with — multi-backend discovery, model switching, and zero-JSON in-TUI onboarding for the Pi coding agent.",
+  "version": "0.2.0",
+  "description": "The local/self-hosted inference connector for Pi — multi-backend discovery, model switching, and zero-JSON in-TUI onboarding for the Pi coding agent.",
   "type": "module",
   "license": "MIT",
   "author": "Hypabolic",
-  "homepage": "https://github.com/hypabolic/crossbar",
+  "homepage": "https://github.com/Hypabolic/Crossbar#readme",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/Hypabolic/Crossbar.git"
+  },
+  "bugs": {
+    "url": "https://github.com/Hypabolic/Crossbar/issues"
+  },
   "keywords": [
     "pi-package",
     "pi-extension",
@@ -22,10 +29,12 @@
   "pi": {
     "extensions": [
       "./src/index.ts"
-    ]
+    ],
+    "image": "https://raw.githubusercontent.com/Hypabolic/Crossbar/main/docs/onboarding.gif"
   },
   "files": [
     "src",
+    "docs/onboarding.gif",
     "RESEARCH.md",
     "CAPABILITY-MATRIX.md",
     "ARCHITECTURE.md",
@@ -35,7 +44,9 @@
   "scripts": {
     "check": "tsc --noEmit",
     "test": "vitest run",
-    "test:watch": "vitest"
+    "test:watch": "vitest",
+    "demo:lmstudio": "node scripts/fake-lmstudio.mjs",
+    "demo:gif": "node scripts/gen-onboarding-gif.mjs"
   },
   "peerDependencies": {
     "@earendil-works/pi-coding-agent": "0.79.9",

package/src/adapters/generic.ts CHANGED Viewed

@@ -140,9 +140,15 @@ class GenericAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? model.input : ["text"],
+      // Local inference is free → per-token costs are zero, but cache-hit token
+      // COUNTS still matter: Pi maps any `usage.prompt_tokens_details.cached_tokens` the
+      // backend reports to `Usage.cacheRead` and displays it regardless of cost. The
+      // flag only asks for usage in streaming (never fabricates), so it is safe even for
+      // unknown OpenAI-compatible servers that may not report cache hits.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? DEFAULT_CONTEXT_WINDOW,
       maxTokens: model.maxTokens ?? DEFAULT_MAX_TOKENS,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/adapters/llamacpp.ts CHANGED Viewed

@@ -197,9 +197,14 @@ class LlamacppAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? model.input : ["text"],
+      // Local inference is free → per-token costs are zero, but cache-hit token
+      // COUNTS still matter: Pi maps the backend's `usage.prompt_tokens_details
+      // .cached_tokens` to `Usage.cacheRead` and displays it regardless of cost. Keep
+      // streaming usage reporting on so those prompt-cache hits are recorded.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? 8192,
       maxTokens: model.maxTokens ?? 4096,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/adapters/llamaswap.ts CHANGED Viewed

@@ -256,9 +256,14 @@ class LlamaswapAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? model.input : ["text"],
+      // Local inference is free → per-token costs are zero, but cache-hit token
+      // COUNTS still matter: Pi maps the backend's `usage.prompt_tokens_details
+      // .cached_tokens` to `Usage.cacheRead` and displays it regardless of cost. Keep
+      // streaming usage reporting on so those prompt-cache hits are recorded.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? 8192,
       maxTokens: model.maxTokens ?? 4096,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/adapters/lmstudio.ts CHANGED Viewed

@@ -2,16 +2,20 @@
  * LM Studio backend adapter.
  *
  * Implements the BackendAdapter contract for LM Studio's local server.
- * Uses the LM Studio-native /api/v0/* endpoints for discovery and management,
- * and delegates inference to the OpenAI-compatible /v1/* layer.
+ * Uses the LM Studio-native REST API for discovery and management, and delegates
+ * inference to the OpenAI-compatible /v1/* layer.
+ *
+ * LM Studio 0.4.0+ ships a native `/api/v1/*` REST API (recommended); the older
+ * `/api/v0/*` API carries the same rich model fields and is kept as a fallback for
+ * pre-0.4.0 servers. We prefer v1 and fall back to v0 only on a 404.
  *
  * Key API endpoints:
- *   GET  /api/v0/models            — model list with state, type, context lengths
- *   POST /api/v1/models/load       — load a model by id
- *   POST /api/v1/models/unload     — unload a model by id
+ *   GET  /api/v1/models  (→ /api/v0/models fallback)  — model list with state, type, context length
+ *   POST /api/v1/models/load                          — load a model by id
+ *   POST /api/v1/models/unload                        — unload a model by id
  *
  * Fingerprint discriminator: data[] entries have both `state` and
- * `compatibility_type` fields (unique to LM Studio's v0 API).
+ * `compatibility_type` fields (unique to LM Studio's native API).
  */
 import { Capability } from "../core/capability.ts";
@@ -24,9 +28,14 @@ import type {
   ModelDescriptor,
   PiModelEntry,
   Probe,
+  ProbeResult,
   ServerCredential,
 } from "../core/types.ts";
+/** Native model-list endpoints, in preference order (v1 first, v0 fallback for <0.4.0). */
+const MODELS_V1 = "/api/v1/models";
+const MODELS_V0 = "/api/v0/models";
 // ---------------------------------------------------------------------------
 // LM Studio API shapes (narrowed from unknown JSON)
 // ---------------------------------------------------------------------------
@@ -103,6 +112,7 @@ function hasLmsDiscriminator(json: unknown): boolean {
 function toDescriptor(m: LmsModelEntry): ModelDescriptor {
   const isEmbeddings = m.type === "embeddings";
   const isVlm = m.type === "vlm";
+  const isLoaded = m.state === "loaded";
   const input: ("text" | "image")[] = ["text"];
   if (isVlm) input.push("image");
@@ -112,11 +122,24 @@ function toDescriptor(m: LmsModelEntry): ModelDescriptor {
     name: m.id,
     input,
     embeddings: isEmbeddings,
-    loaded: m.state === "loaded",
+    loaded: isLoaded,
     raw: m,
   };
-  if (m.max_context_length !== undefined) {
-    desc.contextWindow = m.max_context_length;
+  // Context window: LM Studio reports both the model ceiling (`max_context_length`)
+  // and the window the model was actually loaded with (`loaded_context_length`),
+  // which is frequently configured well below the ceiling (e.g. a 128k model loaded
+  // at 4096). Register the OPERATIVE window so Pi budgets against what the server
+  // will really accept: prefer the loaded length when the model is resident (and
+  // non-zero), otherwise fall back to the model max. `loaded_context_length` is 0 or
+  // absent while the model is not loaded, so it never masks the ceiling in that case.
+  const loadedCtx =
+    isLoaded && typeof m.loaded_context_length === "number" && m.loaded_context_length > 0
+      ? m.loaded_context_length
+      : undefined;
+  const ctx = loadedCtx ?? m.max_context_length;
+  if (ctx !== undefined) {
+    desc.contextWindow = ctx;
   }
   return desc;
 }
@@ -140,10 +163,21 @@ class LmStudioAdapter implements BackendAdapter {
     Capability.Streaming,
   ]);
+  /**
+   * Fetch the native model list, preferring /api/v1/models and falling back to
+   * /api/v0/models for LM Studio < 0.4.0 (which only exposes the v0 REST API).
+   * Falls back ONLY on a 404 so auth (401) and unreachable (0) errors propagate.
+   */
+  private async modelsResponse(probe: Probe): Promise<ProbeResult> {
+    const v1 = await probe(MODELS_V1);
+    if (v1.status === 404) return probe(MODELS_V0);
+    return v1;
+  }
   // --- fingerprint ----------------------------------------------------------
   async fingerprint(baseUrl: string, probe: Probe): Promise<DiscoveredServer | null> {
-    const r = await probe("/api/v0/models");
+    const r = await this.modelsResponse(probe);
     if (!r.ok || r.status === 0) return null;
     if (!hasLmsDiscriminator(r.json)) return null;
@@ -163,7 +197,7 @@ class LmStudioAdapter implements BackendAdapter {
     _cred: ServerCredential,
     probe: Probe,
   ): Promise<HealthStatus> {
-    const r = await probe("/api/v0/models");
+    const r = await this.modelsResponse(probe);
     if (r.status === 0) return { state: "unreachable" };
     if (r.status === 401) return { state: "unauthorized" };
     if (!r.ok) return { state: "degraded" };
@@ -179,7 +213,7 @@ class LmStudioAdapter implements BackendAdapter {
     _cred: ServerCredential,
     probe: Probe,
   ): Promise<ModelDescriptor[]> {
-    const r = await probe("/api/v0/models");
+    const r = await this.modelsResponse(probe);
     if (!r.ok) {
       if (r.status === 401) throw new Error("401 Unauthorized");
       if (r.status === 0) throw new Error("listModels failed: server unreachable");
@@ -197,7 +231,7 @@ class LmStudioAdapter implements BackendAdapter {
     _cred: ServerCredential,
     probe: Probe,
   ): Promise<LoadedState> {
-    const r = await probe("/api/v0/models");
+    const r = await this.modelsResponse(probe);
     if (!r.ok) {
       if (r.status === 401) throw new Error("401 Unauthorized");
       if (r.status === 0) throw new Error("introspectLoaded failed: server unreachable");
@@ -242,7 +276,7 @@ class LmStudioAdapter implements BackendAdapter {
     }
     // Step 2: Confirm via model list that the target is now loaded
-    const r2 = await probe("/api/v0/models");
+    const r2 = await this.modelsResponse(probe);
     if (!r2.ok) {
       if (r2.status === 0) throw new Error("switchModel confirmation failed: server went down");
       if (r2.status === 401) throw new Error("401 Unauthorized");
@@ -287,9 +321,18 @@ class LmStudioAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? (model.input as ("text" | "image")[]) : ["text"],
+      // Local inference is free, so per-token COSTS are zero. The cache-hit token
+      // COUNTS still flow and are worth recording: LM Studio's OpenAI-compatible
+      // responses report `usage.prompt_tokens_details.cached_tokens`, which Pi maps to
+      // `Usage.cacheRead` and surfaces in the TUI regardless of cost. Keep usage
+      // reporting on during streaming so those automatic-prefix-cache hits are
+      // recorded. We intentionally do NOT set `cacheControlFormat`: LM Studio (llama.cpp
+      // engine) caches matching prefixes automatically, so injecting Anthropic-style
+      // `cache_control` markers would be wrong for this OpenAI-completions backend.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? 8192,
       maxTokens: model.maxTokens ?? 4096,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/adapters/ollama.ts CHANGED Viewed

@@ -320,9 +320,14 @@ class OllamaAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? model.input : ["text"],
+      // Local inference is free → per-token costs are zero, but cache-hit token
+      // COUNTS still matter: Pi maps the backend's `usage.prompt_tokens_details
+      // .cached_tokens` to `Usage.cacheRead` and displays it regardless of cost. Keep
+      // streaming usage reporting on so those prompt-cache hits are recorded.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? DEFAULT_CONTEXT_WINDOW,
       maxTokens: model.maxTokens ?? DEFAULT_MAX_TOKENS,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/adapters/vllm.ts CHANGED Viewed

@@ -177,9 +177,15 @@ class VllmAdapter implements BackendAdapter {
       name: model.name,
       reasoning: model.reasoning ?? false,
       input: model.input.length > 0 ? model.input : ["text"],
+      // Local inference is free → per-token costs are zero, but cache-hit token
+      // COUNTS still matter: Pi maps the backend's `usage.prompt_tokens_details
+      // .cached_tokens` to `Usage.cacheRead` and displays it regardless of cost. vLLM
+      // reports cached tokens from its automatic prefix cache; keep streaming usage
+      // reporting on so those hits are recorded.
       cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
       contextWindow: model.contextWindow ?? DEFAULT_CONTEXT_WINDOW,
       maxTokens: model.maxTokens ?? DEFAULT_MAX_TOKENS,
+      compat: { supportsUsageInStreaming: true },
     };
   }

package/src/ui/onboarding.ts CHANGED Viewed

@@ -24,45 +24,56 @@ import { Container, type SelectItem, SelectList, Text, matchesKey } from "@earen
 import type { BackendAdapter } from "../core/backend-adapter.ts";
 import { canIntrospect, canLoadUnload, canSwitch } from "../core/backend-adapter.ts";
-import type { DiscoveredServer, ModelDescriptor, ServerRecord } from "../core/types.ts";
+import type { DiscoveredServer, LoadedState, ModelDescriptor, ServerRecord } from "../core/types.ts";
 import type { ServerRegistry } from "../registry/registry.ts";
 import { serverId } from "../registry/ids.ts";
 import { adapterFor } from "../adapters/index.ts";
+import { unregisterServer } from "../shim/provider-shim.ts";
 import { createProbe } from "../discovery/probe.ts";
 // ─── Pure helpers ────────────────────────────────────────────────────────────
+/** Extract a `host:port` string from a base URL for compact labels. */
+function hostPortOf(baseUrl: string): string {
+  try {
+    const u = new URL(baseUrl);
+    return `${u.hostname}:${u.port || (u.protocol === "https:" ? "443" : "80")}`;
+  } catch {
+    return baseUrl.replace(/^https?:\/\//, "");
+  }
+}
+/** Capitalise a backend kind for display, e.g. "lmstudio" → "Lmstudio". */
+function kindLabelOf(kind: string): string {
+  return kind.charAt(0).toUpperCase() + kind.slice(1);
+}
 /**
- * Build a `SelectItem[]` representing the discovered servers for the top-level
- * onboarding list.  Already-registered servers are marked with a "(added)" suffix
- * so the user can see what is new vs. what Crossbar already knows about.
+ * Build a `SelectItem[]` representing the servers shown in the top-level onboarding
+ * list.  Three kinds of entry can appear:
+ *   - discovered servers (in discovery order) — already-registered ones get an
+ *     "(added)" suffix so the user can tell new from known;
+ *   - registered servers that are NOT currently discovered (e.g. offline), so they
+ *     can still be managed/removed;
+ *   - a sentinel "Add manually" entry, always last.
  *
- * Items are ordered: discovered servers first (in discovery order), then a
- * sentinel "Add manually" entry at the end.
+ * Selecting any already-registered entry opens the manage overlay; selecting a new
+ * discovered entry or the sentinel runs the add flow.
  */
 export function buildDiscoveredItems(
   discovered: DiscoveredServer[],
   existing: ServerRecord[],
 ): SelectItem[] {
   const existingIds = new Set(existing.map((r) => r.id));
+  const discoveredUrls = new Set(discovered.map((s) => s.baseUrl));
   const items: SelectItem[] = discovered.map((server): SelectItem => {
     const id = serverId(server.kind, server.baseUrl);
     const isAdded = existingIds.has(id);
-    // Extract host:port from baseUrl for the label suffix
-    let hostPort: string;
-    try {
-      const u = new URL(server.baseUrl);
-      hostPort = `${u.hostname}:${u.port || (u.protocol === "https:" ? "443" : "80")}`;
-    } catch {
-      hostPort = server.baseUrl.replace(/^https?:\/\//, "");
-    }
     // Compose a label: "[kind] host:port  ✓ healthy" or "(added)"
-    const kindLabel = server.kind.charAt(0).toUpperCase() + server.kind.slice(1);
     const healthMark = isAdded ? "(added)" : "✓ healthy";
-    const label = `${kindLabel} (${hostPort})`;
+    const label = `${kindLabelOf(server.kind)} (${hostPortOf(server.baseUrl)})`;
     return {
       value: server.baseUrl,
@@ -73,6 +84,18 @@ export function buildDiscoveredItems(
     };
   });
+  // Append registered servers that weren't discovered this scan (offline / not
+  // reachable right now) so they remain manageable from the same list.
+  for (const record of existing) {
+    if (!record.enabled) continue;
+    if (discoveredUrls.has(record.baseUrl)) continue;
+    items.push({
+      value: record.baseUrl,
+      label: `${kindLabelOf(record.kind)} (${hostPortOf(record.baseUrl)})  (added)`,
+      description: "Registered · not currently discovered",
+    });
+  }
   // Always append the manual-add sentinel
   items.push({
     value: "__manual__",
@@ -151,6 +174,36 @@ export function capabilityActions(
   return actions;
 }
+/** One-line hints shown under each manage action. */
+const ACTION_DESCRIPTIONS: Record<string, string> = {
+  switch: "Make a model the active/served one",
+  load: "Load a model into memory",
+  unload: "Evict a loaded model from memory",
+  introspect: "Show which models are currently loaded",
+  remove: "Forget this server and delete its stored key",
+};
+/**
+ * Build the manage-overlay action list for an already-registered server: the
+ * adapter's capability-filtered actions (switch / load / unload / introspect) plus
+ * a "Remove server" action that is always available. Backends without any local
+ * capabilities (vLLM, OpenAI, Anthropic, generic) show only "Remove server".
+ */
+export function buildManageItems(adapter: BackendAdapter): SelectItem[] {
+  const items: SelectItem[] = capabilityActions(adapter).map((a) => {
+    const item: SelectItem = { value: a.value, label: a.label };
+    const desc = ACTION_DESCRIPTIONS[a.value];
+    if (desc !== undefined) item.description = desc;
+    return item;
+  });
+  items.push({
+    value: "remove",
+    label: "Remove server",
+    description: ACTION_DESCRIPTIONS["remove"]!,
+  });
+  return items;
+}
 /**
  * Coerce a user-supplied string (which may be bare "host:port", missing a scheme,
  * or already a valid URL) into a well-formed origin with no trailing slash.
@@ -177,6 +230,263 @@ export function normalizeManualUrl(input: string): string {
   return u.origin.replace(/\/+$/, "");
 }
+// ─── Shared overlay + server-action helpers ─────────────────────────────────
+/** Reconstruct a minimal DiscoveredServer from a persisted record for adapter calls. */
+function serverFromRecord(record: ServerRecord): DiscoveredServer {
+  return {
+    kind: record.kind,
+    baseUrl: record.baseUrl,
+    auth: record.auth,
+    label: record.label,
+    confidence: 1,
+  };
+}
+/**
+ * Render a single-select overlay (titled SelectList in an accent border) and resolve
+ * to the chosen item value, or `null` on Esc/cancel. Shared by the model picker and
+ * the manage menus so they stay visually consistent.
+ */
+function selectOverlay(
+  ctx: ExtensionCommandContext,
+  title: string,
+  items: SelectItem[],
+  hint: string,
+): Promise<string | null> {
+  return ctx.ui.custom<string | null>(
+    (_tui, theme, _kb, done) => {
+      const container = new Container();
+      container.addChild(new DynamicBorder((s) => theme.fg("accent", s)));
+      container.addChild(new Text(theme.fg("accent", theme.bold(title))));
+      const list = new SelectList(items, Math.min(items.length, 12), getSelectListTheme());
+      list.onSelect = (item) => done(item.value);
+      list.onCancel = () => done(null);
+      container.addChild(list);
+      container.addChild(new Text(theme.fg("dim", hint)));
+      container.addChild(new DynamicBorder((s) => theme.fg("accent", s)));
+      return {
+        render: (width: number) => container.render(width),
+        invalidate: () => container.invalidate(),
+        handleInput: (data: string) => {
+          if (matchesKey(data, "escape")) {
+            done(null);
+            return;
+          }
+          list.handleInput(data);
+          _tui.requestRender();
+        },
+      };
+    },
+    { overlay: true, overlayOptions: { width: "60%" } },
+  );
+}
+const errMsg = (err: unknown): string => (err instanceof Error ? err.message : String(err));
+/** Fetch a server's models (live, falling back to last-known on failure). */
+async function fetchModels(
+  ctx: ExtensionCommandContext,
+  registry: ServerRegistry,
+  record: ServerRecord,
+): Promise<ModelDescriptor[] | null> {
+  const adapter = adapterFor(record.kind);
+  const cred = await registry.resolveCredential(record);
+  const probe = createProbe(record.baseUrl, { auth: cred, defaultTimeoutMs: 5000 });
+  try {
+    return await adapter.listModels(serverFromRecord(record), cred, probe);
+  } catch (err) {
+    if (record.lastKnownModels && record.lastKnownModels.length > 0) {
+      return record.lastKnownModels;
+    }
+    ctx.ui.notify(`Crossbar: could not list models — ${errMsg(err)}`, "error");
+    return null;
+  }
+}
+/** Switch the active model or load a model: pick from the list, then call the adapter. */
+async function performModelAction(
+  ctx: ExtensionCommandContext,
+  registry: ServerRegistry,
+  record: ServerRecord,
+  action: "switch" | "load",
+): Promise<void> {
+  const adapter = adapterFor(record.kind);
+  const models = await fetchModels(ctx, registry, record);
+  if (!models) return;
+  if (models.length === 0) {
+    ctx.ui.notify("Crossbar: server returned no models.", "warning");
+    return;
+  }
+  const title = action === "switch"
+    ? `Switch model — ${record.label}`
+    : `Load model — ${record.label}`;
+  const modelId = await selectOverlay(
+    ctx,
+    title,
+    buildModelItems(models.filter((m) => !m.embeddings)),
+    "↑↓ navigate · Enter select · Esc cancel",
+  );
+  if (!modelId) return;
+  const cred = await registry.resolveCredential(record);
+  // Loads can be slow (cold model into VRAM) — give them a generous budget.
+  const probe = createProbe(record.baseUrl, { auth: cred, defaultTimeoutMs: 60_000 });
+  ctx.ui.notify(
+    `Crossbar: ${action === "switch" ? "switching to" : "loading"} ${modelId}…`,
+    "info",
+  );
+  try {
+    if (action === "switch") {
+      if (!canSwitch(adapter)) return;
+      await adapter.switchModel(serverFromRecord(record), cred, modelId, probe);
+    } else {
+      if (!canLoadUnload(adapter)) return;
+      await adapter.loadUnload(serverFromRecord(record), cred, modelId, "load", probe);
+    }
+    ctx.ui.notify(
+      `Crossbar: ${modelId} ${action === "switch" ? "is now active" : "loaded"}.`,
+      "info",
+    );
+  } catch (err) {
+    ctx.ui.notify(`Crossbar: ${action} failed — ${errMsg(err)}`, "error");
+  }
+}
+/** Unload a currently-loaded model: resolve the loaded set, pick one, evict it. */
+async function performUnload(
+  ctx: ExtensionCommandContext,
+  registry: ServerRegistry,
+  record: ServerRecord,
+): Promise<void> {
+  const adapter = adapterFor(record.kind);
+  if (!canLoadUnload(adapter)) return;
+  const cred = await registry.resolveCredential(record);
+  const probe = createProbe(record.baseUrl, { auth: cred, defaultTimeoutMs: 5000 });
+  let loadedIds: string[] = record.lastKnownLoaded ?? [];
+  if (canIntrospect(adapter)) {
+    try {
+      const state = await adapter.introspectLoaded(serverFromRecord(record), cred, probe);
+      loadedIds = state.loadedModelIds;
+    } catch {
+      // Fall back to last-known on a failed introspection.
+    }
+  }
+  if (loadedIds.length === 0) {
+    ctx.ui.notify("Crossbar: no models are currently loaded.", "info");
+    return;
+  }
+  const modelId = await selectOverlay(
+    ctx,
+    `Unload model — ${record.label}`,
+    loadedIds.map((id) => ({ value: id, label: id })),
+    "↑↓ navigate · Enter select · Esc cancel",
+  );
+  if (!modelId) return;
+  ctx.ui.notify(`Crossbar: unloading ${modelId}…`, "info");
+  try {
+    await adapter.loadUnload(serverFromRecord(record), cred, modelId, "unload", probe);
+    ctx.ui.notify(`Crossbar: ${modelId} unloaded.`, "info");
+  } catch (err) {
+    ctx.ui.notify(`Crossbar: unload failed — ${errMsg(err)}`, "error");
+  }
+}
+/** Read and report the currently-loaded models for a server. */
+async function performIntrospect(
+  ctx: ExtensionCommandContext,
+  registry: ServerRegistry,
+  record: ServerRecord,
+): Promise<void> {
+  const adapter = adapterFor(record.kind);
+  if (!canIntrospect(adapter)) return;
+  const cred = await registry.resolveCredential(record);
+  const probe = createProbe(record.baseUrl, { auth: cred, defaultTimeoutMs: 5000 });
+  let state: LoadedState;
+  try {
+    state = await adapter.introspectLoaded(serverFromRecord(record), cred, probe);
+  } catch (err) {
+    ctx.ui.notify(`Crossbar: could not read loaded models — ${errMsg(err)}`, "error");
+    return;
+  }
+  if (state.loadedModelIds.length === 0) {
+    ctx.ui.notify(`Crossbar: ${record.label} has no models loaded.`, "info");
+    return;
+  }
+  const summary = state.loadedModelIds
+    .map((id) => {
+      const ctxLen = state.perModel?.[id]?.contextLength;
+      if (ctxLen === undefined) return id;
+      const ctxStr = ctxLen >= 1000 ? `${Math.round(ctxLen / 1000)}k` : `${ctxLen}`;
+      return `${id} (${ctxStr} ctx)`;
+    })
+    .join(", ");
+  ctx.ui.notify(`Crossbar: ${record.label} loaded — ${summary}`, "info");
+}
+/** Confirm and remove a server from the registry, auth.json, and Pi. */
+async function performRemove(
+  pi: ExtensionAPI,
+  ctx: ExtensionCommandContext,
+  registry: ServerRegistry,
+  record: ServerRecord,
+): Promise<void> {
+  const confirm = await ctx.ui.select(`Remove ${record.label}?`, ["Cancel", "Remove server"]);
+  if (confirm !== "Remove server") return;
+  unregisterServer(pi, record);
+  await registry.remove(record.id);
+  ctx.ui.notify(`Crossbar: removed ${record.label}.`, "info");
+}
+/**
+ * Open the manage overlay for an already-registered server: show the
+ * capability-filtered action menu and dispatch the chosen action.
+ */
+export async function openServerActions(
+  pi: ExtensionAPI,
+  ctx: ExtensionCommandContext,
+  deps: OnboardingDeps,
+  record: ServerRecord,
+): Promise<void> {
+  const { registry } = deps;
+  const adapter = adapterFor(record.kind);
+  const choice = await selectOverlay(
+    ctx,
+    `Manage — ${record.label}`,
+    buildManageItems(adapter),
+    "↑↓ navigate · Enter select · Esc close",
+  );
+  if (!choice) return;
+  switch (choice) {
+    case "switch":
+      await performModelAction(ctx, registry, record, "switch");
+      break;
+    case "load":
+      await performModelAction(ctx, registry, record, "load");
+      break;
+    case "unload":
+      await performUnload(ctx, registry, record);
+      break;
+    case "introspect":
+      await performIntrospect(ctx, registry, record);
+      break;
+    case "remove":
+      await performRemove(pi, ctx, registry, record);
+      break;
+  }
+}
 // ─── Overlay flow driver ────────────────────────────────────────────────────
 export interface OnboardingDeps {
@@ -203,7 +513,7 @@ export interface OnboardingDeps {
  * @param deps - injected registry + discover function (for testability)
  */
 export async function openOnboarding(
-  _pi: ExtensionAPI,
+  pi: ExtensionAPI,
   ctx: ExtensionCommandContext,
   deps: OnboardingDeps,
 ): Promise<void> {
@@ -299,7 +609,15 @@ export async function openOnboarding(
     targetBaseUrl = normalizedUrl;
   } else {
-    // Discovered server path
+    // Already-registered server (discovered or offline) → open the manage overlay
+    // instead of re-running the add flow.
+    const existingRecord = registry.list().find((r) => r.baseUrl === chosenBaseUrl);
+    if (existingRecord) {
+      await openServerActions(pi, ctx, deps, existingRecord);
+      return;
+    }
+    // New discovered server path
     discoveredServer = discovered.find((s) => s.baseUrl === chosenBaseUrl);
     targetBaseUrl = chosenBaseUrl;
   }
@@ -368,47 +686,11 @@ export async function openOnboarding(
   }
   // ── Step 5: pick default model ─────────────────────────────────────────────
-  const modelItems = buildModelItems(models);
-  const chosenModelId = await ctx.ui.custom<string | null>(
-    (_tui, theme, _kb, done) => {
-      const container = new Container();
-      container.addChild(new DynamicBorder((s) => theme.fg("accent", s)));
-      container.addChild(
-        new Text(theme.fg("accent", theme.bold(`Pick default model — ${discoveredServer!.label}`))),
-      );
-      const list = new SelectList(
-        modelItems,
-        Math.min(modelItems.length, 12),
-        getSelectListTheme(),
-      );
-      list.onSelect = (item) => done(item.value);
-      list.onCancel = () => done(null);
-      container.addChild(list);
-      container.addChild(
-        new Text(theme.fg("dim", "↑↓ navigate · Enter select · Esc skip")),
-      );
-      container.addChild(new DynamicBorder((s) => theme.fg("accent", s)));
-      return {
-        render: (width: number) => container.render(width),
-        invalidate: () => container.invalidate(),
-        handleInput: (data: string) => {
-          // Allow Esc to skip model selection
-          if (matchesKey(data, "escape")) {
-            done(null);
-            return;
-          }
-          list.handleInput(data);
-          _tui.requestRender();
-        },
-      };
-    },
-    { overlay: true, overlayOptions: { width: "60%" } },
+  const chosenModelId = await selectOverlay(
+    ctx,
+    `Pick default model — ${discoveredServer.label}`,
+    buildModelItems(models),
+    "↑↓ navigate · Enter select · Esc skip",
   );
   // chosenModelId === null means the user skipped — still register the server