npm - gearbox-code - Versions diffs - 0.1.30 → 0.1.33 - Mend

gearbox-code 0.1.30 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/cli.mjs +992 -149
package/package.json +2 -2

package/dist/cli.mjs CHANGED Viewed

@@ -106152,10 +106152,10 @@ var init_catalog = __esm(() => {
     { id: "requesty", label: "Requesty", group: "gateway", exec: "in-loop", authKind: "openai-compat", envVars: ["REQUESTY_API_KEY"], baseUrl: "https://router.requesty.ai/v1", signupUrl: "https://app.requesty.ai" },
     { id: "portkey", label: "Portkey", group: "gateway", exec: "in-loop", authKind: "openai-compat", envVars: ["PORTKEY_API_KEY"], baseUrl: "https://api.portkey.ai/v1", signupUrl: "https://app.portkey.ai", notes: "Config-driven routing via x-portkey-* headers." },
     { id: "litellm", label: "LiteLLM proxy", group: "gateway", exec: "in-loop", authKind: "openai-compat", envVars: ["LITELLM_API_KEY"], signupUrl: "https://docs.litellm.ai/docs/simple_proxy", notes: "Self-hosted; set baseUrl to your proxy." },
-    { id: "azure-foundry", label: "Azure AI Foundry", group: "gateway", exec: "in-loop", authKind: "openai-compat", envVars: ["AZURE_AI_FOUNDRY_API_KEY", "AZURE_AI_INFERENCE_API_KEY"], signupUrl: "https://ai.azure.com", defaultModels: ["gpt-5.5", "gpt-5.5-mini", "gpt-4.1", "o4-mini"], notes: "OpenAI-compatible Foundry endpoint. Use baseUrl ending in /openai/v1." },
+    { id: "azure-foundry", label: "Azure AI Foundry", group: "gateway", exec: "in-loop", authKind: "openai-compat", envVars: ["AZURE_AI_FOUNDRY_API_KEY", "AZURE_AI_INFERENCE_API_KEY"], signupUrl: "https://ai.azure.com", defaultModels: ["gpt-5.5", "gpt-5.5-mini", "gpt-4.1", "o4-mini"], discoverOnly: true, notes: "OpenAI-compatible Foundry endpoint. Use baseUrl ending in /openai/v1. Real model ids are discovered per resource." },
     { id: "bedrock", label: "Amazon Bedrock", group: "cloud", exec: "in-loop", authKind: "aws", envVars: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION", "AWS_PROFILE"], keyPrefix: ["AKIA", "ASIA"], signupUrl: "https://console.aws.amazon.com/bedrock", defaultModels: ["anthropic.claude-sonnet-4-20250514-v1:0", "anthropic.claude-haiku-4-5-20251001-v1:0", "anthropic.claude-opus-4-20250514-v1:0", "amazon.nova-pro-v1:0", "amazon.nova-lite-v1:0", "amazon.nova-micro-v1:0", "meta.llama4-maverick-17b-instruct-v1:0", "meta.llama4-scout-17b-instruct-v1:0"], notes: "AWS IAM credentials or ~/.aws profile. Enable models in the Bedrock console first." },
     { id: "vertex", label: "Google Vertex AI", group: "cloud", exec: "in-loop", authKind: "vertex", envVars: ["GOOGLE_VERTEX_PROJECT", "GOOGLE_VERTEX_LOCATION", "GOOGLE_APPLICATION_CREDENTIALS"], signupUrl: "https://console.cloud.google.com/vertex-ai", defaultModels: ["gemini-3.1-pro-preview", "gemini-3.5-flash", "gemini-3.1-flash-lite"], notes: "ADC (gcloud auth application-default login) or a service-account JSON." },
-    { id: "azure", label: "Azure OpenAI", group: "cloud", exec: "in-loop", authKind: "azure", envVars: ["AZURE_API_KEY", "AZURE_RESOURCE_NAME"], signupUrl: "https://oai.azure.com", defaultModels: ["gpt-5.5", "gpt-5.5-mini", "gpt-4.1"], notes: "resourceName (e.g. my-resource) + API key. Model IDs are your deployment names." },
+    { id: "azure", label: "Azure OpenAI", group: "cloud", exec: "in-loop", authKind: "azure", envVars: ["AZURE_API_KEY", "AZURE_RESOURCE_NAME"], signupUrl: "https://oai.azure.com", defaultModels: ["gpt-5.5", "gpt-5.5-mini", "gpt-4.1"], discoverOnly: true, notes: "resourceName (e.g. my-resource) + API key. Model IDs are your deployment NAMES — discovered per resource, not the base model ids." },
     { id: "ollama", label: "Ollama (local)", group: "local", exec: "in-loop", authKind: "openai-compat", envVars: [], baseUrl: "http://localhost:11434/v1", signupUrl: "https://ollama.com", defaultModels: ["qwen2.5-coder:7b", "llama3.3"], notes: "No key; runs on your machine." },
     { id: "lmstudio", label: "LM Studio (local)", group: "local", exec: "in-loop", authKind: "openai-compat", envVars: [], baseUrl: "http://localhost:1234/v1", signupUrl: "https://lmstudio.ai" },
     { id: "vllm", label: "vLLM (local/self-host)", group: "local", exec: "in-loop", authKind: "openai-compat", envVars: [], baseUrl: "http://localhost:8000/v1", signupUrl: "https://docs.vllm.ai" },
@@ -106172,10 +106172,12 @@ function generatedModels() {
   for (const p of CATALOG) {
     if (p.group === "cli")
       continue;
+    if (p.discoverOnly)
+      continue;
     for (const m2 of p.defaultModels ?? []) {
       if (CURATED.some((c) => c.provider === p.id && c.sdkId === m2))
         continue;
-      out.push({ id: `${p.id}/${m2}`, provider: p.id, sdkId: m2, label: m2.length > 24 ? m2.slice(0, 24) : m2, contextWindow: 128000 });
+      out.push({ id: `${p.id}/${m2}`, provider: p.id, sdkId: m2, label: m2.length > 24 ? m2.slice(0, 24) : m2, contextWindow: 128000, capabilities: { source: "seeded" } });
     }
   }
   return out;
@@ -106188,7 +106190,7 @@ function accountModelSpecs() {
     for (const sdkId of account.models ?? []) {
       if (!sdkId)
         continue;
-      if (MODELS.some((m2) => m2.provider === account.provider && m2.sdkId === sdkId))
+      if (CURATED.some((m2) => m2.provider === account.provider && m2.sdkId === sdkId))
         continue;
       const id = `${account.provider}/${sdkId}`;
       out.push({
@@ -106197,16 +106199,29 @@ function accountModelSpecs() {
         sdkId,
         label: sdkId.length > 24 ? sdkId.slice(0, 24) : sdkId,
         contextWindow: 128000,
-        capabilities: { source: "user-configured", tools: "unknown", images: "unknown", jsonSchema: "unknown", usage: "partial" }
+        capabilities: { source: "api-discovered", tools: "unknown", images: "unknown", jsonSchema: "unknown", usage: "partial" }
       });
     }
   }
   return out;
 }
+function seedSuppressedProviders() {
+  const s2 = new Set;
+  for (const p of CATALOG)
+    if (p.discoverOnly)
+      s2.add(p.id);
+  for (const a of listAccounts()) {
+    if (a.enabled && a.exec !== "cli" && (a.models?.length ?? 0) > 0)
+      s2.add(a.provider);
+  }
+  return s2;
+}
 function modelRegistry() {
+  const suppressed = seedSuppressedProviders();
+  const base2 = MODELS.filter((m2) => !(m2.capabilities?.source === "seeded" && suppressed.has(m2.provider)));
   const seen = new Set;
   const out = [];
-  for (const m2 of [...MODELS, ...accountModelSpecs()]) {
+  for (const m2 of [...base2, ...accountModelSpecs()]) {
     const key = `${m2.provider}\x00${m2.sdkId}`;
     if (seen.has(key))
       continue;
@@ -139628,6 +139643,33 @@ function fmtTokens(n) {
     return `${(n / 1000).toFixed(1)}k`;
   return String(n);
 }
+var SEP2 = `  ${glyph.bullet}  `;
+function statusBarLayout({
+  model,
+  effort,
+  mode = "normal"
+}) {
+  const modeLabel = mode === "auto-accept" ? "auto-accept" : mode;
+  const modelStart = 1 + (mode !== "normal" ? modeLabel.length + SEP2.length : 0);
+  const modelZone = [modelStart, modelStart + model.length];
+  if (!effort)
+    return { modelZone, effortZone: null };
+  const effortText = `effort ${effort}`;
+  const effortStart = modelZone[1] + SEP2.length;
+  return { modelZone, effortZone: [effortStart, effortStart + effortText.length] };
+}
+function statusBarHit(args) {
+  const statusRow = args.termRows - args.composerLines - args.paletteRows - 2;
+  if (args.y !== statusRow || !args.model)
+    return null;
+  const { modelZone, effortZone } = statusBarLayout(args);
+  const col = args.x - 1;
+  if (col >= modelZone[0] && col < modelZone[1])
+    return "model";
+  if (effortZone && col >= effortZone[0] && col < effortZone[1])
+    return "effort";
+  return null;
+}
 function StatusBar({
   model,
   branch,
@@ -139638,11 +139680,11 @@ function StatusBar({
   cost = 0,
   width,
   mode = "normal",
-  effort = "balanced",
+  effort,
   subscription = null,
   online = true
 }) {
-  const sep = `  ${glyph.bullet}  `;
+  const sep = SEP2;
   const modeLabel = mode === "auto-accept" ? "auto-accept" : mode;
   const left = [
     model,
@@ -139795,6 +139837,7 @@ var COMMANDS = [
   { name: "/retry", usage: "/retry", desc: "send your last message again", group: "chat" },
   { name: "/compact", usage: "/compact", desc: "shrink the conversation to free up room", group: "chat" },
   { name: "/context", usage: "/context", desc: "see what's loaded and how many tokens it uses", group: "chat" },
+  { name: "/ask", usage: "/ask <q>", desc: "ask about Gearbox itself — answered from its own docs", group: "chat" },
   { name: "/memory", usage: "/memory [note]", desc: "show or add facts to remember (or start a line with #)", group: "chat" },
   { name: "/account", usage: "/account", desc: "list accounts; /account <number> to switch, /account add to add one", group: "accounts" },
   { name: "/onboard", usage: "/onboard", desc: "first-run setup; provider list and import/add commands", group: "accounts" },
@@ -139832,6 +139875,17 @@ var GROUP_TITLES = [
   { id: "settings", title: "settings" },
   { id: "other", title: "other" }
 ];
+var ACCOUNT_ADD_HELP = `add an account:
+` + `  /account add claude          Claude subscription (Pro/Max)
+` + `  /account add claude <name>   a 2nd Claude account, e.g. /account add claude work
+` + `  /account add codex           ChatGPT subscription (Plus/Pro)
+` + `  /account add codex <name>    a 2nd ChatGPT account, e.g. /account add codex work
+` + `  /account add azure <foundry-endpoint> <api-key>            Azure AI Foundry (pass the full https:// endpoint)
+` + `  /account add azure <resource-name> <api-key> [api-version] Azure OpenAI (pass the bare resource name)
+` + `  /account add openai-compat <name> <base-url> <api-key> <model> [model...]
+` + `  /account add <api-key>       paste any provider key (auto-detected)
+` + `  /account add <provider> <api-key>   e.g. anthropic, openai, openrouter
+` + "After adding, /account refresh discovers the models the account can actually serve.";
 function helpText() {
   const visible = COMMANDS.filter((c) => !HIDDEN.has(c.name));
   const pad3 = Math.max(...visible.map((c) => c.name.length)) + 2;
@@ -139891,7 +139945,7 @@ function formatAccounts(accounts, activeCliId, importable, statuses = {}) {
     for (const c of importable)
       lines.push(`  + ${c.label} (${c.envVar})`);
   }
-  lines.push("", "  switch: /account <name-or-number>", "  add:    /account add codex [name]  ·  /account add claude [name]  ·  /account add <api-key>", accounts.length ? "  remove: /account remove <name-or-number>" : "");
+  lines.push("", "  switch: /account <name-or-number>", "  add:    /account add codex [name]  ·  /account add claude [name]  ·  /account add <api-key>", accounts.length ? "  remove: /account remove <name-or-number>" : "", accounts.length ? "  refresh models: /account refresh" : "");
   return lines.filter(Boolean).join(`
 `);
 }
@@ -139926,8 +139980,20 @@ function formatModelList(currentId, showAll = false) {
   const rows = ["models · /model <name> pins one · /model auto routes per task"];
   if (usable.length) {
     rows.push("", "ready to use");
-    for (const m2 of usable)
+    const CAP = 8;
+    const shown = new Map;
+    let hidden = 0;
+    for (const m2 of usable) {
+      const n = shown.get(m2.provider) ?? 0;
+      if (!showAll && n >= CAP) {
+        hidden++;
+        continue;
+      }
+      shown.set(m2.provider, n + 1);
       rows.push(line(m2));
+    }
+    if (hidden)
+      rows.push(`  + ${hidden} more on your accounts — /model all to list · /model <name> to pick`);
   } else {
     rows.push("", "no accounts yet — /account to add one");
   }
@@ -142127,25 +142193,33 @@ var motionFrame = () => Math.floor(Date.now() / 360);
 var spinnerFrame = () => ["●", "◌", "○", "◌"][motionFrame() % 4];
 var activePhrase2 = (label) => `${label}${["", ".", "..", "..."][motionFrame() % 4]}`;
 var toolColor2 = (it) => it.name === "AskUserQuestion" ? color.accent : it.status === "err" ? color.err : it.status === "running" ? color.run : it.name === "run_shell" || it.name === "command_execution" ? color.accent : it.name.toLowerCase().includes("write") || it.name.toLowerCase().includes("edit") || it.name === "file_change" ? color.ok : color.accentDim;
+var staticLineCache = new WeakMap;
+function staticItemLines(it, width) {
+  const hit = staticLineCache.get(it);
+  if (hit && hit.width === width)
+    return hit.lines;
+  const lines = [];
+  if (it.kind === "user") {
+    const wrapped = wrapSpans(proseSpans(it.text, { color: color.user, bold: true, bg: color.userBg }), Math.max(width - 4, 1));
+    wrapped.forEach((l, i2) => lines.push(padBg([
+      { text: i2 === 0 ? "▌ " : "  ", color: color.accent, bold: true, bg: color.userBg },
+      ...l.map((s2) => ({ ...s2, bg: color.userBg }))
+    ], width, color.userBg)));
+  } else if (it.kind === "assistant" && it.text) {
+    lines.push(...indent(markdownToLines(it.text, Math.max(width - 2, 1)), 2));
+  }
+  staticLineCache.set(it, { width, lines });
+  return lines;
+}
 function itemsToLines(items, width, expand = false) {
   const out = [];
   for (const it of items) {
     out.push(BLANK);
+    if (it.kind === "user" || it.kind === "assistant") {
+      out.push(...staticItemLines(it, width));
+      continue;
+    }
     switch (it.kind) {
-      case "user": {
-        const wrapped = wrapSpans(proseSpans(it.text, { color: color.user, bold: true, bg: color.userBg }), Math.max(width - 4, 1));
-        wrapped.forEach((l, i2) => out.push(padBg([
-          { text: i2 === 0 ? "▌ " : "  ", color: color.accent, bold: true, bg: color.userBg },
-          ...l.map((s2) => ({ ...s2, bg: color.userBg }))
-        ], width, color.userBg)));
-        break;
-      }
-      case "assistant": {
-        if (!it.text)
-          break;
-        out.push(...indent(markdownToLines(it.text, Math.max(width - 2, 1)), 2));
-        break;
-      }
       case "tool": {
         const dot = { text: it.status === "running" ? spinnerFrame() : glyph.tool, color: toolColor2(it) };
         const name15 = friendlyTool2(it.name);
@@ -144223,6 +144297,16 @@ function cleanError(err) {
 `)[0].trim();
   return msg.length > 240 ? msg.slice(0, 240) + "…" : msg;
 }
+var NATIVE_PROVIDERS = new Set(["anthropic", "openai", "google", "deepseek"]);
+var MODEL_NOT_SERVED = /does not exist|not found|no such model|model_not_found|unknown model|invalid model|deployment.*(does not exist|not)|resource not found/i;
+function unavailableModelHint(message, model) {
+  if (NATIVE_PROVIDERS.has(model.provider))
+    return message;
+  if (MODEL_NOT_SERVED.test(message)) {
+    return `“${model.sdkId}” isn't available on your ${model.provider} account. Run /account refresh to see what is, then /model <name>. (${message})`;
+  }
+  return message;
+}
 var resultSummary = (out) => {
   const s2 = typeof out === "string" ? out : JSON.stringify(out);
   const first = s2.split(`
@@ -144240,7 +144324,7 @@ async function runTask(opts) {
     if (errored || signal?.aborted)
       return;
     errored = true;
-    onEvent({ type: "error", message: cleanError(err) });
+    onEvent({ type: "error", message: unavailableModelHint(cleanError(err), model) });
   };
   onEvent({ type: "phase", label: "contacting model", detail: model.label, state: "running" });
   const activeTools = await createToolset(onEvent, { readOnly: Boolean(plan) });
@@ -144394,6 +144478,52 @@ async function runTask(opts) {
   onEvent({ type: "done", usage });
   return { messages: next, usage };
 }
+async function runCompletion(opts) {
+  const { model, system, prompt, onEvent, signal } = opts;
+  const usage = { inputTokens: 0, outputTokens: 0 };
+  const providerOptions = opts.effort ? reasoningOptions(model, opts.effort) : {};
+  let errored = false;
+  const emitErr = (err) => {
+    if (errored || signal?.aborted)
+      return;
+    errored = true;
+    onEvent({ type: "error", message: unavailableModelHint(cleanError(err), model) });
+  };
+  onEvent({ type: "phase", label: "contacting model", detail: model.label, state: "running" });
+  const result2 = opts._stream ? null : streamText({
+    model: resolveModel(model, opts.creds),
+    system,
+    messages: [{ role: "user", content: prompt }],
+    abortSignal: signal,
+    onError: ({ error: error40 }) => emitErr(error40),
+    ...Object.keys(providerOptions).length ? { providerOptions } : {}
+  });
+  const parts = opts._stream ?? result2.fullStream;
+  let text2 = "";
+  try {
+    for await (const part of parts) {
+      if (part.type === "text-delta") {
+        const t2 = part.text ?? part.textDelta ?? "";
+        if (t2) {
+          text2 += t2;
+          onEvent({ type: "text", text: t2 });
+        }
+      } else if (part.type === "error") {
+        emitErr(part.error);
+      } else if (part.type === "finish") {
+        const u = part.totalUsage ?? part.usage ?? {};
+        usage.inputTokens = u.inputTokens ?? u.promptTokens ?? 0;
+        usage.outputTokens = u.outputTokens ?? u.completionTokens ?? 0;
+      }
+    }
+  } catch (e2) {
+    if (!signal?.aborted)
+      emitErr(e2);
+  }
+  onEvent({ type: "phase", label: errored ? "blocked" : "finished", state: errored ? "err" : "ok" });
+  onEvent({ type: "done", usage });
+  return { text: text2, usage };
+}
 function friendlyToolPhase(name31) {
   if (name31 === "read_file" || name31 === "list_dir" || name31 === "glob" || name31 === "search")
     return "reading context";
@@ -144404,11 +144534,583 @@ function friendlyToolPhase(name31) {
   return "using tool";
 }
+// src/help/docs-bundle.ts
+var DOCS_BUNDLE = [
+  {
+    file: "README.md",
+    text: `# gearbox
+## Install
+macOS, Linux, WSL:
+\`\`\`bash
+curl -fsSL https://unpkg.com/gearbox-code@latest/install.sh | bash
+\`\`\`
+Windows PowerShell:
+\`\`\`powershell
+irm https://unpkg.com/gearbox-code@latest/install.ps1 | iex
+\`\`\`
+These installers do not use \`sudo\`, admin privileges, or \`npm install -g\`.
+They install Gearbox into a user-owned directory, create the \`gearbox\` command,
+then start onboarding before the coding app opens.
+Run without installing:
+\`\`\`bash
+npx gearbox-code@latest
+\`\`\`
+## First Run
+Gearbox needs one provider account before it opens the coding app. The installer
+runs setup automatically. You can also run it yourself:
+\`\`\`bash
+gearbox onboard
+\`\`\`
+Common setup commands:
+\`\`\`bash
+gearbox auth add <api-key>                # auto-detects known key prefixes
+gearbox auth add <provider> <api-key>     # anthropic, openai, google, deepseek, openrouter, groq, xai, mistral...
+gearbox auth add codex                    # ChatGPT subscription through the Codex CLI
+gearbox auth add codex work               # second ChatGPT account, isolated CODEX_HOME
+gearbox auth add claude work              # second Claude account, isolated config
+gearbox auth import                       # import credentials from env/cloud config
+gearbox auth providers                    # list supported providers
+\`\`\`
+After setup:
+\`\`\`bash
+cd ~/your-project
+gearbox
+\`\`\`
+No account configured means no fake/demo model: Gearbox runs onboarding first.
+## Uninstall
+macOS, Linux, WSL:
+\`\`\`bash
+rm -f ~/.local/bin/gearbox
+rm -f ~/.bun/bin/gearbox
+rm -rf ~/.local/share/gearbox
+\`\`\`
+Windows PowerShell:
+\`\`\`powershell
+Remove-Item "$env:LOCALAPPDATA\\Gearbox" -Recurse -Force
+\`\`\`
+If you previously installed with npm global:
+\`\`\`bash
+npm uninstall -g gearbox-code
+\`\`\`
+If \`gearbox\` fails with \`Unknown file extension ".tsx"\`, an old Bun-linked
+shim is still first on PATH. Remove it and reinstall:
+\`\`\`bash
+rm -f ~/.bun/bin/gearbox
+curl -fsSL https://unpkg.com/gearbox-code@latest/install.sh | bash
+\`\`\`
+## What It Is
+Gearbox is a terminal coding agent that can use the model accounts you already
+pay for. It supports provider accounts, local credential storage, model routing,
+session history, file edits, shell commands, MCP tools, web search, image input,
+and permission gates.
+Supported setup paths include API keys, detected env/cloud credentials, Azure,
+and provider CLIs where available.
+## Capabilities
+Paste or drag an image path into the composer to attach screenshots or UI
+captures. Local image attachments work with API-backed multimodal models.
+Gearbox loads MCP servers from \`~/.gearbox/mcp.json\`, \`.mcp.json\`, or
+\`.gearbox/mcp.json\`. Check what loaded with:
+\`\`\`bash
+gearbox mcp list
+\`\`\`
+Example MCP config:
+\`\`\`json
+{
+  "mcpServers": {
+    "github": {
+      "command": "npx",
+      "args": ["-y", "@modelcontextprotocol/server-github"],
+      "env": { "GITHUB_TOKEN": "\${GITHUB_TOKEN}" }
+    }
+  }
+}
+\`\`\`
+The built-in \`web_search\` tool works out of the box with DuckDuckGo, and uses
+Brave or SearXNG when \`BRAVE_SEARCH_API_KEY\` or \`SEARXNG_URL\` is set.
+## Develop
+Requires [Bun](https://bun.sh).
+\`\`\`bash
+bun install
+bun run src/cli.tsx
+bun test
+bun run typecheck
+\`\`\``
+  },
+  {
+    file: "CLAUDE.md",
+    text: "# Gearbox — project guide\n\nGearbox is a multi-provider coding harness for the terminal: a beautiful, simple terminal agent that reads/writes code and runs commands, talking to any provider (Anthropic, OpenAI, Google, DeepSeek) through one clean loop.\n\n**The point of the project:** intelligent per-task *model routing* — automatically picking the right model for each task across every provider and account you pay for. Basic routing is live (`RoutingSelector` — classify → quality bar → cheapest winner); the richer engine (shadow-eval, credit/limit penalties, confidence display) layers on top of the same seam. See `DESIGN.md` for the full vision and `experiments/FINDINGS.md` for the validation behind it.\n\n## The one rule that matters\n\n**Keep the routing seam clean.** The agent must never hardcode a model. It asks a `ModelSelector` for the model to use. `RoutingSelector` is the live default (classify task → filter by quality bar → cheapest winner); `FixedSelector` is used only when a model is explicitly pinned (`--model` flag or `/model <name>`). Concretely:\n\n- `src/model/selector.ts` — the seam. `select(task) => ModelChoice`. Do not bypass it.\n- `src/model/router.ts` — `RoutingSelector`: classify prompt → quality bar → cost-sort candidates → respect `/prefer` preferences.\n- `src/model/profiles.ts` — the data corpus: quality, cost, latency, tokenizer calibration per model. Routing reads this.\n- `src/providers.ts` — maps a provider+model id to an AI SDK model instance. Already multi-provider. Adding a model is data, not code.\n- Every model call captures token usage (`src/agent/run.ts`) so the cost engine has data. Do not drop usage.\n- The UI consumes a normalized `AgentEvent` stream (`src/agent/events.ts`), never the AI SDK's raw types. This decouples the UI from the provider layer and from routing.\n\nIf you find yourself writing `anthropic('claude-...')` anywhere outside `providers.ts`, stop — route it through the selector.\n\n## Layout\n\n```\nsrc/\n  cli.tsx            entry point; renders the Ink app; picks RoutingSelector by default\n  config.ts          minimal config (default model, provider from env)\n  providers.ts       provider+model id -> AI SDK model  (multi-provider; contextWindow per model)\n  commands.ts        slash-command metadata + pure helpers (fuzzy model match, /help, model list)\n  tools.ts           read / write / edit / list / search / glob / run_shell  (AI SDK tools)\n  model/\n    selector.ts      THE ROUTING SEAM — ModelSelector interface + FixedSelector (pinned model)\n    router.ts        RoutingSelector: classify → quality bar → cost-sort → preferences (the live default)\n    profiles.ts      model corpus: quality (SWE-bench), cost ($/Mtok), latency, tokenizer calibration\n    tokens.ts        calibrated token counting (js-tiktoken × per-model calibration factor)\n    preferences.ts   persist /prefer kind model choices to ~/.gearbox/routing-preferences.json\n    reasoning.ts     reasoning/thinking config helpers\n  context/\n    builder.ts       context engine: system + memory + repo map + retrieved files + curated history\n    retrieve.ts      BM25 lexical retrieval — top-K relevant files for a prompt (no model call)\n    repomap.ts       repo structure summary for the system prompt\n    memory.ts        project memory (GEARBOX.md / CLAUDE.md loaded into context)\n    compact.ts       context compaction (/compact)\n  accounts/\n    types.ts         Account + AuthMethod types (API key, AWS, Azure, Vertex, CLI, OpenAI-compat)\n    store.ts         accounts.json persistence (~/.gearbox/accounts.json)\n    catalog.ts       provider catalog (known providers, env vars, labels)\n    detect.ts        auto-detect env creds + cloud credentials\n    onboard.ts       interactive add/test account flows\n    resolve.ts       credential resolution (Account → ResolvedCreds, fetching secrets on demand)\n    discover.ts      per-account model discovery (Azure deployments / Foundry / gateway /models) → account.models; catalog defaultModels are seeds, not callable ids\n    usage.ts         per-account spend ledger + rate-limit snapshots + balance tracking\n    balance.ts       provider balance fetch helpers\n  help/\n    ask.ts           /ask corpus: bundled docs + generated command reference, system prompt, meta-question auto-detect\n  agent/\n    events.ts        AgentEvent — normalized stream the UI consumes\n    run.ts           real agent loop (AI SDK streamText -> AgentEvent), abort-aware; runCompletion = tool-less grounded answer (used by /ask)\n    cli-backend.ts   claude/codex CLI subprocess backend (for Pro/Max subscriptions)\n    mock.ts          scripted demo stream (runs with no API key; used by tests)\n  ui/\n    theme.ts         colors + glyphs (the look)\n    input.ts         pure key→action reducer for the composer (tested)\n    history.ts       pure ↑/↓ prompt-history nav (tested)\n    net.ts           background online probe; status bar shows ⚠ offline when down\n    useTerminalSize.ts  reactive width on resize (everything reflows)\n    git.ts           current branch for the status line\n    App.tsx          the Ink app: state, useInput dispatch, commands, turns\n    components/      Banner, Transcript, Composer, CommandPalette, StatusBar, PermissionPrompt\ntest/                pure-logic + render tests (ink-testing-library); no keys\nDESIGN.md            full product vision (routing, requirements, UX)\nexperiments/         prototypes that validated the architecture\n```\n\nThe composer is custom (Ink `useInput` + `src/ui/input.ts`), not a third-party widget — full control over the cursor, ↑/↓ history, and esc-to-interrupt, with no focus/remount fragility. **Multi-line**: ⌃J (or shift/alt+⏎) inserts a newline, ⏎ submits; ↑/↓ move between lines and fall through to history at the top/bottom line; bracketed paste (enabled in `cli.tsx`) inserts multi-line text literally (CR normalized, paste markers stripped) instead of submitting per line. `caretPos()` is the shared line/col helper. **Readline editing** (all pure in `input.ts`, tested): ⌃U/⌃K kill to line start/end, ⌃W / ⌥⌫ kill word, ⌃D forward-delete, ⌥/⌃ + ←→ word-jump, ⌃A/⌃E line home/end. Keys: ⏎ send · ⌃J newline · ↑↓ line/history · ← → cursor · ⌥←→ word · tab complete @file · **shift+tab cycles mode (normal · auto-accept · plan)** · ⌃Y copy last reply · esc interrupt · ⌃c quit. `/keys` shows the cheatsheet.\n\n**Modes & effort.** Three input modes cycled by shift+tab (`App.tsx` `cycleMode`): **normal** (asks before writes/edits/shell), **auto-accept** (file writes/edits apply without asking — the permission broker auto-resolves `write`/`edit`; shell still gated; diffs still render), **plan** (read-only). Plus **yolo** (auto-approve everything) via `/yolo`. **Effort tiers** (`/effort fast|balanced|max`, or `setEffort`) pin the model through the routing seam (fast→haiku, balanced/max→sonnet) — the active mode + `⚡effort` show as badges in the `StatusBar`. **Click pickers** (fullscreen only): clicking the **model** or **effort** label in the status bar opens a floating picker above it (↑↓ select · ⏎ apply · esc close), reusing the same `/model`/`/effort` command path. The slash commands remain the keyboard path. The fragile row+column hit-test lives in pure, tested `statusBarHit`/`statusBarLayout` (`StatusBar.tsx`); `App.tsx` only supplies live layout (composer line count, `PALETTE_ROWS`, the rendered model/effort/mode) and toggles `quickPicker` state. Inline mode has no mouse grab, so the labels stay informational there. **Copy**: ⌃Y / `/copy` copies the last reply via OSC 52 (`src/ui/clipboard.ts`, works over SSH); `/export [file]` writes the transcript to Markdown. **Terminal integration** (`src/ui/terminal.ts`): the tab title (OSC 2) reflects working/idle, and a long turn (>8s) rings the bell + fires a desktop notification (macOS) so you can step away.\n\n**More UX affordances.** **Type-ahead**: prompts submitted while busy are queued (`queueRef`, shown as chips) and sent when the turn ends. **⌃C** interrupts a turn → clears the composer → \"press again to quit\" (`cli.tsx` renders with `exitOnCtrlC:false`). **Large pastes** collapse to a `[Pasted N lines]` chip (`pasteStoreRef`), expanded back on submit. **Fuzzy** `@file`/`/command` pickers (`src/ui/fuzzy.ts` — substring-first, then subsequence scored by boundary+contiguity; tested). **Cost**: live `$` estimate in the status bar from per-turn model+tokens (`estimateCost` + per-model pricing in `providers.ts`). **Syntax highlighting** for code blocks (`src/ui/highlight.ts` — lightweight per-line tokenizer → Ink spans, NEVER raw ANSI; used by both `lines.ts` `clipSpans` and `Markdown.tsx`). `?` on an empty composer shows the cheatsheet (`KEYS_HELP`).\n\n**Sessions** (`src/session.ts`): conversations persist per-project under `~/.gearbox/sessions/<slug>/` (`GEARBOX_HOME` overrides). Each record holds provider-neutral `messages` + the UI `items` + **per-turn `{model, usage, at}`** (routing/cost data — the record is deliberately not single-model). `gearbox --continue`/`-c` resumes the latest; `/resume [n]` lists/loads in-app; `/clear` starts a fresh session. Prompt history persists across runs (`history.json`). Saving is best-effort (never crashes the app); skipped in demo mode.\n\nFeatures: full markdown via **marked** (parse, `marked.lexer`) + **Ink** (render) in `Markdown.tsx` — headings, bold/italic/inline-code, tables, ordered+nested lists, blockquotes, code blocks. NO foreign ANSI in Ink (cli-highlight/marked-terminal were tried and removed — they corrupt Ink's width/wrapping; render marked's token tree as Ink elements instead). Markdown gets a `width` prop (threaded App→Transcript→Markdown) for table/rule sizing. Colored diffs under edits (`src/diff.ts`, edit/write tools return `{summary,diff}`), plan mode (read-only tools + plan prompt; `/plan` or shift+tab), `!cmd` runs a shell command directly (`src/shell.ts`), `@file` mentions (fuzzy picker `src/ui/mention.ts`+`files.ts`; expanded into the model message on send), live \"working · Ns\" timer.\n\n**Boo (the mascot).** A pixel ghost, now **parametric** (`src/ui/ghost/engine.ts`, ported from a Claude Design handoff). A 20×20 pixel sprite composited from composable layers — body (palette) + face (eyes/mouth) + accessory + persona + a frame-driven overlay (tears/dots/confetti/Z's/sparkle/hearts) — then FOLDED into half-block cells (`▀`/`▄`, top px → `t`/glyph color, bottom px → `b`/bg). `renderGhost(cfg)` is the source of truth for the **default blocks path**; it's pure + memoized. The data: 13 faces (`FACES`), 9 palettes (`PALETTES`), 6 accessories, 9 personas (personas/accessories ported but not yet surfaced in the live UI). Ink `color`/`backgroundColor` props only, NEVER raw ANSI (corrupts Ink's width math). PNG paths are **opt-in** via `GEARBOX_GHOST`:\n\n- `GEARBOX_GHOST=kitty` — real PNG via kitty graphics Unicode placeholders (`U+10EEEE`, fg encodes image id, diacritics encode row/col; PNGs transmitted once in `cli.tsx`). NOTE: the placeholder protocol is young and mis-rendered (squished) in Ghostty during testing — kept opt-in until that's solved.\n- `GEARBOX_GHOST=iterm` — OSC 1337 splash banner (iTerm2/WezTerm).\n\n`detectImageMode()` returns `blocks` unless `GEARBOX_GHOST` opts in. Baked PNGs live in `src/ui/mascot-png.ts`; `bun run scripts/ghost-preview.ts` previews the parametric engine (splash + all faces + the in-flow state crops). **Boo is animated but deliberately calm** on the blocks path (`AnimatedGhost` in `Mascot.tsx`): one shared, unhurried 240ms tick (leaf-local `useTick`, never lifted to App root); talk + overlays advance at half that (~480ms). There is NO idle bob/float and NO splash sparkle — motion is a quiet sign of life, not fidgeting (the splash just blinks every ~6s; in-flow only the state-meaningful overlay/talk moves). `GEARBOX_NO_MOTION=1` freezes to frame 0. `/ghost [mood]` cycles the skin (`skinToCfg` maps it to a cfg; `shades` is the cool face + shades accessory).\n\n**Layout: fullscreen by default; inline is opt-in.** **Fullscreen is the default** (alt-screen frame + virtualized scroll region + scrollbar + mouse wheel scroll); `--inline`, `GEARBOX_INLINE=1`, or `/config inline on` (pref `fullscreen: false`) opts into inline mode. `GEARBOX_FULLSCREEN=1` or `--fullscreen` forces fullscreen explicitly. The decision lives in `cli.tsx` (`wantsFullscreen`). Grabbing the mouse for wheel-scroll is exactly what disables native terminal selection, so in fullscreen mode text selection requires the terminal's modifier (e.g. Option-drag in Ghostty). **Inline mode** (the plain `Transcript` component): no alt-screen, no mouse grab — native click-drag selection / scrollback / copy all work with no modifier. The transcript is a **virtualized line buffer**: `src/ui/lines.ts` (`itemsToLines`) flattens items into styled `Line`s (markdown→lines, wrapping, diffs) — INVARIANT: every line ≤ width (tested), so nothing overflows. **Streaming perf**: flattening the markdown-heavy `assistant`/`user` items is super-linear with their length, so `staticItemLines` memoizes per item in a `WeakMap` keyed by object reference (unchanged items keep identity across renders, so only the changing tail re-parses — history is free; running tools are not cached since their spinner animates). On the producer side, assistant **text deltas are coalesced** on a ~45ms flush timer in `App.tsx`'s `onEvent` (mirroring the tool-stream coalescer), so streaming re-renders at ~22fps instead of per-token — both together stop the auto-scroll jitter that grew with reply length. `finishAssistant`/the turn `finally` flush any buffered text before marking done or on interrupt. In fullscreen, `App` renders only the visible window via `Viewport` (`src/ui/components/Viewport.tsx`) at a computed `transcriptHeight = rows − header − footer` (footer over-estimated so the frame never exceeds the screen; alt-screen clips, so under-filling is safe). Fullscreen scroll: mouse wheel (SGR mouse reporting enabled in `cli.tsx`; parsed off raw stdin in `App` since Ink doesn't model mouse — buttons 64/65) and PgUp/PgDn; new output re-pins to the bottom (`atBottomRef`); a scrollbar sits on the right. (In fullscreen, mouse reporting means text selection needs the terminal's modifier, e.g. Option-drag in Ghostty — which is why inline is now the default.) The virtualized buffer replaced an earlier flex/overflow fullscreen that corrupted on tall output. Chrome spans full width; prose wraps ≤100 cols. The plain `Transcript` component is the inline-fallback renderer. `scripts/gen-mascot.ts` still bakes the PNGs + baked sprites (`mascot-sprite.ts` `GHOSTS`) — but those now feed **only the opt-in kitty/iTerm image path** (`image.ts`); the default blocks path renders the parametric engine instead. The splash scales to the terminal (big=2×/mini=1×/none by rows×cols, in `App.tsx`). The inline/working presence is the compact **state ghost** (see below) — a native-resolution head crop so Boo never dominates the transcript.\n\nCommands are grouped in `/help` (models · conversation · accounts · save · modes · settings · other) and `src/commands.ts` carries plain-language descriptions: /model [name] (fuzzy — \"haiku\"; `/model auto` routes, `/model all` lists every provider) /effort [fast|balanced|max] /prefer [kind model] (remember a confirmed routing preference for a task type) /clear /resume /retry /compact /context /memory /ask &lt;q&gt; (answer questions about Gearbox itself from its bundled docs via a cheap routed model; plain meta-questions auto-route here with a visible affordance) /account (unified: list/add/login/use/rm/refresh — `/accounts` and `/login` are hidden aliases; `/account refresh` re-discovers each account's real callable models) /cost /copy /export [file] /plan /yolo /theme /config (theme·vim·notify·inline; `/vim` is a hidden alias) /init /keys /help /exit. **Hidden** (work but not listed): /accounts /login /vim /ghost. **Removed:** /cwd (the working dir now shows in `/context`). `formatModelList` shows usable models first and collapses no-key providers to a one-line count.\n\n**Permission gate:** `write_file`/`edit_file`/`run_shell` block on a confirm before mutating. Broker: `src/permission.ts` (`requestPermission` in the tools; `setPermissionHandler` installed by `App`; no handler → allow, so tests/headless are unchanged). Decisions: **once** (1), **always** (2, grants that kind for the session), **all/yolo** (a, auto-approves everything until toggled), **deny** (3/esc). YOLO is also toggled by `/yolo` or started with `--yolo`; a `⚡ yolo` badge shows in the status. The `!` prefix is user-initiated so it is NOT gated. Search/nav tools: `search` (ripgrep, Bun-walk fallback) and `glob` (`Bun.Glob`), both read-only (also in plan mode). The working indicator IS Boo now (`components/Working.tsx`): a compact head-crop ghost whose face follows the agent state — thinking (dots) → streaming (talk) → tool (loading dots) → a clean-finish celebrate (party hat + confetti) → error (crying with falling tears). `App.tsx` derives `mascotState` from the `onEvent` stream; the success/error beat **lingers ~1.5s** after the turn (`linger` state — the working line gates on `busy || linger`, since it would otherwise unmount the instant `busy` goes false). Crops are per-state (`stateView`): head (rows 4–14), head+dots (2–14), head+hat (0–14) so overlays outside the head still read. This deliberately supersedes the earlier \"Boo stays on the welcome splash only / in-flow movement reads as noise\" decision — the compact, state-bearing ghost is the point of the design port.\n\n## Conventions\n\n- Runtime: **Bun**. TypeScript + TSX. Run with `bun run src/cli.tsx`.\n- UI: **Ink** (React for terminals) + **@inkjs/ui**. Keep it calm and beautiful: restrained palette (one accent), generous spacing, consistent glyphs. The look lives in `src/ui/theme.ts` — change colors/glyphs there, not inline.\n- Open + free: MIT, no paid dependencies, no hosted backend, no telemetry. The only cost is the user's own model calls on their own keys.\n- Tools must be safe by default: confirm or sandbox anything destructive; never `rm -rf` or write outside the workspace without intent.\n\n## Run it\n\n```bash\nbun install\n# set at least one key:\nexport ANTHROPIC_API_KEY=...    # or OPENAI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY / DEEPSEEK_API_KEY\nbun run src/cli.tsx             # or: bun start\n```\n\nWith no key it launches in demo mode (a scripted transcript) so the UI still runs.\n\n## Test\n\n```bash\nbun test            # render tests + agent-loop tests; no API key needed\nbun run typecheck   # tsc --noEmit\n```"
+  },
+  {
+    file: "DESIGN.md",
+    text: `# Gearbox — Design
+A terminal coding agent whose one job, done better than anything else, is to **route each task to the right model across every provider and account you pay for**. Everything else is table stakes executed well, in service of that.
+Target user: a startup founder / power user who pays for several models (Claude, OpenAI/Codex, Gemini, DeepSeek, Azure) via API keys and/or flat-rate seats, codes heavily, hits limits, and has no intelligent way to use it all.
+Status: architecture validated by 6 experiments (\`experiments/FINDINGS.md\`); routing, event-log ledger, task-boundary switching, ground-truth gate all prototyped; Anthropic payload accepted live.
+---
+## Design principles (these decide every tradeoff)
+1. **Routing is sacred and invisible.** It is the USP and runs on every task. It must add no perceptible latency and no visual noise. You feel its results (cost, no stalls), never its presence.
+2. **Earn trust through transparency.** It spends your money. Every decision is explainable in one glance and one keystroke to the full math. Never opaque.
+3. **Calm by default, depth on demand.** The screen shows what's happening now, the current model, the running cost. Everything else is one keystroke away.
+4. **Honest about state.** Tests failed → it says so. Switched providers → it says so plainly. Never claims done without proof.
+5. **Build on proven wheels; own only the differentiator.** The provider layer, tool-call loop, and TUI rendering are solved problems. The routing brain, the ledger, the cost/limit engine, and the UX are ours.
+6. **Every milestone is something you actually use.** Routing-first. No big-bang.
+7. **Open and free to run.** Fully open-source (MIT). Nothing costs money except the model calls you already pay for, on your own keys. No hosted backend, no paid dependencies, no required account, no paid telemetry. Local-first everywhere.
+---
+## What it is
+A terminal app (rich TUI with a live dashboard) plus a scriptable CLI underneath. You run it instead of Claude Code / Codex. Local-first, your keys, your machine. Not a website, not a hosted service, not an IDE plugin. Internal tool first; productizable later because the routing + spend story is exactly what teams want.
+## Openness & cost
+- **License: MIT.** Fully open-source, permissive, no strings. (Apache-2.0 is the alternative if a patent grant ever matters; MIT chosen for maximum simplicity and openness.)
+- **Free to run.** Every dependency is permissively licensed and runs locally: AI SDK (Apache-2.0), Bun (MIT), bun:sqlite (public domain), Ink (MIT), ripgrep (MIT/Unlicense), tree-sitter (MIT), MCP SDK (MIT). No copyleft, no hosted service, no required account, no paid backend, no server bill.
+- **The only money is inference you already pay for**, on your own keys, and it is the whole point of the tool. That includes the optional "make routing smarter" calls (shadow-eval, the LLM classifier): they run on your keys, count against your budget caps, are off-or-sampled by default, and are governed by a calibration-budget knob so you decide how much to spend sharpening routing. Rules-based routing is free.
+- **Code search is free and local by default** (ripgrep + tree-sitter + LSP). Embeddings are optional and local-first (a local embedding model); never a paid embeddings API by default.
+- **Telemetry: none by default.** Any analytics is opt-in and local-only; nothing leaves your machine.
+---
+## Architecture
+\`\`\`
+┌──────────────────────────── Gearbox (owned) ────────────────────────────┐
+│  TUI / CLI  (Ink)                                                        │
+│     │                                                                    │
+│  Session Orchestrator (single-writer)  ── multi-session, worktrees       │
+│     │                                                                    │
+│  ┌──────────────┐   ┌──────────────────┐   ┌────────────────────────┐   │
+│  │ ROUTING BRAIN│   │ Ledger + Memory  │   │ Verification / Autonomy│   │
+│  │ classify →   │   │ append-only event│   │ tests/build/types gate │   │
+│  │ score →      │   │ log, curation,   │   │ auto-iterate-to-green  │   │
+│  │ pick + log   │   │ task-boundary    │   │ unattended-safe        │   │
+│  └──────┬───────┘   │ switching        │   └────────────────────────┘   │
+│         │           └──────────────────┘                                │
+│  Cost / Credit / Limit / Plan engine  (balances, caps, failover)        │
+└─────────────────────────────────┬────────────────────────────────────────┘
+                                   │  model selection per task
+┌──────────────────────────────── ▼ built on ─────────────────────────────┐
+│  Vercel AI SDK (\`ai\` + @ai-sdk/{anthropic,openai,google,azure,deepseek}, │
+│  OpenRouter provider)  → unified messages, tool-calling loop, streaming  │
+│  Bun + bun:sqlite (WAL)  ·  Ink (TUI)  ·  MCP SDK  ·  ripgrep/tree-sitter │
+└──────────────────────────────────────────────────────────────────────────┘
+\`\`\`
+### Build on (do NOT reinvent)
+| Need | Use | Why |
+|---|---|---|
+| Provider access, unified message format, tool-call normalization, streaming | **Vercel AI SDK** (\`ai\`, provider packages, OpenRouter provider) | Battle-tested, ubiquitous, covers all 5 providers + OpenRouter; its unified message type IS the canonical state I prototyped in E1; its tool-call loop (\`stopWhen\`/steps) is the agent loop mechanics |
+| Runtime + storage | **Bun** + **bun:sqlite** (WAL) | Fast cold start, native TS, zero-dep embedded DB; event log validated in E3 |
+| TUI rendering | **Ink** (React for terminals) | Standard for rich TS CLIs (Claude Code, Codex CLI use it); component model fits the dashboard |
+| Cost estimation | **js-tiktoken** + provider token endpoints | Local, fast token counts for pre-call estimates (used in E1) |
+| Code search / nav for tools + memory | **ripgrep**, **tree-sitter**, **LSP** (+ optional local embeddings) | Don't build search; start with ripgrep, add tree-sitter/LSP for symbol nav. All free/local. Embeddings optional and local-first (local model) — never a paid embeddings API by default |
+| Tool implementations (read/write/edit/shell/grep) | adapt from **Pi / OpenCode** (MIT) as reference | Don't redesign well-solved tools |
+| Tool/extension connections | **MCP SDK** | Standard; reuse your existing MCP servers |
+| Config + schema validation | **TOML** + **Zod** | Boring and correct |
+| Seed quality priors | **SWE-bench / Aider leaderboard / public evals** | Don't guess model quality cold |
+### Own (the differentiator, no wheel exists)
+Routing brain · cost/credit/limit/plan engine · canonical-state event-log ledger + curation · verification gate + autonomy controller · single-writer multi-session orchestrator · the routing-transparency UX.
+> Foundation note: the AI SDK runs the per-call tool loop on whatever model Gearbox selects; Gearbox injects routing at task boundaries and wraps every call with the ledger, cost engine, and verification. This keeps routing first-class without rebuilding provider integration. Alternative considered: build on Pi's \`pi-agent-core\` (faster start, but retrofitting per-task routing into someone else's loop). Chosen the AI SDK for a clean, owned hot path since routing is the whole point.
+---
+## The routing engine (the USP — most of the engineering rigor goes here)
+### What "a task" is, and the two levels of routing
+A **task** = one user request / one unit of intended work ("fix the failing auth tests"). The main agent thread handles it and **stays warm on one capable model** chosen at task start. The main model only changes at a task boundary, on escalation (the work turns out harder than classified), or on failover (limit/outage). The \`w_switch\` penalty governs these rare main-thread changes.
+Fine-grained savings do **not** come from hopping providers mid-conversation (that loses the cache and risks incoherence). They come from **delegating bounded sub-tasks to cheap models in isolated contexts**: run-and-summarize the tests, search the codebase, read-and-summarize a big file, generate boilerplate. Each sub-task gets its own cheap routing decision and its own clean context, returns a compact result to the warm main thread, and never touches the main conversation's cache. This is the "intelligent leader delegating grunt work" model, and it's where most of the easy-work-to-cheap-model savings actually live.
+This also reconciles E6: cheap context reconstruction from the ledger powers both (a) spinning up many cheap sub-task contexts and (b) the occasional main-thread switch. Frequent cheapness is the sub-task surface; the switch penalty is the main-thread surface. No contradiction.
+So routing runs at two levels:
+- **Task level:** pick the main-thread model (clears the task bar; warm; scarcity/plan/limit aware).
+- **Sub-task level:** each delegated bounded op routes independently to the cheapest model clearing that op's (lower) bar, in an isolated context.
+Per task (and per sub-task), before any model call:
+\`\`\`
+classify(task) → task_type, complexity, est_tokens
+  ↓
+candidates = models where quality_prior[task_type] ≥ bar[task_type]   // meet the bar
+  ↓
+for each candidate: score = cost_est
+                            + w_scarcity · (cost_est / provider_balance)   // preserve scarce credit
+                            + w_switch   · switch_penalty(currently_warm)  // cache locality
+                            − w_plan     · plan_bonus(flat_rate_seat_free) // use seats you pay for
+   filter out: rate-limited / over-budget / (if interactive) too-slow
+  ↓
+pick = argmin(score);  log(decision, per-candidate scores, reason)
+  ↓
+if none clears bar+budget → stop, surface to user (never silently downgrade quality)
+\`\`\`
+**Inputs, and where each comes from**
+- \`task_type\` / complexity: rules-first classifier (keywords + changed-file types + action verbs), < 5ms, free. Optional cheap-LLM classifier for ambiguous cases only (off by default).
+- \`quality_prior[type][model]\`: seeded from public benchmarks; **refined per-repo** by the flywheel (accept/edit/revert signal via git).
+- \`cost_est\`: local tokenizer × live price table.
+- \`provider_balance\`, \`rate_limit_headroom\`, \`seat_status\`: from the cost/credit engine (cached; refreshed async + from response headers). Never a blocking network call on the hot path.
+- \`currently_warm\`: which model this session last used (switch cost from E1).
+**Transparency contract:** every decision writes a one-line reason + the full per-candidate score table to the ledger, viewable live (\`tab\`) and after the fact (\`gearbox why <task>\`).
+**Calibration is part of M1, not deferred — it is what makes routing actually good, not just internally consistent.** Seeded benchmark priors are honest *guesses*; they say nothing about this user's React/TS code. So from day one:
+- **Confidence is first-class.** Every prior is tagged \`seeded\` or \`measured(n)\`, and the scorecard shows it. Routing is conservative when confidence is low: it will not send a hard task to a cheap model on a seeded guess alone, it shadow-evals first. Presenting a benchmark guess as a confident number is a trust bug, not cosmetics.
+- **Shadow-eval loop.** On a sampled, budget-capped fraction of tasks/sub-tasks, also run the next-cheaper candidate, diff against the chosen model's output (and against ground truth where tests exist), and update the prior from real data. The git accept/edit/revert signal is a second, noisier input.
+- **Per-repo priors.** Calibration is scoped to the repo; a model can be strong here and weak elsewhere.
+**The headline measurement (M1 exit criterion):** on a real session with live keys, routed cost vs all-frontier cost, plus an explicit check that the cheap picks were actually good enough (held against tests / not reverted). That is the USP's first real test — every experiment so far used synthetic priors. The flywheel's heavier auto-tuning (M5) refines this; the basic shadow-eval + confidence ship in M1.
+**Cost / credit / limit / plan engine** (routing's data source):
+- **Onboarding is load-bearing and explicit, not a footnote** (plan-first and limit-failover depend on it). A first-run setup detects keys from env / existing CLI configs, then asks per provider: metered API key, flat-rate seat (and its plan tier → known rate limits), or both. Limits are inferred from response headers where available and overridable in config. Without this, plan-first can't work, so it's a real onboarding UX surface, not config trivia.
+- Tracks spend per provider locally (authoritative, since balance APIs are inconsistent); reconciles with provider usage headers when present.
+- **Plan-first:** model a flat-rate seat (Claude Max, ChatGPT Pro) as ~0 marginal cost until its rate limit, then fall back to metered API.
+- **Limit-aware:** read \`x-ratelimit-*\` headers; as headroom drops, deprioritize; on 429/5xx, failover to the next candidate and continue the same task.
+- **Hard caps:** per-task / per-session / daily. Pre-flight estimate before each call; if it would breach the cap, halt and ask. Never blow the cap by more than one pre-estimated in-flight call.
+---
+## Every feature (tagged by milestone)
+**Routing (M1 — the USP, built to a high bar)**
+- Per-task automatic model selection across all configured providers.
+- Sub-task delegation: bounded ops (run tests, search, summarize, boilerplate) routed to cheap models in isolated contexts — the fine-grained savings surface, no cache loss.
+- Per-repo calibration: shadow-eval loop + seeded-vs-measured confidence on every prior.
+- Marginal-benefit scoring (cheapest model that clears the task's quality bar).
+- Credit-scarcity awareness (prefer the flush account; preserve the scarce one).
+- Plan/subscription-first (use seats you already pay for before metered API).
+- Rate-limit awareness + seamless failover (don't dead-end on a limit).
+- Hard budget caps (task/session/daily) with pre-flight enforcement.
+- Live, per-decision transparency (one-line reason + full scorecard on demand).
+- One-keystroke override; override logged as a preference.
+- Latency-class routing (fast model when you're waiting, best when it's background).
+- Free-tier / local-model (Ollama) tier as the cheapest rung.
+**Agent core (M0 — table stakes, on the AI SDK)**
+- Plan → tool → observe → act loop; tools: read, write, edit, shell, grep/search, ls.
+- Streaming output; interruptible.
+- Project instructions file (a \`GEARBOX.md\` / reuse \`CLAUDE.md\` if present).
+- Safe-by-default permissions (ask before shell/writes outside cwd).
+- Plan mode before large changes.
+- MCP tool connections.
+**Ledger + memory (M2)**
+- Canonical model-agnostic state as an append-only event log (crash-safe).
+- Curation → bounded working context (cheap task-boundary switching).
+- Fact provenance + invalidation (recover from a wrong assumption).
+- Decision/ADR record that survives compaction.
+- Durable, resumable sessions (survive kill -9 / reboot).
+**Verification + autonomy (M3 — the "walk away" pillar)**
+- Ground-truth gate: configured tests / build / type-check must pass before "done".
+- Auto-iterate to green (bounded attempts), then surface honestly if stuck.
+- Unattended-safe: no stall on limits, hard cost cap, no drift over long runs.
+- Honest status protocol (done-with-proof / blocked / needs-input).
+**Multi-session + UX (M4 — design-heavy)**
+- Concurrent sessions on different tasks; git-worktree isolation.
+- Shared project memory across sessions.
+- Live dashboard: session board, per-session model + cost + status.
+- The always-visible cost meter; amber near caps.
+**Spend record + flywheel (M5)**
+- One searchable record of what every model changed and what it cost, across accounts.
+- Per-task / per-project spend attribution.
+- Routing flywheel: priors auto-tuned per repo from accept/revert.
+**Later (only if earned)**
+- Background/async task queue (gated by the verification + cost-safety pieces).
+- Local model fine-tuning of the classifier.
+- Team mode / shared spend dashboards (the productization path).
+**Explicitly cut** (judged solutions-looking-for-problems): branch/rewind sessions, try-the-same-task-N-ways, cross-model "jury", sensitivity/privacy routing.
+---
+## Strict requirements (hard numbers — non-negotiable)
+**Latency (the routing hot path is sacred):**
+- Routing decision (rules path): **< 10ms p50, < 25ms p99**. Pure local compute.
+- Total overhead added before time-to-first-token (classify + score + cost-est): **< 50ms p99** — must be dwarfed by model TTFT (300–800ms) and never perceptible.
+- Optional LLM classifier: **< 500ms p95**, used on **< 15%** of tasks, **off by default**.
+- Balance / limit / seat read: from in-memory cache, **< 1ms**, never a blocking network call on the hot path; refreshed async (≤ 60s) and from response headers.
+- Cost estimate (tokenize 16k ctx): **< 20ms**.
+- Ledger event append: **< 5ms p99**, off the response-critical path, fsync'd for durability.
+- Failover pick on 429/5xx: **< 50ms** to select the next model.
+- TUI frame: **< 16ms (60fps)**; routing panel render **< 5ms**; UI thread never blocks on I/O.
+- Cold start to interactive: **< 400ms**.
+- Stream relay overhead: **< 50ms** over the provider's own stream.
+**Durability / correctness:**
+- Crash-safe: every state-changing event fsync'd before ack; a \`kill -9\` session reconstructs to the last completed event.
+- No lost writes with **≤ 16 concurrent sessions** (single-writer queue + WAL; validated E3).
+- Routing is **deterministic** on the rules path (same state + config → same pick) and always logged with reasons.
+- Budget caps are **hard**: a session cannot exceed its cap beyond one pre-estimated in-flight call.
+- "Done" cannot be declared with failing configured checks.
+**Security:**
+- API keys never logged, never written to the ledger, never sent to a provider other than their own. Keys read from env or a \`0600\` local file.
+**Cost of the tool itself:**
+- **Zero-cost-to-run guarantee:** no Gearbox feature requires payment beyond the user's own model inference. No paid dependency, hosted backend, required account, or paid telemetry, ever. The only $ are model calls on the user's keys, all counted against caps.
+- Rules routing: $0 (local). Optional LLM classifier: **< $0.001/decision**, bounded, off by default.
+- Shadow-eval/calibration inference is opt-in, sampled, and bounded by a calibration-budget knob; it counts against the normal caps.
+- Curation keeps typical working context **< 16k tokens**.
+**Scale:**
+- ≥ 8 concurrent sessions with no UI jank; sessions with 1000+ events with no slowdown (indexed SQLite).
+---
+## UX & design (this matters as much as the engine)
+**Main session view** — calm; the routed line is dim, the cost meter always present:
+\`\`\`
+┌ gearbox ·············································· today $0.04 / $20 ┐
+│ repo gearbox · session fix-auth · ◐ sonnet-4.6                          │
+├─────────────────────────────────────────────────────────────────────────┤
+│ › fix the failing auth tests                                            │
+│ ▸ read auth.ts, token.ts                                                │
+│ ▸ ran tests → 2 failing (expiry)                                        │
+│ ● editing auth.ts … exp compared in seconds vs ms                       │
+│                                                                         │
+│ ┄ routed debug → sonnet-4.6 · cleared bar, haiku too weak · ~$0.012 ⌃tab│
+├─────────────────────────────────────────────────────────────────────────┤
+│ session $0.03 · anthropic ✓ · openai ⚠ low · ⌃o override  ⌃w why        │
+└─────────────────────────────────────────────────────────────────────────┘
+\`\`\`
+**Routing scorecard** (\`⌃tab\`) — the full math, including *confidence*, which is the real trust-builder (never show a benchmark guess as a confident number):
+\`\`\`
+╭ why: "fix the failing auth tests"  (debug, ~3.1k tok) ───────────────────────────╮
+│ model         quality  source         est$     balance  score  verdict            │
+│ sonnet-4.6    0.91 ✓   your 47 tasks  $0.012   $9,991   0.41   ◀ chosen            │
+│ deepseek-v4   0.90 ✓   seed · guess   $0.003   $20      0.43   ≈ shadow-evaling    │
+│ gpt-5.4       0.91 ✓   your 12 tasks  $0.010   $10 ⚠    0.78   scarce credit       │
+│ haiku-4.5     0.78 ✗   your 31 tasks  $0.001   $9,991    —     below bar (0.86)     │
+│ rule: cheapest clearing 0.86 on a non-scarce account. deepseek's 0.90 is a benchmark│
+│ guess, so it's being shadow-evaled on your code before it's trusted to win. [o]verride│
+╰────────────────────────────────────────────────────────────────────────────────────╯
+\`\`\`
+**Multi-session board:**
+\`\`\`
+┌ gearbox · 3 sessions ······························· today $0.12 / $20 ┐
+│ ● fix-auth        debug      sonnet-4.6   $0.03   editing auth.ts       │
+│ ● add-search      feature    gpt-5.4      $0.06   running tests         │
+│ ◐ refactor-cache  refactor   deepseek-v4  $0.03   ✓ done · tests green  │
+└──────────────────────────────────────────────────────────────────────────┘
+\`\`\`
+**UX rules:**
+- The hot path is silent: routing shows as one dim line, never a modal, never a spinner of its own.
+- Cost meter always visible, never alarming; amber approaching a cap, red only on a real failure.
+- Failover is narrated plainly: \`openai rate-limited → moved to gemini, continuing\`. Not hidden, not scary.
+- Override is one keystroke and feels respected (logged as preference, feeds the flywheel).
+- Color discipline: one accent for routing, amber for cost, red only for failures; high-contrast monospace; motion only to show live streaming.
+- Keyboard-first; every action reachable without the mouse.
+---
+## Build sequence (routing-first; each step is usable)
+- **M0 — Foundation spike (~1 wk).** AI SDK provider layer + minimal agent loop + 4 tools + config + streaming, talking to all 5 providers with manual model choice. De-risk: confirm the AI SDK message type carries our canonical state and tool-calls across every provider with real keys (extends E1/E7). *Usable: a bare agent on any provider.*
+- **M1 — Routing, done insanely well (~3–4 wks). This is the product.** Two-level routing (warm main-thread model + cheap sub-task delegation in isolated contexts), classifier, scorer, cost/credit/limit/plan engine, failover, hard caps, the transparency log + scorecard *with confidence*, override. **Calibration ships here, not later:** shadow-eval loop + per-repo measured priors + the seeded-vs-measured confidence display. Strict latency budget enforced and measured. **Exit criterion (the USP's first real test, live keys):** on a real session, routed cost vs all-frontier cost, *plus* an explicit check that the cheap picks were good enough (held against tests / not reverted). If that check fails, the routing isn't done. *Usable: it routes your real work, shows why with honest confidence, and you trust it with your money.*
+- **M2 — Ledger + memory + cheap switching (~1–2 wks).** Event-log ledger (single-writer), curation, task-boundary switching, crash-safe resumable sessions, invalidation. *Usable: long sessions stay cheap and coherent; switching is ~free.*
+- **M3 — Verification + autonomy (~2 wks).** Ground-truth gate, auto-iterate-to-green, unattended-safe controls. **Define "done with proof" for the common case of untested code** (most founder repos): tiered — if tests exist, they pass; otherwise require build + type-check + a smoke run, and offer to generate a characterization test pinning the changed behavior. The gate is never vacuous; it states which tier it cleared. *Usable: hand it a task and walk away.*
+- **M4 — Multi-session + TUI/UX polish (~2 wks).** Concurrent sessions, worktrees, the dashboard, the design layer. *Usable: run several tasks, one calm board.*
+- **M5 — Spend record + advanced auto-tuning (~1 wk).** Searchable cross-account record, spend attribution, and heavier auto-tuning of priors (the basic shadow-eval + confidence already shipped in M1). *Usable: spend is one searchable place; routing keeps sharpening on your code.*
+Re-evaluate against daily use before any "Later" item or productization.
+---
+## Risks / open
+- **AI SDK fit:** confirm its message type round-trips our canonical state + tool-calls across all 5 providers (M0 spike; only Anthropic live-verified so far).
+- **Balance APIs are inconsistent:** some providers don't expose balance. Mitigation: local spend tracking is authoritative; reconcile with headers where available.
+- **Plan/seat modeling is the hardest input:** flat-rate seat limits aren't cleanly exposed. Start with usage-header inference + user-declared limits; refine.
+- **Quality priors are seeds, not truth (the core risk):** addressed by moving calibration into M1 (shadow-eval + measured per-repo priors + confidence display) rather than deferring it. Residual risk: shadow-eval costs extra on sampled tasks and takes real usage to converge; until it does, routing leans conservative and labels guesses as guesses. Routing is only as good as this loop, so it gets the most rigor.
+- **"Task" granularity & savings ceiling:** resolved by the two-level model (warm main thread + cheap sub-task delegation). Residual: deciding *what* to delegate vs keep on the main thread is a real heuristic to tune.
+- **Verification on untested code:** resolved by tiered done-with-proof (tests → build+types+smoke → offered characterization test); residual is how aggressively to auto-generate tests.
+- **Cross-vendor live acceptance** (OpenAI/Gemini) still unverified — close in M0 with real keys, alongside the M1 headline cost-vs-quality measurement.`
+  },
+  {
+    file: "experiments/FINDINGS.md",
+    text: "# Gearbox — Experimental Findings\n\nGoal: (i) does the proposed structure work, (ii) does any other structure work, (iii) best solution to every problem found. Empirical, runnable experiments — not literature review.\n\n---\n\n## Experiment 1 — Canonical state → render per provider → switch at task boundary\n\n**Hypothesis (the architecture's keystone):** one model-agnostic canonical state can be faithfully rendered into Anthropic, OpenAI, and Gemini wire formats; switching providers at a task boundary is cheap because the curated projection is small; context poisoning is recoverable by invalidating facts.\n\n**Method:** real TS/Bun. `canonical.ts` (state model), `renderers.ts` (3 real provider projections), `validate.ts` (structural + cross-provider fidelity checks), `cost.ts` (js-tiktoken o200k_base + real 2026 prices), `curate.ts` (ledger projection), `scale.ts` (sessions of growing length). Run: `bun run experiments/switch-cost/run.ts`.\n\n**Results — STRUCTURE HOLDS:**\n\n- **Rendering correctness:** all structural checks pass for all 3 providers — role mapping (assistant↔model), tool-call↔result pairing (Anthropic `tool_use`/`tool_result`, OpenAI `tool_calls`+`role:tool`, Gemini `functionCall`/`functionResponse`), system handling (top-level vs system message vs systemInstruction), alternation invariants.\n- **Cross-provider fidelity:** the same canonical state yields *identical* semantics across all three — tool-call counts (4/4/4), user text, assistant text all equal. No information dropped/duplicated/mis-paired in translation.\n- **Switch cost scales the right way:** curated projection is ~bounded, transcript is O(session length). Switch cost advantage grows with session size:\n\n  | cycles | full tok | curated tok | ratio | full $switch | curated $switch |\n  |-------:|---------:|------------:|------:|-------------:|----------------:|\n  | 1      | 770      | 464         | 1.7×  | $0.0023      | $0.0014         |\n  | 16     | 9,470    | 914         | 10.4× | $0.0284      | $0.0027         |\n  | 64     | 37,310   | 2,354       | 15.8× | $0.1119      | $0.0071         |\n  | 256    | 148,670  | 8,114       | 18.3× | $0.4460      | $0.0243         |\n\n  At a realistic ~149k-token mid-session, a provider switch re-ingests ~8k curated tokens instead of ~149k raw — **18× cheaper**. Within a task you stay warm (cache hit ⇒ ~0 re-ingest); the cost is only paid at a switch.\n- **Context-poisoning recovery:** an invalidated fact (\"bug is in parseToken\") is absent from the curated projection; the corrected fact is present. Retraction works without rewriting history.\n\n**Honest caveats / unresolved risks:**\n1. **Semantic continuity after a switch is NOT yet proven.** Schema correctness ✅ (proven offline). Whether a model actually *continues the task correctly* from a curated projection needs a LIVE call — no provider keys on this box yet. This is the single most important remaining check.\n2. **Token counts use one tokenizer (o200k_base) as a cross-provider proxy.** Per-provider tokenizers differ slightly; the *ratio* (full vs curated) is robust to this, absolute per-provider $ is approximate.\n3. **Gemini has no tool-call IDs** — it matches function responses by name + order. Sequential calls fine; parallel calls to the *same* function are ambiguous. Real wrinkle for the renderer; needs an ordering/disambiguation strategy.\n4. **Curation quality is a policy risk, not an architecture risk.** Dropping bulky tool output assumes the durable conclusion was captured as a fact. A task needing exact historical detail (deep trace debugging) could be starved if the ledger didn't capture it. The facts-capture policy is where quality lives.\n5. Curated size grows with #facts (256 facts ≈ 8k tok). In the real system facts are themselves tiered/retrieved (project vs working memory), so carried context would be smaller still.\n\n**Bearing on alternatives (goal ii):** the \"full transcript\" column IS the transcript-as-truth alternative (translate the running transcript on the fly, no ledger). It is 18× more expensive at scale and accumulates poison irrecoverably. So the canonical-ledger structure beats transcript-as-truth on both cost and poisoning. Verdict: ledger structure justified.\n\n**Status:** Pillar 2 (memory/curation) and Pillar 3 (switching) substrate validated offline. Live semantic-continuity check pending a provider key.\n\n---\n\n## Experiment 2 — Intelligent routing vs naive baselines\n\n**Hypothesis:** a transparent multi-dimensional router (marginal-benefit + credit-scarcity) beats both \"always premium\" (overpays) and \"always cheap\" (under-delivers), respects credit limits, and explains itself.\n\n**Method:** deterministic simulator. `models.ts` (7 models, benchmark-shaped quality priors per task type, real 2026 prices, per-provider balances incl. the user's \"$10k Anthropic / $10 OpenAI\" scenario), `tasks.ts` (100 tasks, 70/20/10 easy/medium/hard), `router.ts` (cheapest-that-clears-the-bar + credit-scarcity penalty), `run.ts`. Run: `bun run experiments/routing/run.ts`.\n\n**Results — ROUTING WORKS:**\n\n| strategy | total $ | success | OpenAI $ spent (of $10) |\n|---|--:|--:|--:|\n| always-opus | $18.90 | 100% | $0 |\n| always-flash-lite | $0.32 | **47%** | $0 |\n| cheapest-adequate (credit-blind) | $3.98 | 100% | **$3.63** |\n| **Gearbox (marginal-benefit + credit)** | $5.59 | 100% | **$0.00** |\n\n- **70% cheaper than always-opus at identical 100% success** — matches the 60-80% industry claim.\n- **always-cheap is only 47% success** — fails every medium/hard task. Routing is doing real work, not just picking the cheapest.\n- **Credit dimension does exactly what was asked:** credit-blind burns 36% of the scarce $10 OpenAI balance on architecture tasks (via gpt-5.4); Gearbox preserves it entirely by routing those to Sonnet on the flush Anthropic pool. Gearbox costs slightly MORE in raw dollars ($5.59 vs $3.98) — the correct tradeoff: it's constraint-respecting optimization (\"prefer Claude unless strong reason\"), not blind cost-minimization. Tunable via one knob (K_SCARCITY).\n- **Marginal-benefit, shown explicitly:** for an architecture task, Opus (q .97) and Sonnet (q .93) both clear the .92 bar ⇒ Gearbox picks Sonnet; paying 1.7× for Opus's extra .04 above the bar is wasted. The full per-model score table prints, so every decision is explainable.\n- **Routing breakdown:** boilerplate/docs → flash-lite; test → haiku; debug/refactor/review → deepseek-v4; architecture → sonnet. Sensible per-tier allocation falls out of the scoring.\n\n**Honest caveats:**\n1. Quality priors are SEEDED (benchmark-shaped), not measured on the user's real tasks. The flywheel (refine priors from a local accept/revert log) is what makes them real — not yet built/tested.\n2. \"Success = quality ≥ threshold\" is a modeling simplification; real success is continuous and noisy. The sim proves the LOGIC is sound given priors, not that the priors are correct.\n3. K_SCARCITY=20 is hand-tuned; it sets the cost-vs-credit-preservation balance and should be tuned to the user's actual preference.\n4. Cache-locality / switch cost (Experiment 1) isn't yet folded into the per-task score — integrating routing + switching cost is future work.\n\n**Status:** Pillar 1 (routing brain) logic validated. Real priors + flywheel pending live use.\n\n---\n\n## Experiment 3 — Multi-session concurrency on a shared ledger\n\n**Hypothesis:** multiple sessions can safely share one ledger (the basis for \"multi-session day one\" + cross-session shared memory). **This experiment found a real bug, then the fix.**\n\n**Method:** 50 REAL concurrent subprocesses (Bun.spawn, genuine OS concurrency, not async) each write a fact to a shared store, four ways. `worker.ts` + `run.ts`. Run: `bun run experiments/concurrency/run.ts`.\n\n**Results:**\n\n| design | survived | worker failures | integrity |\n|---|---:|---:|---|\n| naive JSON (read-modify-write) | 5/50 | 0 | ❌ catastrophic lost-update race |\n| naive multi-process SQLite | 38/50 | 12 | ❌ data loss |\n| SQLite done right (WAL once + busy_timeout + retry) | **50/50** | 0 | ✅ safe |\n| single-writer orchestrator (serialized queue) | **50/50** | 0 | ✅ safe by construction |\n\n**Root cause found (this is the value):** naive multi-process SQLite lost writes because every worker re-ran `PRAGMA journal_mode=WAL` on its own connection — switching journal mode needs an exclusive lock, so 50 processes contended and 12 errored out (the first run *swallowed* those errors; capturing stderr exposed them). WAL is persistent once set, so workers must NOT re-set it. Fix: set WAL once at init, set only `busy_timeout` per connection, retry the write on a transient lock → 50/50.\n\n**Best solution (goal iii):** **single-writer orchestrator** — one process owns the ledger, sessions submit writes through a serialized queue. Race-free by construction, and it's how Gearbox runs anyway (one orchestrator managing N sessions). Pair with an **append-only event log** (asserts + invalidations as events): race-friendly (insert-only, no read-modify-write), fully auditable, and fact-invalidation (Exp 1's poisoning recovery) becomes just another event. For the separate-CLI-processes case, multi-process WAL done right is the fallback.\n\n**Bearing on alternatives (goal ii):** storage structure matters — naive shared-mutable (JSON or careless SQLite) is unsafe; **append-only event log + single writer** is the right structure. Validated.\n\n**Honest caveats:**\n1. This tests fact WRITES. It does not test semantic merge conflicts (two sessions editing the same file region) — that's handled by git-worktree isolation (untested here) + an integration step, not the ledger.\n2. The stderr *sample* line in the harness is mis-attributed (cosmetic bug); the failure COUNTS and survivor counts are accurate and are what the verdict rests on.\n\n**Status:** Pillar 4 (multi-session) concurrency safety validated with a concrete, proven storage design.\n\n---\n\n## Open / highest-value remaining experiment\n\n**Live cross-VENDOR continuity** (handing Gemini/GPT a projection rendered from Anthropic work) still needs raw keys — not on this box. Structurally proven (Exp 1); not yet live across vendors.\n\n---\n\n## Experiment 4 — LIVE: is a curated handoff sufficient, and is poisoning recoverable?\n\n**Hypothesis:** a model handed ONLY the curated projection (never the full transcript) continues the task correctly; and invalidating a poisoned fact stops it misleading the model. Tests the semantic half Exp 1 couldn't (offline).\n\n**Method:** real `claude -p` print-mode calls (claude-sonnet-4-6), existing CLI auth, no API key. Three handoff prompts (`experiments/continuity/prompts.sh`): A = curated post-fix handoff (poison already invalidated); B = pre-fix with poison present; C = pre-fix with poison invalidated. The answering model never saw the prior conversation — a faithful task-boundary handoff.\n\n**Results — LIVE, as predicted:**\n- **A (sufficiency):** → *\"Run the tests to verify the fix.\"* The model continues **correctly** from the curated handoff alone. The \"you curated away too much\" doubt fails here — the small projection carried enough.\n- **B (poison present):** → *\"Read the parseToken function…\"* — chases the poisoned lead.\n- **C (poison invalidated):** → *\"I'd read auth.test.ts to understand the assertions…\"* — does NOT fixate on parseToken.\n\nB vs C is the live proof that fact-invalidation removes the bias. The ledger can flip `valid:false` (concurrency-safe per Exp 3) ⇒ it can convert the B-state into the C-state ⇒ **live context-poisoning recovery**.\n\n**Honest caveats:**\n1. Same vendor (Anthropic). It IS a real handoff to a model that never saw the transcript (curation-sufficiency proven), but cross-VENDOR semantic continuity is still only structurally proven (Exp 1), not live.\n2. n=1 per prompt, one task. Existence proof / smoke test, not a benchmark. A real eval would run many tasks × models with scoring.\n3. Prompt phrasing influences single responses; the B/C contrast is exactly as predicted but isn't statistically robust.\n\n**Status:** curation-sufficiency + poisoning-recovery validated live (single-vendor). Cross-vendor live + statistical eval pending keys.\n\n---\n\n## Experiment 5 — Ground-truth verification gate\n\n**Hypothesis:** executable tests (not LLM self-assessment) should gate \"done\", so an agent can't present a broken or plausible-but-wrong fix. Attacks the #1 dev pain (11.4h/wk review; 43% of AI fixes need prod debugging) and the moat Anay's own fleet notes name (\"ground-truth verification closes the self-graded loop\").\n\n**Method:** a real micro-repo (`experiments/verification/repo/`) with a seconds-vs-ms expiry bug + 4 real `bun test` cases. Driver runs the actual test runner across three code states. Run: `bun run experiments/verification/run.ts`.\n\n**Results — GATE WORKS:**\n\n| state | tests | gate |\n|---|---|---|\n| buggy code | 2 pass / 2 fail | RED — not done |\n| plausible WRONG fix (edited parseToken, the poisoned lead) | 2 pass / 2 fail | RED — rejected |\n| correct fix (auth.ts `exp*1000`) | 4 pass / 0 fail | GREEN — done |\n\nThe wrong-but-plausible fix (chasing the same poisoned hypothesis from Exp 1/4) does NOT pass the gate. Only the correct fix turns it green. An agent that must clear this gate cannot hand over broken or wrong-but-plausible work.\n\n**Honest caveats:**\n1. Ground truth is only as good as the tests. No tests / weak tests ⇒ weak gate. Gearbox should pair this with the skeptic-evaluator (a fresh-context model review) for untested paths — designed, not yet prototyped.\n2. This validates the gate mechanism, not test generation. Generating good tests is its own problem.\n\n---\n\n## Experiment 7 — LIVE API acceptance (Anthropic) — closes E1's biggest caveat\n\n**Hypothesis:** the canonical→provider rendered payload is not just shape-valid per my own validator, but ACCEPTED by the real API, and a model continues correctly from the curated projection hitting the raw endpoint.\n\n**Method:** `experiments/live-check/run.ts` POSTs the curated post-fix projection to `api.anthropic.com/v1/messages` (real key in gitignored `.env.local`, never printed; Haiku; ~$0.0002).\n\n**Result — PASS (Anthropic only):**\n- **HTTP 200 — payload accepted by the real API.** This upgrades E1 from \"valid per my schema understanding\" to \"accepted by the live API.\" The curated payload contains a `tool_use`+`tool_result` pair with declared tools, so the trickiest renderer path is live-verified.\n- Model reply: *\"Now let's verify the fix by running the tests:\"* — correct continuation from the curated handoff, against the raw API (not the CLI as in E4).\n\n**Scope / still open:** Anthropic only. OpenAI / Gemini / DeepSeek payload ACCEPTANCE remains unverified (needs their keys). Cross-VENDOR continuity is now structurally proven (E1) + Anthropic-live (E7), not OpenAI/Gemini-live.\n\n---\n\n## Experiment 6 — Does a SIMPLER alternative architecture suffice? (goal ii, finally addressed)\n\n**Hypothesis:** maybe the canonical-ledger structure is over-engineering and a simpler architecture (gateway-only / transcript-as-truth, like OpenRouter + a thin agent; or Pi-as-is) is sufficient.\n\n**Method:** model three real architectures over a 60-turn session WITH prompt caching modeled honestly (full input $3/Mtok, cache-read $0.30, cache-write $3.75; a provider switch makes the next turn cold = full re-ingest). Plus a structural capability matrix for properties cost can't capture. `experiments/alternatives/run.ts`.\n\n**Results:**\n\n| switches | transcript-as-truth (gateway-only / pi) | gearbox ledger | ledger saves |\n|---:|---:|---:|---:|\n| 0 | $1.13 | $0.36 | 68% |\n| 5 | $1.71 | $0.37 | 78% |\n| 20 | $3.42 | $0.41 | 88% |\n| 40 | $5.71 | $0.46 | 92% |\n\n- **Surprise that corrected my own narrative:** the ledger is ~68% cheaper *even at 0 switches*. Prompt caching does NOT make a big transcript free — you still pay cache-READ on the full prior context every turn; curation shrinks that base. (I had initially written \"nearly equal at 0 switches\"; the numbers refuted it, narrative fixed.)\n- BUT absolute costs are modest ($0.36–$5.71 for 60 turns), so **for light, single-provider use the simpler structure is genuinely good enough** — cost alone does not force the ledger.\n- **Structural matrix is where alternatives actually fail:** gateway-only and pi-as-is CANNOT do cheap mid-workflow switching, per-ACCOUNT credit routing, context-poisoning recovery, or shared multi-session memory — at all. The ledger can.\n- **Coupling insight:** an intelligent router's job is to switch; switching is cheap only on the ledger; so routing + ledger are coupled — you can't bolt cheap intelligent routing onto a transcript-as-truth structure.\n\n**Verdict (ii):** a simpler structure SUFFICES for light / single-provider / single-session use. The ledger is JUSTIFIED — not over-engineering — specifically for Gearbox's target workflow: frequent intelligent switching + long sessions + many providers/accounts + parallel sessions. The structure must be EARNED by that need; if the user's real usage is light, build the simple thing.\n\n**Caveat:** this is a cost MODEL with stated assumptions (caching rates, even switch spacing, curated-growth shape from Exp 1). It's directional, not a billing guarantee.\n\n---\n\n# CONSOLIDATED VERDICT (goal: does the structure work / do alternatives / best solutions)\n\n**Calibration first — what these experiments are.** Four of five are DEMONSTRATIONS that the mechanisms behave correctly given inputs I chose; only E3 is an adversarial TEST (it could have failed silently — instead it found a real bug). E1 is a real cross-provider check but the same author wrote the renderer and the validator, so a shared schema misunderstanding would pass undetected (only a live API POST closes that). Read the claims accordingly.\n\n**(i) Does the proposed structure work / tend to work? — The load-bearing mechanisms are implemented and behave correctly; real-world efficacy is untested.**\n- **Pillar 3 / rendering (Exp 1):** one canonical state renders into Anthropic/OpenAI/Gemini payloads that are *internally consistent* (valid per my schema understanding) and semantically identical across the three. NOT yet verified that the real APIs accept them — needs one live POST per provider.\n- **Pillar 2 / curation (Exp 1 + 4):** the curated projection is bounded; a live model (single-vendor) continued correctly from a handoff, and the poison/clean contrast (E4 B vs C) is a real, if n=1, signal that invalidation removes a misleading lead. E4-A (sufficiency) is weak — the prompt named the fix, so the reply was near-forced.\n- **Pillar 1 / routing (Exp 2):** the scoring logic does what it is designed to do GIVEN priors/prices/mix I assigned. The \"70% cheaper at 100% success\" is arithmetic from those assumptions, not evidence that intelligent routing beats single-model in reality — reality is exactly those priors, which are untested. This is a unit test of the algorithm, not a real-world result.\n- **Pillar 4 / concurrency (Exp 3):** the one genuine test. Naive multi-process writes lose data; root cause found (per-connection WAL contention) and fixed; single-writer orchestrator is 50/50 safe under real concurrent processes. Solid.\n- **Verification (Exp 5):** illustrates that a test gate stays RED for a non-fix and GREEN for the fix. The \"wrong fix\" was a no-op, so this shows \"tests catch bugs when tests exist,\" not that the gate catches subtle wrong fixes.\n\n**Honest switching-cost framing:** the 18× is a CURATION win (a summary is smaller than full history) and helps whether or not you switch; with prompt caching, staying warm is ~$0 regardless. The switching-specific honest claim: curation makes a provider switch cost ~$0.02 of re-ingestion instead of ~$0.45 — not \"switching is 18× cheaper.\"\n\n**(ii) Does any other structure work? — YES, conditionally (Exp 6).**\nA simpler architecture (gateway-only / transcript-as-truth, or Pi-as-is) is genuinely sufficient for light, single-provider, single-session use — absolute costs are modest and prompt caching covers the no-switch case acceptably. The ledger structure is JUSTIFIED, not over-engineering, ONLY for Gearbox's target workflow: frequent intelligent switching + long sessions + many providers/accounts + parallel sessions. There it wins on cost (68→92%) AND does four things the alternatives structurally cannot (cheap switching, per-account credit routing, poisoning recovery, shared multi-session memory). Coupling insight: routing and the ledger are inseparable — cheap intelligent switching is impossible on a transcript-as-truth structure. Storage refinement from Exp 3: append-only event log + single-writer. **Honest scope:** the alternatives are MODELED, not built+benchmarked live; a true A/B needs the live harness.\n\n**(iii) Best solution to each problem found (proposed, partially evidenced):**\n- Model switching → canonical state + per-provider render + switch at task boundaries (warm within a task). [E1, rendering side only]\n- Context cost / poisoning → bounded curated projection + provenance + invalidation. [E1, E4 B/C]\n- Routing / overpay → cheapest-model-that-clears-the-bar + credit-scarcity penalty + transparency + feedback flywheel. [E2, logic only — priors unvalidated]\n- Multi-session safety → single-writer orchestrator + append-only event log. [E3, genuinely tested]\n- Review burden → executable ground-truth gate + fresh-context skeptic for untested paths. [E5, mechanism only]\n\n**The one test that can still falsify the keystone, and is cheap:** POST each rendered payload to the real Anthropic/OpenAI/Gemini APIs (one throwaway key + one curl each). Confirms payloads are *accepted* (not just shaped right per my understanding) and gives a real cross-vendor continuity data point. Worth more than any sixth confirmatory experiment. Blocked only on a key.\n\n**Bottom line:** the architecture is sound; nothing falsified it. E3 is a real adversarial win (found+fixed a bug); E6 answers (ii) honestly (simpler suffices for light use; the ledger is earned by frequent-switching + long + multi-account + parallel-session workflows); E7 live-verified the renderer is accepted by the real Anthropic API and continues correctly. Remaining honesty: E2/E4-A/E5 are demonstrations-by-construction; live acceptance is confirmed for Anthropic only (OpenAI/Gemini/DeepSeek need their keys); cross-vendor continuity is structurally proven, not yet OpenAI/Gemini-live. Net: build the ledger only if your real usage matches the target workflow; the keystone is now live-validated on one vendor — confirm OpenAI+Gemini acceptance before betting the Milestone-1 build on full cross-vendor switching."
+  }
+];
+// src/help/ask.ts
+var TOTAL_CAP = 48000;
+var cached3 = null;
+function loadGearboxDocs() {
+  if (cached3 !== null)
+    return cached3;
+  const parts = [];
+  let used = 0;
+  for (const { file: file5, text: text2 } of DOCS_BUNDLE) {
+    if (!text2)
+      continue;
+    const remaining = TOTAL_CAP - used;
+    if (remaining <= 0)
+      break;
+    const body = text2.length > remaining ? text2.slice(0, remaining) + `
+…(truncated)` : text2;
+    parts.push(`# ${file5}
+${body}`);
+    used += body.length;
+  }
+  parts.push(`# Command reference (in-app slash commands)
+${helpText()}
+${ACCOUNT_ADD_HELP}`);
+  cached3 = parts.join(`
+---
+`);
+  return cached3;
+}
+function buildAskSystem(docs) {
+  return [
+    "You answer questions about Gearbox, a multi-provider coding agent for the terminal.",
+    "Use ONLY the documentation below. Be concise and concrete: when a question is about",
+    "how to do something, quote the exact command, flag, or keybinding. If the answer is",
+    "not in the docs, say so plainly and suggest the user run /help. Do not invent features.",
+    "",
+    "=== GEARBOX DOCUMENTATION ===",
+    docs
+  ].join(`
+`);
+}
+var QUESTION_START = /^(how|what|where|why|which|can|does|do|is|are)\b/;
+var TOOL_TERMS = /\bgearbox\b|\brouting\b|\broute\b|\bmodel(s)?\b|\baccount(s)?\b|\beffort\b|\bplan mode\b|\byolo\b|\bshortcut(s)?\b|\bkeybind|\bghost\b|\bboo\b|\bsession(s)?\b|\bcompact\b|\bmcp\b|\bprovider(s)?\b|\bapi key\b|\bfullscreen\b|\binline\b|\bsubscription\b/i;
+var SLASH_CMD = /(^|\s)\/[a-z]/;
+var CODE_SIGNAL = /```|\bthis (file|function|bug|code|repo|method|class)\b|\bthe bug\b|[\w./-]+\.(ts|tsx|js|jsx|py|go|rs|java|rb|md|json|ya?ml|css|html)\b|\b(function|class|variable|component|endpoint|cache|regex|schema|migration|dependency|import|module)\b/i;
+function looksLikeGearboxQuestion(text2) {
+  const t2 = text2.trim();
+  if (t2.length < 6 || t2.length > 240)
+    return false;
+  if (CODE_SIGNAL.test(t2))
+    return false;
+  const isQuestion = t2.endsWith("?") || QUESTION_START.test(t2.toLowerCase());
+  if (!isQuestion)
+    return false;
+  return TOOL_TERMS.test(t2) || SLASH_CMD.test(t2);
+}
 // src/ui/App.tsx
 init_resolve();
 init_store();
 init_detect();
 init_onboard();
+// src/accounts/discover.ts
+init_resolve();
+init_catalog();
+var AZURE_LIST_API_VERSION = "2023-03-15-preview";
+var NATIVE2 = new Set(["anthropic", "openai", "google", "deepseek"]);
+var NON_CHAT = /embedding|dall-?e|whisper|tts|text-to-speech|speech|sora|moderation|transcrib|\bada\b|\bbabbage\b/i;
+function parseAzureDeployments(json2) {
+  const data = Array.isArray(json2?.data) ? json2.data : [];
+  const ids = data.filter((d) => !(typeof d?.model === "string" && NON_CHAT.test(d.model))).map((d) => d?.id).filter((x2) => typeof x2 === "string" && x2.length > 0);
+  return [...new Set(ids)];
+}
+function parseOpenAIModels(json2) {
+  const data = Array.isArray(json2?.data) ? json2.data : [];
+  const ids = data.filter((m2) => {
+    const cap = m2?.capabilities;
+    if (cap && typeof cap.chat_completion === "boolean") {
+      return cap.chat_completion && m2?.lifecycle_status !== "deprecated";
+    }
+    return true;
+  }).map((m2) => m2?.id).filter((x2) => typeof x2 === "string" && x2.length > 0);
+  return [...new Set(ids)];
+}
+async function discoverModels(account, fetchImpl = fetch) {
+  if (NATIVE2.has(account.provider) || account.exec === "cli")
+    return { ok: true, models: [] };
+  try {
+    const creds = await resolveCreds(account);
+    if (creds.azure) {
+      const { resourceName, apiKey } = creds.azure;
+      if (!resourceName || !apiKey)
+        return { ok: false, models: [], note: "azure: missing resource name or key" };
+      const url2 = `https://${resourceName}.openai.azure.com/openai/deployments?api-version=${AZURE_LIST_API_VERSION}`;
+      const r2 = await fetchImpl(url2, { headers: { "api-key": apiKey } });
+      if (!r2.ok)
+        return { ok: false, models: [], note: `no deployments listed (HTTP ${r2.status})` };
+      const models = parseAzureDeployments(await r2.json());
+      return { ok: true, models, note: models.length ? undefined : "no chat deployments yet — create one in Azure, then /account refresh" };
+    }
+    const base2 = creds.baseURL ?? catalogProvider(account.provider)?.baseUrl;
+    if (base2) {
+      const url2 = `${base2.replace(/\/$/, "")}/models`;
+      const r2 = await fetchImpl(url2, { headers: { Authorization: `Bearer ${creds.apiKey ?? ""}`, ...creds.headers ?? {} } });
+      if (!r2.ok)
+        return { ok: false, models: [], note: `models endpoint returned HTTP ${r2.status}` };
+      const models = parseOpenAIModels(await r2.json());
+      return { ok: true, models };
+    }
+    return { ok: true, models: [] };
+  } catch (e2) {
+    return { ok: false, models: [], note: e2?.message ?? "discovery failed" };
+  }
+}
+// src/ui/App.tsx
 init_catalog();
 init_onboarding();
 init_cli_backend();
@@ -144966,10 +145668,10 @@ function isNetworkError(e2) {
 // src/ui/git.ts
 import { execFileSync as execFileSync3 } from "node:child_process";
-var cached3;
+var cached4;
 function gitBranch() {
-  if (cached3 !== undefined)
-    return cached3;
+  if (cached4 !== undefined)
+    return cached4;
   try {
     const out = execFileSync3("git", ["rev-parse", "--abbrev-ref", "HEAD"], {
       cwd: process.cwd(),
@@ -144977,11 +145679,11 @@ function gitBranch() {
       stdio: ["ignore", "pipe", "ignore"],
       timeout: 1000
     }).trim();
-    cached3 = out || null;
+    cached4 = out || null;
   } catch {
-    cached3 = null;
+    cached4 = null;
   }
-  return cached3;
+  return cached4;
 }
 // src/ui/App.tsx
@@ -145000,6 +145702,7 @@ var KEYS_HELP = [
   "  ⌃Y copy last reply · shift+tab cycle mode (normal · auto-accept · plan)",
   "  tab @file complete · PgUp/PgDn scroll transcript · type while busy to queue",
   "  / commands · @ files · ! shell · # memory · drag/paste image paths · ? this help",
+  "  click the model or effort label in the status bar to pick (fullscreen)",
   "  input stays fixed at the bottom; /config inline on uses terminal scrollback"
 ].join(`
 `);
@@ -145339,6 +146042,20 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
   const [paletteIndex, setPaletteIndexState] = import_react26.useState(0);
   const searchRef = import_react26.useRef(null);
   const paletteIndexRef = import_react26.useRef(0);
+  const [quickPicker, setQuickPickerState] = import_react26.useState(null);
+  const [quickPickerIndex, setQuickPickerIndexState] = import_react26.useState(0);
+  const quickPickerRef = import_react26.useRef(null);
+  const quickPickerIndexRef = import_react26.useRef(0);
+  const setQuickPicker = (p) => {
+    quickPickerRef.current = p;
+    setQuickPickerState(p);
+    quickPickerIndexRef.current = 0;
+    setQuickPickerIndexState(0);
+  };
+  const setQuickPickerIndex = (n) => {
+    quickPickerIndexRef.current = n;
+    setQuickPickerIndexState(n);
+  };
   const setSearch = (s2) => {
     searchRef.current = s2;
     setSearchState(s2);
@@ -145412,6 +146129,8 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
   const scrollTopRef = import_react26.useRef(0);
   const viewportHeightRef = import_react26.useRef(1);
   const maxScrollRef = import_react26.useRef(0);
+  const paletteRowsLiveRef = import_react26.useRef(0);
+  const statusBarRenderRef = import_react26.useRef({ model: "", mode: "normal" });
   const setPerm = (p) => {
     permRef.current = p;
     setPermState(p);
@@ -145443,6 +146162,28 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
       setActiveCli({ id: a.id, label: bin });
     }
   }, []);
+  const discoveryRanRef = import_react26.useRef(false);
+  import_react26.useEffect(() => {
+    if (discoveryRanRef.current)
+      return;
+    discoveryRanRef.current = true;
+    (async () => {
+      const targets = listAccounts().filter((a) => a.enabled && a.exec !== "cli" && a.models === undefined);
+      let learned = 0;
+      for (const a of targets) {
+        try {
+          const d = await discoverModels(a);
+          if (d.ok) {
+            putAccount({ ...a, models: d.models });
+            if (d.models.length)
+              learned++;
+          }
+        } catch {}
+      }
+      if (learned)
+        notice(`loaded the real model list for ${learned} account${learned === 1 ? "" : "s"} — /model to see them`);
+    })();
+  }, []);
   import_react26.useEffect(() => {
     setPermissionHandler((req) => new Promise((resolve13) => {
       if (modeRef.current === "auto-accept" && (req.kind === "write" || req.kind === "edit")) {
@@ -145551,6 +146292,12 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
       const col = Math.max(0, x2 - 4);
       return offsetAt(value, lineIdx, col);
     };
+    const statusBarZoneAt = (x2, y) => {
+      const lineCount = Math.max(1, editRef.current.value.split(`
+`).length);
+      const { model: model2, effort: effort2, mode: mode3 } = statusBarRenderRef.current;
+      return statusBarHit({ x: x2, y, termRows: rows, composerLines: lineCount, paletteRows: paletteRowsLiveRef.current, model: model2, effort: effort2, mode: mode3 });
+    };
     const viewportTop = 4;
     const transcriptPoint = (x2, y) => {
       const viewportBottom = viewportTop + transcriptHeightLiveRef.current - 1;
@@ -145575,10 +146322,19 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
         else if (b === 65)
           delta += 1;
         else {
-          const off = composerOffset(x2, y);
-          const point = transcriptPoint(x2, y);
           const isDrag = (b & 32) === 32;
           const isPrimary = (b & 3) === 0;
+          if (fullscreen && isPrimary && !isDrag && !up2 && !busyRef.current && !permRef.current) {
+            const zone = statusBarZoneAt(x2, y);
+            if (zone) {
+              setQuickPicker(quickPickerRef.current === zone ? null : zone);
+              continue;
+            }
+            if (quickPickerRef.current)
+              setQuickPicker(null);
+          }
+          const off = composerOffset(x2, y);
+          const point = transcriptPoint(x2, y);
           if (isPrimary && isDrag && transcriptMouseAnchorRef.current && !point) {
             const bottom = viewportTop + transcriptHeightLiveRef.current - 1;
             if (y < viewportTop)
@@ -145703,6 +146459,7 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
     setItems(s2.items);
     msgRef.current = s2.messages;
     sessionRef.current = { id: s2.id, createdAt: s2.createdAt, title: s2.title, turns: s2.turns ?? [] };
+    cliSessionRef.current = undefined;
     notice(`resumed · ${s2.items.length} messages · ${new Date(s2.updatedAt).toLocaleString()}`);
   };
   import_react26.useEffect(() => {
@@ -145863,6 +146620,7 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
       return take2(listSessions().slice(0, 7).map((s2, i2) => ({ value: `/resume ${i2 + 1}`, label: `${i2 + 1}. ${s2.title || "(untitled)"}`.slice(0, 42), detail: new Date(s2.updatedAt).toLocaleDateString() })));
     return [];
   };
+  const quickPickerRows = (which2) => which2 === "model" ? commandPickerRows("/model") : effortRows();
   const isExactSlashCommand = (draft) => {
     const q = draft.trim();
     if (!/^\/\S+$/.test(q))
@@ -145909,6 +146667,7 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
     return model ? effortLevels(model) : [];
   })();
   const displayEffort = activeModelEfforts.length > 0 ? effort : undefined;
+  statusBarRenderRef.current = { model: modelLabel, effort: displayEffort, mode: mode2 };
   const push = (it) => setItems((prev) => [...prev, it]);
   const pushPhase = (label, detail) => {
     const id = idRef.current++;
@@ -145977,7 +146736,29 @@ function App2({ selector: initialSelector, runner, fullscreen = false, resumeId
       statusPad: Math.max(6, ...rows2.map((r2) => r2.status.length))
     };
   };
+  const askModeRef = import_react26.useRef(false);
   const defaultRunner = import_react26.useCallback(async ({ prompt, messages, onEvent, selector: sel, signal }) => {
+    const isAsk = askModeRef.current;
+    askModeRef.current = false;
+    if (isAsk) {
+      const docs = loadGearboxDocs();
+      if (!docs) {
+        onEvent({ type: "error", message: "Gearbox docs aren't bundled with this install — can't answer from them." });
+        return { messages, usage: { inputTokens: 0, outputTokens: 0 } };
+      }
+      const choice3 = sel.select({ prompt, kind: "search" });
+      routedRef.current = { model: choice3.model, reason: choice3.reason };
+      setLastPick({ model: choice3.model, reason: choice3.reason });
+      onEvent({ type: "model-pick", model: choice3.model.label, provider: choice3.model.provider, reason: choice3.reason });
+      const acct = accountResolver.pick(choice3.model.provider);
+      const creds2 = acct ? await resolveCreds(acct) : undefined;
+      usedAccountRef.current = acct?.id ?? null;
+      cliMetaRef.current = null;
+      if (acct)
+        markUsed(acct.id);
+      const r3 = await runCompletion({ model: choice3.model, system: buildAskSystem(docs), prompt, onEvent, signal, creds: creds2 });
+      return { messages, usage: r3.usage };
+    }
     const cli = activeCliRef.current;
     if (cli) {
       if (activeImagesRef.current.length) {
@@ -146239,11 +147020,32 @@ ${fetched.join(`
     const toolMap = new Map;
     const pendingToolStreams = new Map;
     let toolFlushTimer = null;
+    let pendingText = "";
+    let textFlushTimer = null;
     const changedFiles = new Set;
     const checks4 = [];
     const failures = [];
     let hadError = false;
+    const flushText = () => {
+      if (textFlushTimer) {
+        clearTimeout(textFlushTimer);
+        textFlushTimer = null;
+      }
+      if (!pendingText)
+        return;
+      const chunk2 = pendingText;
+      pendingText = "";
+      if (curAsstRef.current === null) {
+        const id = idRef.current++;
+        curAsstRef.current = id;
+        setItems((prev) => [...prev, { kind: "assistant", id, text: chunk2, done: false }]);
+      } else {
+        const id = curAsstRef.current;
+        setItems((prev) => prev.map((i2) => i2.id === id && i2.kind === "assistant" ? { ...i2, text: i2.text + chunk2 } : i2));
+      }
+    };
     const finishAssistant = () => {
+      flushText();
       const id = curAsstRef.current;
       if (id == null)
         return;
@@ -146310,14 +147112,9 @@ ${fetched.join(`
       } else if (e2.type === "text") {
         setMascotState("streaming");
         outCharsRef.current += e2.text.length;
-        if (curAsstRef.current === null) {
-          const id = idRef.current++;
-          curAsstRef.current = id;
-          setItems((prev) => [...prev, { kind: "assistant", id, text: e2.text, done: false }]);
-        } else {
-          const id = curAsstRef.current;
-          setItems((prev) => prev.map((i2) => i2.id === id && i2.kind === "assistant" ? { ...i2, text: i2.text + e2.text } : i2));
-        }
+        pendingText += e2.text;
+        if (!textFlushTimer)
+          textFlushTimer = setTimeout(flushText, 45);
       } else if (e2.type === "tool-start") {
         setMascotState("tool");
         finishAssistant();
@@ -146423,6 +147220,7 @@ ${fetched.join(`
       }
     } finally {
       activeImagesRef.current = [];
+      flushText();
       flushToolStreams();
       abortRef.current = null;
       setBusy(false);
@@ -146554,6 +147352,7 @@ ${fetched.join(`
           setLastInput(0);
           curAsstRef.current = null;
           routedRef.current = null;
+          cliSessionRef.current = undefined;
           sessionRef.current = { id: newSessionId(), createdAt: Date.now(), title: "", turns: [] };
           notice("started a fresh conversation");
           return;
@@ -146692,6 +147491,22 @@ ${fetched.join(`
           }
           runTurn(lastPromptRef.current);
           return;
+        case "ask": {
+          const question = arg.trim();
+          if (!question) {
+            echo(text2);
+            notice("usage: /ask <question about Gearbox>  ·  e.g. /ask how do I add Azure?");
+            return;
+          }
+          if (busyRef.current) {
+            echo(text2);
+            notice("finish the current turn first, then /ask");
+            return;
+          }
+          askModeRef.current = true;
+          runTurn(question);
+          return;
+        }
         case "model":
           echo(text2);
           if (!arg || arg.toLowerCase() === "all") {
@@ -146998,7 +147813,7 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
 ` + formatAccounts(all, activeId, []) : "no accounts yet — /account add to add one");
             return;
           }
-          if (!["add", "remove", "rm", "import", "off"].includes(subL)) {
+          if (!["add", "remove", "rm", "import", "off", "refresh"].includes(subL)) {
             const ref = findAccountRef(arg, all);
             if (ref.account) {
               activate(ref.account);
@@ -147030,16 +147845,7 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
             const provGiven = parts[2] ? key : "";
             const keyVal = parts[2] ?? "";
             if (!key) {
-              notice(`add an account:
-` + `  /account add claude          Claude subscription (Pro/Max)
-` + `  /account add claude <name>   a 2nd Claude account, e.g. /account add claude work
-` + `  /account add codex           ChatGPT subscription (Plus/Pro)
-` + `  /account add codex <name>    a 2nd ChatGPT account, e.g. /account add codex work
-` + `  /account add azure <foundry-endpoint> <api-key>
-` + `  /account add azure <resource-name> <api-key> [api-version]
-` + `  /account add openai-compat <name> <base-url> <api-key> <model> [model...]
-` + `  /account add <api-key>       paste any provider key (auto-detected)
-` + "  /account add <provider> <api-key>   e.g. anthropic, openai, openrouter");
+              notice(ACCOUNT_ADD_HELP);
               return;
             }
             (async () => {
@@ -147068,6 +147874,13 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
               notice(`${res.message} — testing…`);
               const t2 = await testAccount(res.account);
               notice(t2.ok ? `✓ added · ${t2.message}` : `added, but the key test failed: ${t2.message}`);
+              const d = await discoverModels(res.account);
+              if (d.models.length) {
+                putAccount({ ...res.account, models: d.models });
+                notice(`found ${d.models.length} model${d.models.length === 1 ? "" : "s"} on this account — /model to pick one`);
+              } else if (d.note) {
+                notice(d.note);
+              }
             })();
             return;
           }
@@ -147107,6 +147920,26 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
             })();
             return;
           }
+          if (subL === "refresh") {
+            (async () => {
+              const targets = listAccounts().filter((a) => a.enabled && a.exec !== "cli");
+              if (!targets.length) {
+                notice("no API/cloud accounts to refresh — /account add to add one");
+                return;
+              }
+              notice(`refreshing models for ${targets.length} account${targets.length === 1 ? "" : "s"}…`);
+              for (const a of targets) {
+                const d = await discoverModels(a);
+                if (d.models.length) {
+                  putAccount({ ...a, models: d.models });
+                  notice(`${accountName(a)}: ${d.models.length} model${d.models.length === 1 ? "" : "s"}`);
+                } else {
+                  notice(`${accountName(a)}: ${d.note ?? "no models discovered"}`);
+                }
+              }
+            })();
+            return;
+          }
           notice(`didn't recognize "/account ${arg}".
 ` + formatAccounts(all, activeId, importableEnvCreds(), accountStatusCacheRef.current));
@@ -147287,6 +148120,10 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
       notice(`queued (${queueRef.current.length}) — sends when the current turn finishes`);
       return;
     }
+    if (looksLikeGearboxQuestion(text2)) {
+      notice("↳ answering from Gearbox's own docs · rephrase as a task, or /help, to run it as a normal turn");
+      askModeRef.current = true;
+    }
     runTurn(text2);
   }, [handleCommand, runTurn, setupRequired, onboardingState]);
   import_react26.useEffect(() => {
@@ -147338,6 +148175,25 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
         resolvePerm("deny");
       return;
     }
+    if (quickPickerRef.current) {
+      const rows2 = quickPickerRows(quickPickerRef.current);
+      if (key.upArrow || key.downArrow) {
+        if (rows2.length) {
+          const delta = key.upArrow ? -1 : 1;
+          setQuickPickerIndex((quickPickerIndexRef.current + delta + rows2.length) % rows2.length);
+        }
+        return;
+      }
+      if (key.return) {
+        const row = rows2[Math.min(quickPickerIndexRef.current, rows2.length - 1)];
+        setQuickPicker(null);
+        if (row)
+          submit(row.value);
+        return;
+      }
+      setQuickPicker(null);
+      return;
+    }
     if (key.ctrl && input === "c") {
       if (busyRef.current) {
         interruptedRef.current = true;
@@ -147582,6 +148438,8 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
   const lineWidth2 = Math.max(width - 3, 20);
   const lines = import_react26.useMemo(() => itemsToLines(items, lineWidth2, expandAll), [items, lineWidth2, expandAll]);
   const PALETTE_ROWS = pickerRows.length ? Math.min(7, pickerRows.length) : fileMatches.length ? Math.min(5, fileMatches.length) : cmdMatches.length ? Math.min(7, cmdMatches.length) : 0;
+  const quickRows = quickPicker ? quickPickerRows(quickPicker) : [];
+  const quickPickerLimit = Math.min(7, Math.max(1, quickRows.length));
   let footer = 2;
   footer += perm ? 9 : 3;
   footer += PALETTE_ROWS;
@@ -147597,6 +148455,8 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
     footer += 1;
   if (copiedNotice)
     footer += 1;
+  if (quickPicker && quickRows.length)
+    footer += quickPickerLimit + 2;
   const HEADER = 3;
   const transcriptHeight = Math.max(1, rows - HEADER - footer);
   const maxScroll = Math.max(0, lines.length - transcriptHeight);
@@ -147605,6 +148465,7 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
   scrollTopLiveRef.current = effScroll;
   transcriptHeightLiveRef.current = transcriptHeight;
   viewportHeightRef.current = transcriptHeight;
+  paletteRowsLiveRef.current = PALETTE_ROWS;
   maxScrollRef.current = maxScroll;
   scrollTopRef.current = effScroll;
   import_react26.useEffect(() => {
@@ -147718,6 +148579,32 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
       }, undefined, false, undefined, this)
     ]
   }, undefined, true, undefined, this) : null;
+  const quickPickerJsx = quickPicker && quickRows.length ? /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(Box_default, {
+    flexDirection: "column",
+    marginTop: 1,
+    children: [
+      /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(Box_default, {
+        paddingX: 1,
+        children: [
+          /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(Text, {
+            color: color.accent,
+            children: quickPicker === "model" ? "model" : "effort"
+          }, undefined, false, undefined, this),
+          /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(Text, {
+            color: color.faint,
+            children: " · ↑↓ select · ⏎ apply · esc close"
+          }, undefined, false, undefined, this)
+        ]
+      }, undefined, true, undefined, this),
+      /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(CommandPalette, {
+        draft: "",
+        selected: Math.min(quickPickerIndex, quickRows.length - 1),
+        limit: quickPickerLimit,
+        rows: quickRows,
+        width
+      }, undefined, false, undefined, this)
+    ]
+  }, undefined, true, undefined, this) : null;
   const composerJsx = perm ? /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(PermissionPrompt, {
     req: perm,
     width
@@ -147812,6 +148699,7 @@ Example: /mcp add github npx -y @modelcontextprotocol/server-github`);
           ]
         }, undefined, true, undefined, this)
       }, undefined, false, undefined, this) : null,
+      quickPickerJsx,
       /* @__PURE__ */ jsx_dev_runtime12.jsxDEV(StatusBar, {
         model: modelLabel,
         branch,
@@ -147904,7 +148792,7 @@ init_permission();
 var jsx_dev_runtime13 = __toESM(require_jsx_dev_runtime(), 1);
 process.env.LANG = process.env.LANG || "en_US.UTF-8";
 process.env.LC_ALL = process.env.LC_ALL || "en_US.UTF-8";
-var VERSION16 = "0.1.30";
+var VERSION16 = "0.1.32";
 var args = process.argv.slice(2);
 var supportsAnsi = process.env.FORCE_COLOR === "1" || process.env.TERM !== "dumb" && process.env.NO_COLOR !== "1" && process.stdout.isTTY;
 var ansi = (code) => supportsAnsi ? `\x1B[${code}m` : "";
@@ -147924,123 +148812,78 @@ function onboardingBanner(termWidth) {
     const pad3 = Math.max(0, Math.floor((w - visibleLength(s2)) / 2));
     return " ".repeat(pad3) + s2;
   };
-  const rgb = (r2, g, b) => supportsAnsi ? `\x1B[38;2;${r2};${g};${b}m` : "";
   const RST = supportsAnsi ? "\x1B[0m" : "";
-  const mainColor = (row) => {
-    const t2 = row / 6;
-    const r2 = Math.round(80 + (0 - 80) * t2);
-    const g = Math.round(230 + (170 - 230) * t2);
-    const b = Math.round(255 + (255 - 255) * t2);
-    return rgb(r2, g, b);
-  };
-  const SHADOW = rgb(0, 55, 85);
+  const rgb = (r2, g, b) => supportsAnsi ? `\x1B[38;2;${r2};${g};${b}m` : "";
+  const FACE = rgb(0, 215, 255);
+  const DEPTH = rgb(0, 90, 145);
+  const colorize2 = (s2) => s2.split("").map((c) => c === "█" ? FACE + c + RST : c === " " ? " " : DEPTH + c + RST).join("");
   const F2 = {
     G: [
-      " ██████ ",
-      "██      ",
-      "██      ",
-      "██  ████",
-      "██    ██",
-      "██    ██",
-      " ██████ "
+      " ██████╗ ",
+      "██╔════╝ ",
+      "██║  ███╗",
+      "██║   ██║",
+      "╚██████╔╝",
+      " ╚═════╝ "
     ],
     E: [
-      "████████",
-      "██      ",
-      "██      ",
-      "██████  ",
-      "██      ",
-      "██      ",
-      "████████"
+      "███████╗",
+      "██╔════╝",
+      "█████╗  ",
+      "██╔══╝  ",
+      "███████╗",
+      "╚══════╝"
     ],
     A: [
-      "  ████  ",
-      " ██  ██ ",
-      "██    ██",
-      "████████",
-      "██    ██",
-      "██    ██",
-      "██    ██"
+      " █████╗ ",
+      "██╔══██╗",
+      "███████║",
+      "██╔══██║",
+      "██║  ██║",
+      "╚═╝  ╚═╝"
     ],
     R: [
-      "███████ ",
-      "██    ██",
-      "██    ██",
-      "███████ ",
-      "████    ",
-      "██  ██  ",
-      "██   ███"
+      "██████╗ ",
+      "██╔══██╗",
+      "██████╔╝",
+      "██╔══╗  ",
+      "██║  ██╗",
+      "╚═╝  ╚═╝"
     ],
     B: [
-      "███████ ",
-      "██    ██",
-      "██    ██",
-      "███████ ",
-      "██    ██",
-      "██    ██",
-      "███████ "
+      "██████╗ ",
+      "██╔══██╗",
+      "██████╔╝",
+      "██╔══██╗",
+      "██████╔╝",
+      "╚═════╝ "
     ],
     O: [
-      " ██████ ",
-      "██    ██",
-      "██    ██",
-      "██    ██",
-      "██    ██",
-      "██    ██",
-      " ██████ "
+      " ██████╗ ",
+      "██╔═══██╗",
+      "██║   ██║",
+      "██║   ██║",
+      "╚██████╔╝",
+      " ╚═════╝ "
     ],
     X: [
-      "██    ██",
-      " ██  ██ ",
-      "  ████  ",
-      "   ██   ",
-      "  ████  ",
-      " ██  ██ ",
-      "██    ██"
+      "██╗  ██╗",
+      "╚██╗██╔╝",
+      " ╚████╔╝",
+      " ██╔╗██ ",
+      "██╔╝╚██╗",
+      "╚═╝  ╚═╝"
     ]
   };
-  const LETTER_W = 8, GAP = 2, ROWS = 7;
   const letters = "GEARBOX".split("");
-  const totalCols = letters.length * LETTER_W + (letters.length - 1) * GAP;
-  const grid = Array.from({ length: ROWS }, () => new Array(totalCols).fill(false));
-  let startCol = 0;
-  for (const ch of letters) {
-    const rows = F2[ch] ?? [];
-    for (let r2 = 0;r2 < ROWS; r2++) {
-      const row = rows[r2] ?? "        ";
-      for (let c = 0;c < LETTER_W; c++) {
-        if (row[c] === "█") {
-          const cell2 = grid[r2];
-          if (cell2)
-            cell2[startCol + c] = true;
-        }
-      }
-    }
-    startCol += LETTER_W + GAP;
-  }
-  const SDX = 2, SDY = 2;
-  const renderW = totalCols + SDX;
-  const renderH = ROWS + SDY;
-  const leftPad = " ".repeat(Math.max(0, Math.floor((w - renderW) / 2)));
   console.log("");
-  for (let r2 = 0;r2 < renderH; r2++) {
-    let line = "";
-    for (let c = 0;c < renderW; c++) {
-      const mainOn = r2 < ROWS && c < totalCols && grid[r2]?.[c] === true;
-      const shadOn = r2 - SDY >= 0 && c - SDX >= 0 && r2 - SDY < ROWS && c - SDX < totalCols && grid[r2 - SDY]?.[c - SDX] === true;
-      if (mainOn) {
-        line += `${mainColor(r2)}█${RST}`;
-      } else if (shadOn) {
-        line += `${SHADOW}█${RST}`;
-      } else {
-        line += " ";
-      }
-    }
-    console.log(leftPad + line.trimEnd());
+  for (let r2 = 0;r2 < 6; r2++) {
+    const raw = letters.map((ch) => F2[ch]?.[r2] ?? "").join("  ");
+    console.log(center(colorize2(raw)));
   }
   console.log("");
-  console.log(center(`${rgb(0, 160, 200)}one terminal  ·  every model you pay for${RST}`));
-  console.log(center(`${rgb(0, 130, 170)}keys stay local · never sent anywhere${RST}`));
+  console.log(center(`${rgb(0, 155, 200)}one terminal  ·  every model you pay for${RST}`));
+  console.log(center(`${rgb(0, 125, 165)}keys stay local · never sent anywhere${RST}`));
   console.log("");
 }
 var centerStr = (text2, width) => {