npm - mobygate - Versions diffs - 0.8.4 → 0.9.2 - Mend

mobygate 0.8.4 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CHANGELOG.md +472 -0
package/bin/mobygate.js +214 -0
package/inspector.html +200 -3
package/lib/anthropic.js +6 -1
package/lib/captures-index.js +524 -0
package/lib/inference-runner.js +753 -0
package/lib/openai-translation.js +146 -0
package/lib/quiet.js +249 -0
package/lib/request-capture.js +24 -0
package/package.json +3 -1
package/server.js +318 -1110

package/server.js CHANGED Viewed

@@ -70,16 +70,14 @@ import {
   getCurrentVersion,
 } from './lib/updater.js';
 import {
-  anthropicMessagesToPrompt,
   collectAnthropicImages,
-  buildAnthropicResponse,
-  makeStreamTranslator,
   hasAnthropicTools,
-  mapStopReason,
-  extractSdkUsage,
 } from './lib/anthropic.js';
+import { hasTools, collectImages } from './lib/openai-translation.js';
+import { runInference, openaiSurface, anthropicSurface } from './lib/inference-runner.js';
 import { resolveSessionKey } from './lib/session-derive.js';
 import { captureRequest, captureResponse, isCaptureEnabled, CAPTURE_DIR_PATH } from './lib/request-capture.js';
+import { scrubAnthropicBody, quietDiagnose } from './lib/quiet.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
@@ -90,7 +88,32 @@ const PORT = parseInt(process.env.PORT || '3456', 10);
 // interface) in ~/.mobygate/config.yaml, but should add auth in front of it.
 const BIND = process.env.BIND || '127.0.0.1';
 const DEFAULT_MODEL = process.env.DEFAULT_MODEL || 'claude-opus-4-7[1m]';
-const SESSION_TTL_MS = parseInt(process.env.SESSION_TTL_MS || String(60 * 60 * 1000), 10); // 1 hour default
+// SESSION_TTL_MS: how long mobygate holds onto an idle SDK session before
+// expiring it from its in-memory + on-disk session store. v0.8.5 raises
+// the default from 1h → 4h based on real-world usage data: most multi-
+// channel users (Discord agents serving 20+ channels) revisit channels
+// every few hours, and a 1h TTL forced a fresh `query()` (full prompt
+// re-send) every time. With 4h, mobygate retains the SDK session ID for
+// half a day, so the next request resumes via session-id rather than
+// reissuing the entire prompt.
+//
+// Caveat — this only solves SDK-side session continuity. Anthropic's
+// wire-side prompt cache (5 min default, 1h with the
+// `extended-cache-ttl-2025-04-11` beta) is unaffected; the SDK doesn't
+// currently expose that beta to callers, so cache-creation tax on idle
+// channels still applies. The TTL bump is a partial mitigation, not a
+// fix.
+//
+// Override:  SESSION_TTL_MS=14400000  (env, in milliseconds)
+//           or  MOBY_SESSION_TTL_HOURS=4  (more readable, also accepted)
+const SESSION_TTL_MS = (() => {
+  if (process.env.SESSION_TTL_MS) return parseInt(process.env.SESSION_TTL_MS, 10);
+  if (process.env.MOBY_SESSION_TTL_HOURS) {
+    const h = parseFloat(process.env.MOBY_SESSION_TTL_HOURS);
+    if (h > 0) return Math.round(h * 60 * 60 * 1000);
+  }
+  return 4 * 60 * 60 * 1000; // 4h default (was 1h pre-v0.8.5)
+})();
 // ---------------------------------------------------------------------------
 // Session store — maps client keys → SDK session IDs (persisted to disk)
@@ -212,1101 +235,6 @@ function resolveModel(model) {
   return MODEL_MAP[stripped] || MODEL_MAP[model] || DEFAULT_MODEL;
 }
-// ---------------------------------------------------------------------------
-// OpenAI messages → single prompt string
-// ---------------------------------------------------------------------------
-function extractContent(content) {
-  if (typeof content === 'string') return content;
-  if (Array.isArray(content)) {
-    return content
-      .map((part) => {
-        if (typeof part === 'string') return part;
-        if (part.type === 'text') return part.text;
-        if (part.type === 'image_url') return ''; // images carried separately; drop from text
-        return JSON.stringify(part);
-      })
-      .filter(Boolean)
-      .join('\n');
-  }
-  if (content && typeof content === 'object') return JSON.stringify(content);
-  return String(content || '');
-}
-// Convert an OpenAI message.content array into Anthropic image content blocks.
-// Supports both data: URLs (base64) and remote https URLs.
-function extractImageBlocks(content) {
-  if (!Array.isArray(content)) return [];
-  const blocks = [];
-  for (const part of content) {
-    if (!part || part.type !== 'image_url') continue;
-    const url = typeof part.image_url === 'string' ? part.image_url : part.image_url?.url;
-    if (!url) continue;
-    const dataMatch = /^data:([^;]+);base64,(.+)$/.exec(url);
-    if (dataMatch) {
-      blocks.push({ type: 'image', source: { type: 'base64', media_type: dataMatch[1], data: dataMatch[2] } });
-    } else {
-      blocks.push({ type: 'image', source: { type: 'url', url } });
-    }
-  }
-  return blocks;
-}
-// Collect images from the LAST user message (OpenAI only attaches images to the latest turn).
-function collectImages(messages) {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    if (messages[i].role === 'user') return extractImageBlocks(messages[i].content);
-  }
-  return [];
-}
-// ---------------------------------------------------------------------------
-// Tool calling (Phase 1: native MCP tools — no more <tool_call> text hack)
-// ---------------------------------------------------------------------------
-// Client-provided OpenAI tools are registered with the SDK as in-process MCP
-// tools (see lib/tool-bridge.js). The model emits **native** tool_use content
-// blocks in its assistant messages; we abort the SDK on the first one and
-// return OpenAI tool_calls to the client. When the client replies with tool
-// results, we send them back as Anthropic tool_result content blocks inside
-// a single SDKUserMessage — round-tripping cleanly through the SDK session.
-function hasTools(body) {
-  return Array.isArray(body?.tools) && body.tools.length > 0;
-}
-/**
- * Build the prompt text from the OpenAI messages array.
- *
- * Returns `{ promptText }` — a single string ready for the SDK. Tool
- * results are spliced in as <tool_results> XML when present (see
- * lib/tool-bridge.js#toolMessagesToText for why we don't use native
- * tool_result content blocks yet).
- *
- * Resuming vs fresh:
- *   - Resuming: SDK has full history. We only send the new tail —
- *     trailing tool results plus the most recent user text, if any.
- *   - Fresh: SDK starts cold. We serialize the visible history with
- *     <system>/<previous_response>/<tool_results> tags. No tool-
- *     instruction injection — the SDK MCP registration handles that.
- */
-function messagesToPrompt(messages, { resuming = false } = {}) {
-  if (resuming) {
-    // Walk backwards from the end, collecting trailing tool messages and
-    // the most recent user text. Tool results are formatted as a text
-    // block (see lib/tool-bridge.js#toolMessagesToText for the rationale).
-    const trailingToolMessages = [];
-    let userText = '';
-    for (let i = messages.length - 1; i >= 0; i--) {
-      const msg = messages[i];
-      if (msg.role === 'tool') {
-        trailingToolMessages.unshift(msg);
-      } else if (msg.role === 'user') {
-        userText = extractContent(msg.content);
-        break;
-      } else {
-        break;
-      }
-    }
-    const toolResultsText = toolMessagesToText(trailingToolMessages);
-    if (!userText && !toolResultsText) {
-      // Earlier code fell back to extracting whatever was at messages[-1],
-      // which on an assistant-terminated history sent the assistant's own
-      // previous reply back to the SDK as the new user prompt — and the
-      // model would "respond to its own reply." Catch this clearly instead.
-      return {
-        promptText: '',
-        error: 'Resume mode requires the request to end with a user message or tool result. Last message has role "' + (messages[messages.length - 1]?.role || 'unknown') + '".',
-      };
-    }
-    const parts = [];
-    if (toolResultsText) parts.push(toolResultsText);
-    if (userText) parts.push(userText);
-    return { promptText: parts.join('\n\n') };
-  }
-  // Fresh request: serialize visible history as XML-wrapped text. No
-  // tool-instruction injection (the model learns about tools via the SDK
-  // MCP registration, not the prompt).
-  const parts = [];
-  for (const msg of messages) {
-    switch (msg.role) {
-      case 'system':
-        parts.push(`<system>\n${extractContent(msg.content)}\n</system>\n`);
-        break;
-      case 'user':
-        parts.push(extractContent(msg.content));
-        break;
-      case 'assistant': {
-        // Best-effort replay. tool_calls in non-resume history are dropped;
-        // the model can usually infer continuity from the surrounding text.
-        const text = extractContent(msg.content);
-        if (text) parts.push(`<previous_response>\n${text}\n</previous_response>\n`);
-        break;
-      }
-      case 'tool': {
-        // Tool messages on a fresh turn (rare — clients normally use
-        // session keys). Splice as text since there's no preceding
-        // tool_use turn we can bind to natively.
-        const text = toolMessagesToText([msg]);
-        if (text) parts.push(text);
-        break;
-      }
-    }
-  }
-  return {
-    promptText: parts.join('\n').trim(),
-  };
-}
-/**
- * Wrap promptText + optional image blocks into the form query() expects.
- * Returns a string for the fast path (text-only, no images), or an
- * async iterable yielding one SDKUserMessage with multi-part content
- * when there are images.
- */
-function buildQueryPrompt(promptText, imageBlocks) {
-  if (!imageBlocks.length) return promptText;
-  const content = [
-    { type: 'text', text: promptText || '' },
-    ...imageBlocks,
-  ];
-  async function* gen() {
-    yield {
-      type: 'user',
-      message: { role: 'user', content },
-      parent_tool_use_id: null,
-    };
-  }
-  return gen();
-}
-// ---------------------------------------------------------------------------
-// Normalize model name for OpenAI response format
-// ---------------------------------------------------------------------------
-function normalizeModelName(model) {
-  if (model?.includes('opus')) return 'claude-opus-4';
-  if (model?.includes('sonnet')) return 'claude-sonnet-4';
-  if (model?.includes('haiku')) return 'claude-haiku-4';
-  return model || 'claude-sonnet-4';
-}
-// ---------------------------------------------------------------------------
-// SSE helpers
-// ---------------------------------------------------------------------------
-function makeChunk(requestId, model, content, role, finishReason) {
-  return {
-    id: `chatcmpl-${requestId}`,
-    object: 'chat.completion.chunk',
-    created: Math.floor(Date.now() / 1000),
-    model: normalizeModelName(model),
-    choices: [{
-      index: 0,
-      delta: {
-        ...(role ? { role } : {}),
-        ...(content !== undefined ? { content } : {}),
-      },
-      finish_reason: finishReason || null,
-    }],
-  };
-}
-function sendSSE(res, data) {
-  if (!res.writableEnded) {
-    res.write(`data: ${JSON.stringify(data)}\n\n`);
-  }
-}
-// ---------------------------------------------------------------------------
-// POST /v1/chat/completions — streaming
-// ---------------------------------------------------------------------------
-async function handleStreaming(req, res, body, requestId, sessionKey) {
-  const existing = getSession(sessionKey);
-  const resuming = !!existing?.sdkSessionId;
-  const toolsEnabled = hasTools(body);
-  const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
-  if (promptError) {
-    return res.status(400).json({
-      error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
-    });
-  }
-  const images = collectImages(body.messages);
-  // NOTE: `prompt` is built inside runQuery (not here) when images are
-  // present, because buildQueryPrompt returns a single-use async iterator
-  // for multimodal requests. If we built it here and the SDK call hit a
-  // 401, runWithAuthRetry would invoke runQuery a second time with the
-  // same exhausted iterator → SDK gets an empty user message → silent
-  // empty response. Lazy construction inside runQuery rebuilds the
-  // iterator per attempt.
-  const model = resolveModel(body.model);
-  // Build the in-process MCP server exposing client tools to the SDK.
-  // null when toolsEnabled is false (or all tools are malformed).
-  const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
-  // System-prompt append: tells the model exactly which tools are
-  // available and that Claude Code's built-ins (Bash, Grep, Read, etc.)
-  // are NOT in this environment. Without this, the model trained-in
-  // priors lead it to call Grep/Bash, get blocked by allowedTools, and
-  // refuse the task instead of falling back to client tools. ~150 tokens.
-  const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
-  if (images.length) console.log(`  [multimodal] ${images.length} image block(s)`);
-  if (toolsEnabled) console.log(`  [tools] ${body.tools.length} client tool(s) registered as MCP`);
-  res.setHeader('Content-Type', 'text/event-stream');
-  res.setHeader('Cache-Control', 'no-cache');
-  res.setHeader('Connection', 'keep-alive');
-  res.setHeader('X-Request-Id', requestId);
-  if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
-  res.flushHeaders();
-  res.write(':ok\n\n');
-  const abortController = new AbortController();
-  let isFirst = true;
-  let resolvedModel = model;
-  let capturedSessionId = existing?.sdkSessionId || null;
-  let clientDisconnected = false;
-  let inputTokens = 0;
-  let outputTokens = 0;
-  let cacheReadTokens = 0;
-  let cacheCreateTokens = 0;
-  res.on('close', () => {
-    clientDisconnected = true;
-    abortController.abort();
-  });
-  if (resuming) {
-    console.log(`  [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
-  }
-  // Tools-mode buffers text and collects native tool_use blocks. If the
-  // model emits text first then a tool_use, we want both: textBefore as
-  // the assistant content, plus the tool_calls. (Most clients display the
-  // text and then act on the tool_calls.)
-  let bufferedText = '';
-  let collectedToolCalls = []; // [{id, name, arguments}] from extractToolUses()
-  const runQuery = async () => {
-    // Reset per-attempt state so a 401 retry starts clean
-    bufferedText = '';
-    collectedToolCalls = [];
-    isFirst = true;
-    resolvedModel = model;
-    capturedSessionId = existing?.sdkSessionId || null;
-    // Build the prompt lazily on each attempt — multimodal returns a
-    // single-use async iterator. Keeps 401 auth-retries safe.
-    const prompt = buildQueryPrompt(promptText, images);
-    for await (const message of query({
-      prompt,
-      options: {
-        model,
-        maxTurns: toolsEnabled ? 5 : 200,
-        permissionMode: 'bypassPermissions',
-        allowDangerouslySkipPermissions: true,
-        abortController,
-        // Tools-mode: register client tools as an in-process MCP server
-        // and allow only those (no Bash/Read/etc. — the SDK's built-ins
-        // would pollute the session and leak through to the model).
-        ...(clientToolsServer
-          ? {
-              mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
-              allowedTools: [`${MCP_TOOL_PREFIX}*`],
-              systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
-            }
-          : toolsEnabled
-            // Tools were requested but none were valid — disable all tools.
-            ? { allowedTools: [] }
-            : {}),
-        ...(resuming ? { resume: existing.sdkSessionId } : {}),
-        ...(sessionKey && !resuming ? { persistSession: true } : {}),
-      },
-    })) {
-      if (clientDisconnected) break;
-      const msgPreview = message.type === 'assistant'
-        ? `content_keys=${Object.keys(message).join(',')}`
-        : message.type === 'result'
-          ? `result=${(message.result || '').slice(0, 60)}`
-          : message.subtype || '';
-      console.log(`  [msg] type=${message.type} ${msgPreview}`);
-      if (message.type === 'system' && message.subtype === 'init' && message.model) {
-        resolvedModel = message.model;
-      }
-      if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
-        capturedSessionId = message.session_id;
-        console.log(`  [session] captured sdk session: ${capturedSessionId}`);
-      }
-      // Extract text from this assistant message
-      let turnText = '';
-      if (message.type === 'assistant' && message.message?.content) {
-        const content = message.message.content;
-        if (Array.isArray(content)) {
-          for (const b of content) if (b.type === 'text' && b.text) turnText += b.text;
-        } else if (typeof content === 'string') {
-          turnText = content;
-        }
-      }
-      // Detect auth failure surfaced inline (common on long-running proxies
-      // where the SDK's cached creds expire). Throw so runWithAuthRetry
-      // treats it like a real 401 exception.
-      if (turnText && isAuthFailureText(turnText) && isFirst) {
-        abortController.abort();
-        throw new AuthFailureInResultText(turnText);
-      }
-      // Tools-mode: check for native tool_use content blocks. The moment
-      // we see one, abort the SDK — we don't want our stub handler to
-      // hang waiting on an execution that's actually happening client-side.
-      if (toolsEnabled && message.type === 'assistant' && hasToolUse(message)) {
-        const calls = extractToolUses(message);
-        if (calls.length) {
-          collectedToolCalls.push(...calls);
-          if (turnText) bufferedText += turnText;
-          console.log(`  [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
-          abortController.abort();
-          break;
-        }
-      }
-      if (turnText) {
-        if (toolsEnabled) {
-          // Buffer text in case it precedes a tool_use, or ends up as the
-          // final response when the model decides not to call any tools.
-          bufferedText += turnText;
-        } else {
-          sendSSE(res, makeChunk(requestId, resolvedModel, turnText, isFirst ? 'assistant' : undefined, null));
-          isFirst = false;
-        }
-      }
-      if (message.type === 'result') {
-        if (message.result && isAuthFailureText(message.result) && isFirst) {
-          throw new AuthFailureInResultText(message.result);
-        }
-        if (!toolsEnabled && message.result && isFirst) {
-          sendSSE(res, makeChunk(requestId, resolvedModel, message.result, 'assistant', null));
-          isFirst = false;
-        }
-        if (toolsEnabled && !bufferedText && message.result) bufferedText = message.result;
-        const usage = extractSdkUsage(message);
-        inputTokens = usage.input_tokens;
-        outputTokens = usage.output_tokens;
-        cacheReadTokens = usage.cache_read_input_tokens;
-        cacheCreateTokens = usage.cache_creation_input_tokens;
-        console.log(`  [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
-        break;
-      }
-    }
-  };
-  try {
-    await runWithAuthRetry({
-      attempt: runQuery,
-      // Only retry if we haven't written a real chunk yet. In tools mode we
-      // buffer internally so any retry is safe regardless.
-      bailIfStarted: () => !toolsEnabled && !isFirst,
-      onRefreshing: (err) => console.warn(`[auth] 401 on stream — refreshing (${err.message?.slice(0, 80)})`),
-      onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying stream`),
-    });
-  } catch (err) {
-    // Abort from tool-call detection surfaces as an abort error — not a real failure
-    const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
-    if (!clientDisconnected && !(toolsEnabled && isAbort)) {
-      console.error('[stream] SDK error:', err.message);
-      sendSSE(res, { error: { message: err.message, type: 'server_error', code: null } });
-    }
-  }
-  if (sessionKey && capturedSessionId) {
-    upsertSession(sessionKey, capturedSessionId, resolvedModel);
-  }
-  // Tools mode: emit the buffered response as a single chunk with either
-  // tool_calls (+ finish_reason: tool_calls) or plain text (+ stop).
-  if (toolsEnabled && !res.writableEnded) {
-    if (collectedToolCalls.length > 0) {
-      console.log(`  [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
-      const chunk = {
-        id: `chatcmpl-${requestId}`,
-        object: 'chat.completion.chunk',
-        created: Math.floor(Date.now() / 1000),
-        model: normalizeModelName(resolvedModel),
-        choices: [{
-          index: 0,
-          delta: {
-            role: 'assistant',
-            content: bufferedText.trim() || null,
-            tool_calls: collectedToolCalls.map((tc, i) => ({
-              index: i,
-              id: tc.id,
-              type: 'function',
-              function: { name: tc.name, arguments: tc.arguments },
-            })),
-          },
-          finish_reason: 'tool_calls',
-        }],
-      };
-      sendSSE(res, chunk);
-    } else {
-      sendSSE(res, makeChunk(requestId, resolvedModel, bufferedText, 'assistant', null));
-      sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
-    }
-    res.write('data: [DONE]\n\n');
-    res.end();
-    captureResponse({
-      requestId,
-      usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
-      status: 'ok',
-      stopReason: collectedToolCalls.length > 0 ? 'tool_use' : 'end_turn',
-      model: resolvedModel,
-    });
-    return;
-  }
-  if (!res.writableEnded) {
-    sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
-    res.write('data: [DONE]\n\n');
-    res.end();
-  }
-  captureResponse({
-    requestId,
-    usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
-    status: clientDisconnected ? 'client_disconnect' : 'ok',
-    stopReason: 'end_turn',
-    model: resolvedModel,
-  });
-}
-// ---------------------------------------------------------------------------
-// POST /v1/chat/completions — non-streaming
-// ---------------------------------------------------------------------------
-async function handleNonStreaming(res, body, requestId, sessionKey) {
-  const existing = getSession(sessionKey);
-  const resuming = !!existing?.sdkSessionId;
-  const toolsEnabled = hasTools(body);
-  const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
-  if (promptError) {
-    return res.status(400).json({
-      error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
-    });
-  }
-  const images = collectImages(body.messages);
-  // NOTE: `prompt` is built inside runQuery (not here) when images are
-  // present, because buildQueryPrompt returns a single-use async iterator
-  // for multimodal requests. If we built it here and the SDK call hit a
-  // 401, runWithAuthRetry would invoke runQuery a second time with the
-  // same exhausted iterator → SDK gets an empty user message → silent
-  // empty response. Lazy construction inside runQuery rebuilds the
-  // iterator per attempt.
-  const model = resolveModel(body.model);
-  const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
-  const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
-  if (images.length) console.log(`  [multimodal] ${images.length} image block(s)`);
-  if (toolsEnabled) console.log(`  [tools] ${body.tools.length} client tool(s) registered as MCP`);
-  let resultText = '';
-  let collectedToolCalls = [];
-  let resolvedModel = model;
-  let inputTokens = 0;
-  let outputTokens = 0;
-  let cacheReadTokens = 0;
-  let cacheCreateTokens = 0;
-  let stopReason = 'end_turn';
-  let capturedSessionId = existing?.sdkSessionId || null;
-  const abortController = new AbortController();
-  if (resuming) {
-    console.log(`  [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
-  }
-  const runQuery = async () => {
-    // Reset per-attempt state so a 401 retry starts clean
-    resultText = '';
-    collectedToolCalls = [];
-    resolvedModel = model;
-    inputTokens = 0;
-    outputTokens = 0;
-    capturedSessionId = existing?.sdkSessionId || null;
-    // Build the prompt lazily on each attempt — multimodal returns a
-    // single-use async iterator. Keeps 401 auth-retries safe.
-    const prompt = buildQueryPrompt(promptText, images);
-    for await (const message of query({
-      prompt,
-      options: {
-        model,
-        maxTurns: toolsEnabled ? 5 : 200,
-        permissionMode: 'bypassPermissions',
-        allowDangerouslySkipPermissions: true,
-        abortController,
-        ...(clientToolsServer
-          ? {
-              mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
-              allowedTools: [`${MCP_TOOL_PREFIX}*`],
-              systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
-            }
-          : toolsEnabled
-            ? { allowedTools: [] }
-            : {}),
-        ...(resuming ? { resume: existing.sdkSessionId } : {}),
-        ...(sessionKey && !resuming ? { persistSession: true } : {}),
-      },
-    })) {
-      if (message.type === 'system' && message.subtype === 'init' && message.model) {
-        resolvedModel = message.model;
-      }
-      if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
-        capturedSessionId = message.session_id;
-        console.log(`  [session] captured sdk session: ${capturedSessionId}`);
-      }
-      if (message.type === 'assistant' && message.message?.content) {
-        const content = message.message.content;
-        if (Array.isArray(content)) {
-          for (const block of content) {
-            if (block.type === 'text') resultText += block.text || '';
-          }
-        } else if (typeof content === 'string') {
-          resultText += content;
-        }
-        // Detect auth failure surfaced inline (long-running proxy, cached creds)
-        if (isAuthFailureText(resultText)) {
-          abortController.abort();
-          throw new AuthFailureInResultText(resultText);
-        }
-        // Native tool_use detection — abort the moment a tool_use lands.
-        if (toolsEnabled && hasToolUse(message)) {
-          const calls = extractToolUses(message);
-          if (calls.length) {
-            collectedToolCalls.push(...calls);
-            console.log(`  [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
-            abortController.abort();
-            break;
-          }
-        }
-      }
-      if (message.type === 'result') {
-        if (message.result && !resultText) resultText = message.result;
-        if (isAuthFailureText(resultText)) {
-          throw new AuthFailureInResultText(resultText);
-        }
-        const usage = extractSdkUsage(message);
-        inputTokens = usage.input_tokens;
-        outputTokens = usage.output_tokens;
-        cacheReadTokens = usage.cache_read_input_tokens;
-        cacheCreateTokens = usage.cache_creation_input_tokens;
-        console.log(`  [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
-        if (message.subtype) stopReason = message.subtype;
-        break;
-      }
-    }
-  };
-  try {
-    await runWithAuthRetry({
-      attempt: runQuery,
-      // Non-streaming never writes to res until the end — retry is always safe
-      bailIfStarted: () => false,
-      onRefreshing: (err) => console.warn(`[auth] 401 on sync call — refreshing (${err.message?.slice(0, 80)})`),
-      onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying sync call`),
-    });
-  } catch (err) {
-    const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
-    if (!(toolsEnabled && isAbort)) {
-      console.error('[non-stream] SDK error:', err.message);
-      return res.status(500).json({ error: { message: err.message, type: 'server_error', code: null } });
-    }
-  }
-  if (sessionKey && capturedSessionId) {
-    upsertSession(sessionKey, capturedSessionId, resolvedModel);
-  }
-  const responseHeaders = {};
-  if (sessionKey) responseHeaders['X-Session-Id'] = sessionKey;
-  // Tool-calling response shape
-  if (toolsEnabled && collectedToolCalls.length > 0) {
-    console.log(`  [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
-    return res.set(responseHeaders).json({
-      id: `chatcmpl-${requestId}`,
-      object: 'chat.completion',
-      created: Math.floor(Date.now() / 1000),
-      model: normalizeModelName(resolvedModel),
-      choices: [{
-        index: 0,
-        message: {
-          role: 'assistant',
-          content: resultText.trim() || null,
-          tool_calls: collectedToolCalls.map((tc) => ({
-            id: tc.id,
-            type: 'function',
-            function: { name: tc.name, arguments: tc.arguments },
-          })),
-        },
-        finish_reason: 'tool_calls',
-      }],
-      usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
-    });
-    // No tool_use blocks → fall through to normal text response
-  }
-  res.set(responseHeaders).json({
-    id: `chatcmpl-${requestId}`,
-    object: 'chat.completion',
-    created: Math.floor(Date.now() / 1000),
-    model: normalizeModelName(resolvedModel),
-    choices: [{
-      index: 0,
-      message: { role: 'assistant', content: resultText },
-      finish_reason: 'stop',
-    }],
-    usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
-  });
-  captureResponse({
-    requestId,
-    usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
-    status: 'ok',
-    stopReason,
-    model: resolvedModel,
-  });
-}
-// ---------------------------------------------------------------------------
-// POST /v1/messages — Anthropic-native surface (non-streaming + streaming)
-// ---------------------------------------------------------------------------
-// The dual-surface architecture: Hermes uses /v1/chat/completions
-// (OpenAI shape), OpenClaw uses /v1/messages (Anthropic shape). Both
-// translate to the SAME underlying SDK query() — the surfaces are pure
-// translators over a single inference engine.
-//
-// Tool calling: reuses Phase 1's native MCP path from lib/tool-bridge.js.
-// No prompt-injected tool definitions, no <tool_call> text parsing.
-// Inbound tool_results still spliced as text on resume (see anthropic.js
-// docstring for why — Phase 1 limitation, not lifted here).
-async function handleAnthropicNonStreaming(res, body, requestId, sessionKey) {
-  const existing = getSession(sessionKey);
-  const resuming = !!existing?.sdkSessionId;
-  const toolsEnabled = hasAnthropicTools(body);
-  const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
-  if (promptError) {
-    return res.status(400).json({
-      type: 'error',
-      error: { type: 'invalid_request_error', message: promptError },
-    });
-  }
-  const images = collectAnthropicImages(body.messages || []);
-  // See note in handleStreaming — `prompt` is built lazily inside runQuery
-  // because the multimodal path returns a single-use async iterator that
-  // a 401-retry would exhaust on the first attempt.
-  const model = resolveModel(body.model);
-  // Translate Anthropic tool defs → OpenAI shape that buildClientToolsServer
-  // expects. Both go through the same JSON-Schema → Zod path on the way to
-  // MCP; the wrapper shape difference is just `function:{name, parameters}`
-  // vs `{name, input_schema}`.
-  const toolsForBridge = toolsEnabled
-    ? body.tools.map((t) => ({
-        type: 'function',
-        function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
-      }))
-    : null;
-  const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
-  const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
-  if (images.length) console.log(`  [multimodal] ${images.length} image block(s)`);
-  if (toolsEnabled) console.log(`  [tools] ${body.tools.length} client tool(s) registered as MCP`);
-  let resultText = '';
-  let collectedToolCalls = [];
-  let resolvedModel = model;
-  let inputTokens = 0;
-  let outputTokens = 0;
-  let cacheReadTokens = 0;
-  let cacheCreateTokens = 0;
-  let capturedSessionId = existing?.sdkSessionId || null;
-  let stopReason = 'end_turn';
-  const abortController = new AbortController();
-  if (resuming) {
-    console.log(`  [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
-  }
-  const runQuery = async () => {
-    resultText = '';
-    collectedToolCalls = [];
-    resolvedModel = model;
-    inputTokens = 0;
-    outputTokens = 0;
-    capturedSessionId = existing?.sdkSessionId || null;
-    stopReason = 'end_turn';
-    // Build the prompt lazily on each attempt — multimodal returns a
-    // single-use async iterator. Keeps 401 auth-retries safe.
-    const prompt = buildQueryPrompt(promptText, images);
-    for await (const message of query({
-      prompt,
-      options: {
-        model,
-        maxTurns: toolsEnabled ? 5 : 200,
-        permissionMode: 'bypassPermissions',
-        allowDangerouslySkipPermissions: true,
-        abortController,
-        ...(clientToolsServer
-          ? {
-              mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
-              allowedTools: [`${MCP_TOOL_PREFIX}*`],
-              systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
-            }
-          : toolsEnabled
-            ? { allowedTools: [] }
-            : {}),
-        ...(resuming ? { resume: existing.sdkSessionId } : {}),
-        ...(sessionKey && !resuming ? { persistSession: true } : {}),
-      },
-    })) {
-      if (message.type === 'system' && message.subtype === 'init' && message.model) {
-        resolvedModel = message.model;
-      }
-      if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
-        capturedSessionId = message.session_id;
-        console.log(`  [session] captured sdk session: ${capturedSessionId}`);
-      }
-      if (message.type === 'assistant' && message.message?.content) {
-        const content = message.message.content;
-        if (Array.isArray(content)) {
-          for (const block of content) {
-            if (block.type === 'text') resultText += block.text || '';
-          }
-        } else if (typeof content === 'string') {
-          resultText += content;
-        }
-        if (isAuthFailureText(resultText)) {
-          abortController.abort();
-          throw new AuthFailureInResultText(resultText);
-        }
-        if (toolsEnabled && hasToolUse(message)) {
-          const calls = extractToolUses(message);
-          if (calls.length) {
-            collectedToolCalls.push(...calls);
-            stopReason = 'tool_use';
-            console.log(`  [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
-            abortController.abort();
-            break;
-          }
-        }
-      }
-      if (message.type === 'result') {
-        if (message.result && !resultText) resultText = message.result;
-        if (isAuthFailureText(resultText)) {
-          throw new AuthFailureInResultText(resultText);
-        }
-        const usage = extractSdkUsage(message);
-        inputTokens = usage.input_tokens;
-        outputTokens = usage.output_tokens;
-        cacheReadTokens = usage.cache_read_input_tokens;
-        cacheCreateTokens = usage.cache_creation_input_tokens;
-        console.log(`  [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
-        stopReason = mapStopReason(message);
-        break;
-      }
-    }
-  };
-  try {
-    await runWithAuthRetry({
-      attempt: runQuery,
-      bailIfStarted: () => false,
-      onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages — refreshing (${err.message?.slice(0, 80)})`),
-      onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages`),
-    });
-  } catch (err) {
-    const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
-    if (!(toolsEnabled && isAbort)) {
-      console.error('[/v1/messages] SDK error:', err.message);
-      return res.status(500).json({
-        type: 'error',
-        error: { type: 'api_error', message: err.message },
-      });
-    }
-  }
-  if (sessionKey && capturedSessionId) {
-    upsertSession(sessionKey, capturedSessionId, resolvedModel);
-  }
-  if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
-  res.json(buildAnthropicResponse({
-    rawText: resultText.trim(),
-    toolUses: collectedToolCalls,
-    model: resolvedModel,
-    usage: { input_tokens: inputTokens, output_tokens: outputTokens },
-    requestId,
-    stopReason,
-  }));
-  captureResponse({
-    requestId,
-    usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
-    status: 'ok',
-    stopReason,
-    model: resolvedModel,
-  });
-}
-async function handleAnthropicStreaming(req, res, body, requestId, sessionKey) {
-  const existing = getSession(sessionKey);
-  const resuming = !!existing?.sdkSessionId;
-  const toolsEnabled = hasAnthropicTools(body);
-  const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
-  if (promptError) {
-    return res.status(400).json({
-      type: 'error',
-      error: { type: 'invalid_request_error', message: promptError },
-    });
-  }
-  const images = collectAnthropicImages(body.messages || []);
-  // See note in handleStreaming — `prompt` is built lazily inside runQuery
-  // because the multimodal path returns a single-use async iterator that
-  // a 401-retry would exhaust on the first attempt.
-  const model = resolveModel(body.model);
-  const toolsForBridge = toolsEnabled
-    ? body.tools.map((t) => ({
-        type: 'function',
-        function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
-      }))
-    : null;
-  const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
-  const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
-  if (images.length) console.log(`  [multimodal] ${images.length} image block(s)`);
-  if (toolsEnabled) console.log(`  [tools] ${body.tools.length} client tool(s) registered as MCP`);
-  res.setHeader('Content-Type', 'text/event-stream');
-  res.setHeader('Cache-Control', 'no-cache');
-  res.setHeader('Connection', 'keep-alive');
-  res.setHeader('X-Request-Id', requestId);
-  if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
-  res.flushHeaders();
-  const tx = makeStreamTranslator({ res, requestId, model });
-  const abortController = new AbortController();
-  let resolvedModel = model;
-  let capturedSessionId = existing?.sdkSessionId || null;
-  let inputTokens = 0;
-  let outputTokens = 0;
-  let cacheReadTokens = 0;
-  let cacheCreateTokens = 0;
-  let stopReason = 'end_turn';
-  let clientDisconnected = false;
-  let textEmittedSoFar = ''; // dedup against same-message reflow from SDK
-  let toolUseEmitted = false;
-  res.on('close', () => {
-    clientDisconnected = true;
-    abortController.abort();
-  });
-  if (resuming) {
-    console.log(`  [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
-  }
-  const runQuery = async () => {
-    // Reset per-attempt state in case of 401-retry. Note: tx is reused
-    // across retries, so a successful retry that comes after we already
-    // emitted message_start would surface as a confused stream. We bail
-    // out of retry once the translator has started (see bailIfStarted).
-    resolvedModel = model;
-    capturedSessionId = existing?.sdkSessionId || null;
-    inputTokens = 0;
-    outputTokens = 0;
-    stopReason = 'end_turn';
-    textEmittedSoFar = '';
-    toolUseEmitted = false;
-    // Build the prompt lazily on each attempt — multimodal returns a
-    // single-use async iterator. Keeps 401 auth-retries safe.
-    const prompt = buildQueryPrompt(promptText, images);
-    for await (const message of query({
-      prompt,
-      options: {
-        model,
-        maxTurns: toolsEnabled ? 5 : 200,
-        permissionMode: 'bypassPermissions',
-        allowDangerouslySkipPermissions: true,
-        abortController,
-        ...(clientToolsServer
-          ? {
-              mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
-              allowedTools: [`${MCP_TOOL_PREFIX}*`],
-              systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
-            }
-          : toolsEnabled
-            ? { allowedTools: [] }
-            : {}),
-        ...(resuming ? { resume: existing.sdkSessionId } : {}),
-        ...(sessionKey && !resuming ? { persistSession: true } : {}),
-      },
-    })) {
-      if (clientDisconnected) break;
-      if (message.type === 'system' && message.subtype === 'init' && message.model) {
-        resolvedModel = message.model;
-        tx.start(resolvedModel, 0);
-      }
-      if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
-        capturedSessionId = message.session_id;
-        console.log(`  [session] captured sdk session: ${capturedSessionId}`);
-      }
-      if (message.type === 'assistant' && message.message?.content) {
-        const content = message.message.content;
-        // Auth-failure short-circuit: throw so runWithAuthRetry handles it.
-        // Only safe before any text has been streamed (otherwise we've
-        // already corrupted the SSE stream and can't undo).
-        if (Array.isArray(content)) {
-          let combined = '';
-          for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
-          if (combined && isAuthFailureText(combined) && !tx.hasStarted) {
-            abortController.abort();
-            throw new AuthFailureInResultText(combined);
-          }
-        }
-        // Tool_use detection: emit tool_use blocks structurally and abort.
-        // We do this BEFORE streaming text deltas from this message so the
-        // tool_use block is properly framed (after any pending text block
-        // closes). The translator handles the close-text → open-tool-use
-        // sequencing internally.
-        if (toolsEnabled && hasToolUse(message)) {
-          const calls = extractToolUses(message);
-          if (calls.length) {
-            // Emit any text from this same message *before* the tool_use
-            // (Anthropic streams sometimes have text + tool_use in one
-            // assistant message — preserve that ordering).
-            if (Array.isArray(content)) {
-              for (const b of content) {
-                if (b?.type === 'text' && b.text) {
-                  // Compute delta vs what we've emitted to avoid duplication
-                  // on aggregator-style assistant messages that resend the
-                  // whole accumulated text.
-                  const delta = b.text.startsWith(textEmittedSoFar)
-                    ? b.text.slice(textEmittedSoFar.length)
-                    : b.text;
-                  if (delta) {
-                    tx.pushTextDelta(delta);
-                    textEmittedSoFar += delta;
-                  }
-                }
-              }
-            }
-            for (const tu of calls) tx.pushToolUse(tu);
-            toolUseEmitted = true;
-            stopReason = 'tool_use';
-            console.log(`  [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
-            abortController.abort();
-            break;
-          }
-        }
-        // Plain text-only assistant message: stream the delta.
-        if (Array.isArray(content)) {
-          let combined = '';
-          for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
-          if (combined) {
-            const delta = combined.startsWith(textEmittedSoFar)
-              ? combined.slice(textEmittedSoFar.length)
-              : combined;
-            if (delta) {
-              tx.pushTextDelta(delta);
-              textEmittedSoFar += delta;
-            }
-          }
-        } else if (typeof content === 'string' && content) {
-          const delta = content.startsWith(textEmittedSoFar)
-            ? content.slice(textEmittedSoFar.length)
-            : content;
-          if (delta) {
-            tx.pushTextDelta(delta);
-            textEmittedSoFar += delta;
-          }
-        }
-      }
-      if (message.type === 'result') {
-        if (message.result && !textEmittedSoFar && !toolUseEmitted) {
-          // Some SDK paths only deliver text via the final result message
-          // (no streaming assistant messages). Emit it here as a single
-          // delta — clients see this as "model started + finished in one
-          // chunk", which is valid SSE.
-          tx.pushTextDelta(message.result);
-        }
-        if (isAuthFailureText(message.result || '') && !tx.hasStarted) {
-          throw new AuthFailureInResultText(message.result);
-        }
-        const usage = extractSdkUsage(message);
-        inputTokens = usage.input_tokens;
-        outputTokens = usage.output_tokens;
-        cacheReadTokens = usage.cache_read_input_tokens;
-        cacheCreateTokens = usage.cache_creation_input_tokens;
-        console.log(`  [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
-        if (!toolUseEmitted) stopReason = mapStopReason(message);
-        break;
-      }
-    }
-  };
-  try {
-    await runWithAuthRetry({
-      attempt: runQuery,
-      // Once we've emitted message_start or any deltas, the SSE stream is
-      // committed — a retry would fragment it. Same logic as the OpenAI
-      // surface (bail once anything has been written).
-      bailIfStarted: () => tx.hasStarted,
-      onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages stream — refreshing (${err.message?.slice(0, 80)})`),
-      onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages stream`),
-    });
-  } catch (err) {
-    const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
-    if (!clientDisconnected && !(toolsEnabled && isAbort)) {
-      console.error('[/v1/messages stream] SDK error:', err.message);
-      tx.error(err);
-      return;
-    }
-  }
-  if (sessionKey && capturedSessionId) {
-    upsertSession(sessionKey, capturedSessionId, resolvedModel);
-  }
-  tx.finish({ stopReason, usage: { output_tokens: outputTokens } });
-  captureResponse({
-    requestId,
-    usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
-    status: 'ok',
-    stopReason,
-    model: resolvedModel,
-  });
-}
 // ---------------------------------------------------------------------------
 // Express app
 // ---------------------------------------------------------------------------
@@ -1415,6 +343,19 @@ app.get('/inspector', async (_req, res) => {
   }
 });
+// GET /v1/chat/completions — RFC 9110: 405 with Allow header so probes
+// (e.g. Hermes onboarding) can detect the endpoint exists. Returning 404
+// on GET makes them think the endpoint is missing entirely.
+const methodNotAllowed = (allow) => (_req, res) => {
+  res.set('Allow', allow);
+  res.status(405).json({
+    error: { message: `Method Not Allowed. Use ${allow}.`, type: 'invalid_request_error', code: 'method_not_allowed' },
+  });
+};
+app.get('/v1/chat/completions', methodNotAllowed('POST'));
+app.get('/v1/messages', methodNotAllowed('POST'));
+app.get('/quiet/v1/messages', methodNotAllowed('POST'));
 // POST /v1/chat/completions
 app.post('/v1/chat/completions', async (req, res) => {
   const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
@@ -1483,11 +424,14 @@ app.post('/v1/chat/completions', async (req, res) => {
   res.on('finish', () => emitEnd());
   res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
-  if (body.stream) {
-    await handleStreaming(req, res, body, requestId, sessionKey);
-  } else {
-    await handleNonStreaming(res, body, requestId, sessionKey);
-  }
+  await runInference(
+    { req, res, body, requestId, sessionKey },
+    openaiSurface,
+    {
+      mode: body.stream ? 'stream' : 'json',
+      deps: { getSession, upsertSession, resolveModel },
+    },
+  );
 });
 // POST /v1/messages — Anthropic-native surface (for OpenClaw etc.).
@@ -1556,11 +500,104 @@ app.post('/v1/messages', async (req, res) => {
   res.on('finish', () => emitEnd());
   res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
-  if (body.stream) {
-    await handleAnthropicStreaming(req, res, body, requestId, sessionKey);
+  await runInference(
+    { req, res, body, requestId, sessionKey },
+    anthropicSurface,
+    {
+      mode: body.stream ? 'stream' : 'json',
+      deps: { getSession, upsertSession, resolveModel },
+    },
+  );
+});
+// POST /quiet/v1/messages — Anthropic-shape, but with two changes vs /v1/messages:
+//   1. Body is scrubbed for known third-party agent identifiers
+//      (openclaw, hermes, mobius, etc.) before the SDK forwards it.
+//   2. SDK receives an explicit string systemPrompt — disables the
+//      claude_code preset that otherwise injects "I am Claude Code…" framing.
+//
+// Use case: clients that don't want their identity to leak into Anthropic's
+// detection heuristics (e.g. "found 'openclaw' in package.json → flag account
+// for extra-usage billing"). Configurable scrub list at ~/.mobygate/quiet-words.txt.
+app.post('/quiet/v1/messages', async (req, res) => {
+  const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
+  const body = req.body;
+  if (!body?.messages || !Array.isArray(body.messages) || body.messages.length === 0) {
+    return res.status(400).json({
+      type: 'error',
+      error: { type: 'invalid_request_error', message: 'messages is required and must be a non-empty array' },
+    });
+  }
+  // Scrub the body in place BEFORE anything else reads it — capture, session
+  // derivation, prompt building all see the scrubbed content from here on.
+  // Diagnose first so we can log what we stripped (without leaking the values).
+  const diag = quietDiagnose(body);
+  scrubAnthropicBody(body);
+  const { key: sessionKey, source: sessionKeySource } = resolveSessionKey({
+    headerKey: req.headers['x-session-id'],
+    bodyKey: body.session_id,
+    body,
+  });
+  const existing = getSession(sessionKey);
+  const sessionTag = sessionKey
+    ? ` | session=${sessionKey}${sessionKeySource === 'auto' ? ' (auto)' : ''}${existing ? ' (resume)' : ' (new)'}`
+    : '';
+  console.log(`[${new Date().toISOString()}] anthropic-quiet ${body.stream ? 'stream' : 'sync'} | model=${body.model} → ${resolveModel(body.model)} | msgs=${body.messages.length}${sessionTag}`);
+  if (diag.matches > 0) {
+    const breakdown = diag.words.map(w => `${w.word}×${w.count}`).join(' ');
+    console.log(`  [quiet] scrubbed ${diag.matches} occurrence(s): ${breakdown}`);
   } else {
-    await handleAnthropicNonStreaming(res, body, requestId, sessionKey);
+    console.log(`  [quiet] payload was already clean (no matches)`);
   }
+  captureRequest({ path: '/quiet/v1/messages', body, requestId, sessionKey, sessionKeySource });
+  const startedAt = Date.now();
+  const imageBlocks = collectAnthropicImages(body.messages || []).length;
+  dashboardBus.emitEvent({
+    type: 'request.start',
+    id: requestId,
+    method: 'POST',
+    path: '/quiet/v1/messages',
+    model: body.model,
+    resolvedModel: resolveModel(body.model),
+    session: sessionKey,
+    stream: !!body.stream,
+    tools: hasAnthropicTools(body),
+    images: imageBlocks,
+    messages: body.messages.length,
+    resuming: !!existing,
+    quietScrubs: diag.matches,
+  });
+  let endEmitted = false;
+  const emitEnd = (overrides = {}) => {
+    if (endEmitted) return;
+    endEmitted = true;
+    dashboardBus.emitEvent({
+      type: 'request.end',
+      id: requestId,
+      durationMs: Date.now() - startedAt,
+      status: res.statusCode < 400 ? 'ok' : 'error',
+      httpStatus: res.statusCode,
+      ...overrides,
+    });
+  };
+  res.on('finish', () => emitEnd());
+  res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
+  await runInference(
+    { req, res, body, requestId, sessionKey },
+    anthropicSurface,
+    {
+      mode: body.stream ? 'stream' : 'json',
+      deps: { getSession, upsertSession, resolveModel },
+    },
+  );
 });
 // GET /v1/models
@@ -1925,6 +962,176 @@ app.post('/dashboard/captures-toggle', requireLocalOrigin, async (req, res) => {
   }
 });
+// GET /dashboard/session-costs — per-session cost breakdown (v0.8.5)
+//
+// Aggregates the [model-billed] log lines emitted by each handler's SDK
+// result step. Grouped by session_key. Surfaces:
+//   - cost_usd         total $ across all turns of this session
+//   - turns            number of completed (non-tool-use-aborted) turns
+//   - dollars_per_turn average cost amortization (low = cache working)
+//   - models           per-model breakdown (opus vs sonnet vs haiku)
+//   - first_user       first user message (for human-readable identification)
+//
+// This view exists because today's audit found 38.9% of total spend
+// going to "singleton" sessions — channels that fire once, idle past
+// the wire-cache TTL, then pay cache_creation tax on the next turn.
+// The dashboard tab built off this endpoint lets users spot bleeding
+// channels in real time and decide which to keep warm via cron pings.
+app.get('/dashboard/session-costs', requireLocalOrigin, async (_req, res) => {
+  try {
+    const { readFile, readdir } = await import('fs/promises');
+    const { existsSync } = await import('fs');
+    const path = await import('path');
+    const { homedir } = await import('os');
+    const logPath = join(LOGS_DIR, 'server.log');
+    const captureDir = process.env.MOBYGATE_CAPTURE_DIR
+      || join(process.env.MOBYGATE_HOME || join(homedir(), '.mobygate'), 'captures');
+    // Step 1: parse [model-billed] lines from server.log, associating
+    // each with the most recently observed session= line above it.
+    const sessions = {}; // sk -> { turns, cost_usd, models: {model -> {turns, cost_usd, in_uncached, cache_read, cache_create, out}} }
+    let lastSession = null;
+    if (existsSync(logPath)) {
+      const raw = await readFile(logPath, 'utf8');
+      for (const line of raw.split(/\r?\n/)) {
+        const sessMatch = line.match(/session=(auto_\w+)/);
+        if (sessMatch) lastSession = sessMatch[1];
+        const billed = line.match(/\[model-billed\] requested=\S+ modelUsage=(\{.+\})/);
+        if (billed && lastSession) {
+          let mu;
+          try { mu = JSON.parse(billed[1]); } catch { continue; }
+          if (!sessions[lastSession]) {
+            sessions[lastSession] = { turns: 0, cost_usd: 0, models: {} };
+          }
+          const rec = sessions[lastSession];
+          rec.turns += 1;
+          for (const [model, data] of Object.entries(mu)) {
+            const cost = data.costUSD || 0;
+            rec.cost_usd += cost;
+            if (!rec.models[model]) rec.models[model] = { turns: 0, cost_usd: 0, in_uncached: 0, cache_read: 0, cache_create: 0, out: 0 };
+            const m = rec.models[model];
+            m.turns += 1;
+            m.cost_usd += cost;
+            m.in_uncached  += data.inputTokens || 0;
+            m.cache_read   += data.cacheReadInputTokens || 0;
+            m.cache_create += data.cacheCreationInputTokens || 0;
+            m.out          += data.outputTokens || 0;
+          }
+        }
+      }
+    }
+    // Step 2: enrich with capture metadata (first user message, model,
+    // path, msg count) for each session_key. Only need to read enough
+    // captures to find one per session.
+    const sessionMeta = {};
+    if (existsSync(captureDir)) {
+      const files = (await readdir(captureDir))
+        .filter(n => n.endsWith('.json'))
+        .sort()
+        .reverse(); // newest first
+      for (const f of files) {
+        const summaryFile = f.replace(/\.json$/, '.summary.txt');
+        if (!existsSync(join(captureDir, summaryFile))) continue;
+        const summary = await readFile(join(captureDir, summaryFile), 'utf8').catch(() => '');
+        const skMatch = summary.match(/^session_key:\s+(auto_\w+)/m);
+        if (!skMatch) continue;
+        const sk = skMatch[1];
+        if (sessionMeta[sk]) continue; // already have meta
+        const modelMatch  = summary.match(/^model:\s+(\S+)/m);
+        const pathMatch   = summary.match(/^path:\s+(\S+)/m);
+        const msgsMatch   = summary.match(/^messages:\s+(\d+)/m);
+        const lastSeen    = (await readFile(join(captureDir, summaryFile)).then(b => b.length).catch(()=>0)) ? f.slice(0, 19) : null;
+        let firstUser = null;
+        try {
+          const body = JSON.parse(await readFile(join(captureDir, f), 'utf8'));
+          for (const m of (body.messages || []).slice(0, 5)) {
+            if (m.role !== 'user') continue;
+            const c = m.content;
+            let txt = '';
+            if (Array.isArray(c)) {
+              for (const blk of c) {
+                if (blk?.type === 'text' && blk.text) { txt = blk.text; break; }
+              }
+            } else if (typeof c === 'string') {
+              txt = c;
+            }
+            // Skip "OpenClaw runtime context" boilerplate
+            if (txt && !txt.startsWith('OpenClaw runtime context')) {
+              firstUser = txt.slice(0, 80).replace(/\s+/g, ' ');
+              break;
+            }
+          }
+        } catch {}
+        sessionMeta[sk] = {
+          model: modelMatch ? modelMatch[1] : null,
+          path: pathMatch ? pathMatch[1] : null,
+          msgs: msgsMatch ? parseInt(msgsMatch[1], 10) : null,
+          lastSeenIso: lastSeen,
+          firstUser,
+        };
+      }
+    }
+    // Step 3: combine and sort
+    const out = [];
+    for (const [sk, rec] of Object.entries(sessions)) {
+      const meta = sessionMeta[sk] || {};
+      out.push({
+        session_key: sk,
+        turns: rec.turns,
+        cost_usd: Math.round(rec.cost_usd * 10000) / 10000,
+        per_turn_usd: Math.round((rec.cost_usd / Math.max(rec.turns, 1)) * 10000) / 10000,
+        bucket: rec.turns === 1 ? 'singleton' : rec.turns <= 3 ? 'short' : rec.turns <= 10 ? 'medium' : 'warm',
+        model: meta.model || null,
+        path: meta.path || null,
+        msgs: meta.msgs || null,
+        last_seen: meta.lastSeenIso || null,
+        first_user: meta.firstUser || null,
+        models: Object.fromEntries(
+          Object.entries(rec.models).map(([m, d]) => [m, {
+            turns: d.turns,
+            cost_usd: Math.round(d.cost_usd * 10000) / 10000,
+            in_uncached: d.in_uncached,
+            cache_read: d.cache_read,
+            cache_create: d.cache_create,
+            out: d.out,
+          }]),
+        ),
+      });
+    }
+    out.sort((a, b) => b.cost_usd - a.cost_usd);
+    // Step 4: aggregate stats
+    const totalCost = out.reduce((s, r) => s + r.cost_usd, 0);
+    const totalTurns = out.reduce((s, r) => s + r.turns, 0);
+    const buckets = { singleton: { sessions: 0, cost: 0 }, short: { sessions: 0, cost: 0 }, medium: { sessions: 0, cost: 0 }, warm: { sessions: 0, cost: 0 } };
+    for (const r of out) {
+      buckets[r.bucket].sessions += 1;
+      buckets[r.bucket].cost += r.cost_usd;
+    }
+    for (const k of Object.keys(buckets)) {
+      buckets[k].cost = Math.round(buckets[k].cost * 100) / 100;
+      buckets[k].pct_of_total = totalCost > 0 ? Math.round((buckets[k].cost / totalCost) * 1000) / 10 : 0;
+    }
+    res.json({
+      generatedAt: new Date().toISOString(),
+      total_cost_usd: Math.round(totalCost * 100) / 100,
+      total_turns: totalTurns,
+      session_count: out.length,
+      buckets,
+      sessions: out,
+    });
+  } catch (e) {
+    res.status(500).json({ error: e.message });
+  }
+});
 // ---------------------------------------------------------------------------
 // Updater — dashboard-driven "update available → update now" flow
 // ---------------------------------------------------------------------------
@@ -1987,11 +1194,12 @@ app.get('/update/status', (req, res) => {
 app.listen(PORT, BIND, async () => {
   const ttlMin = Math.round(SESSION_TTL_MS / 60000);
+  const ttlHours = (SESSION_TTL_MS / 3600000).toFixed(1);
   const meta = await loadBuildMeta();
   console.log(banner({ version: meta.version }));
   console.log(`    bind         ${BIND}:${PORT}${BIND === '127.0.0.1' ? ' (loopback only)' : ' (⚠ network-reachable — add auth)'}`);
   console.log(`    model        ${DEFAULT_MODEL}`);
-  console.log(`    session TTL  ${ttlMin} min`);
+  console.log(`    session TTL  ${ttlMin} min (${ttlHours}h)`);
   console.log(`    dashboard    http://localhost:${PORT}`);
   if (isCaptureEnabled()) {
     console.log(`    capture      ON → ${CAPTURE_DIR_PATH.replace(process.env.HOME || '', '~')}`);