npm - mobygate - Versions diffs - 0.8.4 → 0.9.2 - Mend

mobygate 0.8.4 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CHANGELOG.md +472 -0
package/bin/mobygate.js +214 -0
package/inspector.html +200 -3
package/lib/anthropic.js +6 -1
package/lib/captures-index.js +524 -0
package/lib/inference-runner.js +753 -0
package/lib/openai-translation.js +146 -0
package/lib/quiet.js +249 -0
package/lib/request-capture.js +24 -0
package/package.json +3 -1
package/server.js +318 -1110

package/lib/inference-runner.js ADDED Viewed

@@ -0,0 +1,753 @@
+/**
+ * Single-runner inference engine.
+ *
+ * Replaces the four near-duplicate handlers (handleStreaming,
+ * handleNonStreaming, handleAnthropicNonStreaming, handleAnthropicStreaming)
+ * with one consolidated `runInference` driven by two surface adapters
+ * (OpenAI + Anthropic) and a `mode` flag (stream | json).
+ *
+ * Why: the four handlers were ~80% identical — same SDK iteration, same
+ * tool_use detection, same auth-failure-text bail, same per-turn usage
+ * tracking, same post-disconnect grace window. Bug fixes had to land in
+ * 2-4 places, which is exactly how regressions slip in. One runner means
+ * one place to fix, four places to test.
+ *
+ * Surface adapter contract — each surface (openai, anthropic) provides:
+ *   parsePrompt(body, { resuming })  → { promptText, error? }
+ *   extractImages(body)              → image content blocks[]
+ *   hasTools(body)                   → bool
+ *   toolsForBridge(body)             → OpenAI-shape tools for tool-bridge
+ *   mapStopReason(resultMessage)     → surface-native stop reason string
+ *   sendInvalidRequest(res, msg)     → 400 in surface-native error envelope
+ *   sendApiError(res, err)           → 500 in surface-native error envelope
+ *   logTag                            → label for [auth] log lines
+ *   createSink({ res, requestId, model, sessionKey, toolsEnabled, mode })
+ *     → { start, pushTextDelta, pushToolUse, finish, error,
+ *         onModelResolved, hasStarted }
+ *
+ * Sink methods:
+ *   start()                          — emit headers / open stream / no-op
+ *   onModelResolved(resolvedModel)   — sink stores final model id; for
+ *                                       Anthropic streaming this also fires
+ *                                       message_start
+ *   pushTextDelta(text)              — surface-specific (live SSE chunk,
+ *                                       buffered chunk, or accumulated json)
+ *   pushToolUse(tu)                  — emit tool_use block (Anthropic stream)
+ *                                       or collect for batched emit (OpenAI)
+ *   finish({ stopReason, usage })    — close stream / send JSON
+ *   error(err)                       — emit error event / send error JSON
+ *   hasStarted                       — true once anything wire-irreversible
+ *                                       has been written (used by both
+ *                                       runWithAuthRetry's bailIfStarted and
+ *                                       the auth-failure-text bail-out)
+ */
+import { v4 as uuidv4 } from 'uuid';
+import { query } from '@anthropic-ai/claude-agent-sdk';
+import { runWithAuthRetry, isAuthFailureText, AuthFailureInResultText } from '../scripts/auth-helper.js';
+import {
+  buildClientToolsServer,
+  buildToolUsageGuidance,
+  extractToolUses,
+  hasToolUse,
+  MCP_SERVER_NAME,
+  MCP_TOOL_PREFIX,
+} from './tool-bridge.js';
+import {
+  anthropicMessagesToPrompt,
+  collectAnthropicImages,
+  buildAnthropicResponse,
+  makeStreamTranslator,
+  hasAnthropicTools,
+  mapStopReason as mapAnthropicStopReason,
+  extractSdkUsage,
+} from './anthropic.js';
+import {
+  messagesToPrompt,
+  collectImages,
+  hasTools as hasOpenAITools,
+  normalizeModelName,
+} from './openai-translation.js';
+import { captureResponse } from './request-capture.js';
+// ---------------------------------------------------------------------------
+// Generic helper: wrap promptText + image blocks into the SDK query() shape
+// ---------------------------------------------------------------------------
+// Returns a string for the fast path (text-only, no images), or an
+// async iterable yielding one SDKUserMessage with multi-part content
+// when there are images. Built lazily inside runQuery so a 401-retry
+// rebuilds the iterator (single-use async iterators die after first run).
+function buildQueryPrompt(promptText, imageBlocks) {
+  if (!imageBlocks.length) return promptText;
+  const content = [
+    { type: 'text', text: promptText || '' },
+    ...imageBlocks,
+  ];
+  async function* gen() {
+    yield {
+      type: 'user',
+      message: { role: 'user', content },
+      parent_tool_use_id: null,
+    };
+  }
+  return gen();
+}
+// ---------------------------------------------------------------------------
+// OpenAI SSE helpers (only used by the OpenAI streaming sink)
+// ---------------------------------------------------------------------------
+function makeOpenAIChunk(requestId, model, content, role, finishReason) {
+  return {
+    id: `chatcmpl-${requestId}`,
+    object: 'chat.completion.chunk',
+    created: Math.floor(Date.now() / 1000),
+    model: normalizeModelName(model),
+    choices: [{
+      index: 0,
+      delta: {
+        ...(role ? { role } : {}),
+        ...(content !== undefined ? { content } : {}),
+      },
+      finish_reason: finishReason || null,
+    }],
+  };
+}
+function sendSSE(res, data) {
+  if (!res.writableEnded) {
+    res.write(`data: ${JSON.stringify(data)}\n\n`);
+  }
+}
+// ---------------------------------------------------------------------------
+// OpenAI surface adapter
+// ---------------------------------------------------------------------------
+export const openaiSurface = {
+  logTag: '/v1/chat/completions',
+  parsePrompt(body, { resuming }) {
+    return messagesToPrompt(body.messages, { resuming });
+  },
+  extractImages(body) {
+    return collectImages(body.messages);
+  },
+  hasTools(body) {
+    return hasOpenAITools(body);
+  },
+  toolsForBridge(body) {
+    // Already OpenAI shape — pass through.
+    return body.tools;
+  },
+  mapStopReason(message) {
+    if (message?.subtype) return message.subtype;
+    return 'stop';
+  },
+  sendInvalidRequest(res, message, code = 'invalid_request') {
+    return res.status(400).json({
+      error: { message, type: 'invalid_request_error', code },
+    });
+  },
+  sendApiError(res, err) {
+    return res.status(500).json({
+      error: { message: err.message, type: 'server_error', code: null },
+    });
+  },
+  createSink({ res, requestId, model, sessionKey, toolsEnabled, mode }) {
+    if (mode === 'stream') return makeOpenAIStreamSink({ res, requestId, model, sessionKey, toolsEnabled });
+    return makeOpenAIJsonSink({ res, requestId, model, sessionKey, toolsEnabled });
+  },
+};
+function makeOpenAIStreamSink({ res, requestId, model, sessionKey, toolsEnabled }) {
+  let resolvedModel = model;
+  let isFirstChunk = true;
+  let bufferedText = '';
+  const collectedToolCalls = [];
+  const start = () => {
+    res.setHeader('Content-Type', 'text/event-stream');
+    res.setHeader('Cache-Control', 'no-cache');
+    res.setHeader('Connection', 'keep-alive');
+    res.setHeader('X-Request-Id', requestId);
+    if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
+    res.flushHeaders();
+    res.write(':ok\n\n');
+  };
+  const onModelResolved = (m) => { if (m) resolvedModel = m; };
+  const pushTextDelta = (text) => {
+    if (!text) return;
+    if (toolsEnabled) {
+      // Buffer text until finish — it might precede a tool_use, or be the
+      // final response when the model decides not to call any tools.
+      bufferedText += text;
+    } else {
+      sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, text, isFirstChunk ? 'assistant' : undefined, null));
+      isFirstChunk = false;
+    }
+  };
+  const pushToolUse = (tu) => {
+    collectedToolCalls.push(tu);
+  };
+  const finish = ({ stopReason, usage } = {}) => {
+    if (res.writableEnded) return;
+    if (toolsEnabled) {
+      if (collectedToolCalls.length > 0) {
+        console.log(`  [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
+        sendSSE(res, {
+          id: `chatcmpl-${requestId}`,
+          object: 'chat.completion.chunk',
+          created: Math.floor(Date.now() / 1000),
+          model: normalizeModelName(resolvedModel),
+          choices: [{
+            index: 0,
+            delta: {
+              role: 'assistant',
+              content: bufferedText.trim() || null,
+              tool_calls: collectedToolCalls.map((tc, i) => ({
+                index: i,
+                id: tc.id,
+                type: 'function',
+                function: { name: tc.name, arguments: tc.arguments },
+              })),
+            },
+            finish_reason: 'tool_calls',
+          }],
+        });
+      } else {
+        sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, bufferedText, 'assistant', null));
+        sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
+      }
+    } else {
+      sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
+    }
+    res.write('data: [DONE]\n\n');
+    res.end();
+  };
+  const error = (err) => {
+    if (res.writableEnded) return;
+    sendSSE(res, { error: { message: err.message, type: 'server_error', code: null } });
+  };
+  return {
+    start,
+    onModelResolved,
+    pushTextDelta,
+    pushToolUse,
+    finish,
+    error,
+    // hasStarted = irreversible writes happened. Tools-mode buffers
+    // everything in memory and only writes at finish() — retry safe.
+    // Non-tools mode commits to the SSE stream on the first chunk.
+    get hasStarted() { return !toolsEnabled && !isFirstChunk; },
+  };
+}
+function makeOpenAIJsonSink({ res, requestId, model, sessionKey, toolsEnabled }) {
+  let resolvedModel = model;
+  let resultText = '';
+  const collectedToolCalls = [];
+  const start = () => { /* JSON: nothing emitted until finish */ };
+  const onModelResolved = (m) => { if (m) resolvedModel = m; };
+  const pushTextDelta = (text) => {
+    if (text) resultText += text;
+  };
+  const pushToolUse = (tu) => {
+    collectedToolCalls.push(tu);
+  };
+  const finish = ({ stopReason, usage } = {}) => {
+    const responseHeaders = { 'X-Request-Id': requestId };
+    if (sessionKey) responseHeaders['X-Session-Id'] = sessionKey;
+    if (toolsEnabled && collectedToolCalls.length > 0) {
+      console.log(`  [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
+      return res.set(responseHeaders).json({
+        id: `chatcmpl-${requestId}`,
+        object: 'chat.completion',
+        created: Math.floor(Date.now() / 1000),
+        model: normalizeModelName(resolvedModel),
+        choices: [{
+          index: 0,
+          message: {
+            role: 'assistant',
+            content: resultText.trim() || null,
+            tool_calls: collectedToolCalls.map((tc) => ({
+              id: tc.id,
+              type: 'function',
+              function: { name: tc.name, arguments: tc.arguments },
+            })),
+          },
+          finish_reason: 'tool_calls',
+        }],
+        usage: {
+          prompt_tokens: usage?.input_tokens || 0,
+          completion_tokens: usage?.output_tokens || 0,
+          total_tokens: (usage?.input_tokens || 0) + (usage?.output_tokens || 0),
+        },
+      });
+    }
+    res.set(responseHeaders).json({
+      id: `chatcmpl-${requestId}`,
+      object: 'chat.completion',
+      created: Math.floor(Date.now() / 1000),
+      model: normalizeModelName(resolvedModel),
+      choices: [{
+        index: 0,
+        message: { role: 'assistant', content: resultText },
+        finish_reason: stopReason === 'tool_use' ? 'tool_calls' : 'stop',
+      }],
+      usage: {
+        prompt_tokens: usage?.input_tokens || 0,
+        completion_tokens: usage?.output_tokens || 0,
+        total_tokens: (usage?.input_tokens || 0) + (usage?.output_tokens || 0),
+      },
+    });
+  };
+  const error = (err) => {
+    return res.status(500).json({ error: { message: err.message, type: 'server_error', code: null } });
+  };
+  return { start, onModelResolved, pushTextDelta, pushToolUse, finish, error, get hasStarted() { return false; } };
+}
+// ---------------------------------------------------------------------------
+// Anthropic surface adapter
+// ---------------------------------------------------------------------------
+export const anthropicSurface = {
+  logTag: '/v1/messages',
+  parsePrompt(body, { resuming }) {
+    return anthropicMessagesToPrompt(body, { resuming });
+  },
+  extractImages(body) {
+    return collectAnthropicImages(body.messages || []);
+  },
+  hasTools(body) {
+    return hasAnthropicTools(body);
+  },
+  toolsForBridge(body) {
+    // Convert Anthropic tool defs → OpenAI shape that buildClientToolsServer
+    // expects. Both go through the same JSON-Schema → Zod path on the way
+    // to MCP; the wrapper shape difference is just `function:{name, parameters}`
+    // vs `{name, input_schema}`.
+    return body.tools.map((t) => ({
+      type: 'function',
+      function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
+    }));
+  },
+  mapStopReason(message) {
+    return mapAnthropicStopReason(message);
+  },
+  sendInvalidRequest(res, message) {
+    return res.status(400).json({
+      type: 'error',
+      error: { type: 'invalid_request_error', message },
+    });
+  },
+  sendApiError(res, err) {
+    return res.status(500).json({
+      type: 'error',
+      error: { type: 'api_error', message: err.message },
+    });
+  },
+  createSink({ res, requestId, model, sessionKey, toolsEnabled, mode }) {
+    if (mode === 'stream') return makeAnthropicStreamSink({ res, requestId, model, sessionKey });
+    return makeAnthropicJsonSink({ res, requestId, model, sessionKey });
+  },
+};
+function makeAnthropicStreamSink({ res, requestId, model, sessionKey }) {
+  let resolvedModel = model;
+  const tx = makeStreamTranslator({ res, requestId, model });
+  const start = () => {
+    res.setHeader('Content-Type', 'text/event-stream');
+    res.setHeader('Cache-Control', 'no-cache');
+    res.setHeader('Connection', 'keep-alive');
+    res.setHeader('X-Request-Id', requestId);
+    if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
+    res.flushHeaders();
+    // No message_start yet — that fires from onModelResolved when the SDK
+    // delivers the system init message with the resolved model id.
+  };
+  const onModelResolved = (m) => {
+    if (m) resolvedModel = m;
+    // tx.start() is idempotent — guarded by an internal `started` flag.
+    tx.start(resolvedModel, 0);
+  };
+  const pushTextDelta = (text) => {
+    if (text) tx.pushTextDelta(text);
+  };
+  const pushToolUse = (tu) => {
+    tx.pushToolUse(tu);
+  };
+  const finish = ({ stopReason, usage } = {}) => {
+    tx.finish({ stopReason, usage });
+  };
+  const error = (err) => {
+    tx.error(err);
+  };
+  return {
+    start,
+    onModelResolved,
+    pushTextDelta,
+    pushToolUse,
+    finish,
+    error,
+    get hasStarted() { return tx.hasStarted; },
+  };
+}
+function makeAnthropicJsonSink({ res, requestId, model, sessionKey }) {
+  let resolvedModel = model;
+  let resultText = '';
+  const collectedToolCalls = [];
+  const start = () => {};
+  const onModelResolved = (m) => { if (m) resolvedModel = m; };
+  const pushTextDelta = (text) => {
+    if (text) resultText += text;
+  };
+  const pushToolUse = (tu) => {
+    collectedToolCalls.push(tu);
+  };
+  const finish = ({ stopReason, usage } = {}) => {
+    res.setHeader('X-Request-Id', requestId);
+    if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
+    res.json(buildAnthropicResponse({
+      rawText: resultText.trim(),
+      toolUses: collectedToolCalls,
+      model: resolvedModel,
+      usage: { input_tokens: usage?.input_tokens || 0, output_tokens: usage?.output_tokens || 0 },
+      requestId,
+      stopReason,
+    }));
+  };
+  const error = (err) => {
+    return res.status(500).json({
+      type: 'error',
+      error: { type: 'api_error', message: err.message },
+    });
+  };
+  return { start, onModelResolved, pushTextDelta, pushToolUse, finish, error, get hasStarted() { return false; } };
+}
+// ---------------------------------------------------------------------------
+// Consolidated runner
+// ---------------------------------------------------------------------------
+/**
+ * Run a single inference request. Drives the SDK query() loop, surfaces
+ * all output via `surface.createSink`, and writes a capture record at the
+ * end. This is the only inference entry point — all four routes
+ * (/v1/chat/completions, /v1/messages, /quiet/v1/messages, future
+ * surfaces) terminate here.
+ *
+ * @param {Object} ctx
+ * @param {express.Request} ctx.req
+ * @param {express.Response} ctx.res
+ * @param {Object} ctx.body — request body (already scrubbed for /quiet/*)
+ * @param {string} ctx.requestId
+ * @param {string} ctx.sessionKey
+ * @param {Object} surface — openaiSurface | anthropicSurface
+ * @param {Object} opts
+ * @param {'stream'|'json'} opts.mode
+ * @param {Object} opts.deps — { getSession, upsertSession, resolveModel }
+ */
+export async function runInference({ req, res, body, requestId, sessionKey }, surface, { mode, deps }) {
+  const { getSession, upsertSession, resolveModel } = deps;
+  const existing = getSession(sessionKey);
+  const resuming = !!existing?.sdkSessionId;
+  const toolsEnabled = surface.hasTools(body);
+  const { promptText, error: promptError } = surface.parsePrompt(body, { resuming });
+  if (promptError) {
+    return surface.sendInvalidRequest(res, promptError);
+  }
+  const images = surface.extractImages(body);
+  const model = resolveModel(body.model);
+  // Tool-bridge setup: convert client tool defs to OpenAI shape (the bridge
+  // expects that), build the in-process MCP server, build the system-prompt
+  // append guidance. All three are null when tools are disabled.
+  const toolsForBridge = toolsEnabled ? surface.toolsForBridge(body) : null;
+  const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
+  const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
+  // Always use the claude_code preset when tools are present — it's not
+  // just identity framing, it's also Anthropic's "this is an approved
+  // client, bill flat Max" signal. v0.8.6 tried to drop the preset for
+  // /quiet/v1/messages and discovered (the hard way) that doing so flips
+  // requests into extra-usage billing. Quiet mode now layers scrubbing on
+  // TOP of the preset, not instead of it.
+  const sdkSystemPrompt = clientToolsServer ? { type: 'preset', preset: 'claude_code', append: toolsGuidance } : undefined;
+  if (images.length) console.log(`  [multimodal] ${images.length} image block(s)`);
+  if (toolsEnabled) console.log(`  [tools] ${body.tools.length} client tool(s) registered as MCP`);
+  // State accumulated across the SDK loop
+  let resolvedModel = model;
+  let capturedSessionId = existing?.sdkSessionId || null;
+  let inputTokens = 0;
+  let outputTokens = 0;
+  let cacheReadTokens = 0;
+  let cacheCreateTokens = 0;
+  let stopReason = 'end_turn';
+  let textEmittedSoFar = '';
+  let toolUseEmitted = false;
+  let clientDisconnected = false;
+  let postDisconnectTimer = null;
+  const abortController = new AbortController();
+  const startedAt = Date.now();
+  const sink = surface.createSink({ res, requestId, model, sessionKey, toolsEnabled, mode });
+  sink.start();
+  if (mode === 'stream') {
+    // On client disconnect, keep the SDK alive so the in-flight generation
+    // (already being billed) finishes and lands in the capture file. Cap
+    // at 60s so a flapping client can't burn unbounded tokens.
+    res.on('close', () => {
+      if (clientDisconnected) return;
+      clientDisconnected = true;
+      if (postDisconnectTimer) return;
+      console.log('  [stream] client disconnected — keeping SDK alive to preserve capture (60s cap)');
+      postDisconnectTimer = setTimeout(() => {
+        console.log('  [stream] post-disconnect 60s cap — aborting SDK');
+        abortController.abort();
+      }, 60_000);
+      postDisconnectTimer.unref?.();
+    });
+  }
+  if (resuming) {
+    console.log(`  [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
+  }
+  const runQuery = async () => {
+    // Reset per-attempt state so a 401 retry starts clean. Sinks that
+    // already hasStarted are caught by bailIfStarted below.
+    resolvedModel = model;
+    capturedSessionId = existing?.sdkSessionId || null;
+    inputTokens = 0;
+    outputTokens = 0;
+    stopReason = 'end_turn';
+    textEmittedSoFar = '';
+    toolUseEmitted = false;
+    // Build the prompt lazily on each attempt — multimodal returns a
+    // single-use async iterator. Keeps 401 auth-retries safe.
+    const prompt = buildQueryPrompt(promptText, images);
+    for await (const message of query({
+      prompt,
+      options: {
+        model,
+        maxTurns: toolsEnabled ? 5 : 200,
+        permissionMode: 'bypassPermissions',
+        allowDangerouslySkipPermissions: true,
+        abortController,
+        ...(clientToolsServer
+          ? {
+              mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
+              allowedTools: [`${MCP_TOOL_PREFIX}*`],
+              systemPrompt: sdkSystemPrompt,
+            }
+          : toolsEnabled
+            ? { allowedTools: [] }
+            : {}),
+        ...(resuming ? { resume: existing.sdkSessionId } : {}),
+        ...(sessionKey && !resuming ? { persistSession: true } : {}),
+      },
+    })) {
+      // Note: do NOT break on clientDisconnected — keep consuming so the
+      // final result/usage lands in the capture. Sink writes are guarded
+      // by res.writableEnded internally and silently no-op.
+      if (message.type === 'system' && message.subtype === 'init' && message.model) {
+        resolvedModel = message.model;
+        sink.onModelResolved(resolvedModel);
+      }
+      if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
+        capturedSessionId = message.session_id;
+        console.log(`  [session] captured sdk session: ${capturedSessionId}`);
+      }
+      // Per-turn usage: required because tool_use detection aborts the
+      // SDK before its `result` message arrives, leaving the trackers at
+      // zero on the abort path. Reading from each assistant turn keeps
+      // usage correct even when aborted.
+      if (message.type === 'assistant' && message.message?.usage) {
+        const turn = extractSdkUsage(message);
+        if (turn.input_tokens || turn.output_tokens || turn.cache_read_input_tokens || turn.cache_creation_input_tokens) {
+          inputTokens = turn.input_tokens;
+          outputTokens = turn.output_tokens;
+          cacheReadTokens = turn.cache_read_input_tokens;
+          cacheCreateTokens = turn.cache_creation_input_tokens;
+        }
+      }
+      if (message.type === 'assistant' && message.message?.content) {
+        const content = message.message.content;
+        // Flatten text from this assistant message
+        let combined = '';
+        if (Array.isArray(content)) {
+          for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
+        } else if (typeof content === 'string') {
+          combined = content;
+        }
+        // Auth-failure short-circuit: throw so runWithAuthRetry handles
+        // it. Only safe before any wire-irreversible writes (otherwise
+        // we've already corrupted the stream and can't undo).
+        if (combined && isAuthFailureText(combined) && !sink.hasStarted) {
+          abortController.abort();
+          throw new AuthFailureInResultText(combined);
+        }
+        // Tool_use detection. Emit any text from this same message
+        // *before* the tool_use blocks so block ordering is preserved
+        // (Anthropic streams sometimes have text + tool_use in one
+        // assistant message).
+        if (toolsEnabled && hasToolUse(message)) {
+          const calls = extractToolUses(message);
+          if (calls.length) {
+            if (combined) {
+              const delta = combined.startsWith(textEmittedSoFar)
+                ? combined.slice(textEmittedSoFar.length)
+                : combined;
+              if (delta) {
+                sink.pushTextDelta(delta);
+                textEmittedSoFar += delta;
+              }
+            }
+            for (const tu of calls) sink.pushToolUse(tu);
+            toolUseEmitted = true;
+            stopReason = 'tool_use';
+            console.log(`  [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
+            abortController.abort();
+            break;
+          }
+        }
+        // Plain text-only assistant message: stream the delta.
+        // Compute delta vs what we've emitted to avoid duplication on
+        // aggregator-style assistant messages that resend accumulated text.
+        if (combined) {
+          const delta = combined.startsWith(textEmittedSoFar)
+            ? combined.slice(textEmittedSoFar.length)
+            : combined;
+          if (delta) {
+            sink.pushTextDelta(delta);
+            textEmittedSoFar += delta;
+          }
+        }
+      }
+      if (message.type === 'result') {
+        if (message.result && !textEmittedSoFar && !toolUseEmitted) {
+          // Some SDK paths only deliver text via the final result message
+          // (no streaming assistant messages). Emit it as one delta.
+          sink.pushTextDelta(message.result);
+          textEmittedSoFar = message.result;
+        }
+        if (isAuthFailureText(message.result || '') && !sink.hasStarted) {
+          throw new AuthFailureInResultText(message.result);
+        }
+        const usage = extractSdkUsage(message);
+        inputTokens = usage.input_tokens;
+        outputTokens = usage.output_tokens;
+        cacheReadTokens = usage.cache_read_input_tokens;
+        cacheCreateTokens = usage.cache_creation_input_tokens;
+        console.log(`  [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
+        if (!toolUseEmitted) stopReason = surface.mapStopReason(message);
+        break;
+      }
+    }
+  };
+  try {
+    await runWithAuthRetry({
+      attempt: runQuery,
+      bailIfStarted: () => sink.hasStarted,
+      onRefreshing: (err) => console.warn(`[auth] 401 on ${surface.logTag} — refreshing (${err.message?.slice(0, 80)})`),
+      onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying ${surface.logTag}`),
+    });
+  } catch (err) {
+    const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
+    if (!clientDisconnected && !(toolsEnabled && isAbort)) {
+      console.error(`[${surface.logTag}] SDK error:`, err.message);
+      sink.error(err);
+      return;
+    }
+  }
+  if (sessionKey && capturedSessionId) {
+    upsertSession(sessionKey, capturedSessionId, resolvedModel);
+  }
+  sink.finish({
+    stopReason,
+    usage: {
+      input_tokens: inputTokens,
+      output_tokens: outputTokens,
+      cache_read_input_tokens: cacheReadTokens,
+      cache_creation_input_tokens: cacheCreateTokens,
+    },
+  });
+  captureResponse({
+    requestId,
+    usage: {
+      input_tokens: inputTokens,
+      output_tokens: outputTokens,
+      cache_read_input_tokens: cacheReadTokens,
+      cache_creation_input_tokens: cacheCreateTokens,
+    },
+    durationMs: Date.now() - startedAt,
+    status: clientDisconnected ? 'client_disconnect' : 'ok',
+    stopReason,
+    model: resolvedModel,
+  });
+}