mobygate 0.8.4 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,753 @@
1
+ /**
2
+ * Single-runner inference engine.
3
+ *
4
+ * Replaces the four near-duplicate handlers (handleStreaming,
5
+ * handleNonStreaming, handleAnthropicNonStreaming, handleAnthropicStreaming)
6
+ * with one consolidated `runInference` driven by two surface adapters
7
+ * (OpenAI + Anthropic) and a `mode` flag (stream | json).
8
+ *
9
+ * Why: the four handlers were ~80% identical — same SDK iteration, same
10
+ * tool_use detection, same auth-failure-text bail, same per-turn usage
11
+ * tracking, same post-disconnect grace window. Bug fixes had to land in
12
+ * 2-4 places, which is exactly how regressions slip in. One runner means
13
+ * one place to fix, four places to test.
14
+ *
15
+ * Surface adapter contract — each surface (openai, anthropic) provides:
16
+ * parsePrompt(body, { resuming }) → { promptText, error? }
17
+ * extractImages(body) → image content blocks[]
18
+ * hasTools(body) → bool
19
+ * toolsForBridge(body) → OpenAI-shape tools for tool-bridge
20
+ * mapStopReason(resultMessage) → surface-native stop reason string
21
+ * sendInvalidRequest(res, msg) → 400 in surface-native error envelope
22
+ * sendApiError(res, err) → 500 in surface-native error envelope
23
+ * logTag → label for [auth] log lines
24
+ * createSink({ res, requestId, model, sessionKey, toolsEnabled, mode })
25
+ * → { start, pushTextDelta, pushToolUse, finish, error,
26
+ * onModelResolved, hasStarted }
27
+ *
28
+ * Sink methods:
29
+ * start() — emit headers / open stream / no-op
30
+ * onModelResolved(resolvedModel) — sink stores final model id; for
31
+ * Anthropic streaming this also fires
32
+ * message_start
33
+ * pushTextDelta(text) — surface-specific (live SSE chunk,
34
+ * buffered chunk, or accumulated json)
35
+ * pushToolUse(tu) — emit tool_use block (Anthropic stream)
36
+ * or collect for batched emit (OpenAI)
37
+ * finish({ stopReason, usage }) — close stream / send JSON
38
+ * error(err) — emit error event / send error JSON
39
+ * hasStarted — true once anything wire-irreversible
40
+ * has been written (used by both
41
+ * runWithAuthRetry's bailIfStarted and
42
+ * the auth-failure-text bail-out)
43
+ */
44
+
45
+ import { v4 as uuidv4 } from 'uuid';
46
+ import { query } from '@anthropic-ai/claude-agent-sdk';
47
+ import { runWithAuthRetry, isAuthFailureText, AuthFailureInResultText } from '../scripts/auth-helper.js';
48
+ import {
49
+ buildClientToolsServer,
50
+ buildToolUsageGuidance,
51
+ extractToolUses,
52
+ hasToolUse,
53
+ MCP_SERVER_NAME,
54
+ MCP_TOOL_PREFIX,
55
+ } from './tool-bridge.js';
56
+ import {
57
+ anthropicMessagesToPrompt,
58
+ collectAnthropicImages,
59
+ buildAnthropicResponse,
60
+ makeStreamTranslator,
61
+ hasAnthropicTools,
62
+ mapStopReason as mapAnthropicStopReason,
63
+ extractSdkUsage,
64
+ } from './anthropic.js';
65
+ import {
66
+ messagesToPrompt,
67
+ collectImages,
68
+ hasTools as hasOpenAITools,
69
+ normalizeModelName,
70
+ } from './openai-translation.js';
71
+ import { captureResponse } from './request-capture.js';
72
+
73
+ // ---------------------------------------------------------------------------
74
+ // Generic helper: wrap promptText + image blocks into the SDK query() shape
75
+ // ---------------------------------------------------------------------------
76
+ // Returns a string for the fast path (text-only, no images), or an
77
+ // async iterable yielding one SDKUserMessage with multi-part content
78
+ // when there are images. Built lazily inside runQuery so a 401-retry
79
+ // rebuilds the iterator (single-use async iterators die after first run).
80
+
81
+ function buildQueryPrompt(promptText, imageBlocks) {
82
+ if (!imageBlocks.length) return promptText;
83
+ const content = [
84
+ { type: 'text', text: promptText || '' },
85
+ ...imageBlocks,
86
+ ];
87
+ async function* gen() {
88
+ yield {
89
+ type: 'user',
90
+ message: { role: 'user', content },
91
+ parent_tool_use_id: null,
92
+ };
93
+ }
94
+ return gen();
95
+ }
96
+
97
+ // ---------------------------------------------------------------------------
98
+ // OpenAI SSE helpers (only used by the OpenAI streaming sink)
99
+ // ---------------------------------------------------------------------------
100
+
101
+ function makeOpenAIChunk(requestId, model, content, role, finishReason) {
102
+ return {
103
+ id: `chatcmpl-${requestId}`,
104
+ object: 'chat.completion.chunk',
105
+ created: Math.floor(Date.now() / 1000),
106
+ model: normalizeModelName(model),
107
+ choices: [{
108
+ index: 0,
109
+ delta: {
110
+ ...(role ? { role } : {}),
111
+ ...(content !== undefined ? { content } : {}),
112
+ },
113
+ finish_reason: finishReason || null,
114
+ }],
115
+ };
116
+ }
117
+
118
+ function sendSSE(res, data) {
119
+ if (!res.writableEnded) {
120
+ res.write(`data: ${JSON.stringify(data)}\n\n`);
121
+ }
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // OpenAI surface adapter
126
+ // ---------------------------------------------------------------------------
127
+
128
+ export const openaiSurface = {
129
+ logTag: '/v1/chat/completions',
130
+
131
+ parsePrompt(body, { resuming }) {
132
+ return messagesToPrompt(body.messages, { resuming });
133
+ },
134
+
135
+ extractImages(body) {
136
+ return collectImages(body.messages);
137
+ },
138
+
139
+ hasTools(body) {
140
+ return hasOpenAITools(body);
141
+ },
142
+
143
+ toolsForBridge(body) {
144
+ // Already OpenAI shape — pass through.
145
+ return body.tools;
146
+ },
147
+
148
+ mapStopReason(message) {
149
+ if (message?.subtype) return message.subtype;
150
+ return 'stop';
151
+ },
152
+
153
+ sendInvalidRequest(res, message, code = 'invalid_request') {
154
+ return res.status(400).json({
155
+ error: { message, type: 'invalid_request_error', code },
156
+ });
157
+ },
158
+
159
+ sendApiError(res, err) {
160
+ return res.status(500).json({
161
+ error: { message: err.message, type: 'server_error', code: null },
162
+ });
163
+ },
164
+
165
+ createSink({ res, requestId, model, sessionKey, toolsEnabled, mode }) {
166
+ if (mode === 'stream') return makeOpenAIStreamSink({ res, requestId, model, sessionKey, toolsEnabled });
167
+ return makeOpenAIJsonSink({ res, requestId, model, sessionKey, toolsEnabled });
168
+ },
169
+ };
170
+
171
+ function makeOpenAIStreamSink({ res, requestId, model, sessionKey, toolsEnabled }) {
172
+ let resolvedModel = model;
173
+ let isFirstChunk = true;
174
+ let bufferedText = '';
175
+ const collectedToolCalls = [];
176
+
177
+ const start = () => {
178
+ res.setHeader('Content-Type', 'text/event-stream');
179
+ res.setHeader('Cache-Control', 'no-cache');
180
+ res.setHeader('Connection', 'keep-alive');
181
+ res.setHeader('X-Request-Id', requestId);
182
+ if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
183
+ res.flushHeaders();
184
+ res.write(':ok\n\n');
185
+ };
186
+
187
+ const onModelResolved = (m) => { if (m) resolvedModel = m; };
188
+
189
+ const pushTextDelta = (text) => {
190
+ if (!text) return;
191
+ if (toolsEnabled) {
192
+ // Buffer text until finish — it might precede a tool_use, or be the
193
+ // final response when the model decides not to call any tools.
194
+ bufferedText += text;
195
+ } else {
196
+ sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, text, isFirstChunk ? 'assistant' : undefined, null));
197
+ isFirstChunk = false;
198
+ }
199
+ };
200
+
201
+ const pushToolUse = (tu) => {
202
+ collectedToolCalls.push(tu);
203
+ };
204
+
205
+ const finish = ({ stopReason, usage } = {}) => {
206
+ if (res.writableEnded) return;
207
+ if (toolsEnabled) {
208
+ if (collectedToolCalls.length > 0) {
209
+ console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
210
+ sendSSE(res, {
211
+ id: `chatcmpl-${requestId}`,
212
+ object: 'chat.completion.chunk',
213
+ created: Math.floor(Date.now() / 1000),
214
+ model: normalizeModelName(resolvedModel),
215
+ choices: [{
216
+ index: 0,
217
+ delta: {
218
+ role: 'assistant',
219
+ content: bufferedText.trim() || null,
220
+ tool_calls: collectedToolCalls.map((tc, i) => ({
221
+ index: i,
222
+ id: tc.id,
223
+ type: 'function',
224
+ function: { name: tc.name, arguments: tc.arguments },
225
+ })),
226
+ },
227
+ finish_reason: 'tool_calls',
228
+ }],
229
+ });
230
+ } else {
231
+ sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, bufferedText, 'assistant', null));
232
+ sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
233
+ }
234
+ } else {
235
+ sendSSE(res, makeOpenAIChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
236
+ }
237
+ res.write('data: [DONE]\n\n');
238
+ res.end();
239
+ };
240
+
241
+ const error = (err) => {
242
+ if (res.writableEnded) return;
243
+ sendSSE(res, { error: { message: err.message, type: 'server_error', code: null } });
244
+ };
245
+
246
+ return {
247
+ start,
248
+ onModelResolved,
249
+ pushTextDelta,
250
+ pushToolUse,
251
+ finish,
252
+ error,
253
+ // hasStarted = irreversible writes happened. Tools-mode buffers
254
+ // everything in memory and only writes at finish() — retry safe.
255
+ // Non-tools mode commits to the SSE stream on the first chunk.
256
+ get hasStarted() { return !toolsEnabled && !isFirstChunk; },
257
+ };
258
+ }
259
+
260
+ function makeOpenAIJsonSink({ res, requestId, model, sessionKey, toolsEnabled }) {
261
+ let resolvedModel = model;
262
+ let resultText = '';
263
+ const collectedToolCalls = [];
264
+
265
+ const start = () => { /* JSON: nothing emitted until finish */ };
266
+ const onModelResolved = (m) => { if (m) resolvedModel = m; };
267
+
268
+ const pushTextDelta = (text) => {
269
+ if (text) resultText += text;
270
+ };
271
+
272
+ const pushToolUse = (tu) => {
273
+ collectedToolCalls.push(tu);
274
+ };
275
+
276
+ const finish = ({ stopReason, usage } = {}) => {
277
+ const responseHeaders = { 'X-Request-Id': requestId };
278
+ if (sessionKey) responseHeaders['X-Session-Id'] = sessionKey;
279
+
280
+ if (toolsEnabled && collectedToolCalls.length > 0) {
281
+ console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
282
+ return res.set(responseHeaders).json({
283
+ id: `chatcmpl-${requestId}`,
284
+ object: 'chat.completion',
285
+ created: Math.floor(Date.now() / 1000),
286
+ model: normalizeModelName(resolvedModel),
287
+ choices: [{
288
+ index: 0,
289
+ message: {
290
+ role: 'assistant',
291
+ content: resultText.trim() || null,
292
+ tool_calls: collectedToolCalls.map((tc) => ({
293
+ id: tc.id,
294
+ type: 'function',
295
+ function: { name: tc.name, arguments: tc.arguments },
296
+ })),
297
+ },
298
+ finish_reason: 'tool_calls',
299
+ }],
300
+ usage: {
301
+ prompt_tokens: usage?.input_tokens || 0,
302
+ completion_tokens: usage?.output_tokens || 0,
303
+ total_tokens: (usage?.input_tokens || 0) + (usage?.output_tokens || 0),
304
+ },
305
+ });
306
+ }
307
+
308
+ res.set(responseHeaders).json({
309
+ id: `chatcmpl-${requestId}`,
310
+ object: 'chat.completion',
311
+ created: Math.floor(Date.now() / 1000),
312
+ model: normalizeModelName(resolvedModel),
313
+ choices: [{
314
+ index: 0,
315
+ message: { role: 'assistant', content: resultText },
316
+ finish_reason: stopReason === 'tool_use' ? 'tool_calls' : 'stop',
317
+ }],
318
+ usage: {
319
+ prompt_tokens: usage?.input_tokens || 0,
320
+ completion_tokens: usage?.output_tokens || 0,
321
+ total_tokens: (usage?.input_tokens || 0) + (usage?.output_tokens || 0),
322
+ },
323
+ });
324
+ };
325
+
326
+ const error = (err) => {
327
+ return res.status(500).json({ error: { message: err.message, type: 'server_error', code: null } });
328
+ };
329
+
330
+ return { start, onModelResolved, pushTextDelta, pushToolUse, finish, error, get hasStarted() { return false; } };
331
+ }
332
+
333
+ // ---------------------------------------------------------------------------
334
+ // Anthropic surface adapter
335
+ // ---------------------------------------------------------------------------
336
+
337
+ export const anthropicSurface = {
338
+ logTag: '/v1/messages',
339
+
340
+ parsePrompt(body, { resuming }) {
341
+ return anthropicMessagesToPrompt(body, { resuming });
342
+ },
343
+
344
+ extractImages(body) {
345
+ return collectAnthropicImages(body.messages || []);
346
+ },
347
+
348
+ hasTools(body) {
349
+ return hasAnthropicTools(body);
350
+ },
351
+
352
+ toolsForBridge(body) {
353
+ // Convert Anthropic tool defs → OpenAI shape that buildClientToolsServer
354
+ // expects. Both go through the same JSON-Schema → Zod path on the way
355
+ // to MCP; the wrapper shape difference is just `function:{name, parameters}`
356
+ // vs `{name, input_schema}`.
357
+ return body.tools.map((t) => ({
358
+ type: 'function',
359
+ function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
360
+ }));
361
+ },
362
+
363
+ mapStopReason(message) {
364
+ return mapAnthropicStopReason(message);
365
+ },
366
+
367
+ sendInvalidRequest(res, message) {
368
+ return res.status(400).json({
369
+ type: 'error',
370
+ error: { type: 'invalid_request_error', message },
371
+ });
372
+ },
373
+
374
+ sendApiError(res, err) {
375
+ return res.status(500).json({
376
+ type: 'error',
377
+ error: { type: 'api_error', message: err.message },
378
+ });
379
+ },
380
+
381
+ createSink({ res, requestId, model, sessionKey, toolsEnabled, mode }) {
382
+ if (mode === 'stream') return makeAnthropicStreamSink({ res, requestId, model, sessionKey });
383
+ return makeAnthropicJsonSink({ res, requestId, model, sessionKey });
384
+ },
385
+ };
386
+
387
+ function makeAnthropicStreamSink({ res, requestId, model, sessionKey }) {
388
+ let resolvedModel = model;
389
+ const tx = makeStreamTranslator({ res, requestId, model });
390
+
391
+ const start = () => {
392
+ res.setHeader('Content-Type', 'text/event-stream');
393
+ res.setHeader('Cache-Control', 'no-cache');
394
+ res.setHeader('Connection', 'keep-alive');
395
+ res.setHeader('X-Request-Id', requestId);
396
+ if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
397
+ res.flushHeaders();
398
+ // No message_start yet — that fires from onModelResolved when the SDK
399
+ // delivers the system init message with the resolved model id.
400
+ };
401
+
402
+ const onModelResolved = (m) => {
403
+ if (m) resolvedModel = m;
404
+ // tx.start() is idempotent — guarded by an internal `started` flag.
405
+ tx.start(resolvedModel, 0);
406
+ };
407
+
408
+ const pushTextDelta = (text) => {
409
+ if (text) tx.pushTextDelta(text);
410
+ };
411
+
412
+ const pushToolUse = (tu) => {
413
+ tx.pushToolUse(tu);
414
+ };
415
+
416
+ const finish = ({ stopReason, usage } = {}) => {
417
+ tx.finish({ stopReason, usage });
418
+ };
419
+
420
+ const error = (err) => {
421
+ tx.error(err);
422
+ };
423
+
424
+ return {
425
+ start,
426
+ onModelResolved,
427
+ pushTextDelta,
428
+ pushToolUse,
429
+ finish,
430
+ error,
431
+ get hasStarted() { return tx.hasStarted; },
432
+ };
433
+ }
434
+
435
+ function makeAnthropicJsonSink({ res, requestId, model, sessionKey }) {
436
+ let resolvedModel = model;
437
+ let resultText = '';
438
+ const collectedToolCalls = [];
439
+
440
+ const start = () => {};
441
+ const onModelResolved = (m) => { if (m) resolvedModel = m; };
442
+
443
+ const pushTextDelta = (text) => {
444
+ if (text) resultText += text;
445
+ };
446
+
447
+ const pushToolUse = (tu) => {
448
+ collectedToolCalls.push(tu);
449
+ };
450
+
451
+ const finish = ({ stopReason, usage } = {}) => {
452
+ res.setHeader('X-Request-Id', requestId);
453
+ if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
454
+ res.json(buildAnthropicResponse({
455
+ rawText: resultText.trim(),
456
+ toolUses: collectedToolCalls,
457
+ model: resolvedModel,
458
+ usage: { input_tokens: usage?.input_tokens || 0, output_tokens: usage?.output_tokens || 0 },
459
+ requestId,
460
+ stopReason,
461
+ }));
462
+ };
463
+
464
+ const error = (err) => {
465
+ return res.status(500).json({
466
+ type: 'error',
467
+ error: { type: 'api_error', message: err.message },
468
+ });
469
+ };
470
+
471
+ return { start, onModelResolved, pushTextDelta, pushToolUse, finish, error, get hasStarted() { return false; } };
472
+ }
473
+
474
+ // ---------------------------------------------------------------------------
475
+ // Consolidated runner
476
+ // ---------------------------------------------------------------------------
477
+
478
+ /**
479
+ * Run a single inference request. Drives the SDK query() loop, surfaces
480
+ * all output via `surface.createSink`, and writes a capture record at the
481
+ * end. This is the only inference entry point — all four routes
482
+ * (/v1/chat/completions, /v1/messages, /quiet/v1/messages, future
483
+ * surfaces) terminate here.
484
+ *
485
+ * @param {Object} ctx
486
+ * @param {express.Request} ctx.req
487
+ * @param {express.Response} ctx.res
488
+ * @param {Object} ctx.body — request body (already scrubbed for /quiet/*)
489
+ * @param {string} ctx.requestId
490
+ * @param {string} ctx.sessionKey
491
+ * @param {Object} surface — openaiSurface | anthropicSurface
492
+ * @param {Object} opts
493
+ * @param {'stream'|'json'} opts.mode
494
+ * @param {Object} opts.deps — { getSession, upsertSession, resolveModel }
495
+ */
496
+ export async function runInference({ req, res, body, requestId, sessionKey }, surface, { mode, deps }) {
497
+ const { getSession, upsertSession, resolveModel } = deps;
498
+ const existing = getSession(sessionKey);
499
+ const resuming = !!existing?.sdkSessionId;
500
+ const toolsEnabled = surface.hasTools(body);
501
+
502
+ const { promptText, error: promptError } = surface.parsePrompt(body, { resuming });
503
+ if (promptError) {
504
+ return surface.sendInvalidRequest(res, promptError);
505
+ }
506
+
507
+ const images = surface.extractImages(body);
508
+ const model = resolveModel(body.model);
509
+
510
+ // Tool-bridge setup: convert client tool defs to OpenAI shape (the bridge
511
+ // expects that), build the in-process MCP server, build the system-prompt
512
+ // append guidance. All three are null when tools are disabled.
513
+ const toolsForBridge = toolsEnabled ? surface.toolsForBridge(body) : null;
514
+ const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
515
+ const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
516
+ // Always use the claude_code preset when tools are present — it's not
517
+ // just identity framing, it's also Anthropic's "this is an approved
518
+ // client, bill flat Max" signal. v0.8.6 tried to drop the preset for
519
+ // /quiet/v1/messages and discovered (the hard way) that doing so flips
520
+ // requests into extra-usage billing. Quiet mode now layers scrubbing on
521
+ // TOP of the preset, not instead of it.
522
+ const sdkSystemPrompt = clientToolsServer ? { type: 'preset', preset: 'claude_code', append: toolsGuidance } : undefined;
523
+
524
+ if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
525
+ if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
526
+
527
+ // State accumulated across the SDK loop
528
+ let resolvedModel = model;
529
+ let capturedSessionId = existing?.sdkSessionId || null;
530
+ let inputTokens = 0;
531
+ let outputTokens = 0;
532
+ let cacheReadTokens = 0;
533
+ let cacheCreateTokens = 0;
534
+ let stopReason = 'end_turn';
535
+ let textEmittedSoFar = '';
536
+ let toolUseEmitted = false;
537
+ let clientDisconnected = false;
538
+ let postDisconnectTimer = null;
539
+
540
+ const abortController = new AbortController();
541
+ const startedAt = Date.now();
542
+
543
+ const sink = surface.createSink({ res, requestId, model, sessionKey, toolsEnabled, mode });
544
+ sink.start();
545
+
546
+ if (mode === 'stream') {
547
+ // On client disconnect, keep the SDK alive so the in-flight generation
548
+ // (already being billed) finishes and lands in the capture file. Cap
549
+ // at 60s so a flapping client can't burn unbounded tokens.
550
+ res.on('close', () => {
551
+ if (clientDisconnected) return;
552
+ clientDisconnected = true;
553
+ if (postDisconnectTimer) return;
554
+ console.log(' [stream] client disconnected — keeping SDK alive to preserve capture (60s cap)');
555
+ postDisconnectTimer = setTimeout(() => {
556
+ console.log(' [stream] post-disconnect 60s cap — aborting SDK');
557
+ abortController.abort();
558
+ }, 60_000);
559
+ postDisconnectTimer.unref?.();
560
+ });
561
+ }
562
+
563
+ if (resuming) {
564
+ console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
565
+ }
566
+
567
+ const runQuery = async () => {
568
+ // Reset per-attempt state so a 401 retry starts clean. Sinks that
569
+ // already hasStarted are caught by bailIfStarted below.
570
+ resolvedModel = model;
571
+ capturedSessionId = existing?.sdkSessionId || null;
572
+ inputTokens = 0;
573
+ outputTokens = 0;
574
+ stopReason = 'end_turn';
575
+ textEmittedSoFar = '';
576
+ toolUseEmitted = false;
577
+
578
+ // Build the prompt lazily on each attempt — multimodal returns a
579
+ // single-use async iterator. Keeps 401 auth-retries safe.
580
+ const prompt = buildQueryPrompt(promptText, images);
581
+
582
+ for await (const message of query({
583
+ prompt,
584
+ options: {
585
+ model,
586
+ maxTurns: toolsEnabled ? 5 : 200,
587
+ permissionMode: 'bypassPermissions',
588
+ allowDangerouslySkipPermissions: true,
589
+ abortController,
590
+ ...(clientToolsServer
591
+ ? {
592
+ mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
593
+ allowedTools: [`${MCP_TOOL_PREFIX}*`],
594
+ systemPrompt: sdkSystemPrompt,
595
+ }
596
+ : toolsEnabled
597
+ ? { allowedTools: [] }
598
+ : {}),
599
+ ...(resuming ? { resume: existing.sdkSessionId } : {}),
600
+ ...(sessionKey && !resuming ? { persistSession: true } : {}),
601
+ },
602
+ })) {
603
+ // Note: do NOT break on clientDisconnected — keep consuming so the
604
+ // final result/usage lands in the capture. Sink writes are guarded
605
+ // by res.writableEnded internally and silently no-op.
606
+
607
+ if (message.type === 'system' && message.subtype === 'init' && message.model) {
608
+ resolvedModel = message.model;
609
+ sink.onModelResolved(resolvedModel);
610
+ }
611
+
612
+ if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
613
+ capturedSessionId = message.session_id;
614
+ console.log(` [session] captured sdk session: ${capturedSessionId}`);
615
+ }
616
+
617
+ // Per-turn usage: required because tool_use detection aborts the
618
+ // SDK before its `result` message arrives, leaving the trackers at
619
+ // zero on the abort path. Reading from each assistant turn keeps
620
+ // usage correct even when aborted.
621
+ if (message.type === 'assistant' && message.message?.usage) {
622
+ const turn = extractSdkUsage(message);
623
+ if (turn.input_tokens || turn.output_tokens || turn.cache_read_input_tokens || turn.cache_creation_input_tokens) {
624
+ inputTokens = turn.input_tokens;
625
+ outputTokens = turn.output_tokens;
626
+ cacheReadTokens = turn.cache_read_input_tokens;
627
+ cacheCreateTokens = turn.cache_creation_input_tokens;
628
+ }
629
+ }
630
+
631
+ if (message.type === 'assistant' && message.message?.content) {
632
+ const content = message.message.content;
633
+ // Flatten text from this assistant message
634
+ let combined = '';
635
+ if (Array.isArray(content)) {
636
+ for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
637
+ } else if (typeof content === 'string') {
638
+ combined = content;
639
+ }
640
+
641
+ // Auth-failure short-circuit: throw so runWithAuthRetry handles
642
+ // it. Only safe before any wire-irreversible writes (otherwise
643
+ // we've already corrupted the stream and can't undo).
644
+ if (combined && isAuthFailureText(combined) && !sink.hasStarted) {
645
+ abortController.abort();
646
+ throw new AuthFailureInResultText(combined);
647
+ }
648
+
649
+ // Tool_use detection. Emit any text from this same message
650
+ // *before* the tool_use blocks so block ordering is preserved
651
+ // (Anthropic streams sometimes have text + tool_use in one
652
+ // assistant message).
653
+ if (toolsEnabled && hasToolUse(message)) {
654
+ const calls = extractToolUses(message);
655
+ if (calls.length) {
656
+ if (combined) {
657
+ const delta = combined.startsWith(textEmittedSoFar)
658
+ ? combined.slice(textEmittedSoFar.length)
659
+ : combined;
660
+ if (delta) {
661
+ sink.pushTextDelta(delta);
662
+ textEmittedSoFar += delta;
663
+ }
664
+ }
665
+ for (const tu of calls) sink.pushToolUse(tu);
666
+ toolUseEmitted = true;
667
+ stopReason = 'tool_use';
668
+ console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
669
+ abortController.abort();
670
+ break;
671
+ }
672
+ }
673
+
674
+ // Plain text-only assistant message: stream the delta.
675
+ // Compute delta vs what we've emitted to avoid duplication on
676
+ // aggregator-style assistant messages that resend accumulated text.
677
+ if (combined) {
678
+ const delta = combined.startsWith(textEmittedSoFar)
679
+ ? combined.slice(textEmittedSoFar.length)
680
+ : combined;
681
+ if (delta) {
682
+ sink.pushTextDelta(delta);
683
+ textEmittedSoFar += delta;
684
+ }
685
+ }
686
+ }
687
+
688
+ if (message.type === 'result') {
689
+ if (message.result && !textEmittedSoFar && !toolUseEmitted) {
690
+ // Some SDK paths only deliver text via the final result message
691
+ // (no streaming assistant messages). Emit it as one delta.
692
+ sink.pushTextDelta(message.result);
693
+ textEmittedSoFar = message.result;
694
+ }
695
+ if (isAuthFailureText(message.result || '') && !sink.hasStarted) {
696
+ throw new AuthFailureInResultText(message.result);
697
+ }
698
+ const usage = extractSdkUsage(message);
699
+ inputTokens = usage.input_tokens;
700
+ outputTokens = usage.output_tokens;
701
+ cacheReadTokens = usage.cache_read_input_tokens;
702
+ cacheCreateTokens = usage.cache_creation_input_tokens;
703
+ console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
704
+ if (!toolUseEmitted) stopReason = surface.mapStopReason(message);
705
+ break;
706
+ }
707
+ }
708
+ };
709
+
710
+ try {
711
+ await runWithAuthRetry({
712
+ attempt: runQuery,
713
+ bailIfStarted: () => sink.hasStarted,
714
+ onRefreshing: (err) => console.warn(`[auth] 401 on ${surface.logTag} — refreshing (${err.message?.slice(0, 80)})`),
715
+ onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying ${surface.logTag}`),
716
+ });
717
+ } catch (err) {
718
+ const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
719
+ if (!clientDisconnected && !(toolsEnabled && isAbort)) {
720
+ console.error(`[${surface.logTag}] SDK error:`, err.message);
721
+ sink.error(err);
722
+ return;
723
+ }
724
+ }
725
+
726
+ if (sessionKey && capturedSessionId) {
727
+ upsertSession(sessionKey, capturedSessionId, resolvedModel);
728
+ }
729
+
730
+ sink.finish({
731
+ stopReason,
732
+ usage: {
733
+ input_tokens: inputTokens,
734
+ output_tokens: outputTokens,
735
+ cache_read_input_tokens: cacheReadTokens,
736
+ cache_creation_input_tokens: cacheCreateTokens,
737
+ },
738
+ });
739
+
740
+ captureResponse({
741
+ requestId,
742
+ usage: {
743
+ input_tokens: inputTokens,
744
+ output_tokens: outputTokens,
745
+ cache_read_input_tokens: cacheReadTokens,
746
+ cache_creation_input_tokens: cacheCreateTokens,
747
+ },
748
+ durationMs: Date.now() - startedAt,
749
+ status: clientDisconnected ? 'client_disconnect' : 'ok',
750
+ stopReason,
751
+ model: resolvedModel,
752
+ });
753
+ }