mobygate 0.8.4 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.js CHANGED
@@ -70,16 +70,14 @@ import {
70
70
  getCurrentVersion,
71
71
  } from './lib/updater.js';
72
72
  import {
73
- anthropicMessagesToPrompt,
74
73
  collectAnthropicImages,
75
- buildAnthropicResponse,
76
- makeStreamTranslator,
77
74
  hasAnthropicTools,
78
- mapStopReason,
79
- extractSdkUsage,
80
75
  } from './lib/anthropic.js';
76
+ import { hasTools, collectImages } from './lib/openai-translation.js';
77
+ import { runInference, openaiSurface, anthropicSurface } from './lib/inference-runner.js';
81
78
  import { resolveSessionKey } from './lib/session-derive.js';
82
79
  import { captureRequest, captureResponse, isCaptureEnabled, CAPTURE_DIR_PATH } from './lib/request-capture.js';
80
+ import { scrubAnthropicBody, quietDiagnose } from './lib/quiet.js';
83
81
 
84
82
  const __filename = fileURLToPath(import.meta.url);
85
83
  const __dirname = dirname(__filename);
@@ -90,7 +88,32 @@ const PORT = parseInt(process.env.PORT || '3456', 10);
90
88
  // interface) in ~/.mobygate/config.yaml, but should add auth in front of it.
91
89
  const BIND = process.env.BIND || '127.0.0.1';
92
90
  const DEFAULT_MODEL = process.env.DEFAULT_MODEL || 'claude-opus-4-7[1m]';
93
- const SESSION_TTL_MS = parseInt(process.env.SESSION_TTL_MS || String(60 * 60 * 1000), 10); // 1 hour default
91
+ // SESSION_TTL_MS: how long mobygate holds onto an idle SDK session before
92
+ // expiring it from its in-memory + on-disk session store. v0.8.5 raises
93
+ // the default from 1h → 4h based on real-world usage data: most multi-
94
+ // channel users (Discord agents serving 20+ channels) revisit channels
95
+ // every few hours, and a 1h TTL forced a fresh `query()` (full prompt
96
+ // re-send) every time. With 4h, mobygate retains the SDK session ID for
97
+ // half a day, so the next request resumes via session-id rather than
98
+ // reissuing the entire prompt.
99
+ //
100
+ // Caveat — this only solves SDK-side session continuity. Anthropic's
101
+ // wire-side prompt cache (5 min default, 1h with the
102
+ // `extended-cache-ttl-2025-04-11` beta) is unaffected; the SDK doesn't
103
+ // currently expose that beta to callers, so cache-creation tax on idle
104
+ // channels still applies. The TTL bump is a partial mitigation, not a
105
+ // fix.
106
+ //
107
+ // Override: SESSION_TTL_MS=14400000 (env, in milliseconds)
108
+ // or MOBY_SESSION_TTL_HOURS=4 (more readable, also accepted)
109
+ const SESSION_TTL_MS = (() => {
110
+ if (process.env.SESSION_TTL_MS) return parseInt(process.env.SESSION_TTL_MS, 10);
111
+ if (process.env.MOBY_SESSION_TTL_HOURS) {
112
+ const h = parseFloat(process.env.MOBY_SESSION_TTL_HOURS);
113
+ if (h > 0) return Math.round(h * 60 * 60 * 1000);
114
+ }
115
+ return 4 * 60 * 60 * 1000; // 4h default (was 1h pre-v0.8.5)
116
+ })();
94
117
 
95
118
  // ---------------------------------------------------------------------------
96
119
  // Session store — maps client keys → SDK session IDs (persisted to disk)
@@ -212,1101 +235,6 @@ function resolveModel(model) {
212
235
  return MODEL_MAP[stripped] || MODEL_MAP[model] || DEFAULT_MODEL;
213
236
  }
214
237
 
215
- // ---------------------------------------------------------------------------
216
- // OpenAI messages → single prompt string
217
- // ---------------------------------------------------------------------------
218
-
219
- function extractContent(content) {
220
- if (typeof content === 'string') return content;
221
- if (Array.isArray(content)) {
222
- return content
223
- .map((part) => {
224
- if (typeof part === 'string') return part;
225
- if (part.type === 'text') return part.text;
226
- if (part.type === 'image_url') return ''; // images carried separately; drop from text
227
- return JSON.stringify(part);
228
- })
229
- .filter(Boolean)
230
- .join('\n');
231
- }
232
- if (content && typeof content === 'object') return JSON.stringify(content);
233
- return String(content || '');
234
- }
235
-
236
- // Convert an OpenAI message.content array into Anthropic image content blocks.
237
- // Supports both data: URLs (base64) and remote https URLs.
238
- function extractImageBlocks(content) {
239
- if (!Array.isArray(content)) return [];
240
- const blocks = [];
241
- for (const part of content) {
242
- if (!part || part.type !== 'image_url') continue;
243
- const url = typeof part.image_url === 'string' ? part.image_url : part.image_url?.url;
244
- if (!url) continue;
245
- const dataMatch = /^data:([^;]+);base64,(.+)$/.exec(url);
246
- if (dataMatch) {
247
- blocks.push({ type: 'image', source: { type: 'base64', media_type: dataMatch[1], data: dataMatch[2] } });
248
- } else {
249
- blocks.push({ type: 'image', source: { type: 'url', url } });
250
- }
251
- }
252
- return blocks;
253
- }
254
-
255
- // Collect images from the LAST user message (OpenAI only attaches images to the latest turn).
256
- function collectImages(messages) {
257
- for (let i = messages.length - 1; i >= 0; i--) {
258
- if (messages[i].role === 'user') return extractImageBlocks(messages[i].content);
259
- }
260
- return [];
261
- }
262
-
263
- // ---------------------------------------------------------------------------
264
- // Tool calling (Phase 1: native MCP tools — no more <tool_call> text hack)
265
- // ---------------------------------------------------------------------------
266
- // Client-provided OpenAI tools are registered with the SDK as in-process MCP
267
- // tools (see lib/tool-bridge.js). The model emits **native** tool_use content
268
- // blocks in its assistant messages; we abort the SDK on the first one and
269
- // return OpenAI tool_calls to the client. When the client replies with tool
270
- // results, we send them back as Anthropic tool_result content blocks inside
271
- // a single SDKUserMessage — round-tripping cleanly through the SDK session.
272
-
273
- function hasTools(body) {
274
- return Array.isArray(body?.tools) && body.tools.length > 0;
275
- }
276
-
277
- /**
278
- * Build the prompt text from the OpenAI messages array.
279
- *
280
- * Returns `{ promptText }` — a single string ready for the SDK. Tool
281
- * results are spliced in as <tool_results> XML when present (see
282
- * lib/tool-bridge.js#toolMessagesToText for why we don't use native
283
- * tool_result content blocks yet).
284
- *
285
- * Resuming vs fresh:
286
- * - Resuming: SDK has full history. We only send the new tail —
287
- * trailing tool results plus the most recent user text, if any.
288
- * - Fresh: SDK starts cold. We serialize the visible history with
289
- * <system>/<previous_response>/<tool_results> tags. No tool-
290
- * instruction injection — the SDK MCP registration handles that.
291
- */
292
- function messagesToPrompt(messages, { resuming = false } = {}) {
293
- if (resuming) {
294
- // Walk backwards from the end, collecting trailing tool messages and
295
- // the most recent user text. Tool results are formatted as a text
296
- // block (see lib/tool-bridge.js#toolMessagesToText for the rationale).
297
- const trailingToolMessages = [];
298
- let userText = '';
299
- for (let i = messages.length - 1; i >= 0; i--) {
300
- const msg = messages[i];
301
- if (msg.role === 'tool') {
302
- trailingToolMessages.unshift(msg);
303
- } else if (msg.role === 'user') {
304
- userText = extractContent(msg.content);
305
- break;
306
- } else {
307
- break;
308
- }
309
- }
310
- const toolResultsText = toolMessagesToText(trailingToolMessages);
311
- if (!userText && !toolResultsText) {
312
- // Earlier code fell back to extracting whatever was at messages[-1],
313
- // which on an assistant-terminated history sent the assistant's own
314
- // previous reply back to the SDK as the new user prompt — and the
315
- // model would "respond to its own reply." Catch this clearly instead.
316
- return {
317
- promptText: '',
318
- error: 'Resume mode requires the request to end with a user message or tool result. Last message has role "' + (messages[messages.length - 1]?.role || 'unknown') + '".',
319
- };
320
- }
321
- const parts = [];
322
- if (toolResultsText) parts.push(toolResultsText);
323
- if (userText) parts.push(userText);
324
- return { promptText: parts.join('\n\n') };
325
- }
326
-
327
- // Fresh request: serialize visible history as XML-wrapped text. No
328
- // tool-instruction injection (the model learns about tools via the SDK
329
- // MCP registration, not the prompt).
330
- const parts = [];
331
- for (const msg of messages) {
332
- switch (msg.role) {
333
- case 'system':
334
- parts.push(`<system>\n${extractContent(msg.content)}\n</system>\n`);
335
- break;
336
- case 'user':
337
- parts.push(extractContent(msg.content));
338
- break;
339
- case 'assistant': {
340
- // Best-effort replay. tool_calls in non-resume history are dropped;
341
- // the model can usually infer continuity from the surrounding text.
342
- const text = extractContent(msg.content);
343
- if (text) parts.push(`<previous_response>\n${text}\n</previous_response>\n`);
344
- break;
345
- }
346
- case 'tool': {
347
- // Tool messages on a fresh turn (rare — clients normally use
348
- // session keys). Splice as text since there's no preceding
349
- // tool_use turn we can bind to natively.
350
- const text = toolMessagesToText([msg]);
351
- if (text) parts.push(text);
352
- break;
353
- }
354
- }
355
- }
356
- return {
357
- promptText: parts.join('\n').trim(),
358
- };
359
- }
360
-
361
- /**
362
- * Wrap promptText + optional image blocks into the form query() expects.
363
- * Returns a string for the fast path (text-only, no images), or an
364
- * async iterable yielding one SDKUserMessage with multi-part content
365
- * when there are images.
366
- */
367
- function buildQueryPrompt(promptText, imageBlocks) {
368
- if (!imageBlocks.length) return promptText;
369
- const content = [
370
- { type: 'text', text: promptText || '' },
371
- ...imageBlocks,
372
- ];
373
- async function* gen() {
374
- yield {
375
- type: 'user',
376
- message: { role: 'user', content },
377
- parent_tool_use_id: null,
378
- };
379
- }
380
- return gen();
381
- }
382
-
383
- // ---------------------------------------------------------------------------
384
- // Normalize model name for OpenAI response format
385
- // ---------------------------------------------------------------------------
386
-
387
- function normalizeModelName(model) {
388
- if (model?.includes('opus')) return 'claude-opus-4';
389
- if (model?.includes('sonnet')) return 'claude-sonnet-4';
390
- if (model?.includes('haiku')) return 'claude-haiku-4';
391
- return model || 'claude-sonnet-4';
392
- }
393
-
394
- // ---------------------------------------------------------------------------
395
- // SSE helpers
396
- // ---------------------------------------------------------------------------
397
-
398
- function makeChunk(requestId, model, content, role, finishReason) {
399
- return {
400
- id: `chatcmpl-${requestId}`,
401
- object: 'chat.completion.chunk',
402
- created: Math.floor(Date.now() / 1000),
403
- model: normalizeModelName(model),
404
- choices: [{
405
- index: 0,
406
- delta: {
407
- ...(role ? { role } : {}),
408
- ...(content !== undefined ? { content } : {}),
409
- },
410
- finish_reason: finishReason || null,
411
- }],
412
- };
413
- }
414
-
415
- function sendSSE(res, data) {
416
- if (!res.writableEnded) {
417
- res.write(`data: ${JSON.stringify(data)}\n\n`);
418
- }
419
- }
420
-
421
- // ---------------------------------------------------------------------------
422
- // POST /v1/chat/completions — streaming
423
- // ---------------------------------------------------------------------------
424
-
425
- async function handleStreaming(req, res, body, requestId, sessionKey) {
426
- const existing = getSession(sessionKey);
427
- const resuming = !!existing?.sdkSessionId;
428
- const toolsEnabled = hasTools(body);
429
- const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
430
- if (promptError) {
431
- return res.status(400).json({
432
- error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
433
- });
434
- }
435
- const images = collectImages(body.messages);
436
- // NOTE: `prompt` is built inside runQuery (not here) when images are
437
- // present, because buildQueryPrompt returns a single-use async iterator
438
- // for multimodal requests. If we built it here and the SDK call hit a
439
- // 401, runWithAuthRetry would invoke runQuery a second time with the
440
- // same exhausted iterator → SDK gets an empty user message → silent
441
- // empty response. Lazy construction inside runQuery rebuilds the
442
- // iterator per attempt.
443
- const model = resolveModel(body.model);
444
- // Build the in-process MCP server exposing client tools to the SDK.
445
- // null when toolsEnabled is false (or all tools are malformed).
446
- const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
447
- // System-prompt append: tells the model exactly which tools are
448
- // available and that Claude Code's built-ins (Bash, Grep, Read, etc.)
449
- // are NOT in this environment. Without this, the model trained-in
450
- // priors lead it to call Grep/Bash, get blocked by allowedTools, and
451
- // refuse the task instead of falling back to client tools. ~150 tokens.
452
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
453
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
454
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
455
-
456
- res.setHeader('Content-Type', 'text/event-stream');
457
- res.setHeader('Cache-Control', 'no-cache');
458
- res.setHeader('Connection', 'keep-alive');
459
- res.setHeader('X-Request-Id', requestId);
460
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
461
- res.flushHeaders();
462
- res.write(':ok\n\n');
463
-
464
- const abortController = new AbortController();
465
- let isFirst = true;
466
- let resolvedModel = model;
467
- let capturedSessionId = existing?.sdkSessionId || null;
468
- let clientDisconnected = false;
469
- let inputTokens = 0;
470
- let outputTokens = 0;
471
- let cacheReadTokens = 0;
472
- let cacheCreateTokens = 0;
473
-
474
- res.on('close', () => {
475
- clientDisconnected = true;
476
- abortController.abort();
477
- });
478
-
479
- if (resuming) {
480
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
481
- }
482
-
483
- // Tools-mode buffers text and collects native tool_use blocks. If the
484
- // model emits text first then a tool_use, we want both: textBefore as
485
- // the assistant content, plus the tool_calls. (Most clients display the
486
- // text and then act on the tool_calls.)
487
- let bufferedText = '';
488
- let collectedToolCalls = []; // [{id, name, arguments}] from extractToolUses()
489
-
490
- const runQuery = async () => {
491
- // Reset per-attempt state so a 401 retry starts clean
492
- bufferedText = '';
493
- collectedToolCalls = [];
494
- isFirst = true;
495
- resolvedModel = model;
496
- capturedSessionId = existing?.sdkSessionId || null;
497
-
498
- // Build the prompt lazily on each attempt — multimodal returns a
499
- // single-use async iterator. Keeps 401 auth-retries safe.
500
- const prompt = buildQueryPrompt(promptText, images);
501
- for await (const message of query({
502
- prompt,
503
- options: {
504
- model,
505
- maxTurns: toolsEnabled ? 5 : 200,
506
- permissionMode: 'bypassPermissions',
507
- allowDangerouslySkipPermissions: true,
508
- abortController,
509
- // Tools-mode: register client tools as an in-process MCP server
510
- // and allow only those (no Bash/Read/etc. — the SDK's built-ins
511
- // would pollute the session and leak through to the model).
512
- ...(clientToolsServer
513
- ? {
514
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
515
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
516
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
517
- }
518
- : toolsEnabled
519
- // Tools were requested but none were valid — disable all tools.
520
- ? { allowedTools: [] }
521
- : {}),
522
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
523
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
524
- },
525
- })) {
526
- if (clientDisconnected) break;
527
-
528
- const msgPreview = message.type === 'assistant'
529
- ? `content_keys=${Object.keys(message).join(',')}`
530
- : message.type === 'result'
531
- ? `result=${(message.result || '').slice(0, 60)}`
532
- : message.subtype || '';
533
- console.log(` [msg] type=${message.type} ${msgPreview}`);
534
-
535
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
536
- resolvedModel = message.model;
537
- }
538
-
539
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
540
- capturedSessionId = message.session_id;
541
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
542
- }
543
-
544
- // Extract text from this assistant message
545
- let turnText = '';
546
- if (message.type === 'assistant' && message.message?.content) {
547
- const content = message.message.content;
548
- if (Array.isArray(content)) {
549
- for (const b of content) if (b.type === 'text' && b.text) turnText += b.text;
550
- } else if (typeof content === 'string') {
551
- turnText = content;
552
- }
553
- }
554
-
555
- // Detect auth failure surfaced inline (common on long-running proxies
556
- // where the SDK's cached creds expire). Throw so runWithAuthRetry
557
- // treats it like a real 401 exception.
558
- if (turnText && isAuthFailureText(turnText) && isFirst) {
559
- abortController.abort();
560
- throw new AuthFailureInResultText(turnText);
561
- }
562
-
563
- // Tools-mode: check for native tool_use content blocks. The moment
564
- // we see one, abort the SDK — we don't want our stub handler to
565
- // hang waiting on an execution that's actually happening client-side.
566
- if (toolsEnabled && message.type === 'assistant' && hasToolUse(message)) {
567
- const calls = extractToolUses(message);
568
- if (calls.length) {
569
- collectedToolCalls.push(...calls);
570
- if (turnText) bufferedText += turnText;
571
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
572
- abortController.abort();
573
- break;
574
- }
575
- }
576
-
577
- if (turnText) {
578
- if (toolsEnabled) {
579
- // Buffer text in case it precedes a tool_use, or ends up as the
580
- // final response when the model decides not to call any tools.
581
- bufferedText += turnText;
582
- } else {
583
- sendSSE(res, makeChunk(requestId, resolvedModel, turnText, isFirst ? 'assistant' : undefined, null));
584
- isFirst = false;
585
- }
586
- }
587
-
588
- if (message.type === 'result') {
589
- if (message.result && isAuthFailureText(message.result) && isFirst) {
590
- throw new AuthFailureInResultText(message.result);
591
- }
592
- if (!toolsEnabled && message.result && isFirst) {
593
- sendSSE(res, makeChunk(requestId, resolvedModel, message.result, 'assistant', null));
594
- isFirst = false;
595
- }
596
- if (toolsEnabled && !bufferedText && message.result) bufferedText = message.result;
597
- const usage = extractSdkUsage(message);
598
- inputTokens = usage.input_tokens;
599
- outputTokens = usage.output_tokens;
600
- cacheReadTokens = usage.cache_read_input_tokens;
601
- cacheCreateTokens = usage.cache_creation_input_tokens;
602
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
603
- break;
604
- }
605
- }
606
- };
607
-
608
- try {
609
- await runWithAuthRetry({
610
- attempt: runQuery,
611
- // Only retry if we haven't written a real chunk yet. In tools mode we
612
- // buffer internally so any retry is safe regardless.
613
- bailIfStarted: () => !toolsEnabled && !isFirst,
614
- onRefreshing: (err) => console.warn(`[auth] 401 on stream — refreshing (${err.message?.slice(0, 80)})`),
615
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying stream`),
616
- });
617
- } catch (err) {
618
- // Abort from tool-call detection surfaces as an abort error — not a real failure
619
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
620
- if (!clientDisconnected && !(toolsEnabled && isAbort)) {
621
- console.error('[stream] SDK error:', err.message);
622
- sendSSE(res, { error: { message: err.message, type: 'server_error', code: null } });
623
- }
624
- }
625
-
626
- if (sessionKey && capturedSessionId) {
627
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
628
- }
629
-
630
- // Tools mode: emit the buffered response as a single chunk with either
631
- // tool_calls (+ finish_reason: tool_calls) or plain text (+ stop).
632
- if (toolsEnabled && !res.writableEnded) {
633
- if (collectedToolCalls.length > 0) {
634
- console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
635
- const chunk = {
636
- id: `chatcmpl-${requestId}`,
637
- object: 'chat.completion.chunk',
638
- created: Math.floor(Date.now() / 1000),
639
- model: normalizeModelName(resolvedModel),
640
- choices: [{
641
- index: 0,
642
- delta: {
643
- role: 'assistant',
644
- content: bufferedText.trim() || null,
645
- tool_calls: collectedToolCalls.map((tc, i) => ({
646
- index: i,
647
- id: tc.id,
648
- type: 'function',
649
- function: { name: tc.name, arguments: tc.arguments },
650
- })),
651
- },
652
- finish_reason: 'tool_calls',
653
- }],
654
- };
655
- sendSSE(res, chunk);
656
- } else {
657
- sendSSE(res, makeChunk(requestId, resolvedModel, bufferedText, 'assistant', null));
658
- sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
659
- }
660
- res.write('data: [DONE]\n\n');
661
- res.end();
662
- captureResponse({
663
- requestId,
664
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
665
- status: 'ok',
666
- stopReason: collectedToolCalls.length > 0 ? 'tool_use' : 'end_turn',
667
- model: resolvedModel,
668
- });
669
- return;
670
- }
671
-
672
- if (!res.writableEnded) {
673
- sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
674
- res.write('data: [DONE]\n\n');
675
- res.end();
676
- }
677
-
678
- captureResponse({
679
- requestId,
680
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
681
- status: clientDisconnected ? 'client_disconnect' : 'ok',
682
- stopReason: 'end_turn',
683
- model: resolvedModel,
684
- });
685
- }
686
-
687
- // ---------------------------------------------------------------------------
688
- // POST /v1/chat/completions — non-streaming
689
- // ---------------------------------------------------------------------------
690
-
691
- async function handleNonStreaming(res, body, requestId, sessionKey) {
692
- const existing = getSession(sessionKey);
693
- const resuming = !!existing?.sdkSessionId;
694
- const toolsEnabled = hasTools(body);
695
- const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
696
- if (promptError) {
697
- return res.status(400).json({
698
- error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
699
- });
700
- }
701
- const images = collectImages(body.messages);
702
- // NOTE: `prompt` is built inside runQuery (not here) when images are
703
- // present, because buildQueryPrompt returns a single-use async iterator
704
- // for multimodal requests. If we built it here and the SDK call hit a
705
- // 401, runWithAuthRetry would invoke runQuery a second time with the
706
- // same exhausted iterator → SDK gets an empty user message → silent
707
- // empty response. Lazy construction inside runQuery rebuilds the
708
- // iterator per attempt.
709
- const model = resolveModel(body.model);
710
- const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
711
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
712
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
713
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
714
-
715
- let resultText = '';
716
- let collectedToolCalls = [];
717
- let resolvedModel = model;
718
- let inputTokens = 0;
719
- let outputTokens = 0;
720
- let cacheReadTokens = 0;
721
- let cacheCreateTokens = 0;
722
- let stopReason = 'end_turn';
723
- let capturedSessionId = existing?.sdkSessionId || null;
724
- const abortController = new AbortController();
725
-
726
- if (resuming) {
727
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
728
- }
729
-
730
- const runQuery = async () => {
731
- // Reset per-attempt state so a 401 retry starts clean
732
- resultText = '';
733
- collectedToolCalls = [];
734
- resolvedModel = model;
735
- inputTokens = 0;
736
- outputTokens = 0;
737
- capturedSessionId = existing?.sdkSessionId || null;
738
-
739
- // Build the prompt lazily on each attempt — multimodal returns a
740
- // single-use async iterator. Keeps 401 auth-retries safe.
741
- const prompt = buildQueryPrompt(promptText, images);
742
- for await (const message of query({
743
- prompt,
744
- options: {
745
- model,
746
- maxTurns: toolsEnabled ? 5 : 200,
747
- permissionMode: 'bypassPermissions',
748
- allowDangerouslySkipPermissions: true,
749
- abortController,
750
- ...(clientToolsServer
751
- ? {
752
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
753
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
754
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
755
- }
756
- : toolsEnabled
757
- ? { allowedTools: [] }
758
- : {}),
759
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
760
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
761
- },
762
- })) {
763
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
764
- resolvedModel = message.model;
765
- }
766
-
767
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
768
- capturedSessionId = message.session_id;
769
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
770
- }
771
-
772
- if (message.type === 'assistant' && message.message?.content) {
773
- const content = message.message.content;
774
- if (Array.isArray(content)) {
775
- for (const block of content) {
776
- if (block.type === 'text') resultText += block.text || '';
777
- }
778
- } else if (typeof content === 'string') {
779
- resultText += content;
780
- }
781
- // Detect auth failure surfaced inline (long-running proxy, cached creds)
782
- if (isAuthFailureText(resultText)) {
783
- abortController.abort();
784
- throw new AuthFailureInResultText(resultText);
785
- }
786
- // Native tool_use detection — abort the moment a tool_use lands.
787
- if (toolsEnabled && hasToolUse(message)) {
788
- const calls = extractToolUses(message);
789
- if (calls.length) {
790
- collectedToolCalls.push(...calls);
791
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
792
- abortController.abort();
793
- break;
794
- }
795
- }
796
- }
797
-
798
- if (message.type === 'result') {
799
- if (message.result && !resultText) resultText = message.result;
800
- if (isAuthFailureText(resultText)) {
801
- throw new AuthFailureInResultText(resultText);
802
- }
803
- const usage = extractSdkUsage(message);
804
- inputTokens = usage.input_tokens;
805
- outputTokens = usage.output_tokens;
806
- cacheReadTokens = usage.cache_read_input_tokens;
807
- cacheCreateTokens = usage.cache_creation_input_tokens;
808
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
809
- if (message.subtype) stopReason = message.subtype;
810
- break;
811
- }
812
- }
813
- };
814
-
815
- try {
816
- await runWithAuthRetry({
817
- attempt: runQuery,
818
- // Non-streaming never writes to res until the end — retry is always safe
819
- bailIfStarted: () => false,
820
- onRefreshing: (err) => console.warn(`[auth] 401 on sync call — refreshing (${err.message?.slice(0, 80)})`),
821
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying sync call`),
822
- });
823
- } catch (err) {
824
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
825
- if (!(toolsEnabled && isAbort)) {
826
- console.error('[non-stream] SDK error:', err.message);
827
- return res.status(500).json({ error: { message: err.message, type: 'server_error', code: null } });
828
- }
829
- }
830
-
831
- if (sessionKey && capturedSessionId) {
832
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
833
- }
834
-
835
- const responseHeaders = {};
836
- if (sessionKey) responseHeaders['X-Session-Id'] = sessionKey;
837
-
838
- // Tool-calling response shape
839
- if (toolsEnabled && collectedToolCalls.length > 0) {
840
- console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
841
- return res.set(responseHeaders).json({
842
- id: `chatcmpl-${requestId}`,
843
- object: 'chat.completion',
844
- created: Math.floor(Date.now() / 1000),
845
- model: normalizeModelName(resolvedModel),
846
- choices: [{
847
- index: 0,
848
- message: {
849
- role: 'assistant',
850
- content: resultText.trim() || null,
851
- tool_calls: collectedToolCalls.map((tc) => ({
852
- id: tc.id,
853
- type: 'function',
854
- function: { name: tc.name, arguments: tc.arguments },
855
- })),
856
- },
857
- finish_reason: 'tool_calls',
858
- }],
859
- usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
860
- });
861
- // No tool_use blocks → fall through to normal text response
862
- }
863
-
864
- res.set(responseHeaders).json({
865
- id: `chatcmpl-${requestId}`,
866
- object: 'chat.completion',
867
- created: Math.floor(Date.now() / 1000),
868
- model: normalizeModelName(resolvedModel),
869
- choices: [{
870
- index: 0,
871
- message: { role: 'assistant', content: resultText },
872
- finish_reason: 'stop',
873
- }],
874
- usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
875
- });
876
-
877
- captureResponse({
878
- requestId,
879
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
880
- status: 'ok',
881
- stopReason,
882
- model: resolvedModel,
883
- });
884
- }
885
-
886
- // ---------------------------------------------------------------------------
887
- // POST /v1/messages — Anthropic-native surface (non-streaming + streaming)
888
- // ---------------------------------------------------------------------------
889
- // The dual-surface architecture: Hermes uses /v1/chat/completions
890
- // (OpenAI shape), OpenClaw uses /v1/messages (Anthropic shape). Both
891
- // translate to the SAME underlying SDK query() — the surfaces are pure
892
- // translators over a single inference engine.
893
- //
894
- // Tool calling: reuses Phase 1's native MCP path from lib/tool-bridge.js.
895
- // No prompt-injected tool definitions, no <tool_call> text parsing.
896
- // Inbound tool_results still spliced as text on resume (see anthropic.js
897
- // docstring for why — Phase 1 limitation, not lifted here).
898
-
899
- async function handleAnthropicNonStreaming(res, body, requestId, sessionKey) {
900
- const existing = getSession(sessionKey);
901
- const resuming = !!existing?.sdkSessionId;
902
- const toolsEnabled = hasAnthropicTools(body);
903
- const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
904
- if (promptError) {
905
- return res.status(400).json({
906
- type: 'error',
907
- error: { type: 'invalid_request_error', message: promptError },
908
- });
909
- }
910
- const images = collectAnthropicImages(body.messages || []);
911
- // See note in handleStreaming — `prompt` is built lazily inside runQuery
912
- // because the multimodal path returns a single-use async iterator that
913
- // a 401-retry would exhaust on the first attempt.
914
- const model = resolveModel(body.model);
915
- // Translate Anthropic tool defs → OpenAI shape that buildClientToolsServer
916
- // expects. Both go through the same JSON-Schema → Zod path on the way to
917
- // MCP; the wrapper shape difference is just `function:{name, parameters}`
918
- // vs `{name, input_schema}`.
919
- const toolsForBridge = toolsEnabled
920
- ? body.tools.map((t) => ({
921
- type: 'function',
922
- function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
923
- }))
924
- : null;
925
- const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
926
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
927
-
928
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
929
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
930
-
931
- let resultText = '';
932
- let collectedToolCalls = [];
933
- let resolvedModel = model;
934
- let inputTokens = 0;
935
- let outputTokens = 0;
936
- let cacheReadTokens = 0;
937
- let cacheCreateTokens = 0;
938
- let capturedSessionId = existing?.sdkSessionId || null;
939
- let stopReason = 'end_turn';
940
- const abortController = new AbortController();
941
-
942
- if (resuming) {
943
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
944
- }
945
-
946
- const runQuery = async () => {
947
- resultText = '';
948
- collectedToolCalls = [];
949
- resolvedModel = model;
950
- inputTokens = 0;
951
- outputTokens = 0;
952
- capturedSessionId = existing?.sdkSessionId || null;
953
- stopReason = 'end_turn';
954
-
955
- // Build the prompt lazily on each attempt — multimodal returns a
956
- // single-use async iterator. Keeps 401 auth-retries safe.
957
- const prompt = buildQueryPrompt(promptText, images);
958
- for await (const message of query({
959
- prompt,
960
- options: {
961
- model,
962
- maxTurns: toolsEnabled ? 5 : 200,
963
- permissionMode: 'bypassPermissions',
964
- allowDangerouslySkipPermissions: true,
965
- abortController,
966
- ...(clientToolsServer
967
- ? {
968
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
969
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
970
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
971
- }
972
- : toolsEnabled
973
- ? { allowedTools: [] }
974
- : {}),
975
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
976
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
977
- },
978
- })) {
979
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
980
- resolvedModel = message.model;
981
- }
982
-
983
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
984
- capturedSessionId = message.session_id;
985
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
986
- }
987
-
988
- if (message.type === 'assistant' && message.message?.content) {
989
- const content = message.message.content;
990
- if (Array.isArray(content)) {
991
- for (const block of content) {
992
- if (block.type === 'text') resultText += block.text || '';
993
- }
994
- } else if (typeof content === 'string') {
995
- resultText += content;
996
- }
997
- if (isAuthFailureText(resultText)) {
998
- abortController.abort();
999
- throw new AuthFailureInResultText(resultText);
1000
- }
1001
- if (toolsEnabled && hasToolUse(message)) {
1002
- const calls = extractToolUses(message);
1003
- if (calls.length) {
1004
- collectedToolCalls.push(...calls);
1005
- stopReason = 'tool_use';
1006
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
1007
- abortController.abort();
1008
- break;
1009
- }
1010
- }
1011
- }
1012
-
1013
- if (message.type === 'result') {
1014
- if (message.result && !resultText) resultText = message.result;
1015
- if (isAuthFailureText(resultText)) {
1016
- throw new AuthFailureInResultText(resultText);
1017
- }
1018
- const usage = extractSdkUsage(message);
1019
- inputTokens = usage.input_tokens;
1020
- outputTokens = usage.output_tokens;
1021
- cacheReadTokens = usage.cache_read_input_tokens;
1022
- cacheCreateTokens = usage.cache_creation_input_tokens;
1023
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
1024
- stopReason = mapStopReason(message);
1025
- break;
1026
- }
1027
- }
1028
- };
1029
-
1030
- try {
1031
- await runWithAuthRetry({
1032
- attempt: runQuery,
1033
- bailIfStarted: () => false,
1034
- onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages — refreshing (${err.message?.slice(0, 80)})`),
1035
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages`),
1036
- });
1037
- } catch (err) {
1038
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
1039
- if (!(toolsEnabled && isAbort)) {
1040
- console.error('[/v1/messages] SDK error:', err.message);
1041
- return res.status(500).json({
1042
- type: 'error',
1043
- error: { type: 'api_error', message: err.message },
1044
- });
1045
- }
1046
- }
1047
-
1048
- if (sessionKey && capturedSessionId) {
1049
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
1050
- }
1051
-
1052
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
1053
-
1054
- res.json(buildAnthropicResponse({
1055
- rawText: resultText.trim(),
1056
- toolUses: collectedToolCalls,
1057
- model: resolvedModel,
1058
- usage: { input_tokens: inputTokens, output_tokens: outputTokens },
1059
- requestId,
1060
- stopReason,
1061
- }));
1062
-
1063
- captureResponse({
1064
- requestId,
1065
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
1066
- status: 'ok',
1067
- stopReason,
1068
- model: resolvedModel,
1069
- });
1070
- }
1071
-
1072
- async function handleAnthropicStreaming(req, res, body, requestId, sessionKey) {
1073
- const existing = getSession(sessionKey);
1074
- const resuming = !!existing?.sdkSessionId;
1075
- const toolsEnabled = hasAnthropicTools(body);
1076
- const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
1077
- if (promptError) {
1078
- return res.status(400).json({
1079
- type: 'error',
1080
- error: { type: 'invalid_request_error', message: promptError },
1081
- });
1082
- }
1083
- const images = collectAnthropicImages(body.messages || []);
1084
- // See note in handleStreaming — `prompt` is built lazily inside runQuery
1085
- // because the multimodal path returns a single-use async iterator that
1086
- // a 401-retry would exhaust on the first attempt.
1087
- const model = resolveModel(body.model);
1088
- const toolsForBridge = toolsEnabled
1089
- ? body.tools.map((t) => ({
1090
- type: 'function',
1091
- function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
1092
- }))
1093
- : null;
1094
- const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
1095
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
1096
-
1097
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
1098
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
1099
-
1100
- res.setHeader('Content-Type', 'text/event-stream');
1101
- res.setHeader('Cache-Control', 'no-cache');
1102
- res.setHeader('Connection', 'keep-alive');
1103
- res.setHeader('X-Request-Id', requestId);
1104
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
1105
- res.flushHeaders();
1106
-
1107
- const tx = makeStreamTranslator({ res, requestId, model });
1108
- const abortController = new AbortController();
1109
- let resolvedModel = model;
1110
- let capturedSessionId = existing?.sdkSessionId || null;
1111
- let inputTokens = 0;
1112
- let outputTokens = 0;
1113
- let cacheReadTokens = 0;
1114
- let cacheCreateTokens = 0;
1115
- let stopReason = 'end_turn';
1116
- let clientDisconnected = false;
1117
- let textEmittedSoFar = ''; // dedup against same-message reflow from SDK
1118
- let toolUseEmitted = false;
1119
-
1120
- res.on('close', () => {
1121
- clientDisconnected = true;
1122
- abortController.abort();
1123
- });
1124
-
1125
- if (resuming) {
1126
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
1127
- }
1128
-
1129
- const runQuery = async () => {
1130
- // Reset per-attempt state in case of 401-retry. Note: tx is reused
1131
- // across retries, so a successful retry that comes after we already
1132
- // emitted message_start would surface as a confused stream. We bail
1133
- // out of retry once the translator has started (see bailIfStarted).
1134
- resolvedModel = model;
1135
- capturedSessionId = existing?.sdkSessionId || null;
1136
- inputTokens = 0;
1137
- outputTokens = 0;
1138
- stopReason = 'end_turn';
1139
- textEmittedSoFar = '';
1140
- toolUseEmitted = false;
1141
-
1142
- // Build the prompt lazily on each attempt — multimodal returns a
1143
- // single-use async iterator. Keeps 401 auth-retries safe.
1144
- const prompt = buildQueryPrompt(promptText, images);
1145
- for await (const message of query({
1146
- prompt,
1147
- options: {
1148
- model,
1149
- maxTurns: toolsEnabled ? 5 : 200,
1150
- permissionMode: 'bypassPermissions',
1151
- allowDangerouslySkipPermissions: true,
1152
- abortController,
1153
- ...(clientToolsServer
1154
- ? {
1155
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
1156
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
1157
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
1158
- }
1159
- : toolsEnabled
1160
- ? { allowedTools: [] }
1161
- : {}),
1162
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
1163
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
1164
- },
1165
- })) {
1166
- if (clientDisconnected) break;
1167
-
1168
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
1169
- resolvedModel = message.model;
1170
- tx.start(resolvedModel, 0);
1171
- }
1172
-
1173
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
1174
- capturedSessionId = message.session_id;
1175
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
1176
- }
1177
-
1178
- if (message.type === 'assistant' && message.message?.content) {
1179
- const content = message.message.content;
1180
-
1181
- // Auth-failure short-circuit: throw so runWithAuthRetry handles it.
1182
- // Only safe before any text has been streamed (otherwise we've
1183
- // already corrupted the SSE stream and can't undo).
1184
- if (Array.isArray(content)) {
1185
- let combined = '';
1186
- for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
1187
- if (combined && isAuthFailureText(combined) && !tx.hasStarted) {
1188
- abortController.abort();
1189
- throw new AuthFailureInResultText(combined);
1190
- }
1191
- }
1192
-
1193
- // Tool_use detection: emit tool_use blocks structurally and abort.
1194
- // We do this BEFORE streaming text deltas from this message so the
1195
- // tool_use block is properly framed (after any pending text block
1196
- // closes). The translator handles the close-text → open-tool-use
1197
- // sequencing internally.
1198
- if (toolsEnabled && hasToolUse(message)) {
1199
- const calls = extractToolUses(message);
1200
- if (calls.length) {
1201
- // Emit any text from this same message *before* the tool_use
1202
- // (Anthropic streams sometimes have text + tool_use in one
1203
- // assistant message — preserve that ordering).
1204
- if (Array.isArray(content)) {
1205
- for (const b of content) {
1206
- if (b?.type === 'text' && b.text) {
1207
- // Compute delta vs what we've emitted to avoid duplication
1208
- // on aggregator-style assistant messages that resend the
1209
- // whole accumulated text.
1210
- const delta = b.text.startsWith(textEmittedSoFar)
1211
- ? b.text.slice(textEmittedSoFar.length)
1212
- : b.text;
1213
- if (delta) {
1214
- tx.pushTextDelta(delta);
1215
- textEmittedSoFar += delta;
1216
- }
1217
- }
1218
- }
1219
- }
1220
- for (const tu of calls) tx.pushToolUse(tu);
1221
- toolUseEmitted = true;
1222
- stopReason = 'tool_use';
1223
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
1224
- abortController.abort();
1225
- break;
1226
- }
1227
- }
1228
-
1229
- // Plain text-only assistant message: stream the delta.
1230
- if (Array.isArray(content)) {
1231
- let combined = '';
1232
- for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
1233
- if (combined) {
1234
- const delta = combined.startsWith(textEmittedSoFar)
1235
- ? combined.slice(textEmittedSoFar.length)
1236
- : combined;
1237
- if (delta) {
1238
- tx.pushTextDelta(delta);
1239
- textEmittedSoFar += delta;
1240
- }
1241
- }
1242
- } else if (typeof content === 'string' && content) {
1243
- const delta = content.startsWith(textEmittedSoFar)
1244
- ? content.slice(textEmittedSoFar.length)
1245
- : content;
1246
- if (delta) {
1247
- tx.pushTextDelta(delta);
1248
- textEmittedSoFar += delta;
1249
- }
1250
- }
1251
- }
1252
-
1253
- if (message.type === 'result') {
1254
- if (message.result && !textEmittedSoFar && !toolUseEmitted) {
1255
- // Some SDK paths only deliver text via the final result message
1256
- // (no streaming assistant messages). Emit it here as a single
1257
- // delta — clients see this as "model started + finished in one
1258
- // chunk", which is valid SSE.
1259
- tx.pushTextDelta(message.result);
1260
- }
1261
- if (isAuthFailureText(message.result || '') && !tx.hasStarted) {
1262
- throw new AuthFailureInResultText(message.result);
1263
- }
1264
- const usage = extractSdkUsage(message);
1265
- inputTokens = usage.input_tokens;
1266
- outputTokens = usage.output_tokens;
1267
- cacheReadTokens = usage.cache_read_input_tokens;
1268
- cacheCreateTokens = usage.cache_creation_input_tokens;
1269
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
1270
- if (!toolUseEmitted) stopReason = mapStopReason(message);
1271
- break;
1272
- }
1273
- }
1274
- };
1275
-
1276
- try {
1277
- await runWithAuthRetry({
1278
- attempt: runQuery,
1279
- // Once we've emitted message_start or any deltas, the SSE stream is
1280
- // committed — a retry would fragment it. Same logic as the OpenAI
1281
- // surface (bail once anything has been written).
1282
- bailIfStarted: () => tx.hasStarted,
1283
- onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages stream — refreshing (${err.message?.slice(0, 80)})`),
1284
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages stream`),
1285
- });
1286
- } catch (err) {
1287
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
1288
- if (!clientDisconnected && !(toolsEnabled && isAbort)) {
1289
- console.error('[/v1/messages stream] SDK error:', err.message);
1290
- tx.error(err);
1291
- return;
1292
- }
1293
- }
1294
-
1295
- if (sessionKey && capturedSessionId) {
1296
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
1297
- }
1298
-
1299
- tx.finish({ stopReason, usage: { output_tokens: outputTokens } });
1300
-
1301
- captureResponse({
1302
- requestId,
1303
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
1304
- status: 'ok',
1305
- stopReason,
1306
- model: resolvedModel,
1307
- });
1308
- }
1309
-
1310
238
  // ---------------------------------------------------------------------------
1311
239
  // Express app
1312
240
  // ---------------------------------------------------------------------------
@@ -1415,6 +343,19 @@ app.get('/inspector', async (_req, res) => {
1415
343
  }
1416
344
  });
1417
345
 
346
+ // GET /v1/chat/completions — RFC 9110: 405 with Allow header so probes
347
+ // (e.g. Hermes onboarding) can detect the endpoint exists. Returning 404
348
+ // on GET makes them think the endpoint is missing entirely.
349
+ const methodNotAllowed = (allow) => (_req, res) => {
350
+ res.set('Allow', allow);
351
+ res.status(405).json({
352
+ error: { message: `Method Not Allowed. Use ${allow}.`, type: 'invalid_request_error', code: 'method_not_allowed' },
353
+ });
354
+ };
355
+ app.get('/v1/chat/completions', methodNotAllowed('POST'));
356
+ app.get('/v1/messages', methodNotAllowed('POST'));
357
+ app.get('/quiet/v1/messages', methodNotAllowed('POST'));
358
+
1418
359
  // POST /v1/chat/completions
1419
360
  app.post('/v1/chat/completions', async (req, res) => {
1420
361
  const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
@@ -1483,11 +424,14 @@ app.post('/v1/chat/completions', async (req, res) => {
1483
424
  res.on('finish', () => emitEnd());
1484
425
  res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
1485
426
 
1486
- if (body.stream) {
1487
- await handleStreaming(req, res, body, requestId, sessionKey);
1488
- } else {
1489
- await handleNonStreaming(res, body, requestId, sessionKey);
1490
- }
427
+ await runInference(
428
+ { req, res, body, requestId, sessionKey },
429
+ openaiSurface,
430
+ {
431
+ mode: body.stream ? 'stream' : 'json',
432
+ deps: { getSession, upsertSession, resolveModel },
433
+ },
434
+ );
1491
435
  });
1492
436
 
1493
437
  // POST /v1/messages — Anthropic-native surface (for OpenClaw etc.).
@@ -1556,11 +500,104 @@ app.post('/v1/messages', async (req, res) => {
1556
500
  res.on('finish', () => emitEnd());
1557
501
  res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
1558
502
 
1559
- if (body.stream) {
1560
- await handleAnthropicStreaming(req, res, body, requestId, sessionKey);
503
+ await runInference(
504
+ { req, res, body, requestId, sessionKey },
505
+ anthropicSurface,
506
+ {
507
+ mode: body.stream ? 'stream' : 'json',
508
+ deps: { getSession, upsertSession, resolveModel },
509
+ },
510
+ );
511
+ });
512
+
513
+ // POST /quiet/v1/messages — Anthropic-shape, but with two changes vs /v1/messages:
514
+ // 1. Body is scrubbed for known third-party agent identifiers
515
+ // (openclaw, hermes, mobius, etc.) before the SDK forwards it.
516
+ // 2. SDK receives an explicit string systemPrompt — disables the
517
+ // claude_code preset that otherwise injects "I am Claude Code…" framing.
518
+ //
519
+ // Use case: clients that don't want their identity to leak into Anthropic's
520
+ // detection heuristics (e.g. "found 'openclaw' in package.json → flag account
521
+ // for extra-usage billing"). Configurable scrub list at ~/.mobygate/quiet-words.txt.
522
+ app.post('/quiet/v1/messages', async (req, res) => {
523
+ const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
524
+ const body = req.body;
525
+
526
+ if (!body?.messages || !Array.isArray(body.messages) || body.messages.length === 0) {
527
+ return res.status(400).json({
528
+ type: 'error',
529
+ error: { type: 'invalid_request_error', message: 'messages is required and must be a non-empty array' },
530
+ });
531
+ }
532
+
533
+ // Scrub the body in place BEFORE anything else reads it — capture, session
534
+ // derivation, prompt building all see the scrubbed content from here on.
535
+ // Diagnose first so we can log what we stripped (without leaking the values).
536
+ const diag = quietDiagnose(body);
537
+ scrubAnthropicBody(body);
538
+
539
+ const { key: sessionKey, source: sessionKeySource } = resolveSessionKey({
540
+ headerKey: req.headers['x-session-id'],
541
+ bodyKey: body.session_id,
542
+ body,
543
+ });
544
+ const existing = getSession(sessionKey);
545
+ const sessionTag = sessionKey
546
+ ? ` | session=${sessionKey}${sessionKeySource === 'auto' ? ' (auto)' : ''}${existing ? ' (resume)' : ' (new)'}`
547
+ : '';
548
+
549
+ console.log(`[${new Date().toISOString()}] anthropic-quiet ${body.stream ? 'stream' : 'sync'} | model=${body.model} → ${resolveModel(body.model)} | msgs=${body.messages.length}${sessionTag}`);
550
+ if (diag.matches > 0) {
551
+ const breakdown = diag.words.map(w => `${w.word}×${w.count}`).join(' ');
552
+ console.log(` [quiet] scrubbed ${diag.matches} occurrence(s): ${breakdown}`);
1561
553
  } else {
1562
- await handleAnthropicNonStreaming(res, body, requestId, sessionKey);
554
+ console.log(` [quiet] payload was already clean (no matches)`);
1563
555
  }
556
+
557
+ captureRequest({ path: '/quiet/v1/messages', body, requestId, sessionKey, sessionKeySource });
558
+
559
+ const startedAt = Date.now();
560
+ const imageBlocks = collectAnthropicImages(body.messages || []).length;
561
+ dashboardBus.emitEvent({
562
+ type: 'request.start',
563
+ id: requestId,
564
+ method: 'POST',
565
+ path: '/quiet/v1/messages',
566
+ model: body.model,
567
+ resolvedModel: resolveModel(body.model),
568
+ session: sessionKey,
569
+ stream: !!body.stream,
570
+ tools: hasAnthropicTools(body),
571
+ images: imageBlocks,
572
+ messages: body.messages.length,
573
+ resuming: !!existing,
574
+ quietScrubs: diag.matches,
575
+ });
576
+
577
+ let endEmitted = false;
578
+ const emitEnd = (overrides = {}) => {
579
+ if (endEmitted) return;
580
+ endEmitted = true;
581
+ dashboardBus.emitEvent({
582
+ type: 'request.end',
583
+ id: requestId,
584
+ durationMs: Date.now() - startedAt,
585
+ status: res.statusCode < 400 ? 'ok' : 'error',
586
+ httpStatus: res.statusCode,
587
+ ...overrides,
588
+ });
589
+ };
590
+ res.on('finish', () => emitEnd());
591
+ res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
592
+
593
+ await runInference(
594
+ { req, res, body, requestId, sessionKey },
595
+ anthropicSurface,
596
+ {
597
+ mode: body.stream ? 'stream' : 'json',
598
+ deps: { getSession, upsertSession, resolveModel },
599
+ },
600
+ );
1564
601
  });
1565
602
 
1566
603
  // GET /v1/models
@@ -1925,6 +962,176 @@ app.post('/dashboard/captures-toggle', requireLocalOrigin, async (req, res) => {
1925
962
  }
1926
963
  });
1927
964
 
965
+ // GET /dashboard/session-costs — per-session cost breakdown (v0.8.5)
966
+ //
967
+ // Aggregates the [model-billed] log lines emitted by each handler's SDK
968
+ // result step. Grouped by session_key. Surfaces:
969
+ // - cost_usd total $ across all turns of this session
970
+ // - turns number of completed (non-tool-use-aborted) turns
971
+ // - dollars_per_turn average cost amortization (low = cache working)
972
+ // - models per-model breakdown (opus vs sonnet vs haiku)
973
+ // - first_user first user message (for human-readable identification)
974
+ //
975
+ // This view exists because today's audit found 38.9% of total spend
976
+ // going to "singleton" sessions — channels that fire once, idle past
977
+ // the wire-cache TTL, then pay cache_creation tax on the next turn.
978
+ // The dashboard tab built off this endpoint lets users spot bleeding
979
+ // channels in real time and decide which to keep warm via cron pings.
980
+ app.get('/dashboard/session-costs', requireLocalOrigin, async (_req, res) => {
981
+ try {
982
+ const { readFile, readdir } = await import('fs/promises');
983
+ const { existsSync } = await import('fs');
984
+ const path = await import('path');
985
+ const { homedir } = await import('os');
986
+
987
+ const logPath = join(LOGS_DIR, 'server.log');
988
+ const captureDir = process.env.MOBYGATE_CAPTURE_DIR
989
+ || join(process.env.MOBYGATE_HOME || join(homedir(), '.mobygate'), 'captures');
990
+
991
+ // Step 1: parse [model-billed] lines from server.log, associating
992
+ // each with the most recently observed session= line above it.
993
+ const sessions = {}; // sk -> { turns, cost_usd, models: {model -> {turns, cost_usd, in_uncached, cache_read, cache_create, out}} }
994
+ let lastSession = null;
995
+
996
+ if (existsSync(logPath)) {
997
+ const raw = await readFile(logPath, 'utf8');
998
+ for (const line of raw.split(/\r?\n/)) {
999
+ const sessMatch = line.match(/session=(auto_\w+)/);
1000
+ if (sessMatch) lastSession = sessMatch[1];
1001
+ const billed = line.match(/\[model-billed\] requested=\S+ modelUsage=(\{.+\})/);
1002
+ if (billed && lastSession) {
1003
+ let mu;
1004
+ try { mu = JSON.parse(billed[1]); } catch { continue; }
1005
+ if (!sessions[lastSession]) {
1006
+ sessions[lastSession] = { turns: 0, cost_usd: 0, models: {} };
1007
+ }
1008
+ const rec = sessions[lastSession];
1009
+ rec.turns += 1;
1010
+ for (const [model, data] of Object.entries(mu)) {
1011
+ const cost = data.costUSD || 0;
1012
+ rec.cost_usd += cost;
1013
+ if (!rec.models[model]) rec.models[model] = { turns: 0, cost_usd: 0, in_uncached: 0, cache_read: 0, cache_create: 0, out: 0 };
1014
+ const m = rec.models[model];
1015
+ m.turns += 1;
1016
+ m.cost_usd += cost;
1017
+ m.in_uncached += data.inputTokens || 0;
1018
+ m.cache_read += data.cacheReadInputTokens || 0;
1019
+ m.cache_create += data.cacheCreationInputTokens || 0;
1020
+ m.out += data.outputTokens || 0;
1021
+ }
1022
+ }
1023
+ }
1024
+ }
1025
+
1026
+ // Step 2: enrich with capture metadata (first user message, model,
1027
+ // path, msg count) for each session_key. Only need to read enough
1028
+ // captures to find one per session.
1029
+ const sessionMeta = {};
1030
+ if (existsSync(captureDir)) {
1031
+ const files = (await readdir(captureDir))
1032
+ .filter(n => n.endsWith('.json'))
1033
+ .sort()
1034
+ .reverse(); // newest first
1035
+ for (const f of files) {
1036
+ const summaryFile = f.replace(/\.json$/, '.summary.txt');
1037
+ if (!existsSync(join(captureDir, summaryFile))) continue;
1038
+ const summary = await readFile(join(captureDir, summaryFile), 'utf8').catch(() => '');
1039
+ const skMatch = summary.match(/^session_key:\s+(auto_\w+)/m);
1040
+ if (!skMatch) continue;
1041
+ const sk = skMatch[1];
1042
+ if (sessionMeta[sk]) continue; // already have meta
1043
+ const modelMatch = summary.match(/^model:\s+(\S+)/m);
1044
+ const pathMatch = summary.match(/^path:\s+(\S+)/m);
1045
+ const msgsMatch = summary.match(/^messages:\s+(\d+)/m);
1046
+ const lastSeen = (await readFile(join(captureDir, summaryFile)).then(b => b.length).catch(()=>0)) ? f.slice(0, 19) : null;
1047
+
1048
+ let firstUser = null;
1049
+ try {
1050
+ const body = JSON.parse(await readFile(join(captureDir, f), 'utf8'));
1051
+ for (const m of (body.messages || []).slice(0, 5)) {
1052
+ if (m.role !== 'user') continue;
1053
+ const c = m.content;
1054
+ let txt = '';
1055
+ if (Array.isArray(c)) {
1056
+ for (const blk of c) {
1057
+ if (blk?.type === 'text' && blk.text) { txt = blk.text; break; }
1058
+ }
1059
+ } else if (typeof c === 'string') {
1060
+ txt = c;
1061
+ }
1062
+ // Skip "OpenClaw runtime context" boilerplate
1063
+ if (txt && !txt.startsWith('OpenClaw runtime context')) {
1064
+ firstUser = txt.slice(0, 80).replace(/\s+/g, ' ');
1065
+ break;
1066
+ }
1067
+ }
1068
+ } catch {}
1069
+
1070
+ sessionMeta[sk] = {
1071
+ model: modelMatch ? modelMatch[1] : null,
1072
+ path: pathMatch ? pathMatch[1] : null,
1073
+ msgs: msgsMatch ? parseInt(msgsMatch[1], 10) : null,
1074
+ lastSeenIso: lastSeen,
1075
+ firstUser,
1076
+ };
1077
+ }
1078
+ }
1079
+
1080
+ // Step 3: combine and sort
1081
+ const out = [];
1082
+ for (const [sk, rec] of Object.entries(sessions)) {
1083
+ const meta = sessionMeta[sk] || {};
1084
+ out.push({
1085
+ session_key: sk,
1086
+ turns: rec.turns,
1087
+ cost_usd: Math.round(rec.cost_usd * 10000) / 10000,
1088
+ per_turn_usd: Math.round((rec.cost_usd / Math.max(rec.turns, 1)) * 10000) / 10000,
1089
+ bucket: rec.turns === 1 ? 'singleton' : rec.turns <= 3 ? 'short' : rec.turns <= 10 ? 'medium' : 'warm',
1090
+ model: meta.model || null,
1091
+ path: meta.path || null,
1092
+ msgs: meta.msgs || null,
1093
+ last_seen: meta.lastSeenIso || null,
1094
+ first_user: meta.firstUser || null,
1095
+ models: Object.fromEntries(
1096
+ Object.entries(rec.models).map(([m, d]) => [m, {
1097
+ turns: d.turns,
1098
+ cost_usd: Math.round(d.cost_usd * 10000) / 10000,
1099
+ in_uncached: d.in_uncached,
1100
+ cache_read: d.cache_read,
1101
+ cache_create: d.cache_create,
1102
+ out: d.out,
1103
+ }]),
1104
+ ),
1105
+ });
1106
+ }
1107
+ out.sort((a, b) => b.cost_usd - a.cost_usd);
1108
+
1109
+ // Step 4: aggregate stats
1110
+ const totalCost = out.reduce((s, r) => s + r.cost_usd, 0);
1111
+ const totalTurns = out.reduce((s, r) => s + r.turns, 0);
1112
+ const buckets = { singleton: { sessions: 0, cost: 0 }, short: { sessions: 0, cost: 0 }, medium: { sessions: 0, cost: 0 }, warm: { sessions: 0, cost: 0 } };
1113
+ for (const r of out) {
1114
+ buckets[r.bucket].sessions += 1;
1115
+ buckets[r.bucket].cost += r.cost_usd;
1116
+ }
1117
+ for (const k of Object.keys(buckets)) {
1118
+ buckets[k].cost = Math.round(buckets[k].cost * 100) / 100;
1119
+ buckets[k].pct_of_total = totalCost > 0 ? Math.round((buckets[k].cost / totalCost) * 1000) / 10 : 0;
1120
+ }
1121
+
1122
+ res.json({
1123
+ generatedAt: new Date().toISOString(),
1124
+ total_cost_usd: Math.round(totalCost * 100) / 100,
1125
+ total_turns: totalTurns,
1126
+ session_count: out.length,
1127
+ buckets,
1128
+ sessions: out,
1129
+ });
1130
+ } catch (e) {
1131
+ res.status(500).json({ error: e.message });
1132
+ }
1133
+ });
1134
+
1928
1135
  // ---------------------------------------------------------------------------
1929
1136
  // Updater — dashboard-driven "update available → update now" flow
1930
1137
  // ---------------------------------------------------------------------------
@@ -1987,11 +1194,12 @@ app.get('/update/status', (req, res) => {
1987
1194
 
1988
1195
  app.listen(PORT, BIND, async () => {
1989
1196
  const ttlMin = Math.round(SESSION_TTL_MS / 60000);
1197
+ const ttlHours = (SESSION_TTL_MS / 3600000).toFixed(1);
1990
1198
  const meta = await loadBuildMeta();
1991
1199
  console.log(banner({ version: meta.version }));
1992
1200
  console.log(` bind ${BIND}:${PORT}${BIND === '127.0.0.1' ? ' (loopback only)' : ' (⚠ network-reachable — add auth)'}`);
1993
1201
  console.log(` model ${DEFAULT_MODEL}`);
1994
- console.log(` session TTL ${ttlMin} min`);
1202
+ console.log(` session TTL ${ttlMin} min (${ttlHours}h)`);
1995
1203
  console.log(` dashboard http://localhost:${PORT}`);
1996
1204
  if (isCaptureEnabled()) {
1997
1205
  console.log(` capture ON → ${CAPTURE_DIR_PATH.replace(process.env.HOME || '', '~')}`);