mobygate 0.8.4 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.js CHANGED
@@ -70,16 +70,14 @@ import {
70
70
  getCurrentVersion,
71
71
  } from './lib/updater.js';
72
72
  import {
73
- anthropicMessagesToPrompt,
74
73
  collectAnthropicImages,
75
- buildAnthropicResponse,
76
- makeStreamTranslator,
77
74
  hasAnthropicTools,
78
- mapStopReason,
79
- extractSdkUsage,
80
75
  } from './lib/anthropic.js';
76
+ import { hasTools, collectImages } from './lib/openai-translation.js';
77
+ import { runInference, openaiSurface, anthropicSurface } from './lib/inference-runner.js';
81
78
  import { resolveSessionKey } from './lib/session-derive.js';
82
79
  import { captureRequest, captureResponse, isCaptureEnabled, CAPTURE_DIR_PATH } from './lib/request-capture.js';
80
+ import { scrubAnthropicBody, quietDiagnose } from './lib/quiet.js';
83
81
 
84
82
  const __filename = fileURLToPath(import.meta.url);
85
83
  const __dirname = dirname(__filename);
@@ -89,8 +87,33 @@ const PORT = parseInt(process.env.PORT || '3456', 10);
89
87
  // want to share the proxy on a network can set bind: 0.0.0.0 (or a specific
90
88
  // interface) in ~/.mobygate/config.yaml, but should add auth in front of it.
91
89
  const BIND = process.env.BIND || '127.0.0.1';
92
- const DEFAULT_MODEL = process.env.DEFAULT_MODEL || 'claude-opus-4-7[1m]';
93
- const SESSION_TTL_MS = parseInt(process.env.SESSION_TTL_MS || String(60 * 60 * 1000), 10); // 1 hour default
90
+ const DEFAULT_MODEL = process.env.DEFAULT_MODEL || 'claude-opus-4-8[1m]';
91
+ // SESSION_TTL_MS: how long mobygate holds onto an idle SDK session before
92
+ // expiring it from its in-memory + on-disk session store. v0.8.5 raises
93
+ // the default from 1h → 4h based on real-world usage data: most multi-
94
+ // channel users (Discord agents serving 20+ channels) revisit channels
95
+ // every few hours, and a 1h TTL forced a fresh `query()` (full prompt
96
+ // re-send) every time. With 4h, mobygate retains the SDK session ID for
97
+ // half a day, so the next request resumes via session-id rather than
98
+ // reissuing the entire prompt.
99
+ //
100
+ // Caveat — this only solves SDK-side session continuity. Anthropic's
101
+ // wire-side prompt cache (5 min default, 1h with the
102
+ // `extended-cache-ttl-2025-04-11` beta) is unaffected; the SDK doesn't
103
+ // currently expose that beta to callers, so cache-creation tax on idle
104
+ // channels still applies. The TTL bump is a partial mitigation, not a
105
+ // fix.
106
+ //
107
+ // Override: SESSION_TTL_MS=14400000 (env, in milliseconds)
108
+ // or MOBY_SESSION_TTL_HOURS=4 (more readable, also accepted)
109
+ const SESSION_TTL_MS = (() => {
110
+ if (process.env.SESSION_TTL_MS) return parseInt(process.env.SESSION_TTL_MS, 10);
111
+ if (process.env.MOBY_SESSION_TTL_HOURS) {
112
+ const h = parseFloat(process.env.MOBY_SESSION_TTL_HOURS);
113
+ if (h > 0) return Math.round(h * 60 * 60 * 1000);
114
+ }
115
+ return 4 * 60 * 60 * 1000; // 4h default (was 1h pre-v0.8.5)
116
+ })();
94
117
 
95
118
  // ---------------------------------------------------------------------------
96
119
  // Session store — maps client keys → SDK session IDs (persisted to disk)
@@ -180,7 +203,13 @@ for (const sig of ['SIGTERM', 'SIGINT', 'SIGHUP']) {
180
203
  // falling back to opus or returning a zero-billed response. Fixed in
181
204
  // v0.8.2 by routing 4-6 through directly.
182
205
  const MODEL_MAP = {
183
- 'claude-opus-4': 'claude-opus-4-7[1m]',
206
+ // Latest opus → 4-8 (1M, Max-included — verified live 2026-05-29).
207
+ // 4-7 entries kept so explicit 4-7 requests still resolve.
208
+ 'claude-opus-4': 'claude-opus-4-8[1m]',
209
+ 'claude-opus-4-8': 'claude-opus-4-8[1m]',
210
+ 'claude-opus-4-8[1m]': 'claude-opus-4-8[1m]',
211
+ 'claude-opus-4-8-1m': 'claude-opus-4-8[1m]',
212
+ 'claude-opus-4-8-200k': 'claude-opus-4-8',
184
213
  'claude-opus-4-6': 'claude-opus-4-6',
185
214
  'claude-opus-4-7': 'claude-opus-4-7[1m]',
186
215
  'claude-opus-4-7[1m]': 'claude-opus-4-7[1m]',
@@ -199,7 +228,16 @@ const MODEL_MAP = {
199
228
  'claude-sonnet-4-6-200k': 'claude-sonnet-4-6', // explicit 200k alias (redundant, kept for clarity)
200
229
  'claude-haiku-4': 'claude-haiku-4-5-20251001',
201
230
  'claude-haiku-4-5': 'claude-haiku-4-5-20251001',
202
- 'opus': 'claude-opus-4-7[1m]', // Opus 1M is Max-included
231
+ // Fable 5 distinct recent model family (parallel to Opus 4.x).
232
+ // 1M variant Max-included (verified live 2026-05-29). Additive: opus
233
+ // stays the default; fable resolves only when explicitly requested.
234
+ 'claude-fable-5': 'claude-fable-5[1m]',
235
+ 'claude-fable-5[1m]': 'claude-fable-5[1m]',
236
+ 'claude-fable-5-1m': 'claude-fable-5[1m]',
237
+ 'claude-fable-5-200k': 'claude-fable-5',
238
+ 'fable': 'claude-fable-5[1m]',
239
+ 'fable-200k': 'claude-fable-5',
240
+ 'opus': 'claude-opus-4-8[1m]', // latest opus, 1M Max-included
203
241
  'sonnet': 'claude-sonnet-4-6', // 200k default; use 'sonnet-1m' for explicit 1M
204
242
  'sonnet-1m': 'claude-sonnet-4-6[1m]', // alias for 'sonnet' + explicit 1M opt-in
205
243
  'haiku': 'claude-haiku-4-5-20251001',
@@ -212,1101 +250,6 @@ function resolveModel(model) {
212
250
  return MODEL_MAP[stripped] || MODEL_MAP[model] || DEFAULT_MODEL;
213
251
  }
214
252
 
215
- // ---------------------------------------------------------------------------
216
- // OpenAI messages → single prompt string
217
- // ---------------------------------------------------------------------------
218
-
219
- function extractContent(content) {
220
- if (typeof content === 'string') return content;
221
- if (Array.isArray(content)) {
222
- return content
223
- .map((part) => {
224
- if (typeof part === 'string') return part;
225
- if (part.type === 'text') return part.text;
226
- if (part.type === 'image_url') return ''; // images carried separately; drop from text
227
- return JSON.stringify(part);
228
- })
229
- .filter(Boolean)
230
- .join('\n');
231
- }
232
- if (content && typeof content === 'object') return JSON.stringify(content);
233
- return String(content || '');
234
- }
235
-
236
- // Convert an OpenAI message.content array into Anthropic image content blocks.
237
- // Supports both data: URLs (base64) and remote https URLs.
238
- function extractImageBlocks(content) {
239
- if (!Array.isArray(content)) return [];
240
- const blocks = [];
241
- for (const part of content) {
242
- if (!part || part.type !== 'image_url') continue;
243
- const url = typeof part.image_url === 'string' ? part.image_url : part.image_url?.url;
244
- if (!url) continue;
245
- const dataMatch = /^data:([^;]+);base64,(.+)$/.exec(url);
246
- if (dataMatch) {
247
- blocks.push({ type: 'image', source: { type: 'base64', media_type: dataMatch[1], data: dataMatch[2] } });
248
- } else {
249
- blocks.push({ type: 'image', source: { type: 'url', url } });
250
- }
251
- }
252
- return blocks;
253
- }
254
-
255
- // Collect images from the LAST user message (OpenAI only attaches images to the latest turn).
256
- function collectImages(messages) {
257
- for (let i = messages.length - 1; i >= 0; i--) {
258
- if (messages[i].role === 'user') return extractImageBlocks(messages[i].content);
259
- }
260
- return [];
261
- }
262
-
263
- // ---------------------------------------------------------------------------
264
- // Tool calling (Phase 1: native MCP tools — no more <tool_call> text hack)
265
- // ---------------------------------------------------------------------------
266
- // Client-provided OpenAI tools are registered with the SDK as in-process MCP
267
- // tools (see lib/tool-bridge.js). The model emits **native** tool_use content
268
- // blocks in its assistant messages; we abort the SDK on the first one and
269
- // return OpenAI tool_calls to the client. When the client replies with tool
270
- // results, we send them back as Anthropic tool_result content blocks inside
271
- // a single SDKUserMessage — round-tripping cleanly through the SDK session.
272
-
273
- function hasTools(body) {
274
- return Array.isArray(body?.tools) && body.tools.length > 0;
275
- }
276
-
277
- /**
278
- * Build the prompt text from the OpenAI messages array.
279
- *
280
- * Returns `{ promptText }` — a single string ready for the SDK. Tool
281
- * results are spliced in as <tool_results> XML when present (see
282
- * lib/tool-bridge.js#toolMessagesToText for why we don't use native
283
- * tool_result content blocks yet).
284
- *
285
- * Resuming vs fresh:
286
- * - Resuming: SDK has full history. We only send the new tail —
287
- * trailing tool results plus the most recent user text, if any.
288
- * - Fresh: SDK starts cold. We serialize the visible history with
289
- * <system>/<previous_response>/<tool_results> tags. No tool-
290
- * instruction injection — the SDK MCP registration handles that.
291
- */
292
- function messagesToPrompt(messages, { resuming = false } = {}) {
293
- if (resuming) {
294
- // Walk backwards from the end, collecting trailing tool messages and
295
- // the most recent user text. Tool results are formatted as a text
296
- // block (see lib/tool-bridge.js#toolMessagesToText for the rationale).
297
- const trailingToolMessages = [];
298
- let userText = '';
299
- for (let i = messages.length - 1; i >= 0; i--) {
300
- const msg = messages[i];
301
- if (msg.role === 'tool') {
302
- trailingToolMessages.unshift(msg);
303
- } else if (msg.role === 'user') {
304
- userText = extractContent(msg.content);
305
- break;
306
- } else {
307
- break;
308
- }
309
- }
310
- const toolResultsText = toolMessagesToText(trailingToolMessages);
311
- if (!userText && !toolResultsText) {
312
- // Earlier code fell back to extracting whatever was at messages[-1],
313
- // which on an assistant-terminated history sent the assistant's own
314
- // previous reply back to the SDK as the new user prompt — and the
315
- // model would "respond to its own reply." Catch this clearly instead.
316
- return {
317
- promptText: '',
318
- error: 'Resume mode requires the request to end with a user message or tool result. Last message has role "' + (messages[messages.length - 1]?.role || 'unknown') + '".',
319
- };
320
- }
321
- const parts = [];
322
- if (toolResultsText) parts.push(toolResultsText);
323
- if (userText) parts.push(userText);
324
- return { promptText: parts.join('\n\n') };
325
- }
326
-
327
- // Fresh request: serialize visible history as XML-wrapped text. No
328
- // tool-instruction injection (the model learns about tools via the SDK
329
- // MCP registration, not the prompt).
330
- const parts = [];
331
- for (const msg of messages) {
332
- switch (msg.role) {
333
- case 'system':
334
- parts.push(`<system>\n${extractContent(msg.content)}\n</system>\n`);
335
- break;
336
- case 'user':
337
- parts.push(extractContent(msg.content));
338
- break;
339
- case 'assistant': {
340
- // Best-effort replay. tool_calls in non-resume history are dropped;
341
- // the model can usually infer continuity from the surrounding text.
342
- const text = extractContent(msg.content);
343
- if (text) parts.push(`<previous_response>\n${text}\n</previous_response>\n`);
344
- break;
345
- }
346
- case 'tool': {
347
- // Tool messages on a fresh turn (rare — clients normally use
348
- // session keys). Splice as text since there's no preceding
349
- // tool_use turn we can bind to natively.
350
- const text = toolMessagesToText([msg]);
351
- if (text) parts.push(text);
352
- break;
353
- }
354
- }
355
- }
356
- return {
357
- promptText: parts.join('\n').trim(),
358
- };
359
- }
360
-
361
- /**
362
- * Wrap promptText + optional image blocks into the form query() expects.
363
- * Returns a string for the fast path (text-only, no images), or an
364
- * async iterable yielding one SDKUserMessage with multi-part content
365
- * when there are images.
366
- */
367
- function buildQueryPrompt(promptText, imageBlocks) {
368
- if (!imageBlocks.length) return promptText;
369
- const content = [
370
- { type: 'text', text: promptText || '' },
371
- ...imageBlocks,
372
- ];
373
- async function* gen() {
374
- yield {
375
- type: 'user',
376
- message: { role: 'user', content },
377
- parent_tool_use_id: null,
378
- };
379
- }
380
- return gen();
381
- }
382
-
383
- // ---------------------------------------------------------------------------
384
- // Normalize model name for OpenAI response format
385
- // ---------------------------------------------------------------------------
386
-
387
- function normalizeModelName(model) {
388
- if (model?.includes('opus')) return 'claude-opus-4';
389
- if (model?.includes('sonnet')) return 'claude-sonnet-4';
390
- if (model?.includes('haiku')) return 'claude-haiku-4';
391
- return model || 'claude-sonnet-4';
392
- }
393
-
394
- // ---------------------------------------------------------------------------
395
- // SSE helpers
396
- // ---------------------------------------------------------------------------
397
-
398
- function makeChunk(requestId, model, content, role, finishReason) {
399
- return {
400
- id: `chatcmpl-${requestId}`,
401
- object: 'chat.completion.chunk',
402
- created: Math.floor(Date.now() / 1000),
403
- model: normalizeModelName(model),
404
- choices: [{
405
- index: 0,
406
- delta: {
407
- ...(role ? { role } : {}),
408
- ...(content !== undefined ? { content } : {}),
409
- },
410
- finish_reason: finishReason || null,
411
- }],
412
- };
413
- }
414
-
415
- function sendSSE(res, data) {
416
- if (!res.writableEnded) {
417
- res.write(`data: ${JSON.stringify(data)}\n\n`);
418
- }
419
- }
420
-
421
- // ---------------------------------------------------------------------------
422
- // POST /v1/chat/completions — streaming
423
- // ---------------------------------------------------------------------------
424
-
425
- async function handleStreaming(req, res, body, requestId, sessionKey) {
426
- const existing = getSession(sessionKey);
427
- const resuming = !!existing?.sdkSessionId;
428
- const toolsEnabled = hasTools(body);
429
- const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
430
- if (promptError) {
431
- return res.status(400).json({
432
- error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
433
- });
434
- }
435
- const images = collectImages(body.messages);
436
- // NOTE: `prompt` is built inside runQuery (not here) when images are
437
- // present, because buildQueryPrompt returns a single-use async iterator
438
- // for multimodal requests. If we built it here and the SDK call hit a
439
- // 401, runWithAuthRetry would invoke runQuery a second time with the
440
- // same exhausted iterator → SDK gets an empty user message → silent
441
- // empty response. Lazy construction inside runQuery rebuilds the
442
- // iterator per attempt.
443
- const model = resolveModel(body.model);
444
- // Build the in-process MCP server exposing client tools to the SDK.
445
- // null when toolsEnabled is false (or all tools are malformed).
446
- const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
447
- // System-prompt append: tells the model exactly which tools are
448
- // available and that Claude Code's built-ins (Bash, Grep, Read, etc.)
449
- // are NOT in this environment. Without this, the model trained-in
450
- // priors lead it to call Grep/Bash, get blocked by allowedTools, and
451
- // refuse the task instead of falling back to client tools. ~150 tokens.
452
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
453
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
454
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
455
-
456
- res.setHeader('Content-Type', 'text/event-stream');
457
- res.setHeader('Cache-Control', 'no-cache');
458
- res.setHeader('Connection', 'keep-alive');
459
- res.setHeader('X-Request-Id', requestId);
460
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
461
- res.flushHeaders();
462
- res.write(':ok\n\n');
463
-
464
- const abortController = new AbortController();
465
- let isFirst = true;
466
- let resolvedModel = model;
467
- let capturedSessionId = existing?.sdkSessionId || null;
468
- let clientDisconnected = false;
469
- let inputTokens = 0;
470
- let outputTokens = 0;
471
- let cacheReadTokens = 0;
472
- let cacheCreateTokens = 0;
473
-
474
- res.on('close', () => {
475
- clientDisconnected = true;
476
- abortController.abort();
477
- });
478
-
479
- if (resuming) {
480
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
481
- }
482
-
483
- // Tools-mode buffers text and collects native tool_use blocks. If the
484
- // model emits text first then a tool_use, we want both: textBefore as
485
- // the assistant content, plus the tool_calls. (Most clients display the
486
- // text and then act on the tool_calls.)
487
- let bufferedText = '';
488
- let collectedToolCalls = []; // [{id, name, arguments}] from extractToolUses()
489
-
490
- const runQuery = async () => {
491
- // Reset per-attempt state so a 401 retry starts clean
492
- bufferedText = '';
493
- collectedToolCalls = [];
494
- isFirst = true;
495
- resolvedModel = model;
496
- capturedSessionId = existing?.sdkSessionId || null;
497
-
498
- // Build the prompt lazily on each attempt — multimodal returns a
499
- // single-use async iterator. Keeps 401 auth-retries safe.
500
- const prompt = buildQueryPrompt(promptText, images);
501
- for await (const message of query({
502
- prompt,
503
- options: {
504
- model,
505
- maxTurns: toolsEnabled ? 5 : 200,
506
- permissionMode: 'bypassPermissions',
507
- allowDangerouslySkipPermissions: true,
508
- abortController,
509
- // Tools-mode: register client tools as an in-process MCP server
510
- // and allow only those (no Bash/Read/etc. — the SDK's built-ins
511
- // would pollute the session and leak through to the model).
512
- ...(clientToolsServer
513
- ? {
514
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
515
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
516
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
517
- }
518
- : toolsEnabled
519
- // Tools were requested but none were valid — disable all tools.
520
- ? { allowedTools: [] }
521
- : {}),
522
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
523
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
524
- },
525
- })) {
526
- if (clientDisconnected) break;
527
-
528
- const msgPreview = message.type === 'assistant'
529
- ? `content_keys=${Object.keys(message).join(',')}`
530
- : message.type === 'result'
531
- ? `result=${(message.result || '').slice(0, 60)}`
532
- : message.subtype || '';
533
- console.log(` [msg] type=${message.type} ${msgPreview}`);
534
-
535
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
536
- resolvedModel = message.model;
537
- }
538
-
539
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
540
- capturedSessionId = message.session_id;
541
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
542
- }
543
-
544
- // Extract text from this assistant message
545
- let turnText = '';
546
- if (message.type === 'assistant' && message.message?.content) {
547
- const content = message.message.content;
548
- if (Array.isArray(content)) {
549
- for (const b of content) if (b.type === 'text' && b.text) turnText += b.text;
550
- } else if (typeof content === 'string') {
551
- turnText = content;
552
- }
553
- }
554
-
555
- // Detect auth failure surfaced inline (common on long-running proxies
556
- // where the SDK's cached creds expire). Throw so runWithAuthRetry
557
- // treats it like a real 401 exception.
558
- if (turnText && isAuthFailureText(turnText) && isFirst) {
559
- abortController.abort();
560
- throw new AuthFailureInResultText(turnText);
561
- }
562
-
563
- // Tools-mode: check for native tool_use content blocks. The moment
564
- // we see one, abort the SDK — we don't want our stub handler to
565
- // hang waiting on an execution that's actually happening client-side.
566
- if (toolsEnabled && message.type === 'assistant' && hasToolUse(message)) {
567
- const calls = extractToolUses(message);
568
- if (calls.length) {
569
- collectedToolCalls.push(...calls);
570
- if (turnText) bufferedText += turnText;
571
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
572
- abortController.abort();
573
- break;
574
- }
575
- }
576
-
577
- if (turnText) {
578
- if (toolsEnabled) {
579
- // Buffer text in case it precedes a tool_use, or ends up as the
580
- // final response when the model decides not to call any tools.
581
- bufferedText += turnText;
582
- } else {
583
- sendSSE(res, makeChunk(requestId, resolvedModel, turnText, isFirst ? 'assistant' : undefined, null));
584
- isFirst = false;
585
- }
586
- }
587
-
588
- if (message.type === 'result') {
589
- if (message.result && isAuthFailureText(message.result) && isFirst) {
590
- throw new AuthFailureInResultText(message.result);
591
- }
592
- if (!toolsEnabled && message.result && isFirst) {
593
- sendSSE(res, makeChunk(requestId, resolvedModel, message.result, 'assistant', null));
594
- isFirst = false;
595
- }
596
- if (toolsEnabled && !bufferedText && message.result) bufferedText = message.result;
597
- const usage = extractSdkUsage(message);
598
- inputTokens = usage.input_tokens;
599
- outputTokens = usage.output_tokens;
600
- cacheReadTokens = usage.cache_read_input_tokens;
601
- cacheCreateTokens = usage.cache_creation_input_tokens;
602
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
603
- break;
604
- }
605
- }
606
- };
607
-
608
- try {
609
- await runWithAuthRetry({
610
- attempt: runQuery,
611
- // Only retry if we haven't written a real chunk yet. In tools mode we
612
- // buffer internally so any retry is safe regardless.
613
- bailIfStarted: () => !toolsEnabled && !isFirst,
614
- onRefreshing: (err) => console.warn(`[auth] 401 on stream — refreshing (${err.message?.slice(0, 80)})`),
615
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying stream`),
616
- });
617
- } catch (err) {
618
- // Abort from tool-call detection surfaces as an abort error — not a real failure
619
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
620
- if (!clientDisconnected && !(toolsEnabled && isAbort)) {
621
- console.error('[stream] SDK error:', err.message);
622
- sendSSE(res, { error: { message: err.message, type: 'server_error', code: null } });
623
- }
624
- }
625
-
626
- if (sessionKey && capturedSessionId) {
627
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
628
- }
629
-
630
- // Tools mode: emit the buffered response as a single chunk with either
631
- // tool_calls (+ finish_reason: tool_calls) or plain text (+ stop).
632
- if (toolsEnabled && !res.writableEnded) {
633
- if (collectedToolCalls.length > 0) {
634
- console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
635
- const chunk = {
636
- id: `chatcmpl-${requestId}`,
637
- object: 'chat.completion.chunk',
638
- created: Math.floor(Date.now() / 1000),
639
- model: normalizeModelName(resolvedModel),
640
- choices: [{
641
- index: 0,
642
- delta: {
643
- role: 'assistant',
644
- content: bufferedText.trim() || null,
645
- tool_calls: collectedToolCalls.map((tc, i) => ({
646
- index: i,
647
- id: tc.id,
648
- type: 'function',
649
- function: { name: tc.name, arguments: tc.arguments },
650
- })),
651
- },
652
- finish_reason: 'tool_calls',
653
- }],
654
- };
655
- sendSSE(res, chunk);
656
- } else {
657
- sendSSE(res, makeChunk(requestId, resolvedModel, bufferedText, 'assistant', null));
658
- sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
659
- }
660
- res.write('data: [DONE]\n\n');
661
- res.end();
662
- captureResponse({
663
- requestId,
664
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
665
- status: 'ok',
666
- stopReason: collectedToolCalls.length > 0 ? 'tool_use' : 'end_turn',
667
- model: resolvedModel,
668
- });
669
- return;
670
- }
671
-
672
- if (!res.writableEnded) {
673
- sendSSE(res, makeChunk(requestId, resolvedModel, undefined, undefined, 'stop'));
674
- res.write('data: [DONE]\n\n');
675
- res.end();
676
- }
677
-
678
- captureResponse({
679
- requestId,
680
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
681
- status: clientDisconnected ? 'client_disconnect' : 'ok',
682
- stopReason: 'end_turn',
683
- model: resolvedModel,
684
- });
685
- }
686
-
687
- // ---------------------------------------------------------------------------
688
- // POST /v1/chat/completions — non-streaming
689
- // ---------------------------------------------------------------------------
690
-
691
- async function handleNonStreaming(res, body, requestId, sessionKey) {
692
- const existing = getSession(sessionKey);
693
- const resuming = !!existing?.sdkSessionId;
694
- const toolsEnabled = hasTools(body);
695
- const { promptText, error: promptError } = messagesToPrompt(body.messages, { resuming });
696
- if (promptError) {
697
- return res.status(400).json({
698
- error: { message: promptError, type: 'invalid_request_error', code: 'invalid_resume_messages' },
699
- });
700
- }
701
- const images = collectImages(body.messages);
702
- // NOTE: `prompt` is built inside runQuery (not here) when images are
703
- // present, because buildQueryPrompt returns a single-use async iterator
704
- // for multimodal requests. If we built it here and the SDK call hit a
705
- // 401, runWithAuthRetry would invoke runQuery a second time with the
706
- // same exhausted iterator → SDK gets an empty user message → silent
707
- // empty response. Lazy construction inside runQuery rebuilds the
708
- // iterator per attempt.
709
- const model = resolveModel(body.model);
710
- const clientToolsServer = toolsEnabled ? buildClientToolsServer(body.tools) : null;
711
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(body.tools) : null;
712
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
713
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
714
-
715
- let resultText = '';
716
- let collectedToolCalls = [];
717
- let resolvedModel = model;
718
- let inputTokens = 0;
719
- let outputTokens = 0;
720
- let cacheReadTokens = 0;
721
- let cacheCreateTokens = 0;
722
- let stopReason = 'end_turn';
723
- let capturedSessionId = existing?.sdkSessionId || null;
724
- const abortController = new AbortController();
725
-
726
- if (resuming) {
727
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
728
- }
729
-
730
- const runQuery = async () => {
731
- // Reset per-attempt state so a 401 retry starts clean
732
- resultText = '';
733
- collectedToolCalls = [];
734
- resolvedModel = model;
735
- inputTokens = 0;
736
- outputTokens = 0;
737
- capturedSessionId = existing?.sdkSessionId || null;
738
-
739
- // Build the prompt lazily on each attempt — multimodal returns a
740
- // single-use async iterator. Keeps 401 auth-retries safe.
741
- const prompt = buildQueryPrompt(promptText, images);
742
- for await (const message of query({
743
- prompt,
744
- options: {
745
- model,
746
- maxTurns: toolsEnabled ? 5 : 200,
747
- permissionMode: 'bypassPermissions',
748
- allowDangerouslySkipPermissions: true,
749
- abortController,
750
- ...(clientToolsServer
751
- ? {
752
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
753
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
754
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
755
- }
756
- : toolsEnabled
757
- ? { allowedTools: [] }
758
- : {}),
759
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
760
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
761
- },
762
- })) {
763
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
764
- resolvedModel = message.model;
765
- }
766
-
767
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
768
- capturedSessionId = message.session_id;
769
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
770
- }
771
-
772
- if (message.type === 'assistant' && message.message?.content) {
773
- const content = message.message.content;
774
- if (Array.isArray(content)) {
775
- for (const block of content) {
776
- if (block.type === 'text') resultText += block.text || '';
777
- }
778
- } else if (typeof content === 'string') {
779
- resultText += content;
780
- }
781
- // Detect auth failure surfaced inline (long-running proxy, cached creds)
782
- if (isAuthFailureText(resultText)) {
783
- abortController.abort();
784
- throw new AuthFailureInResultText(resultText);
785
- }
786
- // Native tool_use detection — abort the moment a tool_use lands.
787
- if (toolsEnabled && hasToolUse(message)) {
788
- const calls = extractToolUses(message);
789
- if (calls.length) {
790
- collectedToolCalls.push(...calls);
791
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
792
- abortController.abort();
793
- break;
794
- }
795
- }
796
- }
797
-
798
- if (message.type === 'result') {
799
- if (message.result && !resultText) resultText = message.result;
800
- if (isAuthFailureText(resultText)) {
801
- throw new AuthFailureInResultText(resultText);
802
- }
803
- const usage = extractSdkUsage(message);
804
- inputTokens = usage.input_tokens;
805
- outputTokens = usage.output_tokens;
806
- cacheReadTokens = usage.cache_read_input_tokens;
807
- cacheCreateTokens = usage.cache_creation_input_tokens;
808
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
809
- if (message.subtype) stopReason = message.subtype;
810
- break;
811
- }
812
- }
813
- };
814
-
815
- try {
816
- await runWithAuthRetry({
817
- attempt: runQuery,
818
- // Non-streaming never writes to res until the end — retry is always safe
819
- bailIfStarted: () => false,
820
- onRefreshing: (err) => console.warn(`[auth] 401 on sync call — refreshing (${err.message?.slice(0, 80)})`),
821
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying sync call`),
822
- });
823
- } catch (err) {
824
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
825
- if (!(toolsEnabled && isAbort)) {
826
- console.error('[non-stream] SDK error:', err.message);
827
- return res.status(500).json({ error: { message: err.message, type: 'server_error', code: null } });
828
- }
829
- }
830
-
831
- if (sessionKey && capturedSessionId) {
832
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
833
- }
834
-
835
- const responseHeaders = {};
836
- if (sessionKey) responseHeaders['X-Session-Id'] = sessionKey;
837
-
838
- // Tool-calling response shape
839
- if (toolsEnabled && collectedToolCalls.length > 0) {
840
- console.log(` [tools] emitting ${collectedToolCalls.length} tool_call(s)`);
841
- return res.set(responseHeaders).json({
842
- id: `chatcmpl-${requestId}`,
843
- object: 'chat.completion',
844
- created: Math.floor(Date.now() / 1000),
845
- model: normalizeModelName(resolvedModel),
846
- choices: [{
847
- index: 0,
848
- message: {
849
- role: 'assistant',
850
- content: resultText.trim() || null,
851
- tool_calls: collectedToolCalls.map((tc) => ({
852
- id: tc.id,
853
- type: 'function',
854
- function: { name: tc.name, arguments: tc.arguments },
855
- })),
856
- },
857
- finish_reason: 'tool_calls',
858
- }],
859
- usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
860
- });
861
- // No tool_use blocks → fall through to normal text response
862
- }
863
-
864
- res.set(responseHeaders).json({
865
- id: `chatcmpl-${requestId}`,
866
- object: 'chat.completion',
867
- created: Math.floor(Date.now() / 1000),
868
- model: normalizeModelName(resolvedModel),
869
- choices: [{
870
- index: 0,
871
- message: { role: 'assistant', content: resultText },
872
- finish_reason: 'stop',
873
- }],
874
- usage: { prompt_tokens: inputTokens, completion_tokens: outputTokens, total_tokens: inputTokens + outputTokens },
875
- });
876
-
877
- captureResponse({
878
- requestId,
879
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
880
- status: 'ok',
881
- stopReason,
882
- model: resolvedModel,
883
- });
884
- }
885
-
886
- // ---------------------------------------------------------------------------
887
- // POST /v1/messages — Anthropic-native surface (non-streaming + streaming)
888
- // ---------------------------------------------------------------------------
889
- // The dual-surface architecture: Hermes uses /v1/chat/completions
890
- // (OpenAI shape), OpenClaw uses /v1/messages (Anthropic shape). Both
891
- // translate to the SAME underlying SDK query() — the surfaces are pure
892
- // translators over a single inference engine.
893
- //
894
- // Tool calling: reuses Phase 1's native MCP path from lib/tool-bridge.js.
895
- // No prompt-injected tool definitions, no <tool_call> text parsing.
896
- // Inbound tool_results still spliced as text on resume (see anthropic.js
897
- // docstring for why — Phase 1 limitation, not lifted here).
898
-
899
- async function handleAnthropicNonStreaming(res, body, requestId, sessionKey) {
900
- const existing = getSession(sessionKey);
901
- const resuming = !!existing?.sdkSessionId;
902
- const toolsEnabled = hasAnthropicTools(body);
903
- const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
904
- if (promptError) {
905
- return res.status(400).json({
906
- type: 'error',
907
- error: { type: 'invalid_request_error', message: promptError },
908
- });
909
- }
910
- const images = collectAnthropicImages(body.messages || []);
911
- // See note in handleStreaming — `prompt` is built lazily inside runQuery
912
- // because the multimodal path returns a single-use async iterator that
913
- // a 401-retry would exhaust on the first attempt.
914
- const model = resolveModel(body.model);
915
- // Translate Anthropic tool defs → OpenAI shape that buildClientToolsServer
916
- // expects. Both go through the same JSON-Schema → Zod path on the way to
917
- // MCP; the wrapper shape difference is just `function:{name, parameters}`
918
- // vs `{name, input_schema}`.
919
- const toolsForBridge = toolsEnabled
920
- ? body.tools.map((t) => ({
921
- type: 'function',
922
- function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
923
- }))
924
- : null;
925
- const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
926
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
927
-
928
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
929
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
930
-
931
- let resultText = '';
932
- let collectedToolCalls = [];
933
- let resolvedModel = model;
934
- let inputTokens = 0;
935
- let outputTokens = 0;
936
- let cacheReadTokens = 0;
937
- let cacheCreateTokens = 0;
938
- let capturedSessionId = existing?.sdkSessionId || null;
939
- let stopReason = 'end_turn';
940
- const abortController = new AbortController();
941
-
942
- if (resuming) {
943
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
944
- }
945
-
946
- const runQuery = async () => {
947
- resultText = '';
948
- collectedToolCalls = [];
949
- resolvedModel = model;
950
- inputTokens = 0;
951
- outputTokens = 0;
952
- capturedSessionId = existing?.sdkSessionId || null;
953
- stopReason = 'end_turn';
954
-
955
- // Build the prompt lazily on each attempt — multimodal returns a
956
- // single-use async iterator. Keeps 401 auth-retries safe.
957
- const prompt = buildQueryPrompt(promptText, images);
958
- for await (const message of query({
959
- prompt,
960
- options: {
961
- model,
962
- maxTurns: toolsEnabled ? 5 : 200,
963
- permissionMode: 'bypassPermissions',
964
- allowDangerouslySkipPermissions: true,
965
- abortController,
966
- ...(clientToolsServer
967
- ? {
968
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
969
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
970
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
971
- }
972
- : toolsEnabled
973
- ? { allowedTools: [] }
974
- : {}),
975
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
976
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
977
- },
978
- })) {
979
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
980
- resolvedModel = message.model;
981
- }
982
-
983
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
984
- capturedSessionId = message.session_id;
985
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
986
- }
987
-
988
- if (message.type === 'assistant' && message.message?.content) {
989
- const content = message.message.content;
990
- if (Array.isArray(content)) {
991
- for (const block of content) {
992
- if (block.type === 'text') resultText += block.text || '';
993
- }
994
- } else if (typeof content === 'string') {
995
- resultText += content;
996
- }
997
- if (isAuthFailureText(resultText)) {
998
- abortController.abort();
999
- throw new AuthFailureInResultText(resultText);
1000
- }
1001
- if (toolsEnabled && hasToolUse(message)) {
1002
- const calls = extractToolUses(message);
1003
- if (calls.length) {
1004
- collectedToolCalls.push(...calls);
1005
- stopReason = 'tool_use';
1006
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
1007
- abortController.abort();
1008
- break;
1009
- }
1010
- }
1011
- }
1012
-
1013
- if (message.type === 'result') {
1014
- if (message.result && !resultText) resultText = message.result;
1015
- if (isAuthFailureText(resultText)) {
1016
- throw new AuthFailureInResultText(resultText);
1017
- }
1018
- const usage = extractSdkUsage(message);
1019
- inputTokens = usage.input_tokens;
1020
- outputTokens = usage.output_tokens;
1021
- cacheReadTokens = usage.cache_read_input_tokens;
1022
- cacheCreateTokens = usage.cache_creation_input_tokens;
1023
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
1024
- stopReason = mapStopReason(message);
1025
- break;
1026
- }
1027
- }
1028
- };
1029
-
1030
- try {
1031
- await runWithAuthRetry({
1032
- attempt: runQuery,
1033
- bailIfStarted: () => false,
1034
- onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages — refreshing (${err.message?.slice(0, 80)})`),
1035
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages`),
1036
- });
1037
- } catch (err) {
1038
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
1039
- if (!(toolsEnabled && isAbort)) {
1040
- console.error('[/v1/messages] SDK error:', err.message);
1041
- return res.status(500).json({
1042
- type: 'error',
1043
- error: { type: 'api_error', message: err.message },
1044
- });
1045
- }
1046
- }
1047
-
1048
- if (sessionKey && capturedSessionId) {
1049
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
1050
- }
1051
-
1052
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
1053
-
1054
- res.json(buildAnthropicResponse({
1055
- rawText: resultText.trim(),
1056
- toolUses: collectedToolCalls,
1057
- model: resolvedModel,
1058
- usage: { input_tokens: inputTokens, output_tokens: outputTokens },
1059
- requestId,
1060
- stopReason,
1061
- }));
1062
-
1063
- captureResponse({
1064
- requestId,
1065
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
1066
- status: 'ok',
1067
- stopReason,
1068
- model: resolvedModel,
1069
- });
1070
- }
1071
-
1072
- async function handleAnthropicStreaming(req, res, body, requestId, sessionKey) {
1073
- const existing = getSession(sessionKey);
1074
- const resuming = !!existing?.sdkSessionId;
1075
- const toolsEnabled = hasAnthropicTools(body);
1076
- const { promptText, error: promptError } = anthropicMessagesToPrompt(body, { resuming });
1077
- if (promptError) {
1078
- return res.status(400).json({
1079
- type: 'error',
1080
- error: { type: 'invalid_request_error', message: promptError },
1081
- });
1082
- }
1083
- const images = collectAnthropicImages(body.messages || []);
1084
- // See note in handleStreaming — `prompt` is built lazily inside runQuery
1085
- // because the multimodal path returns a single-use async iterator that
1086
- // a 401-retry would exhaust on the first attempt.
1087
- const model = resolveModel(body.model);
1088
- const toolsForBridge = toolsEnabled
1089
- ? body.tools.map((t) => ({
1090
- type: 'function',
1091
- function: { name: t.name, description: t.description || '', parameters: t.input_schema || {} },
1092
- }))
1093
- : null;
1094
- const clientToolsServer = toolsForBridge ? buildClientToolsServer(toolsForBridge) : null;
1095
- const toolsGuidance = clientToolsServer ? buildToolUsageGuidance(toolsForBridge) : null;
1096
-
1097
- if (images.length) console.log(` [multimodal] ${images.length} image block(s)`);
1098
- if (toolsEnabled) console.log(` [tools] ${body.tools.length} client tool(s) registered as MCP`);
1099
-
1100
- res.setHeader('Content-Type', 'text/event-stream');
1101
- res.setHeader('Cache-Control', 'no-cache');
1102
- res.setHeader('Connection', 'keep-alive');
1103
- res.setHeader('X-Request-Id', requestId);
1104
- if (sessionKey) res.setHeader('X-Session-Id', sessionKey);
1105
- res.flushHeaders();
1106
-
1107
- const tx = makeStreamTranslator({ res, requestId, model });
1108
- const abortController = new AbortController();
1109
- let resolvedModel = model;
1110
- let capturedSessionId = existing?.sdkSessionId || null;
1111
- let inputTokens = 0;
1112
- let outputTokens = 0;
1113
- let cacheReadTokens = 0;
1114
- let cacheCreateTokens = 0;
1115
- let stopReason = 'end_turn';
1116
- let clientDisconnected = false;
1117
- let textEmittedSoFar = ''; // dedup against same-message reflow from SDK
1118
- let toolUseEmitted = false;
1119
-
1120
- res.on('close', () => {
1121
- clientDisconnected = true;
1122
- abortController.abort();
1123
- });
1124
-
1125
- if (resuming) {
1126
- console.log(` [session] resuming: ${sessionKey} → sdk=${existing.sdkSessionId} (msgs=${existing.messageCount})`);
1127
- }
1128
-
1129
- const runQuery = async () => {
1130
- // Reset per-attempt state in case of 401-retry. Note: tx is reused
1131
- // across retries, so a successful retry that comes after we already
1132
- // emitted message_start would surface as a confused stream. We bail
1133
- // out of retry once the translator has started (see bailIfStarted).
1134
- resolvedModel = model;
1135
- capturedSessionId = existing?.sdkSessionId || null;
1136
- inputTokens = 0;
1137
- outputTokens = 0;
1138
- stopReason = 'end_turn';
1139
- textEmittedSoFar = '';
1140
- toolUseEmitted = false;
1141
-
1142
- // Build the prompt lazily on each attempt — multimodal returns a
1143
- // single-use async iterator. Keeps 401 auth-retries safe.
1144
- const prompt = buildQueryPrompt(promptText, images);
1145
- for await (const message of query({
1146
- prompt,
1147
- options: {
1148
- model,
1149
- maxTurns: toolsEnabled ? 5 : 200,
1150
- permissionMode: 'bypassPermissions',
1151
- allowDangerouslySkipPermissions: true,
1152
- abortController,
1153
- ...(clientToolsServer
1154
- ? {
1155
- mcpServers: { [MCP_SERVER_NAME]: clientToolsServer },
1156
- allowedTools: [`${MCP_TOOL_PREFIX}*`],
1157
- systemPrompt: { type: 'preset', preset: 'claude_code', append: toolsGuidance },
1158
- }
1159
- : toolsEnabled
1160
- ? { allowedTools: [] }
1161
- : {}),
1162
- ...(resuming ? { resume: existing.sdkSessionId } : {}),
1163
- ...(sessionKey && !resuming ? { persistSession: true } : {}),
1164
- },
1165
- })) {
1166
- if (clientDisconnected) break;
1167
-
1168
- if (message.type === 'system' && message.subtype === 'init' && message.model) {
1169
- resolvedModel = message.model;
1170
- tx.start(resolvedModel, 0);
1171
- }
1172
-
1173
- if (message.type === 'assistant' && message.session_id && !capturedSessionId) {
1174
- capturedSessionId = message.session_id;
1175
- console.log(` [session] captured sdk session: ${capturedSessionId}`);
1176
- }
1177
-
1178
- if (message.type === 'assistant' && message.message?.content) {
1179
- const content = message.message.content;
1180
-
1181
- // Auth-failure short-circuit: throw so runWithAuthRetry handles it.
1182
- // Only safe before any text has been streamed (otherwise we've
1183
- // already corrupted the SSE stream and can't undo).
1184
- if (Array.isArray(content)) {
1185
- let combined = '';
1186
- for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
1187
- if (combined && isAuthFailureText(combined) && !tx.hasStarted) {
1188
- abortController.abort();
1189
- throw new AuthFailureInResultText(combined);
1190
- }
1191
- }
1192
-
1193
- // Tool_use detection: emit tool_use blocks structurally and abort.
1194
- // We do this BEFORE streaming text deltas from this message so the
1195
- // tool_use block is properly framed (after any pending text block
1196
- // closes). The translator handles the close-text → open-tool-use
1197
- // sequencing internally.
1198
- if (toolsEnabled && hasToolUse(message)) {
1199
- const calls = extractToolUses(message);
1200
- if (calls.length) {
1201
- // Emit any text from this same message *before* the tool_use
1202
- // (Anthropic streams sometimes have text + tool_use in one
1203
- // assistant message — preserve that ordering).
1204
- if (Array.isArray(content)) {
1205
- for (const b of content) {
1206
- if (b?.type === 'text' && b.text) {
1207
- // Compute delta vs what we've emitted to avoid duplication
1208
- // on aggregator-style assistant messages that resend the
1209
- // whole accumulated text.
1210
- const delta = b.text.startsWith(textEmittedSoFar)
1211
- ? b.text.slice(textEmittedSoFar.length)
1212
- : b.text;
1213
- if (delta) {
1214
- tx.pushTextDelta(delta);
1215
- textEmittedSoFar += delta;
1216
- }
1217
- }
1218
- }
1219
- }
1220
- for (const tu of calls) tx.pushToolUse(tu);
1221
- toolUseEmitted = true;
1222
- stopReason = 'tool_use';
1223
- console.log(` [tools] ${calls.length} native tool_use block(s) — aborting SDK`);
1224
- abortController.abort();
1225
- break;
1226
- }
1227
- }
1228
-
1229
- // Plain text-only assistant message: stream the delta.
1230
- if (Array.isArray(content)) {
1231
- let combined = '';
1232
- for (const b of content) if (b?.type === 'text' && b.text) combined += b.text;
1233
- if (combined) {
1234
- const delta = combined.startsWith(textEmittedSoFar)
1235
- ? combined.slice(textEmittedSoFar.length)
1236
- : combined;
1237
- if (delta) {
1238
- tx.pushTextDelta(delta);
1239
- textEmittedSoFar += delta;
1240
- }
1241
- }
1242
- } else if (typeof content === 'string' && content) {
1243
- const delta = content.startsWith(textEmittedSoFar)
1244
- ? content.slice(textEmittedSoFar.length)
1245
- : content;
1246
- if (delta) {
1247
- tx.pushTextDelta(delta);
1248
- textEmittedSoFar += delta;
1249
- }
1250
- }
1251
- }
1252
-
1253
- if (message.type === 'result') {
1254
- if (message.result && !textEmittedSoFar && !toolUseEmitted) {
1255
- // Some SDK paths only deliver text via the final result message
1256
- // (no streaming assistant messages). Emit it here as a single
1257
- // delta — clients see this as "model started + finished in one
1258
- // chunk", which is valid SSE.
1259
- tx.pushTextDelta(message.result);
1260
- }
1261
- if (isAuthFailureText(message.result || '') && !tx.hasStarted) {
1262
- throw new AuthFailureInResultText(message.result);
1263
- }
1264
- const usage = extractSdkUsage(message);
1265
- inputTokens = usage.input_tokens;
1266
- outputTokens = usage.output_tokens;
1267
- cacheReadTokens = usage.cache_read_input_tokens;
1268
- cacheCreateTokens = usage.cache_creation_input_tokens;
1269
- console.log(` [model-billed] requested=${resolvedModel} modelUsage=${JSON.stringify(usage.modelUsage || '(none)')}`);
1270
- if (!toolUseEmitted) stopReason = mapStopReason(message);
1271
- break;
1272
- }
1273
- }
1274
- };
1275
-
1276
- try {
1277
- await runWithAuthRetry({
1278
- attempt: runQuery,
1279
- // Once we've emitted message_start or any deltas, the SSE stream is
1280
- // committed — a retry would fragment it. Same logic as the OpenAI
1281
- // surface (bail once anything has been written).
1282
- bailIfStarted: () => tx.hasStarted,
1283
- onRefreshing: (err) => console.warn(`[auth] 401 on /v1/messages stream — refreshing (${err.message?.slice(0, 80)})`),
1284
- onRetry: (r) => console.log(`[auth] refreshed in ${r.durationMs}ms — retrying /v1/messages stream`),
1285
- });
1286
- } catch (err) {
1287
- const isAbort = err?.name === 'AbortError' || /aborted/i.test(err?.message || '');
1288
- if (!clientDisconnected && !(toolsEnabled && isAbort)) {
1289
- console.error('[/v1/messages stream] SDK error:', err.message);
1290
- tx.error(err);
1291
- return;
1292
- }
1293
- }
1294
-
1295
- if (sessionKey && capturedSessionId) {
1296
- upsertSession(sessionKey, capturedSessionId, resolvedModel);
1297
- }
1298
-
1299
- tx.finish({ stopReason, usage: { output_tokens: outputTokens } });
1300
-
1301
- captureResponse({
1302
- requestId,
1303
- usage: { input_tokens: inputTokens, output_tokens: outputTokens, cache_read_input_tokens: cacheReadTokens, cache_creation_input_tokens: cacheCreateTokens },
1304
- status: 'ok',
1305
- stopReason,
1306
- model: resolvedModel,
1307
- });
1308
- }
1309
-
1310
253
  // ---------------------------------------------------------------------------
1311
254
  // Express app
1312
255
  // ---------------------------------------------------------------------------
@@ -1370,6 +313,36 @@ function requireLocalOrigin(req, res, next) {
1370
313
  next();
1371
314
  }
1372
315
 
316
+ function serializeSession(key, entry, { dashboard = false } = {}) {
317
+ const now = Date.now();
318
+ const idleMs = now - entry.lastUsed;
319
+ const ttlRemainingMs = Math.max(0, SESSION_TTL_MS - idleMs);
320
+
321
+ if (dashboard) {
322
+ return {
323
+ key,
324
+ sdkSessionId: entry.sdkSessionId,
325
+ model: entry.model,
326
+ messageCount: entry.messageCount,
327
+ createdAt: new Date(entry.createdAt).toISOString(),
328
+ lastUsedAt: new Date(entry.lastUsed).toISOString(),
329
+ idleSec: Math.floor(idleMs / 1000),
330
+ ttlRemainingSec: Math.floor(ttlRemainingMs / 1000),
331
+ };
332
+ }
333
+
334
+ return {
335
+ sessionKey: key,
336
+ sdkSessionId: entry.sdkSessionId,
337
+ model: entry.model,
338
+ messageCount: entry.messageCount,
339
+ createdAt: new Date(entry.createdAt).toISOString(),
340
+ lastUsed: new Date(entry.lastUsed).toISOString(),
341
+ idleSeconds: Math.round(idleMs / 1000),
342
+ ttlRemainingSeconds: Math.round(ttlRemainingMs / 1000),
343
+ };
344
+ }
345
+
1373
346
  // GET / — serve dashboard. No-cache headers so browsers always re-fetch
1374
347
  // after a mobygate upgrade; otherwise they keep serving the old index.html
1375
348
  // from cache and users see a stale dashboard long after the service updated.
@@ -1402,6 +375,19 @@ app.get('/', async (_req, res) => {
1402
375
  }
1403
376
  });
1404
377
 
378
+ app.get('/dashboard.css', async (_req, res) => {
379
+ res.setHeader('Cache-Control', 'no-cache, no-store, must-revalidate');
380
+ res.setHeader('Pragma', 'no-cache');
381
+ res.setHeader('Expires', '0');
382
+ try {
383
+ const { readFile } = await import('fs/promises');
384
+ const css = await readFile(join(__dirname, 'dashboard.css'), 'utf8');
385
+ res.type('css').send(css);
386
+ } catch (e) {
387
+ res.status(404).type('text').send('dashboard.css not found at ' + join(__dirname, 'dashboard.css'));
388
+ }
389
+ });
390
+
1405
391
  // /inspector — session inspector UI for browsing captures.
1406
392
  // Backed by /dashboard/captures and /dashboard/captures/:filename.
1407
393
  app.get('/inspector', async (_req, res) => {
@@ -1415,6 +401,19 @@ app.get('/inspector', async (_req, res) => {
1415
401
  }
1416
402
  });
1417
403
 
404
+ // GET /v1/chat/completions — RFC 9110: 405 with Allow header so probes
405
+ // (e.g. Hermes onboarding) can detect the endpoint exists. Returning 404
406
+ // on GET makes them think the endpoint is missing entirely.
407
+ const methodNotAllowed = (allow) => (_req, res) => {
408
+ res.set('Allow', allow);
409
+ res.status(405).json({
410
+ error: { message: `Method Not Allowed. Use ${allow}.`, type: 'invalid_request_error', code: 'method_not_allowed' },
411
+ });
412
+ };
413
+ app.get('/v1/chat/completions', methodNotAllowed('POST'));
414
+ app.get('/v1/messages', methodNotAllowed('POST'));
415
+ app.get('/quiet/v1/messages', methodNotAllowed('POST'));
416
+
1418
417
  // POST /v1/chat/completions
1419
418
  app.post('/v1/chat/completions', async (req, res) => {
1420
419
  const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
@@ -1483,11 +482,14 @@ app.post('/v1/chat/completions', async (req, res) => {
1483
482
  res.on('finish', () => emitEnd());
1484
483
  res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
1485
484
 
1486
- if (body.stream) {
1487
- await handleStreaming(req, res, body, requestId, sessionKey);
1488
- } else {
1489
- await handleNonStreaming(res, body, requestId, sessionKey);
1490
- }
485
+ await runInference(
486
+ { req, res, body, requestId, sessionKey },
487
+ openaiSurface,
488
+ {
489
+ mode: body.stream ? 'stream' : 'json',
490
+ deps: { getSession, upsertSession, resolveModel },
491
+ },
492
+ );
1491
493
  });
1492
494
 
1493
495
  // POST /v1/messages — Anthropic-native surface (for OpenClaw etc.).
@@ -1556,11 +558,104 @@ app.post('/v1/messages', async (req, res) => {
1556
558
  res.on('finish', () => emitEnd());
1557
559
  res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
1558
560
 
1559
- if (body.stream) {
1560
- await handleAnthropicStreaming(req, res, body, requestId, sessionKey);
561
+ await runInference(
562
+ { req, res, body, requestId, sessionKey },
563
+ anthropicSurface,
564
+ {
565
+ mode: body.stream ? 'stream' : 'json',
566
+ deps: { getSession, upsertSession, resolveModel },
567
+ },
568
+ );
569
+ });
570
+
571
+ // POST /quiet/v1/messages — Anthropic-shape, but with two changes vs /v1/messages:
572
+ // 1. Body is scrubbed for known third-party agent identifiers
573
+ // (openclaw, hermes, mobius, etc.) before the SDK forwards it.
574
+ // 2. SDK receives an explicit string systemPrompt — disables the
575
+ // claude_code preset that otherwise injects "I am Claude Code…" framing.
576
+ //
577
+ // Use case: clients that don't want their identity to leak into Anthropic's
578
+ // detection heuristics (e.g. "found 'openclaw' in package.json → flag account
579
+ // for extra-usage billing"). Configurable scrub list at ~/.mobygate/quiet-words.txt.
580
+ app.post('/quiet/v1/messages', async (req, res) => {
581
+ const requestId = uuidv4().replace(/-/g, '').slice(0, 24);
582
+ const body = req.body;
583
+
584
+ if (!body?.messages || !Array.isArray(body.messages) || body.messages.length === 0) {
585
+ return res.status(400).json({
586
+ type: 'error',
587
+ error: { type: 'invalid_request_error', message: 'messages is required and must be a non-empty array' },
588
+ });
589
+ }
590
+
591
+ // Scrub the body in place BEFORE anything else reads it — capture, session
592
+ // derivation, prompt building all see the scrubbed content from here on.
593
+ // Diagnose first so we can log what we stripped (without leaking the values).
594
+ const diag = quietDiagnose(body);
595
+ scrubAnthropicBody(body);
596
+
597
+ const { key: sessionKey, source: sessionKeySource } = resolveSessionKey({
598
+ headerKey: req.headers['x-session-id'],
599
+ bodyKey: body.session_id,
600
+ body,
601
+ });
602
+ const existing = getSession(sessionKey);
603
+ const sessionTag = sessionKey
604
+ ? ` | session=${sessionKey}${sessionKeySource === 'auto' ? ' (auto)' : ''}${existing ? ' (resume)' : ' (new)'}`
605
+ : '';
606
+
607
+ console.log(`[${new Date().toISOString()}] anthropic-quiet ${body.stream ? 'stream' : 'sync'} | model=${body.model} → ${resolveModel(body.model)} | msgs=${body.messages.length}${sessionTag}`);
608
+ if (diag.matches > 0) {
609
+ const breakdown = diag.words.map(w => `${w.word}×${w.count}`).join(' ');
610
+ console.log(` [quiet] scrubbed ${diag.matches} occurrence(s): ${breakdown}`);
1561
611
  } else {
1562
- await handleAnthropicNonStreaming(res, body, requestId, sessionKey);
612
+ console.log(` [quiet] payload was already clean (no matches)`);
1563
613
  }
614
+
615
+ captureRequest({ path: '/quiet/v1/messages', body, requestId, sessionKey, sessionKeySource });
616
+
617
+ const startedAt = Date.now();
618
+ const imageBlocks = collectAnthropicImages(body.messages || []).length;
619
+ dashboardBus.emitEvent({
620
+ type: 'request.start',
621
+ id: requestId,
622
+ method: 'POST',
623
+ path: '/quiet/v1/messages',
624
+ model: body.model,
625
+ resolvedModel: resolveModel(body.model),
626
+ session: sessionKey,
627
+ stream: !!body.stream,
628
+ tools: hasAnthropicTools(body),
629
+ images: imageBlocks,
630
+ messages: body.messages.length,
631
+ resuming: !!existing,
632
+ quietScrubs: diag.matches,
633
+ });
634
+
635
+ let endEmitted = false;
636
+ const emitEnd = (overrides = {}) => {
637
+ if (endEmitted) return;
638
+ endEmitted = true;
639
+ dashboardBus.emitEvent({
640
+ type: 'request.end',
641
+ id: requestId,
642
+ durationMs: Date.now() - startedAt,
643
+ status: res.statusCode < 400 ? 'ok' : 'error',
644
+ httpStatus: res.statusCode,
645
+ ...overrides,
646
+ });
647
+ };
648
+ res.on('finish', () => emitEnd());
649
+ res.on('close', () => { if (!endEmitted) emitEnd({ status: 'error', error: 'client_disconnect' }); });
650
+
651
+ await runInference(
652
+ { req, res, body, requestId, sessionKey },
653
+ anthropicSurface,
654
+ {
655
+ mode: body.stream ? 'stream' : 'json',
656
+ deps: { getSession, upsertSession, resolveModel },
657
+ },
658
+ );
1564
659
  });
1565
660
 
1566
661
  // GET /v1/models
@@ -1582,37 +677,19 @@ app.get('/v1/models', (_req, res) => {
1582
677
  });
1583
678
 
1584
679
  // GET /sessions — list active sessions
1585
- app.get('/sessions', (_req, res) => {
680
+ app.get('/sessions', requireLocalOrigin, (_req, res) => {
1586
681
  const list = [];
1587
682
  for (const [key, entry] of sessions) {
1588
- list.push({
1589
- sessionKey: key,
1590
- sdkSessionId: entry.sdkSessionId,
1591
- model: entry.model,
1592
- messageCount: entry.messageCount,
1593
- createdAt: new Date(entry.createdAt).toISOString(),
1594
- lastUsed: new Date(entry.lastUsed).toISOString(),
1595
- idleSeconds: Math.round((Date.now() - entry.lastUsed) / 1000),
1596
- ttlRemainingSeconds: Math.max(0, Math.round((SESSION_TTL_MS - (Date.now() - entry.lastUsed)) / 1000)),
1597
- });
683
+ list.push(serializeSession(key, entry));
1598
684
  }
1599
685
  res.json({ active: list.length, sessions: list });
1600
686
  });
1601
687
 
1602
688
  // GET /sessions/:key — get specific session
1603
- app.get('/sessions/:key', (req, res) => {
689
+ app.get('/sessions/:key', requireLocalOrigin, (req, res) => {
1604
690
  const entry = sessions.get(req.params.key);
1605
691
  if (!entry) return res.status(404).json({ error: 'Session not found' });
1606
- res.json({
1607
- sessionKey: req.params.key,
1608
- sdkSessionId: entry.sdkSessionId,
1609
- model: entry.model,
1610
- messageCount: entry.messageCount,
1611
- createdAt: new Date(entry.createdAt).toISOString(),
1612
- lastUsed: new Date(entry.lastUsed).toISOString(),
1613
- idleSeconds: Math.round((Date.now() - entry.lastUsed) / 1000),
1614
- ttlRemainingSeconds: Math.max(0, Math.round((SESSION_TTL_MS - (Date.now() - entry.lastUsed)) / 1000)),
1615
- });
692
+ res.json(serializeSession(req.params.key, entry));
1616
693
  });
1617
694
 
1618
695
  // DELETE /sessions/:key — clear a session
@@ -1649,7 +726,7 @@ app.get('/health', (_req, res) => {
1649
726
  // GET /auth/status
1650
727
  // Reports CLI-side auth state plus (optionally) a real probe against Anthropic.
1651
728
  // Pass ?quick=1 to skip the probe (reads keychain only — cheap).
1652
- app.get('/auth/status', async (req, res) => {
729
+ app.get('/auth/status', requireLocalOrigin, async (req, res) => {
1653
730
  const quick = req.query.quick === '1' || req.query.quick === 'true';
1654
731
  const status = await getAuthStatus();
1655
732
  if (!quick && status.ok && status.loggedIn) {
@@ -1722,7 +799,7 @@ async function loadBuildMeta() {
1722
799
  }
1723
800
 
1724
801
  // GET /dashboard/recent — ring-buffer snapshot for initial page load
1725
- app.get('/dashboard/recent', async (req, res) => {
802
+ app.get('/dashboard/recent', requireLocalOrigin, async (req, res) => {
1726
803
  const limit = Math.min(500, parseInt(req.query.limit || '100', 10));
1727
804
  res.json({
1728
805
  recent: dashboardBus.getRecent({ limit }),
@@ -1735,20 +812,10 @@ app.get('/dashboard/recent', async (req, res) => {
1735
812
  });
1736
813
 
1737
814
  // GET /dashboard/sessions — active session detail for the dashboard
1738
- app.get('/dashboard/sessions', (_req, res) => {
1739
- const now = Date.now();
815
+ app.get('/dashboard/sessions', requireLocalOrigin, (_req, res) => {
1740
816
  const list = [];
1741
817
  for (const [key, entry] of sessions) {
1742
- list.push({
1743
- key,
1744
- sdkSessionId: entry.sdkSessionId,
1745
- model: entry.model,
1746
- messageCount: entry.messageCount,
1747
- createdAt: new Date(entry.createdAt).toISOString(),
1748
- lastUsedAt: new Date(entry.lastUsed).toISOString(),
1749
- idleSec: Math.floor((now - entry.lastUsed) / 1000),
1750
- ttlRemainingSec: Math.max(0, Math.floor((SESSION_TTL_MS - (now - entry.lastUsed)) / 1000)),
1751
- });
818
+ list.push(serializeSession(key, entry, { dashboard: true }));
1752
819
  }
1753
820
  // Most recently used first
1754
821
  list.sort((a, b) => a.idleSec - b.idleSec);
@@ -1925,6 +992,176 @@ app.post('/dashboard/captures-toggle', requireLocalOrigin, async (req, res) => {
1925
992
  }
1926
993
  });
1927
994
 
995
+ // GET /dashboard/session-costs — per-session cost breakdown (v0.8.5)
996
+ //
997
+ // Aggregates the [model-billed] log lines emitted by each handler's SDK
998
+ // result step. Grouped by session_key. Surfaces:
999
+ // - cost_usd total $ across all turns of this session
1000
+ // - turns number of completed (non-tool-use-aborted) turns
1001
+ // - dollars_per_turn average cost amortization (low = cache working)
1002
+ // - models per-model breakdown (opus vs sonnet vs haiku)
1003
+ // - first_user first user message (for human-readable identification)
1004
+ //
1005
+ // This view exists because today's audit found 38.9% of total spend
1006
+ // going to "singleton" sessions — channels that fire once, idle past
1007
+ // the wire-cache TTL, then pay cache_creation tax on the next turn.
1008
+ // The dashboard tab built off this endpoint lets users spot bleeding
1009
+ // channels in real time and decide which to keep warm via cron pings.
1010
+ app.get('/dashboard/session-costs', requireLocalOrigin, async (_req, res) => {
1011
+ try {
1012
+ const { readFile, readdir } = await import('fs/promises');
1013
+ const { existsSync } = await import('fs');
1014
+ const path = await import('path');
1015
+ const { homedir } = await import('os');
1016
+
1017
+ const logPath = join(LOGS_DIR, 'server.log');
1018
+ const captureDir = process.env.MOBYGATE_CAPTURE_DIR
1019
+ || join(process.env.MOBYGATE_HOME || join(homedir(), '.mobygate'), 'captures');
1020
+
1021
+ // Step 1: parse [model-billed] lines from server.log, associating
1022
+ // each with the most recently observed session= line above it.
1023
+ const sessions = {}; // sk -> { turns, cost_usd, models: {model -> {turns, cost_usd, in_uncached, cache_read, cache_create, out}} }
1024
+ let lastSession = null;
1025
+
1026
+ if (existsSync(logPath)) {
1027
+ const raw = await readFile(logPath, 'utf8');
1028
+ for (const line of raw.split(/\r?\n/)) {
1029
+ const sessMatch = line.match(/session=(auto_\w+)/);
1030
+ if (sessMatch) lastSession = sessMatch[1];
1031
+ const billed = line.match(/\[model-billed\] requested=\S+ modelUsage=(\{.+\})/);
1032
+ if (billed && lastSession) {
1033
+ let mu;
1034
+ try { mu = JSON.parse(billed[1]); } catch { continue; }
1035
+ if (!sessions[lastSession]) {
1036
+ sessions[lastSession] = { turns: 0, cost_usd: 0, models: {} };
1037
+ }
1038
+ const rec = sessions[lastSession];
1039
+ rec.turns += 1;
1040
+ for (const [model, data] of Object.entries(mu)) {
1041
+ const cost = data.costUSD || 0;
1042
+ rec.cost_usd += cost;
1043
+ if (!rec.models[model]) rec.models[model] = { turns: 0, cost_usd: 0, in_uncached: 0, cache_read: 0, cache_create: 0, out: 0 };
1044
+ const m = rec.models[model];
1045
+ m.turns += 1;
1046
+ m.cost_usd += cost;
1047
+ m.in_uncached += data.inputTokens || 0;
1048
+ m.cache_read += data.cacheReadInputTokens || 0;
1049
+ m.cache_create += data.cacheCreationInputTokens || 0;
1050
+ m.out += data.outputTokens || 0;
1051
+ }
1052
+ }
1053
+ }
1054
+ }
1055
+
1056
+ // Step 2: enrich with capture metadata (first user message, model,
1057
+ // path, msg count) for each session_key. Only need to read enough
1058
+ // captures to find one per session.
1059
+ const sessionMeta = {};
1060
+ if (existsSync(captureDir)) {
1061
+ const files = (await readdir(captureDir))
1062
+ .filter(n => n.endsWith('.json'))
1063
+ .sort()
1064
+ .reverse(); // newest first
1065
+ for (const f of files) {
1066
+ const summaryFile = f.replace(/\.json$/, '.summary.txt');
1067
+ if (!existsSync(join(captureDir, summaryFile))) continue;
1068
+ const summary = await readFile(join(captureDir, summaryFile), 'utf8').catch(() => '');
1069
+ const skMatch = summary.match(/^session_key:\s+(auto_\w+)/m);
1070
+ if (!skMatch) continue;
1071
+ const sk = skMatch[1];
1072
+ if (sessionMeta[sk]) continue; // already have meta
1073
+ const modelMatch = summary.match(/^model:\s+(\S+)/m);
1074
+ const pathMatch = summary.match(/^path:\s+(\S+)/m);
1075
+ const msgsMatch = summary.match(/^messages:\s+(\d+)/m);
1076
+ const lastSeen = (await readFile(join(captureDir, summaryFile)).then(b => b.length).catch(()=>0)) ? f.slice(0, 19) : null;
1077
+
1078
+ let firstUser = null;
1079
+ try {
1080
+ const body = JSON.parse(await readFile(join(captureDir, f), 'utf8'));
1081
+ for (const m of (body.messages || []).slice(0, 5)) {
1082
+ if (m.role !== 'user') continue;
1083
+ const c = m.content;
1084
+ let txt = '';
1085
+ if (Array.isArray(c)) {
1086
+ for (const blk of c) {
1087
+ if (blk?.type === 'text' && blk.text) { txt = blk.text; break; }
1088
+ }
1089
+ } else if (typeof c === 'string') {
1090
+ txt = c;
1091
+ }
1092
+ // Skip "OpenClaw runtime context" boilerplate
1093
+ if (txt && !txt.startsWith('OpenClaw runtime context')) {
1094
+ firstUser = txt.slice(0, 80).replace(/\s+/g, ' ');
1095
+ break;
1096
+ }
1097
+ }
1098
+ } catch {}
1099
+
1100
+ sessionMeta[sk] = {
1101
+ model: modelMatch ? modelMatch[1] : null,
1102
+ path: pathMatch ? pathMatch[1] : null,
1103
+ msgs: msgsMatch ? parseInt(msgsMatch[1], 10) : null,
1104
+ lastSeenIso: lastSeen,
1105
+ firstUser,
1106
+ };
1107
+ }
1108
+ }
1109
+
1110
+ // Step 3: combine and sort
1111
+ const out = [];
1112
+ for (const [sk, rec] of Object.entries(sessions)) {
1113
+ const meta = sessionMeta[sk] || {};
1114
+ out.push({
1115
+ session_key: sk,
1116
+ turns: rec.turns,
1117
+ cost_usd: Math.round(rec.cost_usd * 10000) / 10000,
1118
+ per_turn_usd: Math.round((rec.cost_usd / Math.max(rec.turns, 1)) * 10000) / 10000,
1119
+ bucket: rec.turns === 1 ? 'singleton' : rec.turns <= 3 ? 'short' : rec.turns <= 10 ? 'medium' : 'warm',
1120
+ model: meta.model || null,
1121
+ path: meta.path || null,
1122
+ msgs: meta.msgs || null,
1123
+ last_seen: meta.lastSeenIso || null,
1124
+ first_user: meta.firstUser || null,
1125
+ models: Object.fromEntries(
1126
+ Object.entries(rec.models).map(([m, d]) => [m, {
1127
+ turns: d.turns,
1128
+ cost_usd: Math.round(d.cost_usd * 10000) / 10000,
1129
+ in_uncached: d.in_uncached,
1130
+ cache_read: d.cache_read,
1131
+ cache_create: d.cache_create,
1132
+ out: d.out,
1133
+ }]),
1134
+ ),
1135
+ });
1136
+ }
1137
+ out.sort((a, b) => b.cost_usd - a.cost_usd);
1138
+
1139
+ // Step 4: aggregate stats
1140
+ const totalCost = out.reduce((s, r) => s + r.cost_usd, 0);
1141
+ const totalTurns = out.reduce((s, r) => s + r.turns, 0);
1142
+ const buckets = { singleton: { sessions: 0, cost: 0 }, short: { sessions: 0, cost: 0 }, medium: { sessions: 0, cost: 0 }, warm: { sessions: 0, cost: 0 } };
1143
+ for (const r of out) {
1144
+ buckets[r.bucket].sessions += 1;
1145
+ buckets[r.bucket].cost += r.cost_usd;
1146
+ }
1147
+ for (const k of Object.keys(buckets)) {
1148
+ buckets[k].cost = Math.round(buckets[k].cost * 100) / 100;
1149
+ buckets[k].pct_of_total = totalCost > 0 ? Math.round((buckets[k].cost / totalCost) * 1000) / 10 : 0;
1150
+ }
1151
+
1152
+ res.json({
1153
+ generatedAt: new Date().toISOString(),
1154
+ total_cost_usd: Math.round(totalCost * 100) / 100,
1155
+ total_turns: totalTurns,
1156
+ session_count: out.length,
1157
+ buckets,
1158
+ sessions: out,
1159
+ });
1160
+ } catch (e) {
1161
+ res.status(500).json({ error: e.message });
1162
+ }
1163
+ });
1164
+
1928
1165
  // ---------------------------------------------------------------------------
1929
1166
  // Updater — dashboard-driven "update available → update now" flow
1930
1167
  // ---------------------------------------------------------------------------
@@ -1932,7 +1169,7 @@ app.post('/dashboard/captures-toggle', requireLocalOrigin, async (req, res) => {
1932
1169
  // GET /update/check — is there a newer mobygate on npm?
1933
1170
  // Response: { current, latest, updateAvailable, installMode, canApply, cached, error }
1934
1171
  // Safe to poll: the npm registry call is cached for 15 min in-process.
1935
- app.get('/update/check', async (req, res) => {
1172
+ app.get('/update/check', requireLocalOrigin, async (req, res) => {
1936
1173
  try {
1937
1174
  const force = req.query.force === '1' || req.query.force === 'true';
1938
1175
  const info = await getUpdateCheck({ force });
@@ -1964,7 +1201,7 @@ app.post('/update/apply', requireLocalOrigin, (_req, res) => {
1964
1201
  // The dashboard polls this during apply. `running` is determined by
1965
1202
  // PID liveness, so even if our process is the one getting restarted,
1966
1203
  // the new one answers correctly.
1967
- app.get('/update/status', (req, res) => {
1204
+ app.get('/update/status', requireLocalOrigin, (req, res) => {
1968
1205
  const state = readUpdateState();
1969
1206
  let running = false;
1970
1207
  if (state.pid) {
@@ -1987,11 +1224,12 @@ app.get('/update/status', (req, res) => {
1987
1224
 
1988
1225
  app.listen(PORT, BIND, async () => {
1989
1226
  const ttlMin = Math.round(SESSION_TTL_MS / 60000);
1227
+ const ttlHours = (SESSION_TTL_MS / 3600000).toFixed(1);
1990
1228
  const meta = await loadBuildMeta();
1991
1229
  console.log(banner({ version: meta.version }));
1992
1230
  console.log(` bind ${BIND}:${PORT}${BIND === '127.0.0.1' ? ' (loopback only)' : ' (⚠ network-reachable — add auth)'}`);
1993
1231
  console.log(` model ${DEFAULT_MODEL}`);
1994
- console.log(` session TTL ${ttlMin} min`);
1232
+ console.log(` session TTL ${ttlMin} min (${ttlHours}h)`);
1995
1233
  console.log(` dashboard http://localhost:${PORT}`);
1996
1234
  if (isCaptureEnabled()) {
1997
1235
  console.log(` capture ON → ${CAPTURE_DIR_PATH.replace(process.env.HOME || '', '~')}`);