@hsupu/copilot-api 0.7.18-beta → 0.7.18-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.mjs CHANGED
@@ -1,8 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  import { defineCommand, runMain } from "citty";
3
3
  import consola, { consola as consola$1 } from "consola";
4
+ import * as fs$1 from "node:fs/promises";
4
5
  import fs, { access, constants, readFile } from "node:fs/promises";
5
6
  import os, { homedir } from "node:os";
7
+ import * as path$1 from "node:path";
6
8
  import path, { dirname, join, resolve } from "node:path";
7
9
  import { randomBytes, randomUUID } from "node:crypto";
8
10
  import pc from "picocolors";
@@ -24,6 +26,7 @@ const PATHS = {
24
26
  APP_DIR,
25
27
  GITHUB_TOKEN_PATH,
26
28
  CONFIG_YAML: path.join(APP_DIR, "config.yaml"),
29
+ LEARNED_LIMITS: path.join(APP_DIR, "learned-limits.json"),
27
30
  ERROR_DIR: path.join(APP_DIR, "errmsgs")
28
31
  };
29
32
  async function ensurePaths() {
@@ -42,7 +45,15 @@ async function ensureFile(filePath) {
42
45
 
43
46
  //#endregion
44
47
  //#region src/lib/state.ts
45
- /** Default model overrides: short aliases → top-preference model per family */
48
+ /**
49
+ * Rebuild model lookup indexes from state.models.
50
+ * Called by cacheModels() in production; call directly in tests after setting state.models.
51
+ */
52
+ function rebuildModelIndex() {
53
+ const data = state.models?.data ?? [];
54
+ state.modelIndex = new Map(data.map((m) => [m.id, m]));
55
+ state.modelIds = new Set(data.map((m) => m.id));
56
+ }
46
57
  const DEFAULT_MODEL_OVERRIDES = {
47
58
  opus: "claude-opus-4.6",
48
59
  sonnet: "claude-sonnet-4.6",
@@ -50,6 +61,8 @@ const DEFAULT_MODEL_OVERRIDES = {
50
61
  };
51
62
  const state = {
52
63
  accountType: "individual",
64
+ modelIndex: /* @__PURE__ */ new Map(),
65
+ modelIds: /* @__PURE__ */ new Set(),
53
66
  showGitHubToken: false,
54
67
  verbose: false,
55
68
  autoTruncate: true,
@@ -198,7 +211,11 @@ function extractTrailingSystemReminderTags(text) {
198
211
  while (true) {
199
212
  const currentTagEnd = scanEnd;
200
213
  let end = scanEnd;
201
- while (end > 0 && "\n \r".includes(text[end - 1])) end--;
214
+ while (end > 0) {
215
+ const c = text.charCodeAt(end - 1);
216
+ if (c !== 10 && c !== 32 && c !== 9 && c !== 13) break;
217
+ end--;
218
+ }
202
219
  if (end < 18) break;
203
220
  if (text.slice(end - 18, end) !== CLOSE_TAG) break;
204
221
  const closeTagStart = end - 18;
@@ -240,7 +257,11 @@ function extractLeadingSystemReminderTags(text) {
240
257
  while (true) {
241
258
  const currentTagStart = scanStart;
242
259
  let start = scanStart;
243
- while (start < text.length && " \r".includes(text[start])) start++;
260
+ while (start < text.length) {
261
+ const c = text.charCodeAt(start);
262
+ if (c !== 32 && c !== 9 && c !== 13) break;
263
+ start++;
264
+ }
244
265
  if (start + 17 > text.length) break;
245
266
  if (text.slice(start, start + 17) !== OPEN_TAG) break;
246
267
  const afterOpen = start + 17;
@@ -365,21 +386,6 @@ function removeSystemReminderTags(text) {
365
386
  return end < result.length ? result.slice(0, end) : result;
366
387
  }
367
388
 
368
- //#endregion
369
- //#region src/lib/utils.ts
370
- const sleep = (ms) => new Promise((resolve) => {
371
- setTimeout(resolve, ms);
372
- });
373
- const isNullish = (value) => value === null || value === void 0;
374
- /** Convert bytes to KB with rounding */
375
- function bytesToKB(bytes) {
376
- return Math.round(bytes / 1024);
377
- }
378
- /** Generate unique ID (timestamp + random) */
379
- function generateId(randomLength = 7) {
380
- return Date.now().toString(36) + Math.random().toString(36).slice(2, 2 + randomLength);
381
- }
382
-
383
389
  //#endregion
384
390
  //#region src/lib/auto-truncate/index.ts
385
391
  /**
@@ -392,64 +398,126 @@ const MAX_AUTO_TRUNCATE_RETRIES = 5;
392
398
  const AUTO_TRUNCATE_RETRY_FACTOR = .9;
393
399
  const DEFAULT_AUTO_TRUNCATE_CONFIG = {
394
400
  safetyMarginPercent: 2,
395
- maxRequestBodyBytes: 510 * 1024,
396
401
  preserveRecentPercent: .7,
397
- checkTokenLimit: true,
398
- checkByteLimit: false
402
+ checkTokenLimit: true
399
403
  };
400
- /** Dynamic byte limit that adjusts based on 413 errors */
401
- let dynamicByteLimit = null;
404
+ const learnedLimits = /* @__PURE__ */ new Map();
405
+ /** Get learned limits for a model (including calibration data) */
406
+ function getLearnedLimits(modelId) {
407
+ return learnedLimits.get(modelId);
408
+ }
402
409
  /**
403
- * Called when a 413 error occurs. Adjusts the byte limit to 90% of the failing size.
410
+ * Check whether a model has known limits from previous failures.
411
+ * Used to decide whether to pre-check requests before sending.
404
412
  */
405
- function onRequestTooLarge(failingBytes) {
406
- const newLimit = Math.max(Math.floor(failingBytes * .9), 100 * 1024);
407
- dynamicByteLimit = newLimit;
408
- consola.info(`[AutoTruncate] Adjusted byte limit: ${bytesToKB(failingBytes)}KB failed → ${bytesToKB(newLimit)}KB`);
409
- }
410
- /** Get the current effective byte limit */
411
- function getEffectiveByteLimitBytes() {
412
- return dynamicByteLimit ?? DEFAULT_AUTO_TRUNCATE_CONFIG.maxRequestBodyBytes;
413
+ function hasKnownLimits(modelId) {
414
+ return learnedLimits.has(modelId);
413
415
  }
414
- /** Dynamic token limits per model, adjusted based on token limit errors */
415
- const dynamicTokenLimits = /* @__PURE__ */ new Map();
416
416
  /**
417
417
  * Called when a token limit error (400) occurs.
418
- * Adjusts the token limit for the specific model to 95% of the reported limit.
419
- */
420
- function onTokenLimitExceeded(modelId, reportedLimit) {
421
- const newLimit = Math.floor(reportedLimit * .95);
422
- const previous = dynamicTokenLimits.get(modelId);
423
- if (!previous || newLimit < previous) {
424
- dynamicTokenLimits.set(modelId, newLimit);
425
- consola.info(`[AutoTruncate] Adjusted token limit for ${modelId}: ${reportedLimit} reported → ${newLimit} effective`);
418
+ * Records the learned limit and optionally updates calibration.
419
+ */
420
+ function onTokenLimitExceeded(modelId, reportedLimit, reportedCurrent, estimatedTokens) {
421
+ const existing = learnedLimits.get(modelId);
422
+ if (!existing || reportedLimit < existing.tokenLimit) {
423
+ learnedLimits.set(modelId, {
424
+ tokenLimit: reportedLimit,
425
+ calibrationFactor: existing?.calibrationFactor ?? 1,
426
+ sampleCount: existing?.sampleCount ?? 0,
427
+ updatedAt: Date.now()
428
+ });
429
+ consola.info(`[AutoTruncate] Learned token limit for ${modelId}: ${reportedLimit}`);
430
+ }
431
+ if (reportedCurrent !== void 0 && estimatedTokens !== void 0 && estimatedTokens > 0) {
432
+ updateCalibration(modelId, reportedCurrent, estimatedTokens);
433
+ const lim = learnedLimits.get(modelId);
434
+ consola.info(`[AutoTruncate] Calibration for ${modelId}: actual=${reportedCurrent} vs estimated=${estimatedTokens} → factor=${lim.calibrationFactor.toFixed(3)} (${lim.sampleCount} samples)`);
426
435
  }
436
+ schedulePersist();
427
437
  }
438
+ const CALIBRATION_ALPHA = .3;
439
+ const CALIBRATION_MIN = .5;
440
+ const CALIBRATION_MAX = 3;
428
441
  /**
429
- * Get the effective token limit for a model.
430
- * Returns the dynamic limit if set, otherwise null to use model capabilities.
431
- */
432
- function getEffectiveTokenLimit(modelId) {
433
- return dynamicTokenLimits.get(modelId) ?? null;
442
+ * Update the per-model calibration factor using EWMA.
443
+ *
444
+ * Called after a token limit error when we know both the GPT tokenizer estimate
445
+ * and the actual token count (from the error response). The ratio between them
446
+ * tells us how much the GPT tokenizer over/under-estimates for this model.
447
+ */
448
+ function updateCalibration(modelId, actualTokens, estimatedTokens) {
449
+ if (estimatedTokens <= 0) return;
450
+ const limits = learnedLimits.get(modelId);
451
+ if (!limits) return;
452
+ const rawFactor = actualTokens / estimatedTokens;
453
+ const clamped = Math.max(CALIBRATION_MIN, Math.min(CALIBRATION_MAX, rawFactor));
454
+ if (limits.sampleCount === 0) limits.calibrationFactor = clamped;
455
+ else limits.calibrationFactor = CALIBRATION_ALPHA * clamped + (1 - CALIBRATION_ALPHA) * limits.calibrationFactor;
456
+ limits.sampleCount++;
457
+ limits.updatedAt = Date.now();
458
+ }
459
+ /** Apply calibration factor to a GPT tokenizer estimate */
460
+ function calibrate(modelId, gptEstimate) {
461
+ const limits = learnedLimits.get(modelId);
462
+ if (!limits || limits.sampleCount === 0) return gptEstimate;
463
+ return Math.ceil(gptEstimate * limits.calibrationFactor);
464
+ }
465
+ const BASE_MARGIN = .03;
466
+ const BONUS_MARGIN_PER_SAMPLE = .07;
467
+ /**
468
+ * Compute dynamic safety margin based on calibration confidence.
469
+ * Fewer samples → wider margin (conservative). More samples → narrower margin.
470
+ *
471
+ * - 0 samples: 10% (0.03 + 0.07)
472
+ * - 1 sample: 10%
473
+ * - 10 samples: ~3.7%
474
+ * - ∞ samples: 3%
475
+ */
476
+ function computeSafetyMargin(sampleCount) {
477
+ if (sampleCount <= 0) return BASE_MARGIN + BONUS_MARGIN_PER_SAMPLE;
478
+ return BASE_MARGIN + BONUS_MARGIN_PER_SAMPLE / sampleCount;
479
+ }
480
+ let persistTimer = null;
481
+ const PERSIST_DEBOUNCE_MS = 5e3;
482
+ /** Schedule an async write of learned limits (debounced) */
483
+ function schedulePersist() {
484
+ if (persistTimer) return;
485
+ persistTimer = setTimeout(() => {
486
+ persistTimer = null;
487
+ persistLimits();
488
+ }, PERSIST_DEBOUNCE_MS);
489
+ }
490
+ /** Write learned limits to disk */
491
+ async function persistLimits() {
492
+ if (learnedLimits.size === 0) return;
493
+ const data = {
494
+ version: 1,
495
+ limits: Object.fromEntries(learnedLimits)
496
+ };
497
+ try {
498
+ await fs.writeFile(PATHS.LEARNED_LIMITS, JSON.stringify(data, null, 2), "utf8");
499
+ } catch {}
434
500
  }
435
- /**
436
- * Check whether a model has known limits from previous failures.
437
- * Used to decide whether to pre-check requests before sending.
438
- */
439
- function hasKnownLimits(modelId) {
440
- return dynamicTokenLimits.has(modelId) || dynamicByteLimit !== null;
501
+ /** Load previously persisted limits from disk (called at startup) */
502
+ async function loadPersistedLimits() {
503
+ try {
504
+ const raw = await fs.readFile(PATHS.LEARNED_LIMITS, "utf8");
505
+ const data = JSON.parse(raw);
506
+ if (data.version !== 1) return;
507
+ for (const [modelId, lim] of Object.entries(data.limits)) if (lim.tokenLimit > 0 && lim.calibrationFactor >= CALIBRATION_MIN && lim.calibrationFactor <= CALIBRATION_MAX) learnedLimits.set(modelId, lim);
508
+ if (learnedLimits.size > 0) consola.info(`[AutoTruncate] Loaded learned limits for ${learnedLimits.size} model(s)`);
509
+ } catch {}
441
510
  }
442
511
  /**
443
- * Parse an HTTPError to detect token limit or body size errors,
512
+ * Parse an HTTPError to detect token limit errors,
444
513
  * and record the learned limit for future pre-checks.
445
514
  *
446
- * Returns error info if the error is a retryable limit error, null otherwise.
515
+ * When `estimatedTokens` is provided (the GPT tokenizer estimate at the time
516
+ * of the error), also updates the per-model calibration factor.
517
+ *
518
+ * Returns error info if the error is a retryable token limit error, null otherwise.
447
519
  */
448
- function tryParseAndLearnLimit(error, modelId, payloadBytes, learn = true) {
449
- if (error.status === 413) {
450
- if (payloadBytes && learn) onRequestTooLarge(payloadBytes);
451
- return { type: "body_too_large" };
452
- }
520
+ function tryParseAndLearnLimit(error, modelId, learn = true, estimatedTokens) {
453
521
  if (error.status === 400) {
454
522
  let errorJson;
455
523
  try {
@@ -461,7 +529,7 @@ function tryParseAndLearnLimit(error, modelId, payloadBytes, learn = true) {
461
529
  if (!(errorJson.error.code === "model_max_prompt_tokens_exceeded" || errorJson.error.type === "invalid_request_error")) return null;
462
530
  const tokenInfo = parseTokenLimitError(errorJson.error.message);
463
531
  if (!tokenInfo) return null;
464
- if (learn) onTokenLimitExceeded(modelId, tokenInfo.limit);
532
+ if (learn) onTokenLimitExceeded(modelId, tokenInfo.limit, tokenInfo.current, estimatedTokens);
465
533
  return {
466
534
  type: "token_limit",
467
535
  limit: tokenInfo.limit,
@@ -594,64 +662,9 @@ function formatRateLimitError(copilotMessage) {
594
662
  }
595
663
  };
596
664
  }
597
- /** Format timestamp as YYMMDD_HHmmss for error directory names */
598
- function formatErrorTimestamp() {
599
- const now = /* @__PURE__ */ new Date();
600
- return `${String(now.getFullYear()).slice(2)}${String(now.getMonth() + 1).padStart(2, "0")}${String(now.getDate()).padStart(2, "0")}_${String(now.getHours()).padStart(2, "0")}${String(now.getMinutes()).padStart(2, "0")}${String(now.getSeconds()).padStart(2, "0")}`;
601
- }
602
- /** Extract request headers as a plain object (excluding potentially large/binary headers) */
603
- function extractHeaders(c) {
604
- const headers = {};
605
- for (const [key, value] of c.req.raw.headers.entries()) headers[key] = key.toLowerCase() === "authorization" ? "[REDACTED]" : value;
606
- return headers;
607
- }
608
- /**
609
- * Persist error details to disk for post-mortem debugging.
610
- * Each error gets a subdirectory under errmsgs/ containing:
611
- * - meta.json: structured metadata (timestamp, status, headers, error info)
612
- * - request.json: raw request body
613
- * - response.txt: raw upstream response body
614
- *
615
- * Fire-and-forget — never blocks or throws.
616
- */
617
- async function writeErrorToFile(c, error) {
618
- const id = randomBytes(4).toString("hex");
619
- const dirName = `${formatErrorTimestamp()}_${id}`;
620
- const dirPath = path.join(PATHS.ERROR_DIR, dirName);
621
- await fs.mkdir(dirPath, { recursive: true });
622
- const meta = {
623
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
624
- request: {
625
- method: c.req.method,
626
- path: c.req.path,
627
- url: c.req.url,
628
- headers: extractHeaders(c)
629
- }
630
- };
631
- if (error instanceof HTTPError) {
632
- meta.response = {
633
- status: error.status,
634
- modelId: error.modelId
635
- };
636
- meta.error = { message: error.message };
637
- } else if (error instanceof Error) meta.error = {
638
- message: formatErrorWithCause(error),
639
- name: error.name,
640
- stack: error.stack
641
- };
642
- else meta.error = { message: String(error) };
643
- const writes = [fs.writeFile(path.join(dirPath, "meta.json"), JSON.stringify(meta, null, 2))];
644
- try {
645
- const body = await c.req.json();
646
- writes.push(fs.writeFile(path.join(dirPath, "request.json"), JSON.stringify(body, null, 2)));
647
- } catch {}
648
- if (error instanceof HTTPError && error.responseText) writes.push(fs.writeFile(path.join(dirPath, "response.txt"), error.responseText));
649
- await Promise.all(writes);
650
- }
651
665
  function forwardError(c, error) {
652
- writeErrorToFile(c, error).catch(() => {});
653
666
  if (error instanceof HTTPError) {
654
- const limitInfo = tryParseAndLearnLimit(error, error.modelId ?? "unknown", void 0, state.autoTruncate);
667
+ const limitInfo = tryParseAndLearnLimit(error, error.modelId ?? "unknown", state.autoTruncate);
655
668
  if (error.status === 413) {
656
669
  const formattedError = formatRequestTooLargeError();
657
670
  consola.warn(`HTTP 413: Request too large`);
@@ -827,7 +840,6 @@ const NETWORK_ERROR_PATTERNS = [
827
840
  function isNetworkError(error) {
828
841
  const msg = error.message.toLowerCase();
829
842
  if (NETWORK_ERROR_PATTERNS.some((p) => msg.includes(p.toLowerCase()))) return true;
830
- if (error instanceof TypeError) return true;
831
843
  if (error.cause instanceof Error) return isNetworkError(error.cause);
832
844
  return false;
833
845
  }
@@ -1048,6 +1060,21 @@ var CopilotTokenManager = class {
1048
1060
  }
1049
1061
  };
1050
1062
 
1063
+ //#endregion
1064
+ //#region src/lib/utils.ts
1065
+ const sleep = (ms) => new Promise((resolve) => {
1066
+ setTimeout(resolve, ms);
1067
+ });
1068
+ const isNullish = (value) => value === null || value === void 0;
1069
+ /** Convert bytes to KB with rounding */
1070
+ function bytesToKB(bytes) {
1071
+ return Math.round(bytes / 1024);
1072
+ }
1073
+ /** Generate unique ID (timestamp + random) */
1074
+ function generateId(randomLength = 7) {
1075
+ return Date.now().toString(36) + Math.random().toString(36).slice(2, 2 + randomLength);
1076
+ }
1077
+
1051
1078
  //#endregion
1052
1079
  //#region src/lib/token/github-client.ts
1053
1080
  /** GitHub OAuth API client — device code flow and user info */
@@ -1589,6 +1616,7 @@ const checkUsage = defineCommand({
1589
1616
  /** Fetch models from Copilot API and cache in global state */
1590
1617
  async function cacheModels() {
1591
1618
  state.models = await getModels();
1619
+ rebuildModelIndex();
1592
1620
  }
1593
1621
  const getModels = async () => {
1594
1622
  const response = await fetch(`${copilotBaseUrl(state)}/models`, { headers: copilotHeaders(state) });
@@ -1766,6 +1794,8 @@ var AdaptiveRateLimiter = class {
1766
1794
  lastRequestTime = 0;
1767
1795
  /** Current step in gradual recovery (index into gradualRecoverySteps) */
1768
1796
  recoveryStepIndex = 0;
1797
+ /** Abort controller for cancelling pending sleeps during shutdown */
1798
+ sleepAbortController = new AbortController();
1769
1799
  constructor(config = {}) {
1770
1800
  this.config = {
1771
1801
  ...DEFAULT_CONFIG,
@@ -1999,10 +2029,20 @@ var AdaptiveRateLimiter = class {
1999
2029
  request.reject(/* @__PURE__ */ new Error("Server shutting down"));
2000
2030
  }
2001
2031
  this.processing = false;
2032
+ this.sleepAbortController.abort();
2033
+ this.sleepAbortController = new AbortController();
2002
2034
  return count;
2003
2035
  }
2004
2036
  sleep(ms) {
2005
- return new Promise((resolve) => setTimeout(resolve, ms));
2037
+ const signal = this.sleepAbortController.signal;
2038
+ if (signal.aborted) return Promise.resolve();
2039
+ return new Promise((resolve) => {
2040
+ const timer = setTimeout(resolve, ms);
2041
+ signal.addEventListener("abort", () => {
2042
+ clearTimeout(timer);
2043
+ resolve();
2044
+ }, { once: true });
2045
+ });
2006
2046
  }
2007
2047
  /**
2008
2048
  * Get current status for debugging/monitoring
@@ -2073,6 +2113,10 @@ const MODEL_PREFERENCE = {
2073
2113
  ],
2074
2114
  haiku: ["claude-haiku-4.5"]
2075
2115
  };
2116
+ /** Pre-compiled regex: claude-{family}-{major}-{minor}[-YYYYMMDD] */
2117
+ const VERSIONED_RE = /^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/;
2118
+ /** Pre-compiled regex: claude-{family}-{major}-YYYYMMDD (date-only suffix) */
2119
+ const DATE_ONLY_RE = /^(claude-(opus|sonnet|haiku)-\d+)-\d{8,}$/;
2076
2120
  /**
2077
2121
  * Normalize model ID for matching: lowercase and replace dots with dashes.
2078
2122
  * e.g. "claude-sonnet-4.5" → "claude-sonnet-4-5"
@@ -2093,7 +2137,7 @@ function normalizeForMatching(modelId) {
2093
2137
  */
2094
2138
  function normalizeModelId(modelId) {
2095
2139
  const { base, suffix } = extractModifierSuffix(modelId);
2096
- const versionedMatch = base.match(/^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/);
2140
+ const versionedMatch = base.match(VERSIONED_RE);
2097
2141
  if (versionedMatch) return `${versionedMatch[1]}-${versionedMatch[2]}.${versionedMatch[3]}${suffix}`;
2098
2142
  return modelId;
2099
2143
  }
@@ -2112,9 +2156,8 @@ function getModelFamily(modelId) {
2112
2156
  function findPreferredModel(family) {
2113
2157
  const preference = MODEL_PREFERENCE[family];
2114
2158
  if (!preference) return family;
2115
- const availableIds = state.models?.data.map((m) => m.id);
2116
- if (!availableIds || availableIds.length === 0) return preference[0];
2117
- for (const candidate of preference) if (availableIds.includes(candidate)) return candidate;
2159
+ if (state.modelIds.size === 0) return preference[0];
2160
+ for (const candidate of preference) if (state.modelIds.has(candidate)) return candidate;
2118
2161
  return preference[0];
2119
2162
  }
2120
2163
  /** Known model modifier suffixes (e.g., "-fast" for fast output mode, "-1m" for 1M context). */
@@ -2182,8 +2225,7 @@ function resolveModelName(model) {
2182
2225
  * Uses `seen` set to prevent circular override chains.
2183
2226
  */
2184
2227
  function resolveOverrideTarget(source, target, seen) {
2185
- const availableIds = state.models?.data.map((m) => m.id);
2186
- if (!availableIds || availableIds.length === 0 || availableIds.includes(target)) return target;
2228
+ if (state.modelIds.size === 0 || state.modelIds.has(target)) return target;
2187
2229
  const visited = seen ?? new Set([source]);
2188
2230
  const targetOverride = state.modelOverrides[target];
2189
2231
  if (targetOverride && !visited.has(target)) {
@@ -2213,8 +2255,7 @@ function resolveModelNameCore(model) {
2213
2255
  const resolvedBase = resolveBase(base);
2214
2256
  if (suffix) {
2215
2257
  const withSuffix = resolvedBase + suffix;
2216
- const availableIds = state.models?.data.map((m) => m.id);
2217
- if (!availableIds || availableIds.length === 0 || availableIds.includes(withSuffix)) return withSuffix;
2258
+ if (state.modelIds.size === 0 || state.modelIds.has(withSuffix)) return withSuffix;
2218
2259
  return resolvedBase;
2219
2260
  }
2220
2261
  return resolvedBase;
@@ -2222,17 +2263,16 @@ function resolveModelNameCore(model) {
2222
2263
  /** Resolve a base model name (without modifier suffix) to its canonical form. */
2223
2264
  function resolveBase(model) {
2224
2265
  if (model in MODEL_PREFERENCE) return findPreferredModel(model);
2225
- const versionedMatch = model.match(/^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/);
2266
+ const versionedMatch = model.match(VERSIONED_RE);
2226
2267
  if (versionedMatch) {
2227
2268
  const dotModel = `${versionedMatch[1]}-${versionedMatch[2]}.${versionedMatch[3]}`;
2228
- const availableIds = state.models?.data.map((m) => m.id);
2229
- if (!availableIds || availableIds.length === 0 || availableIds.includes(dotModel)) return dotModel;
2269
+ if (state.modelIds.size === 0 || state.modelIds.has(dotModel)) return dotModel;
2230
2270
  }
2231
- const dateOnlyMatch = model.match(/^(claude-(opus|sonnet|haiku)-\d+)-\d{8,}$/);
2271
+ const dateOnlyMatch = model.match(DATE_ONLY_RE);
2232
2272
  if (dateOnlyMatch) {
2233
2273
  const baseModel = dateOnlyMatch[1];
2234
2274
  const family = dateOnlyMatch[2];
2235
- if ((state.models?.data.map((m) => m.id))?.includes(baseModel)) return baseModel;
2275
+ if (state.modelIds.has(baseModel)) return baseModel;
2236
2276
  return findPreferredModel(family);
2237
2277
  }
2238
2278
  return model;
@@ -2272,6 +2312,9 @@ function createRequestContext(opts) {
2272
2312
  get durationMs() {
2273
2313
  return Date.now() - startTime;
2274
2314
  },
2315
+ get settled() {
2316
+ return settled;
2317
+ },
2275
2318
  get originalRequest() {
2276
2319
  return _originalRequest;
2277
2320
  },
@@ -2410,7 +2453,7 @@ function createRequestContext(opts) {
2410
2453
  fail(model, error) {
2411
2454
  if (settled) return;
2412
2455
  settled = true;
2413
- const errorMessage = getErrorMessage(error);
2456
+ const errorMsg = getErrorMessage(error);
2414
2457
  _response = {
2415
2458
  success: false,
2416
2459
  model: normalizeModelId(model),
@@ -2418,28 +2461,14 @@ function createRequestContext(opts) {
2418
2461
  input_tokens: 0,
2419
2462
  output_tokens: 0
2420
2463
  },
2421
- error: errorMessage,
2464
+ error: errorMsg,
2422
2465
  content: null
2423
2466
  };
2424
2467
  if (error instanceof Error && "responseText" in error && typeof error.responseText === "string") {
2425
2468
  const responseText = error.responseText;
2426
- const status = "status" in error ? error.status : void 0;
2427
- if (responseText) {
2428
- let formattedBody;
2429
- try {
2430
- formattedBody = JSON.stringify(JSON.parse(responseText), null, 2);
2431
- } catch {
2432
- formattedBody = responseText;
2433
- }
2434
- _response.content = {
2435
- role: "assistant",
2436
- content: [{
2437
- type: "text",
2438
- text: `[API Error Response${status ? ` - HTTP ${status}` : ""}]\n\n${formattedBody}`
2439
- }]
2440
- };
2441
- }
2469
+ if (responseText) _response.responseText = responseText;
2442
2470
  }
2471
+ if (error instanceof Error && "status" in error && typeof error.status === "number") _response.status = error.status;
2443
2472
  _state = "failed";
2444
2473
  emit({
2445
2474
  type: "failed",
@@ -2462,7 +2491,7 @@ function createRequestContext(opts) {
2462
2491
  }
2463
2492
  };
2464
2493
  if (_response) entry.response = _response;
2465
- const lastTruncation = [..._attempts].reverse().find((a) => a.truncation)?.truncation;
2494
+ const lastTruncation = _attempts.findLast((a) => a.truncation)?.truncation;
2466
2495
  if (lastTruncation) entry.truncation = lastTruncation;
2467
2496
  if (_rewrites) entry.rewrites = _rewrites;
2468
2497
  if (_sseEvents) entry.sseEvents = _sseEvents;
@@ -2500,7 +2529,7 @@ function createRequestContextManager() {
2500
2529
  const maxAgeMs = state.staleRequestMaxAge * 1e3;
2501
2530
  if (maxAgeMs <= 0) return;
2502
2531
  for (const [id, ctx] of activeContexts) if (ctx.durationMs > maxAgeMs) {
2503
- consola$1.warn(`[context] Force-failing stale request ${id} (age: ${Math.round(ctx.durationMs / 1e3)}s, max: ${state.staleRequestMaxAge}s, model: ${ctx.originalRequest?.model ?? "unknown"})`);
2532
+ consola$1.warn(`[context] Force-failing stale request ${id} (endpoint: ${ctx.endpoint}, model: ${ctx.originalRequest?.model ?? "unknown"}, stream: ${ctx.originalRequest?.stream ?? "?"}, state: ${ctx.state}, age: ${Math.round(ctx.durationMs / 1e3)}s, max: ${state.staleRequestMaxAge}s)`);
2504
2533
  ctx.fail(ctx.originalRequest?.model ?? "unknown", /* @__PURE__ */ new Error(`Request exceeded maximum age of ${state.staleRequestMaxAge}s (stale context reaper)`));
2505
2534
  }
2506
2535
  }
@@ -2707,10 +2736,18 @@ function buildSearchText(entry) {
2707
2736
  for (const block of msg.content) if (block.type === "text" && block.text) parts.push(block.text.slice(0, 200));
2708
2737
  else if (block.type === "tool_use") {
2709
2738
  if (block.name) parts.push(block.name);
2739
+ if (block.input) {
2740
+ const inputStr = typeof block.input === "string" ? block.input : JSON.stringify(block.input);
2741
+ parts.push(inputStr.slice(0, 500));
2742
+ }
2743
+ } else if (block.type === "tool_result" && block.content) {
2744
+ const contentStr = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
2745
+ parts.push(contentStr.slice(0, 500));
2710
2746
  } else if (block.type === "thinking" && block.thinking) parts.push(block.thinking.slice(0, 200));
2711
2747
  }
2712
- if (msg.tool_calls) {
2713
- for (const tc of msg.tool_calls) if (tc.function.name) parts.push(tc.function.name);
2748
+ if (msg.tool_calls) for (const tc of msg.tool_calls) {
2749
+ if (tc.function.name) parts.push(tc.function.name);
2750
+ if (tc.function.arguments) parts.push(tc.function.arguments.slice(0, 500));
2714
2751
  }
2715
2752
  }
2716
2753
  if (entry.response?.content) {
@@ -2723,7 +2760,7 @@ function buildSearchText(entry) {
2723
2760
  }
2724
2761
  return parts.join(" ").toLowerCase();
2725
2762
  }
2726
- /** Build a summary from a full HistoryEntry */
2763
+ /** Build a summary from a full HistoryEntry (searchText is computed lazily) */
2727
2764
  function toSummary(entry) {
2728
2765
  return {
2729
2766
  id: entry.id,
@@ -2739,7 +2776,7 @@ function toSummary(entry) {
2739
2776
  usage: entry.response?.usage,
2740
2777
  durationMs: entry.durationMs,
2741
2778
  previewText: extractPreviewText(entry),
2742
- searchText: buildSearchText(entry)
2779
+ searchText: ""
2743
2780
  };
2744
2781
  }
2745
2782
  /** Global history state */
@@ -2756,6 +2793,14 @@ const entryIndex = /* @__PURE__ */ new Map();
2756
2793
  const summaryIndex = /* @__PURE__ */ new Map();
2757
2794
  /** Track entry count per session to avoid O(n) filter during FIFO eviction */
2758
2795
  const sessionEntryCount = /* @__PURE__ */ new Map();
2796
+ /** O(1) uniqueness tracking for session.models (avoids Array.includes in hot path) */
2797
+ const sessionModelsSet = /* @__PURE__ */ new Map();
2798
+ /** O(1) uniqueness tracking for session.toolsUsed (avoids Array.includes in hot path) */
2799
+ const sessionToolsSet = /* @__PURE__ */ new Map();
2800
+ /** Dirty flag for stats cache — set true when entries are inserted/updated */
2801
+ let statsDirty = true;
2802
+ /** Cached stats result — recomputed only when statsDirty is true */
2803
+ let cachedStats = null;
2759
2804
  function initHistory(enabled, maxEntries) {
2760
2805
  historyState.enabled = enabled;
2761
2806
  historyState.maxEntries = maxEntries;
@@ -2765,6 +2810,10 @@ function initHistory(enabled, maxEntries) {
2765
2810
  entryIndex.clear();
2766
2811
  summaryIndex.clear();
2767
2812
  sessionEntryCount.clear();
2813
+ sessionModelsSet.clear();
2814
+ sessionToolsSet.clear();
2815
+ statsDirty = true;
2816
+ cachedStats = null;
2768
2817
  }
2769
2818
  /** Update the maximum number of history entries (for config hot-reload) */
2770
2819
  function setHistoryMaxEntries(limit) {
@@ -2790,6 +2839,8 @@ function getCurrentSession(endpoint) {
2790
2839
  const now = Date.now();
2791
2840
  const sessionId = generateId();
2792
2841
  historyState.currentSessionId = sessionId;
2842
+ sessionModelsSet.set(sessionId, /* @__PURE__ */ new Set());
2843
+ sessionToolsSet.set(sessionId, /* @__PURE__ */ new Set());
2793
2844
  historyState.sessions.set(sessionId, {
2794
2845
  id: sessionId,
2795
2846
  startTime: now,
@@ -2815,25 +2866,43 @@ function insertEntry(entry) {
2815
2866
  session.requestCount++;
2816
2867
  sessionEntryCount.set(entry.sessionId, (sessionEntryCount.get(entry.sessionId) ?? 0) + 1);
2817
2868
  const model = entry.request.model;
2818
- if (model && !session.models.includes(model)) session.models.push(model);
2869
+ if (model) {
2870
+ const modelsSet = sessionModelsSet.get(entry.sessionId);
2871
+ if (modelsSet && !modelsSet.has(model)) {
2872
+ modelsSet.add(model);
2873
+ session.models.push(model);
2874
+ }
2875
+ }
2819
2876
  if (entry.request.tools && entry.request.tools.length > 0) {
2820
2877
  if (!session.toolsUsed) session.toolsUsed = [];
2821
- for (const tool of entry.request.tools) if (!session.toolsUsed.includes(tool.name)) session.toolsUsed.push(tool.name);
2878
+ let toolsSet = sessionToolsSet.get(entry.sessionId);
2879
+ if (!toolsSet) {
2880
+ toolsSet = new Set(session.toolsUsed);
2881
+ sessionToolsSet.set(entry.sessionId, toolsSet);
2882
+ }
2883
+ for (const tool of entry.request.tools) if (!toolsSet.has(tool.name)) {
2884
+ toolsSet.add(tool.name);
2885
+ session.toolsUsed.push(tool.name);
2886
+ }
2822
2887
  }
2823
2888
  const summary = toSummary(entry);
2824
2889
  summaryIndex.set(entry.id, summary);
2825
- while (historyState.maxEntries > 0 && historyState.entries.length > historyState.maxEntries) {
2826
- const removed = historyState.entries.shift();
2827
- if (removed) {
2828
- entryIndex.delete(removed.id);
2829
- summaryIndex.delete(removed.id);
2830
- const count = (sessionEntryCount.get(removed.sessionId) ?? 1) - 1;
2890
+ if (historyState.maxEntries > 0 && historyState.entries.length > historyState.maxEntries) {
2891
+ const excess = historyState.entries.length - historyState.maxEntries;
2892
+ const removed = historyState.entries.splice(0, excess);
2893
+ for (const r of removed) {
2894
+ entryIndex.delete(r.id);
2895
+ summaryIndex.delete(r.id);
2896
+ const count = (sessionEntryCount.get(r.sessionId) ?? 1) - 1;
2831
2897
  if (count <= 0) {
2832
- sessionEntryCount.delete(removed.sessionId);
2833
- historyState.sessions.delete(removed.sessionId);
2834
- } else sessionEntryCount.set(removed.sessionId, count);
2898
+ sessionEntryCount.delete(r.sessionId);
2899
+ sessionModelsSet.delete(r.sessionId);
2900
+ sessionToolsSet.delete(r.sessionId);
2901
+ historyState.sessions.delete(r.sessionId);
2902
+ } else sessionEntryCount.set(r.sessionId, count);
2835
2903
  }
2836
2904
  }
2905
+ statsDirty = true;
2837
2906
  notifyEntryAdded(summary);
2838
2907
  }
2839
2908
  /**
@@ -2849,10 +2918,24 @@ function updateEntry(id, update) {
2849
2918
  const session = historyState.sessions.get(entry.sessionId);
2850
2919
  if (session) {
2851
2920
  const model = update.request.model;
2852
- if (model && !session.models.includes(model)) session.models.push(model);
2921
+ if (model) {
2922
+ const modelsSet = sessionModelsSet.get(entry.sessionId);
2923
+ if (modelsSet && !modelsSet.has(model)) {
2924
+ modelsSet.add(model);
2925
+ session.models.push(model);
2926
+ }
2927
+ }
2853
2928
  if (update.request.tools && update.request.tools.length > 0) {
2854
2929
  if (!session.toolsUsed) session.toolsUsed = [];
2855
- for (const tool of update.request.tools) if (!session.toolsUsed.includes(tool.name)) session.toolsUsed.push(tool.name);
2930
+ let toolsSet = sessionToolsSet.get(entry.sessionId);
2931
+ if (!toolsSet) {
2932
+ toolsSet = new Set(session.toolsUsed);
2933
+ sessionToolsSet.set(entry.sessionId, toolsSet);
2934
+ }
2935
+ for (const tool of update.request.tools) if (!toolsSet.has(tool.name)) {
2936
+ toolsSet.add(tool.name);
2937
+ session.toolsUsed.push(tool.name);
2938
+ }
2856
2939
  }
2857
2940
  }
2858
2941
  }
@@ -2867,6 +2950,7 @@ function updateEntry(id, update) {
2867
2950
  session.lastActivity = Date.now();
2868
2951
  }
2869
2952
  }
2953
+ statsDirty = true;
2870
2954
  const summary = toSummary(entry);
2871
2955
  summaryIndex.set(entry.id, summary);
2872
2956
  notifyEntryUpdated(summary);
@@ -2894,7 +2978,13 @@ function getHistorySummaries(options = {}) {
2894
2978
  if (to) summaries = summaries.filter((s) => s.timestamp <= to);
2895
2979
  if (search) {
2896
2980
  const needle = search.toLowerCase();
2897
- summaries = summaries.filter((s) => s.searchText.includes(needle));
2981
+ summaries = summaries.filter((s) => {
2982
+ if (s.searchText === "") {
2983
+ const entry = entryIndex.get(s.id);
2984
+ if (entry) s.searchText = buildSearchText(entry);
2985
+ }
2986
+ return s.searchText.includes(needle);
2987
+ });
2898
2988
  }
2899
2989
  summaries.sort((a, b) => b.timestamp - a.timestamp);
2900
2990
  const total = summaries.length;
@@ -2928,6 +3018,10 @@ function clearHistory() {
2928
3018
  entryIndex.clear();
2929
3019
  summaryIndex.clear();
2930
3020
  sessionEntryCount.clear();
3021
+ sessionModelsSet.clear();
3022
+ sessionToolsSet.clear();
3023
+ statsDirty = true;
3024
+ cachedStats = null;
2931
3025
  }
2932
3026
  function deleteSession(sessionId) {
2933
3027
  if (!historyState.sessions.has(sessionId)) return false;
@@ -2939,10 +3033,15 @@ function deleteSession(sessionId) {
2939
3033
  historyState.entries = remaining;
2940
3034
  historyState.sessions.delete(sessionId);
2941
3035
  sessionEntryCount.delete(sessionId);
3036
+ sessionModelsSet.delete(sessionId);
3037
+ sessionToolsSet.delete(sessionId);
3038
+ statsDirty = true;
3039
+ cachedStats = null;
2942
3040
  if (historyState.currentSessionId === sessionId) historyState.currentSessionId = generateId();
2943
3041
  return true;
2944
3042
  }
2945
3043
  function getStats() {
3044
+ if (!statsDirty && cachedStats) return cachedStats;
2946
3045
  const entries = historyState.entries;
2947
3046
  const modelDist = {};
2948
3047
  const endpointDist = {};
@@ -2975,7 +3074,7 @@ function getStats() {
2975
3074
  hour,
2976
3075
  count
2977
3076
  }));
2978
- return {
3077
+ const stats = {
2979
3078
  totalRequests: entries.length,
2980
3079
  successfulRequests: successCount,
2981
3080
  failedRequests: failCount,
@@ -2987,6 +3086,9 @@ function getStats() {
2987
3086
  recentActivity,
2988
3087
  activeSessions: historyState.sessions.size
2989
3088
  };
3089
+ statsDirty = false;
3090
+ cachedStats = stats;
3091
+ return stats;
2990
3092
  }
2991
3093
  /** Escape a value for CSV: wrap in quotes if it contains comma, quote, or newline; convert nullish to empty string */
2992
3094
  function escapeCsvValue(value) {
@@ -3112,7 +3214,6 @@ async function gracefulShutdown(signal, deps) {
3112
3214
  const tracker = deps?.tracker ?? tuiLogger;
3113
3215
  const server = deps?.server ?? serverInstance;
3114
3216
  const rateLimiter = deps?.rateLimiter !== void 0 ? deps.rateLimiter : getAdaptiveRateLimiter();
3115
- const contextManager = deps?.contextManager ?? getRequestContextManager();
3116
3217
  const stopRefresh = deps?.stopTokenRefreshFn ?? stopTokenRefresh;
3117
3218
  const closeWsClients = deps?.closeAllClientsFn ?? closeAllClients;
3118
3219
  const getWsClientCount = deps?.getClientCountFn ?? getClientCount;
@@ -3125,7 +3226,9 @@ async function gracefulShutdown(signal, deps) {
3125
3226
  _isShuttingDown = true;
3126
3227
  shutdownAbortController = new AbortController();
3127
3228
  consola.info(`Received ${signal}, shutting down gracefully...`);
3128
- contextManager?.stopReaper();
3229
+ try {
3230
+ (deps?.contextManager ?? getRequestContextManager()).stopReaper();
3231
+ } catch {}
3129
3232
  stopRefresh();
3130
3233
  const wsClients = getWsClientCount();
3131
3234
  if (wsClients > 0) {
@@ -3244,7 +3347,7 @@ var TuiLogger = class {
3244
3347
  if (!entry) return;
3245
3348
  if (update.model !== void 0) {
3246
3349
  entry.model = update.model;
3247
- const multiplier = state.models?.data.find((m) => m.id === update.model)?.billing?.multiplier;
3350
+ const multiplier = state.modelIndex.get(update.model)?.billing?.multiplier;
3248
3351
  if (multiplier !== void 0) entry.multiplier = multiplier;
3249
3352
  }
3250
3353
  if (update.clientModel !== void 0) entry.clientModel = update.clientModel;
@@ -3957,7 +4060,7 @@ const setupClaudeCode = defineCommand({
3957
4060
 
3958
4061
  //#endregion
3959
4062
  //#region package.json
3960
- var version = "0.7.18-beta";
4063
+ var version = "0.7.18-beta.2";
3961
4064
 
3962
4065
  //#endregion
3963
4066
  //#region src/lib/config/config.ts
@@ -3999,9 +4102,15 @@ function compileRewriteRules(raws) {
3999
4102
  }
4000
4103
  let cachedConfig = null;
4001
4104
  let configLastMtimeMs = 0;
4105
+ /** Time-based debounce: skip stat() if checked recently */
4106
+ let lastStatTimeMs = 0;
4107
+ const STAT_DEBOUNCE_MS = 2e3;
4002
4108
  async function loadConfig() {
4003
4109
  try {
4110
+ const now = Date.now();
4111
+ if (cachedConfig && now - lastStatTimeMs < STAT_DEBOUNCE_MS) return cachedConfig;
4004
4112
  const stat = await fs.stat(PATHS.CONFIG_YAML);
4113
+ lastStatTimeMs = now;
4005
4114
  if (cachedConfig && stat.mtimeMs === configLastMtimeMs) return cachedConfig;
4006
4115
  const content = await fs.readFile(PATHS.CONFIG_YAML, "utf8");
4007
4116
  const { parse } = await import("yaml");
@@ -4048,7 +4157,7 @@ async function applyConfigToState() {
4048
4157
  else if (Array.isArray(a.rewrite_system_reminders)) state.rewriteSystemReminders = compileRewriteRules(a.rewrite_system_reminders);
4049
4158
  }
4050
4159
  }
4051
- if (config.system_prompt_overrides !== void 0) state.systemPromptOverrides = config.system_prompt_overrides.length > 0 ? compileRewriteRules(config.system_prompt_overrides) : [];
4160
+ if (Array.isArray(config.system_prompt_overrides)) state.systemPromptOverrides = config.system_prompt_overrides.length > 0 ? compileRewriteRules(config.system_prompt_overrides) : [];
4052
4161
  if (config.model_overrides) state.modelOverrides = {
4053
4162
  ...DEFAULT_MODEL_OVERRIDES,
4054
4163
  ...config.model_overrides
@@ -4072,6 +4181,78 @@ async function applyConfigToState() {
4072
4181
  return config;
4073
4182
  }
4074
4183
 
4184
+ //#endregion
4185
+ //#region src/lib/context/error-persistence.ts
4186
+ /**
4187
+ * Error persistence consumer.
4188
+ *
4189
+ * Subscribes to "failed" events on RequestContext and writes structured
4190
+ * error files to disk for post-mortem debugging. All data comes from
4191
+ * RequestContext (via HistoryEntryData on the event), not from Hono
4192
+ * Context — ensuring reliability regardless of whether the HTTP body
4193
+ * has been consumed.
4194
+ *
4195
+ * Output directory: PATHS.ERROR_DIR/{timestamp}_{id}/
4196
+ * Files:
4197
+ * - meta.json: structured metadata (timestamp, endpoint, model, error, attempts)
4198
+ * - request.json: full request payload (messages capped at 50 for size)
4199
+ * - response.txt: raw upstream response body (if available)
4200
+ * - sse-events.json: recorded SSE events (if streaming request failed mid-stream)
4201
+ */
4202
+ /** Handle context events — only acts on "failed" */
4203
+ function handleErrorPersistence(event) {
4204
+ if (event.type !== "failed") return;
4205
+ writeErrorEntry(event.entry).catch((err) => {
4206
+ consola.debug(`[ErrorPersistence] Failed to write error file: ${err}`);
4207
+ });
4208
+ }
4209
+ /** Max number of messages to include in request.json (to avoid huge files) */
4210
+ const MAX_MESSAGES_IN_DUMP = 50;
4211
+ async function writeErrorEntry(entry) {
4212
+ const meta = {
4213
+ timestamp: new Date(entry.timestamp).toISOString(),
4214
+ id: entry.id,
4215
+ endpoint: entry.endpoint,
4216
+ durationMs: entry.durationMs,
4217
+ request: {
4218
+ model: entry.request.model,
4219
+ stream: entry.request.stream,
4220
+ messageCount: entry.request.messages?.length,
4221
+ toolCount: entry.request.tools?.length
4222
+ },
4223
+ response: entry.response ? {
4224
+ success: entry.response.success,
4225
+ model: entry.response.model,
4226
+ error: entry.response.error,
4227
+ status: entry.response.status
4228
+ } : void 0,
4229
+ truncation: entry.truncation,
4230
+ attempts: entry.attempts
4231
+ };
4232
+ const files = [["meta.json", JSON.stringify(meta, null, 2)]];
4233
+ if (entry.request) {
4234
+ const { messages, ...requestWithoutMessages } = entry.request;
4235
+ const requestData = {
4236
+ ...requestWithoutMessages,
4237
+ messageCount: messages?.length,
4238
+ ...messages && messages.length <= MAX_MESSAGES_IN_DUMP && { messages }
4239
+ };
4240
+ files.push(["request.json", JSON.stringify(requestData, null, 2)]);
4241
+ }
4242
+ if (entry.response?.responseText) files.push(["response.txt", entry.response.responseText]);
4243
+ if (entry.sseEvents?.length) files.push(["sse-events.json", JSON.stringify(entry.sseEvents, null, 2)]);
4244
+ const id = randomBytes(4).toString("hex");
4245
+ const dirPath = path$1.join(PATHS.ERROR_DIR, `${formatTimestamp()}_${id}`);
4246
+ await fs$1.mkdir(dirPath, { recursive: true });
4247
+ await Promise.all(files.map(([name, content]) => fs$1.writeFile(path$1.join(dirPath, name), content)));
4248
+ }
4249
+ /** Format timestamp as YYMMDD_HHmmss for error directory names */
4250
+ function formatTimestamp() {
4251
+ const now = /* @__PURE__ */ new Date();
4252
+ const pad = (n) => String(n).padStart(2, "0");
4253
+ return `${String(now.getFullYear()).slice(2)}${pad(now.getMonth() + 1)}${pad(now.getDate())}_${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
4254
+ }
4255
+
4075
4256
  //#endregion
4076
4257
  //#region src/lib/context/consumers.ts
4077
4258
  function handleHistoryEvent(event) {
@@ -4190,6 +4371,7 @@ function toHistoryResponse(entryData) {
4190
4371
  function registerContextConsumers(manager) {
4191
4372
  manager.on("change", handleHistoryEvent);
4192
4373
  manager.on("change", handleTuiEvent);
4374
+ manager.on("change", handleErrorPersistence);
4193
4375
  }
4194
4376
 
4195
4377
  //#endregion
@@ -4709,13 +4891,13 @@ const getTokenCount = async (payload, model) => {
4709
4891
  */
4710
4892
  /**
4711
4893
  * Log helpful debugging information when a 413 error occurs.
4712
- * Also adjusts the dynamic byte limit for future requests.
4894
+ *
4895
+ * @param precomputedBytes - Optional pre-computed payload byte size to avoid redundant JSON.stringify
4713
4896
  */
4714
- async function logPayloadSizeInfo(payload, model) {
4897
+ async function logPayloadSizeInfo(payload, model, precomputedBytes) {
4715
4898
  const messageCount = payload.messages.length;
4716
- const bodySize = JSON.stringify(payload).length;
4899
+ const bodySize = precomputedBytes ?? JSON.stringify(payload).length;
4717
4900
  const bodySizeKB = bytesToKB(bodySize);
4718
- onRequestTooLarge(bodySize);
4719
4901
  let imageCount = 0;
4720
4902
  let largeMessages = 0;
4721
4903
  let totalImageSize = 0;
@@ -4797,7 +4979,7 @@ async function executeRequestPipeline(opts) {
4797
4979
  try {
4798
4980
  const { result: response, queueWaitMs } = await adapter.execute(effectivePayload);
4799
4981
  totalQueueWaitMs += queueWaitMs;
4800
- requestContext?.addQueueWaitMs(totalQueueWaitMs);
4982
+ requestContext?.addQueueWaitMs(queueWaitMs);
4801
4983
  return {
4802
4984
  response,
4803
4985
  effectivePayload,
@@ -5105,8 +5287,7 @@ function buildResponsesResponseData(acc, fallbackModel) {
5105
5287
  /**
5106
5288
  * Auto-truncate retry strategy.
5107
5289
  *
5108
- * Handles 413 (body too large) and token limit errors by truncating the
5109
- * message payload and retrying.
5290
+ * Handles token limit errors by truncating the message payload and retrying.
5110
5291
  */
5111
5292
  /**
5112
5293
  * Create an auto-truncate retry strategy.
@@ -5134,26 +5315,44 @@ function createAutoTruncateStrategy(opts) {
5134
5315
  action: "abort",
5135
5316
  error
5136
5317
  };
5137
- const payloadBytes = JSON.stringify(currentPayload).length;
5138
- const parsed = tryParseAndLearnLimit(rawError, model.id, payloadBytes);
5139
- if (!parsed) return {
5140
- action: "abort",
5141
- error
5142
- };
5318
+ const payloadJson = JSON.stringify(currentPayload);
5319
+ const estimatedTokens = Math.ceil(payloadJson.length / 4);
5320
+ const parsed = tryParseAndLearnLimit(rawError, model.id, true, estimatedTokens);
5321
+ if (!parsed) {
5322
+ if (rawError.status === 413) {
5323
+ consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: 413 Body too large, retrying with truncation...`);
5324
+ const truncateResult = await truncate(originalPayload, model, { checkTokenLimit: true });
5325
+ if (!truncateResult.wasTruncated) return {
5326
+ action: "abort",
5327
+ error
5328
+ };
5329
+ const sanitizeResult = resanitize(truncateResult.payload);
5330
+ return {
5331
+ action: "retry",
5332
+ payload: sanitizeResult.payload,
5333
+ meta: {
5334
+ truncateResult,
5335
+ sanitization: sanitizeResult.stats ?? {
5336
+ totalBlocksRemoved: sanitizeResult.removedCount,
5337
+ systemReminderRemovals: sanitizeResult.systemReminderRemovals
5338
+ },
5339
+ attempt: attempt + 1
5340
+ }
5341
+ };
5342
+ }
5343
+ return {
5344
+ action: "abort",
5345
+ error
5346
+ };
5347
+ }
5143
5348
  let targetTokenLimit;
5144
- let targetByteLimitBytes;
5145
- if (parsed.type === "token_limit" && parsed.limit) {
5349
+ if (parsed.limit) {
5146
5350
  targetTokenLimit = Math.floor(parsed.limit * AUTO_TRUNCATE_RETRY_FACTOR);
5147
5351
  consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: Token limit error (${parsed.current}>${parsed.limit}), retrying with limit ${targetTokenLimit}...`);
5148
- } else if (parsed.type === "body_too_large") {
5149
- targetByteLimitBytes = Math.floor(payloadBytes * AUTO_TRUNCATE_RETRY_FACTOR);
5150
- consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: Body too large (${bytesToKB(payloadBytes)}KB), retrying with limit ${bytesToKB(targetByteLimitBytes)}KB...`);
5151
5352
  }
5152
5353
  const truncateResult = await truncate(originalPayload, model, {
5153
5354
  checkTokenLimit: true,
5154
- checkByteLimit: true,
5155
- targetTokenLimit,
5156
- targetByteLimitBytes
5355
+ targetTokenLimit
5157
5356
  });
5158
5357
  if (!truncateResult.wasTruncated) return {
5159
5358
  action: "abort",
@@ -5576,12 +5775,13 @@ function sanitizeMessageParamContent(msg) {
5576
5775
  */
5577
5776
  function removeAnthropicSystemReminders(messages) {
5578
5777
  let modifiedCount = 0;
5778
+ const result = messages.map((msg) => {
5779
+ const sanitized = sanitizeMessageParamContent(msg);
5780
+ if (sanitized !== msg) modifiedCount++;
5781
+ return sanitized;
5782
+ });
5579
5783
  return {
5580
- messages: messages.map((msg) => {
5581
- const sanitized = sanitizeMessageParamContent(msg);
5582
- if (sanitized !== msg) modifiedCount++;
5583
- return sanitized;
5584
- }),
5784
+ messages: modifiedCount === 0 ? messages : result,
5585
5785
  modifiedCount
5586
5786
  };
5587
5787
  }
@@ -6180,11 +6380,11 @@ function convertServerToolsToCustom(tools) {
6180
6380
  * Auto-truncate module for Anthropic-style messages.
6181
6381
  *
6182
6382
  * This module handles automatic truncation of Anthropic message format
6183
- * when it exceeds token or byte limits.
6383
+ * when it exceeds token limits.
6184
6384
  *
6185
6385
  * Key features:
6186
6386
  * - Binary search for optimal truncation point
6187
- * - Considers both token and byte limits
6387
+ * - Token limit enforcement with learned calibration
6188
6388
  * - Preserves system messages
6189
6389
  * - Filters orphaned tool_result and tool_use messages
6190
6390
  * - Smart compression of old tool_result content (e.g., Read tool results)
@@ -6315,15 +6515,6 @@ async function countTotalInputTokens(payload, model) {
6315
6515
  }
6316
6516
  return total;
6317
6517
  }
6318
- /** Get byte size of a message (memoized to avoid redundant JSON.stringify) */
6319
- const messageBytesCache$1 = /* @__PURE__ */ new WeakMap();
6320
- function getMessageBytes$1(msg) {
6321
- let cached = messageBytesCache$1.get(msg);
6322
- if (cached !== void 0) return cached;
6323
- cached = JSON.stringify(msg).length;
6324
- messageBytesCache$1.set(msg, cached);
6325
- return cached;
6326
- }
6327
6518
  /**
6328
6519
  * Strip thinking/redacted_thinking blocks from old assistant messages.
6329
6520
  *
@@ -6377,26 +6568,20 @@ function compressToolResultBlock(block) {
6377
6568
  }
6378
6569
  /**
6379
6570
  * Smart compression strategy:
6380
- * 1. Calculate tokens/bytes from the end until reaching preservePercent of limit
6571
+ * 1. Calculate tokens from the end until reaching preservePercent of limit
6381
6572
  * 2. Messages before that threshold get their tool_results compressed
6382
6573
  * 3. Returns compressed messages and stats
6383
6574
  *
6384
6575
  * @param preservePercent - Percentage of context to preserve uncompressed (0.0-1.0)
6385
6576
  */
6386
- function smartCompressToolResults$1(messages, tokenLimit, byteLimit, preservePercent) {
6577
+ function smartCompressToolResults$1(messages, tokenLimit, preservePercent) {
6387
6578
  const n = messages.length;
6388
6579
  const cumTokens = Array.from({ length: n + 1 }, () => 0);
6389
- const cumBytes = Array.from({ length: n + 1 }, () => 0);
6390
- for (let i = n - 1; i >= 0; i--) {
6391
- const msg = messages[i];
6392
- cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(msg);
6393
- cumBytes[i] = cumBytes[i + 1] + getMessageBytes$1(msg) + 1;
6394
- }
6580
+ for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(messages[i]);
6395
6581
  const preserveTokenLimit = Math.floor(tokenLimit * preservePercent);
6396
- const preserveByteLimit = Math.floor(byteLimit * preservePercent);
6397
6582
  let thresholdIndex = n;
6398
6583
  for (let i = n - 1; i >= 0; i--) {
6399
- if (cumTokens[i] > preserveTokenLimit || cumBytes[i] > preserveByteLimit) {
6584
+ if (cumTokens[i] > preserveTokenLimit) {
6400
6585
  thresholdIndex = i + 1;
6401
6586
  break;
6402
6587
  }
@@ -6448,40 +6633,35 @@ function smartCompressToolResults$1(messages, tokenLimit, byteLimit, preservePer
6448
6633
  };
6449
6634
  }
6450
6635
  /** Default fallback for when model capabilities are not available */
6451
- const DEFAULT_CONTEXT_WINDOW = 2e5;
6452
- function calculateLimits$1(model, config) {
6453
- if (config.targetTokenLimit !== void 0 || config.targetByteLimitBytes !== void 0) return {
6454
- tokenLimit: config.targetTokenLimit ?? model.capabilities?.limits?.max_context_window_tokens ?? DEFAULT_CONTEXT_WINDOW,
6455
- byteLimit: config.targetByteLimitBytes ?? getEffectiveByteLimitBytes()
6456
- };
6457
- const rawTokenLimit = getEffectiveTokenLimit(model.id) ?? model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW;
6458
- return {
6459
- tokenLimit: Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100)),
6460
- byteLimit: getEffectiveByteLimitBytes()
6461
- };
6636
+ const DEFAULT_CONTEXT_WINDOW$1 = 2e5;
6637
+ /**
6638
+ * Calculate the effective token limit for auto-truncate.
6639
+ * Uses explicit target if provided, otherwise learned limits with calibration,
6640
+ * otherwise model capabilities with safety margin.
6641
+ */
6642
+ function calculateTokenLimit$1(model, config) {
6643
+ if (config.targetTokenLimit !== void 0) return config.targetTokenLimit;
6644
+ const learned = getLearnedLimits(model.id);
6645
+ if (learned) {
6646
+ const margin = computeSafetyMargin(learned.sampleCount);
6647
+ return Math.floor(learned.tokenLimit * (1 - margin));
6648
+ }
6649
+ const rawTokenLimit = model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW$1;
6650
+ return Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100));
6462
6651
  }
6463
6652
  function findOptimalPreserveIndex$1(params) {
6464
- const { messages, systemBytes, systemTokens, payloadOverhead, tokenLimit, byteLimit, checkTokenLimit, checkByteLimit } = params;
6653
+ const { messages, systemTokens, tokenLimit } = params;
6465
6654
  if (messages.length === 0) return 0;
6466
- const markerBytes = 200;
6467
6655
  const availableTokens = tokenLimit - systemTokens - 50;
6468
- const availableBytes = byteLimit - payloadOverhead - systemBytes - markerBytes;
6469
- if (checkTokenLimit && availableTokens <= 0 || checkByteLimit && availableBytes <= 0) return messages.length;
6656
+ if (availableTokens <= 0) return messages.length;
6470
6657
  const n = messages.length;
6471
6658
  const cumTokens = Array.from({ length: n + 1 }, () => 0);
6472
- const cumBytes = Array.from({ length: n + 1 }, () => 0);
6473
- for (let i = n - 1; i >= 0; i--) {
6474
- const msg = messages[i];
6475
- cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(msg);
6476
- cumBytes[i] = cumBytes[i + 1] + getMessageBytes$1(msg) + 1;
6477
- }
6659
+ for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(messages[i]);
6478
6660
  let left = 0;
6479
6661
  let right = n;
6480
6662
  while (left < right) {
6481
6663
  const mid = left + right >>> 1;
6482
- const tokensFit = !checkTokenLimit || cumTokens[mid] <= availableTokens;
6483
- const bytesFit = !checkByteLimit || cumBytes[mid] <= availableBytes;
6484
- if (tokensFit && bytesFit) right = mid;
6664
+ if (cumTokens[mid] <= availableTokens) right = mid;
6485
6665
  else left = mid + 1;
6486
6666
  }
6487
6667
  return left;
@@ -6572,36 +6752,28 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
6572
6752
  ...DEFAULT_AUTO_TRUNCATE_CONFIG,
6573
6753
  ...config
6574
6754
  };
6575
- const { tokenLimit, byteLimit } = calculateLimits$1(model, cfg);
6755
+ const tokenLimit = calculateTokenLimit$1(model, cfg);
6576
6756
  const fixedTokens = await countFixedTokens(payload, model);
6577
- const originalBytes = JSON.stringify(payload).length;
6578
6757
  const originalTokens = fixedTokens + await countMessagesTokens(payload.messages, model);
6579
- if (originalTokens <= tokenLimit && originalBytes <= byteLimit) return buildResult({
6758
+ if (originalTokens <= tokenLimit) return buildResult({
6580
6759
  payload,
6581
6760
  wasTruncated: false,
6582
6761
  originalTokens,
6583
6762
  compactedTokens: originalTokens,
6584
6763
  removedMessageCount: 0
6585
6764
  });
6586
- const exceedsTokens = originalTokens > tokenLimit;
6587
- const exceedsBytes = originalBytes > byteLimit;
6588
6765
  const { messages: thinkingStripped, strippedCount: thinkingStrippedCount } = stripThinkingBlocks(payload.messages, 4);
6589
6766
  let workingMessages = thinkingStripped;
6590
6767
  if (thinkingStrippedCount > 0) {
6591
- const strippedPayload = {
6592
- ...payload,
6593
- messages: workingMessages
6594
- };
6595
- const strippedBytes = JSON.stringify(strippedPayload).length;
6596
6768
  const strippedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
6597
- if (strippedTokens <= tokenLimit && strippedBytes <= byteLimit) {
6598
- let reason = "tokens";
6599
- if (exceedsTokens && exceedsBytes) reason = "tokens+size";
6600
- else if (exceedsBytes) reason = "size";
6769
+ if (strippedTokens <= tokenLimit) {
6601
6770
  const elapsedMs = Math.round(performance.now() - startTime);
6602
- consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${strippedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(strippedBytes)}KB (stripped ${thinkingStrippedCount} thinking blocks) [${elapsedMs}ms]`);
6771
+ consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${strippedTokens} (stripped ${thinkingStrippedCount} thinking blocks) [${elapsedMs}ms]`);
6603
6772
  return buildResult({
6604
- payload: strippedPayload,
6773
+ payload: {
6774
+ ...payload,
6775
+ messages: workingMessages
6776
+ },
6605
6777
  wasTruncated: true,
6606
6778
  originalTokens,
6607
6779
  compactedTokens: strippedTokens,
@@ -6611,47 +6783,37 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
6611
6783
  }
6612
6784
  let compressedCount = 0;
6613
6785
  if (state.compressToolResultsBeforeTruncate) {
6614
- const compressionResult = smartCompressToolResults$1(workingMessages, tokenLimit, byteLimit, cfg.preserveRecentPercent);
6786
+ const compressionResult = smartCompressToolResults$1(workingMessages, tokenLimit, cfg.preserveRecentPercent);
6615
6787
  workingMessages = compressionResult.messages;
6616
6788
  compressedCount = compressionResult.compressedCount;
6617
- const compressedPayload = {
6618
- ...payload,
6619
- messages: workingMessages
6620
- };
6621
- const compressedBytes = JSON.stringify(compressedPayload).length;
6622
6789
  const compressedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
6623
- if (compressedTokens <= tokenLimit && compressedBytes <= byteLimit) {
6624
- let reason = "tokens";
6625
- if (exceedsTokens && exceedsBytes) reason = "tokens+size";
6626
- else if (exceedsBytes) reason = "size";
6790
+ if (compressedTokens <= tokenLimit) {
6627
6791
  const elapsedMs = Math.round(performance.now() - startTime);
6628
- consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${compressedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
6792
+ consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${compressedTokens} (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
6629
6793
  return buildResult({
6630
- payload: addCompressionNotice$1(compressedPayload, compressedCount),
6794
+ payload: addCompressionNotice$1({
6795
+ ...payload,
6796
+ messages: workingMessages
6797
+ }, compressedCount),
6631
6798
  wasTruncated: true,
6632
6799
  originalTokens,
6633
6800
  compactedTokens: compressedTokens + (Math.ceil(150 / 4) + 4),
6634
6801
  removedMessageCount: 0
6635
6802
  });
6636
6803
  }
6637
- const allCompression = smartCompressToolResults$1(workingMessages, tokenLimit, byteLimit, 0);
6804
+ const allCompression = smartCompressToolResults$1(workingMessages, tokenLimit, 0);
6638
6805
  if (allCompression.compressedCount > 0) {
6639
6806
  workingMessages = allCompression.messages;
6640
6807
  compressedCount += allCompression.compressedCount;
6641
- const allCompressedPayload = {
6642
- ...payload,
6643
- messages: workingMessages
6644
- };
6645
- const allCompressedBytes = JSON.stringify(allCompressedPayload).length;
6646
6808
  const allCompressedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
6647
- if (allCompressedTokens <= tokenLimit && allCompressedBytes <= byteLimit) {
6648
- let reason = "tokens";
6649
- if (exceedsTokens && exceedsBytes) reason = "tokens+size";
6650
- else if (exceedsBytes) reason = "size";
6809
+ if (allCompressedTokens <= tokenLimit) {
6651
6810
  const elapsedMs = Math.round(performance.now() - startTime);
6652
- consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${allCompressedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
6811
+ consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${allCompressedTokens} (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
6653
6812
  return buildResult({
6654
- payload: addCompressionNotice$1(allCompressedPayload, compressedCount),
6813
+ payload: addCompressionNotice$1({
6814
+ ...payload,
6815
+ messages: workingMessages
6816
+ }, compressedCount),
6655
6817
  wasTruncated: true,
6656
6818
  originalTokens,
6657
6819
  compactedTokens: allCompressedTokens + (Math.ceil(150 / 4) + 4),
@@ -6660,23 +6822,11 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
6660
6822
  }
6661
6823
  }
6662
6824
  }
6663
- const systemBytes = payload.system ? JSON.stringify(payload.system).length : 0;
6664
6825
  const systemTokens = await countSystemTokens(payload.system, model);
6665
- const messagesBytes = workingMessages.reduce((sum, msg) => sum + getMessageBytes$1(msg) + 1, 0) + 2;
6666
- const payloadOverhead = JSON.stringify({
6667
- ...payload,
6668
- messages: workingMessages
6669
- }).length - messagesBytes - systemBytes;
6670
- consola.debug(`[AutoTruncate:Anthropic] overhead=${bytesToKB(payloadOverhead)}KB, system=${bytesToKB(systemBytes)}KB`);
6671
6826
  const preserveIndex = findOptimalPreserveIndex$1({
6672
6827
  messages: workingMessages,
6673
- systemBytes,
6674
6828
  systemTokens,
6675
- payloadOverhead,
6676
- tokenLimit,
6677
- byteLimit,
6678
- checkTokenLimit: cfg.checkTokenLimit,
6679
- checkByteLimit: cfg.checkByteLimit
6829
+ tokenLimit
6680
6830
  });
6681
6831
  if (preserveIndex >= workingMessages.length) {
6682
6832
  consola.warn("[AutoTruncate:Anthropic] Would need to remove all messages");
@@ -6724,17 +6874,14 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
6724
6874
  const newBytes = JSON.stringify(newPayload).length;
6725
6875
  const newMsgTokens = await countMessagesTokens(newMessages, model);
6726
6876
  const newTokens = (newSystem !== payload.system ? await countSystemTokens(newSystem, model) : systemTokens) + (fixedTokens - await countSystemTokens(payload.system, model)) + newMsgTokens;
6727
- let reason = "tokens";
6728
- if (exceedsTokens && exceedsBytes) reason = "tokens+size";
6729
- else if (exceedsBytes) reason = "size";
6730
6877
  const actions = [];
6731
6878
  if (removedCount > 0) actions.push(`removed ${removedCount} msgs`);
6732
6879
  if (thinkingStrippedCount > 0) actions.push(`stripped ${thinkingStrippedCount} thinking blocks`);
6733
6880
  if (compressedCount > 0) actions.push(`compressed ${compressedCount} tool_results`);
6734
6881
  const actionInfo = actions.length > 0 ? ` (${actions.join(", ")})` : "";
6735
6882
  const elapsedMs = Math.round(performance.now() - startTime);
6736
- consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${newTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
6737
- if (newBytes > byteLimit || newTokens > tokenLimit) consola.warn(`[AutoTruncate:Anthropic] Result still over limit (${newTokens} tokens, ${bytesToKB(newBytes)}KB)`);
6883
+ consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${newTokens}, ${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
6884
+ if (newTokens > tokenLimit) consola.warn(`[AutoTruncate:Anthropic] Result still over token limit (${newTokens} > ${tokenLimit})`);
6738
6885
  return buildResult({
6739
6886
  payload: newPayload,
6740
6887
  wasTruncated: true,
@@ -6744,32 +6891,43 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
6744
6891
  });
6745
6892
  }
6746
6893
  /**
6747
- * Check if payload needs compaction.
6894
+ * Check if payload needs compaction based on learned model limits.
6895
+ * Returns early with `needed: false` when no limits are known for the model.
6748
6896
  */
6749
6897
  async function checkNeedsCompactionAnthropic(payload, model, config = {}) {
6750
6898
  const cfg = {
6751
6899
  ...DEFAULT_AUTO_TRUNCATE_CONFIG,
6752
6900
  ...config
6753
6901
  };
6754
- const { tokenLimit, byteLimit } = calculateLimits$1(model, cfg);
6755
- const currentTokens = await countTotalTokens(payload, model);
6756
- const currentBytes = JSON.stringify(payload).length;
6902
+ const learned = getLearnedLimits(model.id);
6903
+ if (!learned && cfg.targetTokenLimit === void 0) return {
6904
+ needed: false,
6905
+ currentTokens: 0,
6906
+ tokenLimit: 0
6907
+ };
6908
+ const tokenLimit = calculateTokenLimit$1(model, cfg);
6909
+ const rawTokens = await countTotalTokens(payload, model);
6910
+ const currentTokens = learned && learned.sampleCount > 0 ? calibrate(model.id, rawTokens) : rawTokens;
6757
6911
  const exceedsTokens = cfg.checkTokenLimit && currentTokens > tokenLimit;
6758
- const exceedsBytes = cfg.checkByteLimit && currentBytes > byteLimit;
6759
- let reason;
6760
- if (exceedsTokens && exceedsBytes) reason = "both";
6761
- else if (exceedsTokens) reason = "tokens";
6762
- else if (exceedsBytes) reason = "bytes";
6763
6912
  return {
6764
- needed: exceedsTokens || exceedsBytes,
6913
+ needed: exceedsTokens,
6765
6914
  currentTokens,
6766
6915
  tokenLimit,
6767
- currentBytes,
6768
- byteLimit,
6769
- reason
6916
+ reason: exceedsTokens ? "tokens" : void 0
6770
6917
  };
6771
6918
  }
6772
6919
 
6920
+ //#endregion
6921
+ //#region src/lib/fetch-utils.ts
6922
+ /**
6923
+ * Create an AbortSignal for fetch timeout if configured.
6924
+ * Controls the time from request start to receiving response headers.
6925
+ * Returns undefined if fetchTimeout is 0 (disabled).
6926
+ */
6927
+ function createFetchSignal() {
6928
+ return state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
6929
+ }
6930
+
6773
6931
  //#endregion
6774
6932
  //#region src/lib/anthropic/features.ts
6775
6933
  /**
@@ -7126,7 +7284,7 @@ async function createAnthropicMessages(payload) {
7126
7284
  }
7127
7285
  }
7128
7286
  consola.debug("Sending direct Anthropic request to Copilot /v1/messages");
7129
- const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
7287
+ const fetchSignal = createFetchSignal();
7130
7288
  const response = await fetch(`${copilotBaseUrl(state)}/v1/messages`, {
7131
7289
  method: "POST",
7132
7290
  headers,
@@ -7365,7 +7523,7 @@ function raceIteratorNext(promise, opts) {
7365
7523
  * Returns a decision with reason so callers can log/display the routing rationale.
7366
7524
  */
7367
7525
  function supportsDirectAnthropicApi(modelId) {
7368
- const model = state.models?.data.find((m) => m.id === modelId);
7526
+ const model = state.modelIndex.get(modelId);
7369
7527
  if (model?.vendor !== "Anthropic") return {
7370
7528
  supported: false,
7371
7529
  reason: `vendor is "${model?.vendor ?? "unknown"}", not Anthropic`
@@ -7387,6 +7545,12 @@ function supportsDirectAnthropicApi(modelId) {
7387
7545
  async function handleAnthropicMessagesCompletion(c, anthropicPayload, options) {
7388
7546
  if (anthropicPayload.system) anthropicPayload.system = await processAnthropicSystem(anthropicPayload.system);
7389
7547
  const tuiLogId = c.get("tuiLogId");
7548
+ const routingDecision = supportsDirectAnthropicApi(anthropicPayload.model);
7549
+ if (!routingDecision.supported) {
7550
+ const msg = `Model "${anthropicPayload.model}" does not support /v1/messages: ${routingDecision.reason}`;
7551
+ throw new HTTPError(msg, 400, msg);
7552
+ }
7553
+ consola.debug(`[AnthropicRouting] ${anthropicPayload.model}: ${routingDecision.reason}`);
7390
7554
  const reqCtx = getRequestContextManager().create({
7391
7555
  endpoint: "anthropic",
7392
7556
  tuiLogId
@@ -7409,17 +7573,11 @@ async function handleAnthropicMessagesCompletion(c, anthropicPayload, options) {
7409
7573
  strippedReadTagCount: preprocessed.strippedReadTagCount,
7410
7574
  dedupedToolCallCount: preprocessed.dedupedToolCallCount
7411
7575
  });
7412
- const routingDecision = supportsDirectAnthropicApi(anthropicPayload.model);
7413
- if (!routingDecision.supported) {
7414
- const msg = `Model "${anthropicPayload.model}" does not support /v1/messages: ${routingDecision.reason}`;
7415
- throw new HTTPError(msg, 400, msg);
7416
- }
7417
- consola.debug(`[AnthropicRouting] ${anthropicPayload.model}: ${routingDecision.reason}`);
7418
7576
  return handleDirectAnthropicCompletion(c, anthropicPayload, reqCtx);
7419
7577
  }
7420
7578
  async function handleDirectAnthropicCompletion(c, anthropicPayload, reqCtx) {
7421
7579
  consola.debug("Using direct Anthropic API path for model:", anthropicPayload.model);
7422
- const selectedModel = state.models?.data.find((m) => m.id === anthropicPayload.model);
7580
+ const selectedModel = state.modelIndex.get(anthropicPayload.model);
7423
7581
  const { payload: initialSanitized, stats: sanitizationStats } = sanitizeAnthropicMessages(anthropicPayload);
7424
7582
  reqCtx.addSanitizationInfo(toSanitizationInfo(sanitizationStats));
7425
7583
  const hasPreprocessing = reqCtx.preprocessInfo ? reqCtx.preprocessInfo.dedupedToolCallCount > 0 || reqCtx.preprocessInfo.strippedReadTagCount > 0 : false;
@@ -7527,8 +7685,8 @@ function combineAbortSignals(...signals) {
7527
7685
  async function* processAnthropicStream(response, acc, clientAbortSignal) {
7528
7686
  const idleTimeoutMs = state.streamIdleTimeout * 1e3;
7529
7687
  const iterator = response[Symbol.asyncIterator]();
7688
+ const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
7530
7689
  for (;;) {
7531
- const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
7532
7690
  const result = await raceIteratorNext(iterator.next(), {
7533
7691
  idleTimeoutMs,
7534
7692
  abortSignal
@@ -7597,7 +7755,7 @@ async function handleDirectAnthropicStreamingResponse(opts) {
7597
7755
  await stream.writeSSE({
7598
7756
  data: rawEvent.data ?? "",
7599
7757
  event: rawEvent.event,
7600
- id: String(rawEvent.id),
7758
+ id: rawEvent.id != null ? String(rawEvent.id) : void 0,
7601
7759
  retry: rawEvent.retry
7602
7760
  });
7603
7761
  }
@@ -7761,26 +7919,31 @@ function extractOpenAISystemMessages(messages) {
7761
7919
  //#region src/lib/openai/auto-truncate.ts
7762
7920
  /**
7763
7921
  * Auto-truncate module: Automatically truncates conversation history
7764
- * when it exceeds token or byte limits (OpenAI format).
7922
+ * when it exceeds token limits (OpenAI format).
7765
7923
  *
7766
7924
  * Key features:
7767
7925
  * - Binary search for optimal truncation point
7768
- * - Considers both token and byte limits
7926
+ * - Token limit enforcement with learned calibration
7769
7927
  * - Preserves system messages
7770
7928
  * - Filters orphaned tool_result and tool_use messages
7771
- * - Dynamic byte limit adjustment on 413 errors
7772
7929
  * - Optional smart compression of old tool_result content
7773
7930
  */
7774
- function calculateLimits(model, config) {
7775
- if (config.targetTokenLimit !== void 0 || config.targetByteLimitBytes !== void 0) return {
7776
- tokenLimit: config.targetTokenLimit ?? model.capabilities?.limits?.max_context_window_tokens ?? 128e3,
7777
- byteLimit: config.targetByteLimitBytes ?? getEffectiveByteLimitBytes()
7778
- };
7779
- const rawTokenLimit = getEffectiveTokenLimit(model.id) ?? model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? 128e3;
7780
- return {
7781
- tokenLimit: Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100)),
7782
- byteLimit: getEffectiveByteLimitBytes()
7783
- };
7931
+ /** Default fallback for when model capabilities are not available */
7932
+ const DEFAULT_CONTEXT_WINDOW = 128e3;
7933
+ /**
7934
+ * Calculate the effective token limit for auto-truncate.
7935
+ * Uses explicit target if provided, otherwise learned limits with calibration,
7936
+ * otherwise model capabilities with safety margin.
7937
+ */
7938
+ function calculateTokenLimit(model, config) {
7939
+ if (config.targetTokenLimit !== void 0) return config.targetTokenLimit;
7940
+ const learned = getLearnedLimits(model.id);
7941
+ if (learned) {
7942
+ const margin = computeSafetyMargin(learned.sampleCount);
7943
+ return Math.floor(learned.tokenLimit * (1 - margin));
7944
+ }
7945
+ const rawTokenLimit = model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW;
7946
+ return Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100));
7784
7947
  }
7785
7948
  /** Estimate tokens for a single message (fast approximation) */
7786
7949
  function estimateMessageTokens(msg) {
@@ -7793,28 +7956,12 @@ function estimateMessageTokens(msg) {
7793
7956
  if (msg.tool_calls) charCount += JSON.stringify(msg.tool_calls).length;
7794
7957
  return Math.ceil(charCount / 4) + 10;
7795
7958
  }
7796
- /** Get byte size of a message (memoized to avoid redundant JSON.stringify) */
7797
- const messageBytesCache = /* @__PURE__ */ new WeakMap();
7798
- function getMessageBytes(msg) {
7799
- let cached = messageBytesCache.get(msg);
7800
- if (cached !== void 0) return cached;
7801
- cached = JSON.stringify(msg).length;
7802
- messageBytesCache.set(msg, cached);
7803
- return cached;
7804
- }
7805
- /** Calculate cumulative token and byte sums from the end of the message array */
7959
+ /** Calculate cumulative token sums from the end of the message array */
7806
7960
  function calculateCumulativeSums(messages) {
7807
7961
  const n = messages.length;
7808
7962
  const cumTokens = Array.from({ length: n + 1 }).fill(0);
7809
- const cumBytes = Array.from({ length: n + 1 }).fill(0);
7810
- for (let i = n - 1; i >= 0; i--) {
7811
- cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens(messages[i]);
7812
- cumBytes[i] = cumBytes[i + 1] + getMessageBytes(messages[i]) + 1;
7813
- }
7814
- return {
7815
- cumTokens,
7816
- cumBytes
7817
- };
7963
+ for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens(messages[i]);
7964
+ return { cumTokens };
7818
7965
  }
7819
7966
  /**
7820
7967
  * Clean up orphaned tool messages and ensure valid conversation start.
@@ -7833,20 +7980,19 @@ function cleanupMessages(messages) {
7833
7980
  }
7834
7981
  /**
7835
7982
  * Smart compression strategy for OpenAI format:
7836
- * 1. Calculate tokens/bytes from the end until reaching preservePercent of limit
7983
+ * 1. Calculate tokens from the end until reaching preservePercent of limit
7837
7984
  * 2. Messages before that threshold get their tool content compressed
7838
7985
  * 3. Returns compressed messages and stats
7839
7986
  *
7840
7987
  * @param preservePercent - Percentage of context to preserve uncompressed (0.0-1.0)
7841
7988
  */
7842
- function smartCompressToolResults(messages, tokenLimit, byteLimit, preservePercent) {
7989
+ function smartCompressToolResults(messages, tokenLimit, preservePercent) {
7843
7990
  const n = messages.length;
7844
- const { cumTokens, cumBytes } = calculateCumulativeSums(messages);
7991
+ const { cumTokens } = calculateCumulativeSums(messages);
7845
7992
  const preserveTokenLimit = Math.floor(tokenLimit * preservePercent);
7846
- const preserveByteLimit = Math.floor(byteLimit * preservePercent);
7847
7993
  let thresholdIndex = n;
7848
7994
  for (let i = n - 1; i >= 0; i--) {
7849
- if (cumTokens[i] > preserveTokenLimit || cumBytes[i] > preserveByteLimit) {
7995
+ if (cumTokens[i] > preserveTokenLimit) {
7850
7996
  thresholdIndex = i + 1;
7851
7997
  break;
7852
7998
  }
@@ -7882,21 +8028,17 @@ function smartCompressToolResults(messages, tokenLimit, byteLimit, preservePerce
7882
8028
  * Returns the smallest index where the preserved portion fits within limits.
7883
8029
  */
7884
8030
  function findOptimalPreserveIndex(params) {
7885
- const { messages, systemBytes, systemTokens, payloadOverhead, tokenLimit, byteLimit, checkTokenLimit, checkByteLimit } = params;
8031
+ const { messages, systemTokens, tokenLimit } = params;
7886
8032
  if (messages.length === 0) return 0;
7887
- const markerBytes = 200;
7888
8033
  const availableTokens = tokenLimit - systemTokens - 50;
7889
- const availableBytes = byteLimit - payloadOverhead - systemBytes - markerBytes;
7890
- if (checkTokenLimit && availableTokens <= 0 || checkByteLimit && availableBytes <= 0) return messages.length;
8034
+ if (availableTokens <= 0) return messages.length;
7891
8035
  const n = messages.length;
7892
- const { cumTokens, cumBytes } = calculateCumulativeSums(messages);
8036
+ const { cumTokens } = calculateCumulativeSums(messages);
7893
8037
  let left = 0;
7894
8038
  let right = n;
7895
8039
  while (left < right) {
7896
8040
  const mid = left + right >>> 1;
7897
- const tokensFit = !checkTokenLimit || cumTokens[mid] <= availableTokens;
7898
- const bytesFit = !checkByteLimit || cumBytes[mid] <= availableBytes;
7899
- if (tokensFit && bytesFit) right = mid;
8041
+ if (cumTokens[mid] <= availableTokens) right = mid;
7900
8042
  else left = mid + 1;
7901
8043
  }
7902
8044
  return left;
@@ -7981,11 +8123,6 @@ function buildTimedResult(ctx, result) {
7981
8123
  processingTimeMs: Math.round(performance.now() - ctx.startTime)
7982
8124
  };
7983
8125
  }
7984
- function getReasonLabel(exceedsTokens, exceedsBytes) {
7985
- if (exceedsTokens && exceedsBytes) return "tokens+size";
7986
- if (exceedsBytes) return "size";
7987
- return "tokens";
7988
- }
7989
8126
  /**
7990
8127
  * Step 1: Try compressing tool results to fit within limits.
7991
8128
  * First compresses old tool results, then all if needed.
@@ -7996,7 +8133,7 @@ async function tryCompressToolResults(ctx) {
7996
8133
  workingMessages: ctx.payload.messages,
7997
8134
  compressedCount: 0
7998
8135
  };
7999
- const compressionResult = smartCompressToolResults(ctx.payload.messages, ctx.tokenLimit, ctx.byteLimit, ctx.cfg.preserveRecentPercent);
8136
+ const compressionResult = smartCompressToolResults(ctx.payload.messages, ctx.tokenLimit, ctx.cfg.preserveRecentPercent);
8000
8137
  let workingMessages = compressionResult.messages;
8001
8138
  let compressedCount = compressionResult.compressedCount;
8002
8139
  const compressedPayload = {
@@ -8005,10 +8142,9 @@ async function tryCompressToolResults(ctx) {
8005
8142
  };
8006
8143
  const compressedBytes = JSON.stringify(compressedPayload).length;
8007
8144
  const compressedTokenCount = await getTokenCount(compressedPayload, ctx.model);
8008
- if (compressedTokenCount.input <= ctx.tokenLimit && compressedBytes <= ctx.byteLimit) {
8009
- const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
8145
+ if (compressedTokenCount.input <= ctx.tokenLimit) {
8010
8146
  const elapsedMs = Math.round(performance.now() - ctx.startTime);
8011
- consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${compressedTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
8147
+ consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${compressedTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
8012
8148
  const noticePayload = addCompressionNotice(compressedPayload, compressedCount);
8013
8149
  const noticeTokenOverhead = Math.ceil(150 / 4) + 10;
8014
8150
  return {
@@ -8023,7 +8159,7 @@ async function tryCompressToolResults(ctx) {
8023
8159
  })
8024
8160
  };
8025
8161
  }
8026
- const allCompression = smartCompressToolResults(workingMessages, ctx.tokenLimit, ctx.byteLimit, 0);
8162
+ const allCompression = smartCompressToolResults(workingMessages, ctx.tokenLimit, 0);
8027
8163
  if (allCompression.compressedCount > 0) {
8028
8164
  workingMessages = allCompression.messages;
8029
8165
  compressedCount += allCompression.compressedCount;
@@ -8033,10 +8169,9 @@ async function tryCompressToolResults(ctx) {
8033
8169
  };
8034
8170
  const allCompressedBytes = JSON.stringify(allCompressedPayload).length;
8035
8171
  const allCompressedTokenCount = await getTokenCount(allCompressedPayload, ctx.model);
8036
- if (allCompressedTokenCount.input <= ctx.tokenLimit && allCompressedBytes <= ctx.byteLimit) {
8037
- const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
8172
+ if (allCompressedTokenCount.input <= ctx.tokenLimit) {
8038
8173
  const elapsedMs = Math.round(performance.now() - ctx.startTime);
8039
- consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${allCompressedTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
8174
+ consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${allCompressedTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
8040
8175
  const noticePayload = addCompressionNotice(allCompressedPayload, compressedCount);
8041
8176
  const noticeTokenOverhead = Math.ceil(150 / 4) + 10;
8042
8177
  return {
@@ -8063,23 +8198,10 @@ async function tryCompressToolResults(ctx) {
8063
8198
  */
8064
8199
  async function truncateByMessageRemoval(ctx, workingMessages, compressedCount) {
8065
8200
  const { systemMessages, conversationMessages } = extractOpenAISystemMessages(workingMessages);
8066
- const messagesBytes = workingMessages.reduce((sum, m) => sum + getMessageBytes(m) + 1, 0) + 1;
8067
- const payloadOverhead = JSON.stringify({
8068
- ...ctx.payload,
8069
- messages: workingMessages
8070
- }).length - messagesBytes;
8071
- const systemBytes = systemMessages.reduce((sum, m) => sum + getMessageBytes(m) + 1, 0);
8072
- const systemTokens = systemMessages.reduce((sum, m) => sum + estimateMessageTokens(m), 0);
8073
- consola.debug(`[AutoTruncate:OpenAI] overhead=${bytesToKB(payloadOverhead)}KB, system=${systemMessages.length} msgs (${bytesToKB(systemBytes)}KB)`);
8074
8201
  const preserveIndex = findOptimalPreserveIndex({
8075
8202
  messages: conversationMessages,
8076
- systemBytes,
8077
- systemTokens,
8078
- payloadOverhead,
8079
- tokenLimit: ctx.tokenLimit,
8080
- byteLimit: ctx.byteLimit,
8081
- checkTokenLimit: ctx.cfg.checkTokenLimit,
8082
- checkByteLimit: ctx.cfg.checkByteLimit
8203
+ systemTokens: systemMessages.reduce((sum, m) => sum + estimateMessageTokens(m), 0),
8204
+ tokenLimit: ctx.tokenLimit
8083
8205
  });
8084
8206
  if (preserveIndex >= conversationMessages.length) {
8085
8207
  consola.warn("[AutoTruncate:OpenAI] Would need to remove all messages");
@@ -8124,14 +8246,13 @@ async function truncateByMessageRemoval(ctx, workingMessages, compressedCount) {
8124
8246
  };
8125
8247
  const newBytes = JSON.stringify(newPayload).length;
8126
8248
  const newTokenCount = await getTokenCount(newPayload, ctx.model);
8127
- const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
8128
8249
  const actions = [];
8129
8250
  if (removedCount > 0) actions.push(`removed ${removedCount} msgs`);
8130
8251
  if (compressedCount > 0) actions.push(`compressed ${compressedCount} tool_results`);
8131
8252
  const actionInfo = actions.length > 0 ? ` (${actions.join(", ")})` : "";
8132
8253
  const elapsedMs = Math.round(performance.now() - ctx.startTime);
8133
- consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${newTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
8134
- if (newBytes > ctx.byteLimit) consola.warn(`[AutoTruncate:OpenAI] Result still over byte limit (${bytesToKB(newBytes)}KB > ${bytesToKB(ctx.byteLimit)}KB)`);
8254
+ consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${newTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
8255
+ if (newTokenCount.input > ctx.tokenLimit) consola.warn(`[AutoTruncate:OpenAI] Result still over token limit (${newTokenCount.input} > ${ctx.tokenLimit})`);
8135
8256
  return buildTimedResult(ctx, {
8136
8257
  payload: newPayload,
8137
8258
  wasTruncated: true,
@@ -8155,7 +8276,7 @@ async function autoTruncateOpenAI(payload, model, config = {}) {
8155
8276
  ...DEFAULT_AUTO_TRUNCATE_CONFIG,
8156
8277
  ...config
8157
8278
  };
8158
- const { tokenLimit, byteLimit } = calculateLimits(model, cfg);
8279
+ const tokenLimit = calculateTokenLimit(model, cfg);
8159
8280
  const originalBytes = JSON.stringify(payload).length;
8160
8281
  const originalTokens = (await getTokenCount(payload, model)).input;
8161
8282
  const ctx = {
@@ -8163,14 +8284,11 @@ async function autoTruncateOpenAI(payload, model, config = {}) {
8163
8284
  model,
8164
8285
  cfg,
8165
8286
  tokenLimit,
8166
- byteLimit,
8167
8287
  originalTokens,
8168
8288
  originalBytes,
8169
- exceedsTokens: originalTokens > tokenLimit,
8170
- exceedsBytes: originalBytes > byteLimit,
8171
8289
  startTime
8172
8290
  };
8173
- if (!ctx.exceedsTokens && !ctx.exceedsBytes) return buildTimedResult(ctx, {
8291
+ if (originalTokens <= tokenLimit) return buildTimedResult(ctx, {
8174
8292
  payload,
8175
8293
  wasTruncated: false,
8176
8294
  originalTokens,
@@ -8201,7 +8319,7 @@ const createChatCompletions = async (payload) => {
8201
8319
  ...copilotHeaders(state, enableVision),
8202
8320
  "X-Initiator": isAgentCall ? "agent" : "user"
8203
8321
  };
8204
- const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
8322
+ const fetchSignal = createFetchSignal();
8205
8323
  const response = await fetch(`${copilotBaseUrl(state)}/chat/completions`, {
8206
8324
  method: "POST",
8207
8325
  headers,
@@ -8372,7 +8490,7 @@ async function handleCompletion(c) {
8372
8490
  consola.debug(`Model name resolved: ${clientModel} → ${resolvedModel}`);
8373
8491
  originalPayload.model = resolvedModel;
8374
8492
  }
8375
- const selectedModel = state.models?.data.find((model) => model.id === originalPayload.model);
8493
+ const selectedModel = state.modelIndex.get(originalPayload.model);
8376
8494
  if (!isEndpointSupported(selectedModel, ENDPOINT.CHAT_COMPLETIONS)) {
8377
8495
  const msg = `Model "${originalPayload.model}" does not support the ${ENDPOINT.CHAT_COMPLETIONS} endpoint`;
8378
8496
  throw new HTTPError(msg, 400, msg);
@@ -8524,8 +8642,8 @@ async function handleStreamingResponse(opts) {
8524
8642
  acc.content += marker;
8525
8643
  }
8526
8644
  const iterator = response[Symbol.asyncIterator]();
8645
+ const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
8527
8646
  for (;;) {
8528
- const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
8529
8647
  const result = await raceIteratorNext(iterator.next(), {
8530
8648
  idleTimeoutMs,
8531
8649
  abortSignal
@@ -8545,7 +8663,7 @@ async function handleStreamingResponse(opts) {
8545
8663
  await stream.writeSSE({
8546
8664
  data: rawEvent.data ?? "",
8547
8665
  event: rawEvent.event,
8548
- id: String(rawEvent.id),
8666
+ id: rawEvent.id != null ? String(rawEvent.id) : void 0,
8549
8667
  retry: rawEvent.retry
8550
8668
  });
8551
8669
  }
@@ -8630,16 +8748,13 @@ async function handleCountTokens(c) {
8630
8748
  const anthropicPayload = await c.req.json();
8631
8749
  anthropicPayload.model = resolveModelName(anthropicPayload.model);
8632
8750
  if (tuiLogId) tuiLogger.updateRequest(tuiLogId, { model: anthropicPayload.model });
8633
- const selectedModel = state.models?.data.find((model) => model.id === anthropicPayload.model);
8751
+ const selectedModel = state.modelIndex.get(anthropicPayload.model);
8634
8752
  if (!selectedModel) {
8635
8753
  consola.warn(`[count_tokens] Model "${anthropicPayload.model}" not found, returning input_tokens=1`);
8636
8754
  return c.json({ input_tokens: 1 });
8637
8755
  }
8638
8756
  if (state.autoTruncate && hasKnownLimits(selectedModel.id)) {
8639
- const truncateCheck = await checkNeedsCompactionAnthropic(anthropicPayload, selectedModel, {
8640
- checkTokenLimit: true,
8641
- checkByteLimit: true
8642
- });
8757
+ const truncateCheck = await checkNeedsCompactionAnthropic(anthropicPayload, selectedModel, { checkTokenLimit: true });
8643
8758
  if (truncateCheck.needed) {
8644
8759
  const contextWindow = selectedModel.capabilities?.limits?.max_context_window_tokens ?? 2e5;
8645
8760
  const inflatedTokens = Math.floor(contextWindow * .95);
@@ -8716,7 +8831,7 @@ modelRoutes.get("/:model", async (c) => {
8716
8831
  try {
8717
8832
  if (!state.models) await cacheModels();
8718
8833
  const modelId = c.req.param("model");
8719
- const model = state.models?.data.find((m) => m.id === modelId);
8834
+ const model = state.modelIndex.get(modelId);
8720
8835
  if (!model) return c.json({ error: {
8721
8836
  message: `The model '${modelId}' does not exist`,
8722
8837
  type: "invalid_request_error",
@@ -8740,7 +8855,7 @@ const createResponses = async (payload) => {
8740
8855
  ...copilotHeaders(state, enableVision),
8741
8856
  "X-Initiator": isAgentCall ? "agent" : "user"
8742
8857
  };
8743
- const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
8858
+ const fetchSignal = createFetchSignal();
8744
8859
  const response = await fetch(`${copilotBaseUrl(state)}/responses`, {
8745
8860
  method: "POST",
8746
8861
  headers,
@@ -8880,8 +8995,7 @@ async function handleResponsesCompletion(c) {
8880
8995
  consola.debug(`Model name resolved: ${clientModel} → ${resolvedModel}`);
8881
8996
  payload.model = resolvedModel;
8882
8997
  }
8883
- const selectedModel = state.models?.data.find((model) => model.id === payload.model);
8884
- if (!isEndpointSupported(selectedModel, ENDPOINT.RESPONSES)) {
8998
+ if (!isEndpointSupported(state.modelIndex.get(payload.model), ENDPOINT.RESPONSES)) {
8885
8999
  const msg = `Model "${payload.model}" does not support the ${ENDPOINT.RESPONSES} endpoint`;
8886
9000
  throw new HTTPError(msg, 400, msg);
8887
9001
  }
@@ -8911,10 +9025,33 @@ async function handleResponsesCompletion(c) {
8911
9025
  /** Pass through to Copilot /responses endpoint directly */
8912
9026
  async function handleDirectResponses(opts) {
8913
9027
  const { c, payload, reqCtx } = opts;
8914
- const inputCount = typeof payload.input === "string" ? 1 : payload.input.length;
8915
- consola.debug(`Responses payload: ${inputCount} input item(s), model: ${payload.model}`);
9028
+ const adapter = {
9029
+ format: "openai-responses",
9030
+ sanitize: (p) => ({
9031
+ payload: p,
9032
+ removedCount: 0,
9033
+ systemReminderRemovals: 0
9034
+ }),
9035
+ execute: (p) => executeWithAdaptiveRateLimit(() => createResponses(p)),
9036
+ logPayloadSize: (p) => {
9037
+ const count = typeof p.input === "string" ? 1 : p.input.length;
9038
+ consola.debug(`Responses payload: ${count} input item(s), model: ${p.model}`);
9039
+ }
9040
+ };
9041
+ const strategies = [createTokenRefreshStrategy()];
9042
+ const selectedModel = state.modelIndex.get(payload.model);
8916
9043
  try {
8917
- const { result: response } = await executeWithAdaptiveRateLimit(() => createResponses(payload));
9044
+ const pipelineResult = await executeRequestPipeline({
9045
+ adapter,
9046
+ strategies,
9047
+ payload,
9048
+ originalPayload: payload,
9049
+ model: selectedModel,
9050
+ maxRetries: 1,
9051
+ requestContext: reqCtx
9052
+ });
9053
+ const response = pipelineResult.response;
9054
+ reqCtx.addQueueWaitMs(pipelineResult.queueWaitMs);
8918
9055
  if (!payload.stream) {
8919
9056
  const responsesResponse = response;
8920
9057
  const content = responsesOutputToContent(responsesResponse.output);
@@ -8942,8 +9079,8 @@ async function handleDirectResponses(opts) {
8942
9079
  let eventsIn = 0;
8943
9080
  try {
8944
9081
  const iterator = response[Symbol.asyncIterator]();
9082
+ const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbort.signal);
8945
9083
  for (;;) {
8946
- const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbort.signal);
8947
9084
  const result = await raceIteratorNext(iterator.next(), {
8948
9085
  idleTimeoutMs,
8949
9086
  abortSignal
@@ -8989,7 +9126,13 @@ async function handleDirectResponses(opts) {
8989
9126
  * Handles POST /responses and POST /v1/responses.
8990
9127
  */
8991
9128
  const responsesRoutes = new Hono();
8992
- responsesRoutes.post("/", handleResponsesCompletion);
9129
+ responsesRoutes.post("/", async (c) => {
9130
+ try {
9131
+ return await handleResponsesCompletion(c);
9132
+ } catch (error) {
9133
+ return forwardError(c, error);
9134
+ }
9135
+ });
8993
9136
 
8994
9137
  //#endregion
8995
9138
  //#region src/routes/token/route.ts
@@ -9078,20 +9221,39 @@ registerRoutes(server);
9078
9221
  function formatLimit(value) {
9079
9222
  return value ? `${Math.round(value / 1e3)}k` : "?";
9080
9223
  }
9224
+ /**
9225
+ * Format a model as 3 lines: main info, features, and supported endpoints.
9226
+ *
9227
+ * Example output:
9228
+ * - claude-opus-4.6-1m Anthropic ctx:1000k prp: 936k out: 64k
9229
+ * features: adaptive-thinking, thinking, streaming, vision, tool-calls
9230
+ * endpoints: messages, completions
9231
+ */
9081
9232
  function formatModelInfo(model) {
9082
9233
  const limits = model.capabilities?.limits;
9083
9234
  const supports = model.capabilities?.supports;
9084
9235
  const contextK = formatLimit(limits?.max_context_window_tokens);
9085
9236
  const promptK = formatLimit(limits?.max_prompt_tokens);
9086
9237
  const outputK = formatLimit(limits?.max_output_tokens);
9238
+ const mainLine = ` - ${model.id.length > 28 ? `${model.id.slice(0, 25)}...` : model.id.padEnd(28)} ${model.vendor.padEnd(13)} ctx:${contextK.padStart(5)} prp:${promptK.padStart(5)} out:${outputK.padStart(5)}`;
9087
9239
  const features = [
9088
9240
  ...Object.entries(supports ?? {}).filter(([, value]) => value === true).map(([key]) => key.replaceAll("_", "-")),
9089
9241
  supports?.max_thinking_budget && "thinking",
9090
9242
  model.capabilities?.type === "embeddings" && "embeddings",
9091
9243
  model.preview && "preview"
9092
9244
  ].filter(Boolean).join(", ");
9093
- const featureStr = features ? ` (${features})` : "";
9094
- return ` - ${model.id.length > 25 ? `${model.id.slice(0, 22)}...` : model.id.padEnd(25)} ctx:${contextK.padStart(5)} prp:${promptK.padStart(5)} out:${outputK.padStart(5)}` + featureStr;
9245
+ const featLine = features ? pc.dim(` features: ${features}`) : "";
9246
+ const endpoints = formatEndpoints(model.supported_endpoints);
9247
+ return [
9248
+ mainLine,
9249
+ featLine,
9250
+ pc.dim(` endpoints: ${endpoints}`)
9251
+ ].filter(Boolean).join("\n");
9252
+ }
9253
+ /** Format endpoint paths as short display names */
9254
+ function formatEndpoints(endpoints) {
9255
+ if (!endpoints || endpoints.length === 0) return "(legacy)";
9256
+ return endpoints.map((e) => e.replace(/^\/(v1\/|chat\/)?/, "")).join(", ");
9095
9257
  }
9096
9258
  /** Parse an integer from a string, returning a default if the result is NaN. */
9097
9259
  function parseIntOrDefault(value, defaultValue) {
@@ -9165,6 +9327,7 @@ async function runServer(options) {
9165
9327
  consola.warn("Failed to fetch models from Copilot API:", error instanceof Error ? error.message : error);
9166
9328
  }
9167
9329
  consola.info(`Available models:\n${state.models?.data.map((m) => formatModelInfo(m)).join("\n")}`);
9330
+ await loadPersistedLimits();
9168
9331
  const availableIds = state.models?.data.map((m) => m.id) ?? [];
9169
9332
  const overrideLines = Object.entries(state.modelOverrides).map(([from, to]) => {
9170
9333
  const resolved = resolveModelName(from);