npm - @hsupu/copilot-api - Versions diffs - 0.7.18-beta → 0.7.18-beta.2 - Mend

@hsupu/copilot-api 0.7.18-beta → 0.7.18-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/main.mjs CHANGED Viewed

@@ -1,8 +1,10 @@
 #!/usr/bin/env node
 import { defineCommand, runMain } from "citty";
 import consola, { consola as consola$1 } from "consola";
+import * as fs$1 from "node:fs/promises";
 import fs, { access, constants, readFile } from "node:fs/promises";
 import os, { homedir } from "node:os";
+import * as path$1 from "node:path";
 import path, { dirname, join, resolve } from "node:path";
 import { randomBytes, randomUUID } from "node:crypto";
 import pc from "picocolors";
@@ -24,6 +26,7 @@ const PATHS = {
 	APP_DIR,
 	GITHUB_TOKEN_PATH,
 	CONFIG_YAML: path.join(APP_DIR, "config.yaml"),
+	LEARNED_LIMITS: path.join(APP_DIR, "learned-limits.json"),
 	ERROR_DIR: path.join(APP_DIR, "errmsgs")
 };
 async function ensurePaths() {
@@ -42,7 +45,15 @@ async function ensureFile(filePath) {
 //#endregion
 //#region src/lib/state.ts
-/** Default model overrides: short aliases → top-preference model per family */
+/**
+* Rebuild model lookup indexes from state.models.
+* Called by cacheModels() in production; call directly in tests after setting state.models.
+*/
+function rebuildModelIndex() {
+	const data = state.models?.data ?? [];
+	state.modelIndex = new Map(data.map((m) => [m.id, m]));
+	state.modelIds = new Set(data.map((m) => m.id));
+}
 const DEFAULT_MODEL_OVERRIDES = {
 	opus: "claude-opus-4.6",
 	sonnet: "claude-sonnet-4.6",
@@ -50,6 +61,8 @@ const DEFAULT_MODEL_OVERRIDES = {
 };
 const state = {
 	accountType: "individual",
+	modelIndex: /* @__PURE__ */ new Map(),
+	modelIds: /* @__PURE__ */ new Set(),
 	showGitHubToken: false,
 	verbose: false,
 	autoTruncate: true,
@@ -198,7 +211,11 @@ function extractTrailingSystemReminderTags(text) {
 	while (true) {
 		const currentTagEnd = scanEnd;
 		let end = scanEnd;
-		while (end > 0 && "\n 	\r".includes(text[end - 1])) end--;
+		while (end > 0) {
+			const c = text.charCodeAt(end - 1);
+			if (c !== 10 && c !== 32 && c !== 9 && c !== 13) break;
+			end--;
+		}
 		if (end < 18) break;
 		if (text.slice(end - 18, end) !== CLOSE_TAG) break;
 		const closeTagStart = end - 18;
@@ -240,7 +257,11 @@ function extractLeadingSystemReminderTags(text) {
 	while (true) {
 		const currentTagStart = scanStart;
 		let start = scanStart;
-		while (start < text.length && " 	\r".includes(text[start])) start++;
+		while (start < text.length) {
+			const c = text.charCodeAt(start);
+			if (c !== 32 && c !== 9 && c !== 13) break;
+			start++;
+		}
 		if (start + 17 > text.length) break;
 		if (text.slice(start, start + 17) !== OPEN_TAG) break;
 		const afterOpen = start + 17;
@@ -365,21 +386,6 @@ function removeSystemReminderTags(text) {
 	return end < result.length ? result.slice(0, end) : result;
 }
-//#endregion
-//#region src/lib/utils.ts
-const sleep = (ms) => new Promise((resolve) => {
-	setTimeout(resolve, ms);
-});
-const isNullish = (value) => value === null || value === void 0;
-/** Convert bytes to KB with rounding */
-function bytesToKB(bytes) {
-	return Math.round(bytes / 1024);
-}
-/** Generate unique ID (timestamp + random) */
-function generateId(randomLength = 7) {
-	return Date.now().toString(36) + Math.random().toString(36).slice(2, 2 + randomLength);
-}
 //#endregion
 //#region src/lib/auto-truncate/index.ts
 /**
@@ -392,64 +398,126 @@ const MAX_AUTO_TRUNCATE_RETRIES = 5;
 const AUTO_TRUNCATE_RETRY_FACTOR = .9;
 const DEFAULT_AUTO_TRUNCATE_CONFIG = {
 	safetyMarginPercent: 2,
-	maxRequestBodyBytes: 510 * 1024,
 	preserveRecentPercent: .7,
-	checkTokenLimit: true,
-	checkByteLimit: false
+	checkTokenLimit: true
 };
-/** Dynamic byte limit that adjusts based on 413 errors */
-let dynamicByteLimit = null;
+const learnedLimits = /* @__PURE__ */ new Map();
+/** Get learned limits for a model (including calibration data) */
+function getLearnedLimits(modelId) {
+	return learnedLimits.get(modelId);
+}
 /**
-* Called when a 413 error occurs. Adjusts the byte limit to 90% of the failing size.
+* Check whether a model has known limits from previous failures.
+* Used to decide whether to pre-check requests before sending.
 */
-function onRequestTooLarge(failingBytes) {
-	const newLimit = Math.max(Math.floor(failingBytes * .9), 100 * 1024);
-	dynamicByteLimit = newLimit;
-	consola.info(`[AutoTruncate] Adjusted byte limit: ${bytesToKB(failingBytes)}KB failed → ${bytesToKB(newLimit)}KB`);
-}
-/** Get the current effective byte limit */
-function getEffectiveByteLimitBytes() {
-	return dynamicByteLimit ?? DEFAULT_AUTO_TRUNCATE_CONFIG.maxRequestBodyBytes;
+function hasKnownLimits(modelId) {
+	return learnedLimits.has(modelId);
 }
-/** Dynamic token limits per model, adjusted based on token limit errors */
-const dynamicTokenLimits = /* @__PURE__ */ new Map();
 /**
 * Called when a token limit error (400) occurs.
-* Adjusts the token limit for the specific model to 95% of the reported limit.
-*/
-function onTokenLimitExceeded(modelId, reportedLimit) {
-	const newLimit = Math.floor(reportedLimit * .95);
-	const previous = dynamicTokenLimits.get(modelId);
-	if (!previous || newLimit < previous) {
-		dynamicTokenLimits.set(modelId, newLimit);
-		consola.info(`[AutoTruncate] Adjusted token limit for ${modelId}: ${reportedLimit} reported → ${newLimit} effective`);
+* Records the learned limit and optionally updates calibration.
+*/
+function onTokenLimitExceeded(modelId, reportedLimit, reportedCurrent, estimatedTokens) {
+	const existing = learnedLimits.get(modelId);
+	if (!existing || reportedLimit < existing.tokenLimit) {
+		learnedLimits.set(modelId, {
+			tokenLimit: reportedLimit,
+			calibrationFactor: existing?.calibrationFactor ?? 1,
+			sampleCount: existing?.sampleCount ?? 0,
+			updatedAt: Date.now()
+		});
+		consola.info(`[AutoTruncate] Learned token limit for ${modelId}: ${reportedLimit}`);
+	}
+	if (reportedCurrent !== void 0 && estimatedTokens !== void 0 && estimatedTokens > 0) {
+		updateCalibration(modelId, reportedCurrent, estimatedTokens);
+		const lim = learnedLimits.get(modelId);
+		consola.info(`[AutoTruncate] Calibration for ${modelId}: actual=${reportedCurrent} vs estimated=${estimatedTokens} → factor=${lim.calibrationFactor.toFixed(3)} (${lim.sampleCount} samples)`);
 	}
+	schedulePersist();
 }
+const CALIBRATION_ALPHA = .3;
+const CALIBRATION_MIN = .5;
+const CALIBRATION_MAX = 3;
 /**
-* Get the effective token limit for a model.
-* Returns the dynamic limit if set, otherwise null to use model capabilities.
-*/
-function getEffectiveTokenLimit(modelId) {
-	return dynamicTokenLimits.get(modelId) ?? null;
+* Update the per-model calibration factor using EWMA.
+*
+* Called after a token limit error when we know both the GPT tokenizer estimate
+* and the actual token count (from the error response). The ratio between them
+* tells us how much the GPT tokenizer over/under-estimates for this model.
+*/
+function updateCalibration(modelId, actualTokens, estimatedTokens) {
+	if (estimatedTokens <= 0) return;
+	const limits = learnedLimits.get(modelId);
+	if (!limits) return;
+	const rawFactor = actualTokens / estimatedTokens;
+	const clamped = Math.max(CALIBRATION_MIN, Math.min(CALIBRATION_MAX, rawFactor));
+	if (limits.sampleCount === 0) limits.calibrationFactor = clamped;
+	else limits.calibrationFactor = CALIBRATION_ALPHA * clamped + (1 - CALIBRATION_ALPHA) * limits.calibrationFactor;
+	limits.sampleCount++;
+	limits.updatedAt = Date.now();
+}
+/** Apply calibration factor to a GPT tokenizer estimate */
+function calibrate(modelId, gptEstimate) {
+	const limits = learnedLimits.get(modelId);
+	if (!limits || limits.sampleCount === 0) return gptEstimate;
+	return Math.ceil(gptEstimate * limits.calibrationFactor);
+}
+const BASE_MARGIN = .03;
+const BONUS_MARGIN_PER_SAMPLE = .07;
+/**
+* Compute dynamic safety margin based on calibration confidence.
+* Fewer samples → wider margin (conservative). More samples → narrower margin.
+*
+* - 0 samples: 10% (0.03 + 0.07)
+* - 1 sample:  10%
+* - 10 samples: ~3.7%
+* - ∞ samples:  3%
+*/
+function computeSafetyMargin(sampleCount) {
+	if (sampleCount <= 0) return BASE_MARGIN + BONUS_MARGIN_PER_SAMPLE;
+	return BASE_MARGIN + BONUS_MARGIN_PER_SAMPLE / sampleCount;
+}
+let persistTimer = null;
+const PERSIST_DEBOUNCE_MS = 5e3;
+/** Schedule an async write of learned limits (debounced) */
+function schedulePersist() {
+	if (persistTimer) return;
+	persistTimer = setTimeout(() => {
+		persistTimer = null;
+		persistLimits();
+	}, PERSIST_DEBOUNCE_MS);
+}
+/** Write learned limits to disk */
+async function persistLimits() {
+	if (learnedLimits.size === 0) return;
+	const data = {
+		version: 1,
+		limits: Object.fromEntries(learnedLimits)
+	};
+	try {
+		await fs.writeFile(PATHS.LEARNED_LIMITS, JSON.stringify(data, null, 2), "utf8");
+	} catch {}
 }
-/**
-* Check whether a model has known limits from previous failures.
-* Used to decide whether to pre-check requests before sending.
-*/
-function hasKnownLimits(modelId) {
-	return dynamicTokenLimits.has(modelId) || dynamicByteLimit !== null;
+/** Load previously persisted limits from disk (called at startup) */
+async function loadPersistedLimits() {
+	try {
+		const raw = await fs.readFile(PATHS.LEARNED_LIMITS, "utf8");
+		const data = JSON.parse(raw);
+		if (data.version !== 1) return;
+		for (const [modelId, lim] of Object.entries(data.limits)) if (lim.tokenLimit > 0 && lim.calibrationFactor >= CALIBRATION_MIN && lim.calibrationFactor <= CALIBRATION_MAX) learnedLimits.set(modelId, lim);
+		if (learnedLimits.size > 0) consola.info(`[AutoTruncate] Loaded learned limits for ${learnedLimits.size} model(s)`);
+	} catch {}
 }
 /**
-* Parse an HTTPError to detect token limit or body size errors,
+* Parse an HTTPError to detect token limit errors,
 * and record the learned limit for future pre-checks.
 *
-* Returns error info if the error is a retryable limit error, null otherwise.
+* When `estimatedTokens` is provided (the GPT tokenizer estimate at the time
+* of the error), also updates the per-model calibration factor.
+*
+* Returns error info if the error is a retryable token limit error, null otherwise.
 */
-function tryParseAndLearnLimit(error, modelId, payloadBytes, learn = true) {
-	if (error.status === 413) {
-		if (payloadBytes && learn) onRequestTooLarge(payloadBytes);
-		return { type: "body_too_large" };
-	}
+function tryParseAndLearnLimit(error, modelId, learn = true, estimatedTokens) {
 	if (error.status === 400) {
 		let errorJson;
 		try {
@@ -461,7 +529,7 @@ function tryParseAndLearnLimit(error, modelId, payloadBytes, learn = true) {
 		if (!(errorJson.error.code === "model_max_prompt_tokens_exceeded" || errorJson.error.type === "invalid_request_error")) return null;
 		const tokenInfo = parseTokenLimitError(errorJson.error.message);
 		if (!tokenInfo) return null;
-		if (learn) onTokenLimitExceeded(modelId, tokenInfo.limit);
+		if (learn) onTokenLimitExceeded(modelId, tokenInfo.limit, tokenInfo.current, estimatedTokens);
 		return {
 			type: "token_limit",
 			limit: tokenInfo.limit,
@@ -594,64 +662,9 @@ function formatRateLimitError(copilotMessage) {
 		}
 	};
 }
-/** Format timestamp as YYMMDD_HHmmss for error directory names */
-function formatErrorTimestamp() {
-	const now = /* @__PURE__ */ new Date();
-	return `${String(now.getFullYear()).slice(2)}${String(now.getMonth() + 1).padStart(2, "0")}${String(now.getDate()).padStart(2, "0")}_${String(now.getHours()).padStart(2, "0")}${String(now.getMinutes()).padStart(2, "0")}${String(now.getSeconds()).padStart(2, "0")}`;
-}
-/** Extract request headers as a plain object (excluding potentially large/binary headers) */
-function extractHeaders(c) {
-	const headers = {};
-	for (const [key, value] of c.req.raw.headers.entries()) headers[key] = key.toLowerCase() === "authorization" ? "[REDACTED]" : value;
-	return headers;
-}
-/**
-* Persist error details to disk for post-mortem debugging.
-* Each error gets a subdirectory under errmsgs/ containing:
-* - meta.json: structured metadata (timestamp, status, headers, error info)
-* - request.json: raw request body
-* - response.txt: raw upstream response body
-*
-* Fire-and-forget — never blocks or throws.
-*/
-async function writeErrorToFile(c, error) {
-	const id = randomBytes(4).toString("hex");
-	const dirName = `${formatErrorTimestamp()}_${id}`;
-	const dirPath = path.join(PATHS.ERROR_DIR, dirName);
-	await fs.mkdir(dirPath, { recursive: true });
-	const meta = {
-		timestamp: (/* @__PURE__ */ new Date()).toISOString(),
-		request: {
-			method: c.req.method,
-			path: c.req.path,
-			url: c.req.url,
-			headers: extractHeaders(c)
-		}
-	};
-	if (error instanceof HTTPError) {
-		meta.response = {
-			status: error.status,
-			modelId: error.modelId
-		};
-		meta.error = { message: error.message };
-	} else if (error instanceof Error) meta.error = {
-		message: formatErrorWithCause(error),
-		name: error.name,
-		stack: error.stack
-	};
-	else meta.error = { message: String(error) };
-	const writes = [fs.writeFile(path.join(dirPath, "meta.json"), JSON.stringify(meta, null, 2))];
-	try {
-		const body = await c.req.json();
-		writes.push(fs.writeFile(path.join(dirPath, "request.json"), JSON.stringify(body, null, 2)));
-	} catch {}
-	if (error instanceof HTTPError && error.responseText) writes.push(fs.writeFile(path.join(dirPath, "response.txt"), error.responseText));
-	await Promise.all(writes);
-}
 function forwardError(c, error) {
-	writeErrorToFile(c, error).catch(() => {});
 	if (error instanceof HTTPError) {
-		const limitInfo = tryParseAndLearnLimit(error, error.modelId ?? "unknown", void 0, state.autoTruncate);
+		const limitInfo = tryParseAndLearnLimit(error, error.modelId ?? "unknown", state.autoTruncate);
 		if (error.status === 413) {
 			const formattedError = formatRequestTooLargeError();
 			consola.warn(`HTTP 413: Request too large`);
@@ -827,7 +840,6 @@ const NETWORK_ERROR_PATTERNS = [
 function isNetworkError(error) {
 	const msg = error.message.toLowerCase();
 	if (NETWORK_ERROR_PATTERNS.some((p) => msg.includes(p.toLowerCase()))) return true;
-	if (error instanceof TypeError) return true;
 	if (error.cause instanceof Error) return isNetworkError(error.cause);
 	return false;
 }
@@ -1048,6 +1060,21 @@ var CopilotTokenManager = class {
 	}
 };
+//#endregion
+//#region src/lib/utils.ts
+const sleep = (ms) => new Promise((resolve) => {
+	setTimeout(resolve, ms);
+});
+const isNullish = (value) => value === null || value === void 0;
+/** Convert bytes to KB with rounding */
+function bytesToKB(bytes) {
+	return Math.round(bytes / 1024);
+}
+/** Generate unique ID (timestamp + random) */
+function generateId(randomLength = 7) {
+	return Date.now().toString(36) + Math.random().toString(36).slice(2, 2 + randomLength);
+}
 //#endregion
 //#region src/lib/token/github-client.ts
 /** GitHub OAuth API client — device code flow and user info */
@@ -1589,6 +1616,7 @@ const checkUsage = defineCommand({
 /** Fetch models from Copilot API and cache in global state */
 async function cacheModels() {
 	state.models = await getModels();
+	rebuildModelIndex();
 }
 const getModels = async () => {
 	const response = await fetch(`${copilotBaseUrl(state)}/models`, { headers: copilotHeaders(state) });
@@ -1766,6 +1794,8 @@ var AdaptiveRateLimiter = class {
 	lastRequestTime = 0;
 	/** Current step in gradual recovery (index into gradualRecoverySteps) */
 	recoveryStepIndex = 0;
+	/** Abort controller for cancelling pending sleeps during shutdown */
+	sleepAbortController = new AbortController();
 	constructor(config = {}) {
 		this.config = {
 			...DEFAULT_CONFIG,
@@ -1999,10 +2029,20 @@ var AdaptiveRateLimiter = class {
 			request.reject(/* @__PURE__ */ new Error("Server shutting down"));
 		}
 		this.processing = false;
+		this.sleepAbortController.abort();
+		this.sleepAbortController = new AbortController();
 		return count;
 	}
 	sleep(ms) {
-		return new Promise((resolve) => setTimeout(resolve, ms));
+		const signal = this.sleepAbortController.signal;
+		if (signal.aborted) return Promise.resolve();
+		return new Promise((resolve) => {
+			const timer = setTimeout(resolve, ms);
+			signal.addEventListener("abort", () => {
+				clearTimeout(timer);
+				resolve();
+			}, { once: true });
+		});
 	}
 	/**
 	* Get current status for debugging/monitoring
@@ -2073,6 +2113,10 @@ const MODEL_PREFERENCE = {
 	],
 	haiku: ["claude-haiku-4.5"]
 };
+/** Pre-compiled regex: claude-{family}-{major}-{minor}[-YYYYMMDD] */
+const VERSIONED_RE = /^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/;
+/** Pre-compiled regex: claude-{family}-{major}-YYYYMMDD (date-only suffix) */
+const DATE_ONLY_RE = /^(claude-(opus|sonnet|haiku)-\d+)-\d{8,}$/;
 /**
 * Normalize model ID for matching: lowercase and replace dots with dashes.
 * e.g. "claude-sonnet-4.5" → "claude-sonnet-4-5"
@@ -2093,7 +2137,7 @@ function normalizeForMatching(modelId) {
 */
 function normalizeModelId(modelId) {
 	const { base, suffix } = extractModifierSuffix(modelId);
-	const versionedMatch = base.match(/^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/);
+	const versionedMatch = base.match(VERSIONED_RE);
 	if (versionedMatch) return `${versionedMatch[1]}-${versionedMatch[2]}.${versionedMatch[3]}${suffix}`;
 	return modelId;
 }
@@ -2112,9 +2156,8 @@ function getModelFamily(modelId) {
 function findPreferredModel(family) {
 	const preference = MODEL_PREFERENCE[family];
 	if (!preference) return family;
-	const availableIds = state.models?.data.map((m) => m.id);
-	if (!availableIds || availableIds.length === 0) return preference[0];
-	for (const candidate of preference) if (availableIds.includes(candidate)) return candidate;
+	if (state.modelIds.size === 0) return preference[0];
+	for (const candidate of preference) if (state.modelIds.has(candidate)) return candidate;
 	return preference[0];
 }
 /** Known model modifier suffixes (e.g., "-fast" for fast output mode, "-1m" for 1M context). */
@@ -2182,8 +2225,7 @@ function resolveModelName(model) {
 * Uses `seen` set to prevent circular override chains.
 */
 function resolveOverrideTarget(source, target, seen) {
-	const availableIds = state.models?.data.map((m) => m.id);
-	if (!availableIds || availableIds.length === 0 || availableIds.includes(target)) return target;
+	if (state.modelIds.size === 0 || state.modelIds.has(target)) return target;
 	const visited = seen ?? new Set([source]);
 	const targetOverride = state.modelOverrides[target];
 	if (targetOverride && !visited.has(target)) {
@@ -2213,8 +2255,7 @@ function resolveModelNameCore(model) {
 	const resolvedBase = resolveBase(base);
 	if (suffix) {
 		const withSuffix = resolvedBase + suffix;
-		const availableIds = state.models?.data.map((m) => m.id);
-		if (!availableIds || availableIds.length === 0 || availableIds.includes(withSuffix)) return withSuffix;
+		if (state.modelIds.size === 0 || state.modelIds.has(withSuffix)) return withSuffix;
 		return resolvedBase;
 	}
 	return resolvedBase;
@@ -2222,17 +2263,16 @@ function resolveModelNameCore(model) {
 /** Resolve a base model name (without modifier suffix) to its canonical form. */
 function resolveBase(model) {
 	if (model in MODEL_PREFERENCE) return findPreferredModel(model);
-	const versionedMatch = model.match(/^(claude-(?:opus|sonnet|haiku))-(\d+)-(\d{1,2})(?:-\d{8,})?$/);
+	const versionedMatch = model.match(VERSIONED_RE);
 	if (versionedMatch) {
 		const dotModel = `${versionedMatch[1]}-${versionedMatch[2]}.${versionedMatch[3]}`;
-		const availableIds = state.models?.data.map((m) => m.id);
-		if (!availableIds || availableIds.length === 0 || availableIds.includes(dotModel)) return dotModel;
+		if (state.modelIds.size === 0 || state.modelIds.has(dotModel)) return dotModel;
 	}
-	const dateOnlyMatch = model.match(/^(claude-(opus|sonnet|haiku)-\d+)-\d{8,}$/);
+	const dateOnlyMatch = model.match(DATE_ONLY_RE);
 	if (dateOnlyMatch) {
 		const baseModel = dateOnlyMatch[1];
 		const family = dateOnlyMatch[2];
-		if ((state.models?.data.map((m) => m.id))?.includes(baseModel)) return baseModel;
+		if (state.modelIds.has(baseModel)) return baseModel;
 		return findPreferredModel(family);
 	}
 	return model;
@@ -2272,6 +2312,9 @@ function createRequestContext(opts) {
 		get durationMs() {
 			return Date.now() - startTime;
 		},
+		get settled() {
+			return settled;
+		},
 		get originalRequest() {
 			return _originalRequest;
 		},
@@ -2410,7 +2453,7 @@ function createRequestContext(opts) {
 		fail(model, error) {
 			if (settled) return;
 			settled = true;
-			const errorMessage = getErrorMessage(error);
+			const errorMsg = getErrorMessage(error);
 			_response = {
 				success: false,
 				model: normalizeModelId(model),
@@ -2418,28 +2461,14 @@ function createRequestContext(opts) {
 					input_tokens: 0,
 					output_tokens: 0
 				},
-				error: errorMessage,
+				error: errorMsg,
 				content: null
 			};
 			if (error instanceof Error && "responseText" in error && typeof error.responseText === "string") {
 				const responseText = error.responseText;
-				const status = "status" in error ? error.status : void 0;
-				if (responseText) {
-					let formattedBody;
-					try {
-						formattedBody = JSON.stringify(JSON.parse(responseText), null, 2);
-					} catch {
-						formattedBody = responseText;
-					}
-					_response.content = {
-						role: "assistant",
-						content: [{
-							type: "text",
-							text: `[API Error Response${status ? ` - HTTP ${status}` : ""}]\n\n${formattedBody}`
-						}]
-					};
-				}
+				if (responseText) _response.responseText = responseText;
 			}
+			if (error instanceof Error && "status" in error && typeof error.status === "number") _response.status = error.status;
 			_state = "failed";
 			emit({
 				type: "failed",
@@ -2462,7 +2491,7 @@ function createRequestContext(opts) {
 				}
 			};
 			if (_response) entry.response = _response;
-			const lastTruncation = [..._attempts].reverse().find((a) => a.truncation)?.truncation;
+			const lastTruncation = _attempts.findLast((a) => a.truncation)?.truncation;
 			if (lastTruncation) entry.truncation = lastTruncation;
 			if (_rewrites) entry.rewrites = _rewrites;
 			if (_sseEvents) entry.sseEvents = _sseEvents;
@@ -2500,7 +2529,7 @@ function createRequestContextManager() {
 		const maxAgeMs = state.staleRequestMaxAge * 1e3;
 		if (maxAgeMs <= 0) return;
 		for (const [id, ctx] of activeContexts) if (ctx.durationMs > maxAgeMs) {
-			consola$1.warn(`[context] Force-failing stale request ${id} (age: ${Math.round(ctx.durationMs / 1e3)}s, max: ${state.staleRequestMaxAge}s, model: ${ctx.originalRequest?.model ?? "unknown"})`);
+			consola$1.warn(`[context] Force-failing stale request ${id} (endpoint: ${ctx.endpoint}, model: ${ctx.originalRequest?.model ?? "unknown"}, stream: ${ctx.originalRequest?.stream ?? "?"}, state: ${ctx.state}, age: ${Math.round(ctx.durationMs / 1e3)}s, max: ${state.staleRequestMaxAge}s)`);
 			ctx.fail(ctx.originalRequest?.model ?? "unknown", /* @__PURE__ */ new Error(`Request exceeded maximum age of ${state.staleRequestMaxAge}s (stale context reaper)`));
 		}
 	}
@@ -2707,10 +2736,18 @@ function buildSearchText(entry) {
 			for (const block of msg.content) if (block.type === "text" && block.text) parts.push(block.text.slice(0, 200));
 			else if (block.type === "tool_use") {
 				if (block.name) parts.push(block.name);
+				if (block.input) {
+					const inputStr = typeof block.input === "string" ? block.input : JSON.stringify(block.input);
+					parts.push(inputStr.slice(0, 500));
+				}
+			} else if (block.type === "tool_result" && block.content) {
+				const contentStr = typeof block.content === "string" ? block.content : JSON.stringify(block.content);
+				parts.push(contentStr.slice(0, 500));
 			} else if (block.type === "thinking" && block.thinking) parts.push(block.thinking.slice(0, 200));
 		}
-		if (msg.tool_calls) {
-			for (const tc of msg.tool_calls) if (tc.function.name) parts.push(tc.function.name);
+		if (msg.tool_calls) for (const tc of msg.tool_calls) {
+			if (tc.function.name) parts.push(tc.function.name);
+			if (tc.function.arguments) parts.push(tc.function.arguments.slice(0, 500));
 		}
 	}
 	if (entry.response?.content) {
@@ -2723,7 +2760,7 @@ function buildSearchText(entry) {
 	}
 	return parts.join(" ").toLowerCase();
 }
-/** Build a summary from a full HistoryEntry */
+/** Build a summary from a full HistoryEntry (searchText is computed lazily) */
 function toSummary(entry) {
 	return {
 		id: entry.id,
@@ -2739,7 +2776,7 @@ function toSummary(entry) {
 		usage: entry.response?.usage,
 		durationMs: entry.durationMs,
 		previewText: extractPreviewText(entry),
-		searchText: buildSearchText(entry)
+		searchText: ""
 	};
 }
 /** Global history state */
@@ -2756,6 +2793,14 @@ const entryIndex = /* @__PURE__ */ new Map();
 const summaryIndex = /* @__PURE__ */ new Map();
 /** Track entry count per session to avoid O(n) filter during FIFO eviction */
 const sessionEntryCount = /* @__PURE__ */ new Map();
+/** O(1) uniqueness tracking for session.models (avoids Array.includes in hot path) */
+const sessionModelsSet = /* @__PURE__ */ new Map();
+/** O(1) uniqueness tracking for session.toolsUsed (avoids Array.includes in hot path) */
+const sessionToolsSet = /* @__PURE__ */ new Map();
+/** Dirty flag for stats cache — set true when entries are inserted/updated */
+let statsDirty = true;
+/** Cached stats result — recomputed only when statsDirty is true */
+let cachedStats = null;
 function initHistory(enabled, maxEntries) {
 	historyState.enabled = enabled;
 	historyState.maxEntries = maxEntries;
@@ -2765,6 +2810,10 @@ function initHistory(enabled, maxEntries) {
 	entryIndex.clear();
 	summaryIndex.clear();
 	sessionEntryCount.clear();
+	sessionModelsSet.clear();
+	sessionToolsSet.clear();
+	statsDirty = true;
+	cachedStats = null;
 }
 /** Update the maximum number of history entries (for config hot-reload) */
 function setHistoryMaxEntries(limit) {
@@ -2790,6 +2839,8 @@ function getCurrentSession(endpoint) {
 	const now = Date.now();
 	const sessionId = generateId();
 	historyState.currentSessionId = sessionId;
+	sessionModelsSet.set(sessionId, /* @__PURE__ */ new Set());
+	sessionToolsSet.set(sessionId, /* @__PURE__ */ new Set());
 	historyState.sessions.set(sessionId, {
 		id: sessionId,
 		startTime: now,
@@ -2815,25 +2866,43 @@ function insertEntry(entry) {
 	session.requestCount++;
 	sessionEntryCount.set(entry.sessionId, (sessionEntryCount.get(entry.sessionId) ?? 0) + 1);
 	const model = entry.request.model;
-	if (model && !session.models.includes(model)) session.models.push(model);
+	if (model) {
+		const modelsSet = sessionModelsSet.get(entry.sessionId);
+		if (modelsSet && !modelsSet.has(model)) {
+			modelsSet.add(model);
+			session.models.push(model);
+		}
+	}
 	if (entry.request.tools && entry.request.tools.length > 0) {
 		if (!session.toolsUsed) session.toolsUsed = [];
-		for (const tool of entry.request.tools) if (!session.toolsUsed.includes(tool.name)) session.toolsUsed.push(tool.name);
+		let toolsSet = sessionToolsSet.get(entry.sessionId);
+		if (!toolsSet) {
+			toolsSet = new Set(session.toolsUsed);
+			sessionToolsSet.set(entry.sessionId, toolsSet);
+		}
+		for (const tool of entry.request.tools) if (!toolsSet.has(tool.name)) {
+			toolsSet.add(tool.name);
+			session.toolsUsed.push(tool.name);
+		}
 	}
 	const summary = toSummary(entry);
 	summaryIndex.set(entry.id, summary);
-	while (historyState.maxEntries > 0 && historyState.entries.length > historyState.maxEntries) {
-		const removed = historyState.entries.shift();
-		if (removed) {
-			entryIndex.delete(removed.id);
-			summaryIndex.delete(removed.id);
-			const count = (sessionEntryCount.get(removed.sessionId) ?? 1) - 1;
+	if (historyState.maxEntries > 0 && historyState.entries.length > historyState.maxEntries) {
+		const excess = historyState.entries.length - historyState.maxEntries;
+		const removed = historyState.entries.splice(0, excess);
+		for (const r of removed) {
+			entryIndex.delete(r.id);
+			summaryIndex.delete(r.id);
+			const count = (sessionEntryCount.get(r.sessionId) ?? 1) - 1;
 			if (count <= 0) {
-				sessionEntryCount.delete(removed.sessionId);
-				historyState.sessions.delete(removed.sessionId);
-			} else sessionEntryCount.set(removed.sessionId, count);
+				sessionEntryCount.delete(r.sessionId);
+				sessionModelsSet.delete(r.sessionId);
+				sessionToolsSet.delete(r.sessionId);
+				historyState.sessions.delete(r.sessionId);
+			} else sessionEntryCount.set(r.sessionId, count);
 		}
 	}
+	statsDirty = true;
 	notifyEntryAdded(summary);
 }
 /**
@@ -2849,10 +2918,24 @@ function updateEntry(id, update) {
 		const session = historyState.sessions.get(entry.sessionId);
 		if (session) {
 			const model = update.request.model;
-			if (model && !session.models.includes(model)) session.models.push(model);
+			if (model) {
+				const modelsSet = sessionModelsSet.get(entry.sessionId);
+				if (modelsSet && !modelsSet.has(model)) {
+					modelsSet.add(model);
+					session.models.push(model);
+				}
+			}
 			if (update.request.tools && update.request.tools.length > 0) {
 				if (!session.toolsUsed) session.toolsUsed = [];
-				for (const tool of update.request.tools) if (!session.toolsUsed.includes(tool.name)) session.toolsUsed.push(tool.name);
+				let toolsSet = sessionToolsSet.get(entry.sessionId);
+				if (!toolsSet) {
+					toolsSet = new Set(session.toolsUsed);
+					sessionToolsSet.set(entry.sessionId, toolsSet);
+				}
+				for (const tool of update.request.tools) if (!toolsSet.has(tool.name)) {
+					toolsSet.add(tool.name);
+					session.toolsUsed.push(tool.name);
+				}
 			}
 		}
 	}
@@ -2867,6 +2950,7 @@ function updateEntry(id, update) {
 			session.lastActivity = Date.now();
 		}
 	}
+	statsDirty = true;
 	const summary = toSummary(entry);
 	summaryIndex.set(entry.id, summary);
 	notifyEntryUpdated(summary);
@@ -2894,7 +2978,13 @@ function getHistorySummaries(options = {}) {
 	if (to) summaries = summaries.filter((s) => s.timestamp <= to);
 	if (search) {
 		const needle = search.toLowerCase();
-		summaries = summaries.filter((s) => s.searchText.includes(needle));
+		summaries = summaries.filter((s) => {
+			if (s.searchText === "") {
+				const entry = entryIndex.get(s.id);
+				if (entry) s.searchText = buildSearchText(entry);
+			}
+			return s.searchText.includes(needle);
+		});
 	}
 	summaries.sort((a, b) => b.timestamp - a.timestamp);
 	const total = summaries.length;
@@ -2928,6 +3018,10 @@ function clearHistory() {
 	entryIndex.clear();
 	summaryIndex.clear();
 	sessionEntryCount.clear();
+	sessionModelsSet.clear();
+	sessionToolsSet.clear();
+	statsDirty = true;
+	cachedStats = null;
 }
 function deleteSession(sessionId) {
 	if (!historyState.sessions.has(sessionId)) return false;
@@ -2939,10 +3033,15 @@ function deleteSession(sessionId) {
 	historyState.entries = remaining;
 	historyState.sessions.delete(sessionId);
 	sessionEntryCount.delete(sessionId);
+	sessionModelsSet.delete(sessionId);
+	sessionToolsSet.delete(sessionId);
+	statsDirty = true;
+	cachedStats = null;
 	if (historyState.currentSessionId === sessionId) historyState.currentSessionId = generateId();
 	return true;
 }
 function getStats() {
+	if (!statsDirty && cachedStats) return cachedStats;
 	const entries = historyState.entries;
 	const modelDist = {};
 	const endpointDist = {};
@@ -2975,7 +3074,7 @@ function getStats() {
 		hour,
 		count
 	}));
-	return {
+	const stats = {
 		totalRequests: entries.length,
 		successfulRequests: successCount,
 		failedRequests: failCount,
@@ -2987,6 +3086,9 @@ function getStats() {
 		recentActivity,
 		activeSessions: historyState.sessions.size
 	};
+	statsDirty = false;
+	cachedStats = stats;
+	return stats;
 }
 /** Escape a value for CSV: wrap in quotes if it contains comma, quote, or newline; convert nullish to empty string */
 function escapeCsvValue(value) {
@@ -3112,7 +3214,6 @@ async function gracefulShutdown(signal, deps) {
 	const tracker = deps?.tracker ?? tuiLogger;
 	const server = deps?.server ?? serverInstance;
 	const rateLimiter = deps?.rateLimiter !== void 0 ? deps.rateLimiter : getAdaptiveRateLimiter();
-	const contextManager = deps?.contextManager ?? getRequestContextManager();
 	const stopRefresh = deps?.stopTokenRefreshFn ?? stopTokenRefresh;
 	const closeWsClients = deps?.closeAllClientsFn ?? closeAllClients;
 	const getWsClientCount = deps?.getClientCountFn ?? getClientCount;
@@ -3125,7 +3226,9 @@ async function gracefulShutdown(signal, deps) {
 	_isShuttingDown = true;
 	shutdownAbortController = new AbortController();
 	consola.info(`Received ${signal}, shutting down gracefully...`);
-	contextManager?.stopReaper();
+	try {
+		(deps?.contextManager ?? getRequestContextManager()).stopReaper();
+	} catch {}
 	stopRefresh();
 	const wsClients = getWsClientCount();
 	if (wsClients > 0) {
@@ -3244,7 +3347,7 @@ var TuiLogger = class {
 		if (!entry) return;
 		if (update.model !== void 0) {
 			entry.model = update.model;
-			const multiplier = state.models?.data.find((m) => m.id === update.model)?.billing?.multiplier;
+			const multiplier = state.modelIndex.get(update.model)?.billing?.multiplier;
 			if (multiplier !== void 0) entry.multiplier = multiplier;
 		}
 		if (update.clientModel !== void 0) entry.clientModel = update.clientModel;
@@ -3957,7 +4060,7 @@ const setupClaudeCode = defineCommand({
 //#endregion
 //#region package.json
-var version = "0.7.18-beta";
+var version = "0.7.18-beta.2";
 //#endregion
 //#region src/lib/config/config.ts
@@ -3999,9 +4102,15 @@ function compileRewriteRules(raws) {
 }
 let cachedConfig = null;
 let configLastMtimeMs = 0;
+/** Time-based debounce: skip stat() if checked recently */
+let lastStatTimeMs = 0;
+const STAT_DEBOUNCE_MS = 2e3;
 async function loadConfig() {
 	try {
+		const now = Date.now();
+		if (cachedConfig && now - lastStatTimeMs < STAT_DEBOUNCE_MS) return cachedConfig;
 		const stat = await fs.stat(PATHS.CONFIG_YAML);
+		lastStatTimeMs = now;
 		if (cachedConfig && stat.mtimeMs === configLastMtimeMs) return cachedConfig;
 		const content = await fs.readFile(PATHS.CONFIG_YAML, "utf8");
 		const { parse } = await import("yaml");
@@ -4048,7 +4157,7 @@ async function applyConfigToState() {
 			else if (Array.isArray(a.rewrite_system_reminders)) state.rewriteSystemReminders = compileRewriteRules(a.rewrite_system_reminders);
 		}
 	}
-	if (config.system_prompt_overrides !== void 0) state.systemPromptOverrides = config.system_prompt_overrides.length > 0 ? compileRewriteRules(config.system_prompt_overrides) : [];
+	if (Array.isArray(config.system_prompt_overrides)) state.systemPromptOverrides = config.system_prompt_overrides.length > 0 ? compileRewriteRules(config.system_prompt_overrides) : [];
 	if (config.model_overrides) state.modelOverrides = {
 		...DEFAULT_MODEL_OVERRIDES,
 		...config.model_overrides
@@ -4072,6 +4181,78 @@ async function applyConfigToState() {
 	return config;
 }
+//#endregion
+//#region src/lib/context/error-persistence.ts
+/**
+* Error persistence consumer.
+*
+* Subscribes to "failed" events on RequestContext and writes structured
+* error files to disk for post-mortem debugging. All data comes from
+* RequestContext (via HistoryEntryData on the event), not from Hono
+* Context — ensuring reliability regardless of whether the HTTP body
+* has been consumed.
+*
+* Output directory: PATHS.ERROR_DIR/{timestamp}_{id}/
+* Files:
+*   - meta.json:       structured metadata (timestamp, endpoint, model, error, attempts)
+*   - request.json:    full request payload (messages capped at 50 for size)
+*   - response.txt:    raw upstream response body (if available)
+*   - sse-events.json: recorded SSE events (if streaming request failed mid-stream)
+*/
+/** Handle context events — only acts on "failed" */
+function handleErrorPersistence(event) {
+	if (event.type !== "failed") return;
+	writeErrorEntry(event.entry).catch((err) => {
+		consola.debug(`[ErrorPersistence] Failed to write error file: ${err}`);
+	});
+}
+/** Max number of messages to include in request.json (to avoid huge files) */
+const MAX_MESSAGES_IN_DUMP = 50;
+async function writeErrorEntry(entry) {
+	const meta = {
+		timestamp: new Date(entry.timestamp).toISOString(),
+		id: entry.id,
+		endpoint: entry.endpoint,
+		durationMs: entry.durationMs,
+		request: {
+			model: entry.request.model,
+			stream: entry.request.stream,
+			messageCount: entry.request.messages?.length,
+			toolCount: entry.request.tools?.length
+		},
+		response: entry.response ? {
+			success: entry.response.success,
+			model: entry.response.model,
+			error: entry.response.error,
+			status: entry.response.status
+		} : void 0,
+		truncation: entry.truncation,
+		attempts: entry.attempts
+	};
+	const files = [["meta.json", JSON.stringify(meta, null, 2)]];
+	if (entry.request) {
+		const { messages, ...requestWithoutMessages } = entry.request;
+		const requestData = {
+			...requestWithoutMessages,
+			messageCount: messages?.length,
+			...messages && messages.length <= MAX_MESSAGES_IN_DUMP && { messages }
+		};
+		files.push(["request.json", JSON.stringify(requestData, null, 2)]);
+	}
+	if (entry.response?.responseText) files.push(["response.txt", entry.response.responseText]);
+	if (entry.sseEvents?.length) files.push(["sse-events.json", JSON.stringify(entry.sseEvents, null, 2)]);
+	const id = randomBytes(4).toString("hex");
+	const dirPath = path$1.join(PATHS.ERROR_DIR, `${formatTimestamp()}_${id}`);
+	await fs$1.mkdir(dirPath, { recursive: true });
+	await Promise.all(files.map(([name, content]) => fs$1.writeFile(path$1.join(dirPath, name), content)));
+}
+/** Format timestamp as YYMMDD_HHmmss for error directory names */
+function formatTimestamp() {
+	const now = /* @__PURE__ */ new Date();
+	const pad = (n) => String(n).padStart(2, "0");
+	return `${String(now.getFullYear()).slice(2)}${pad(now.getMonth() + 1)}${pad(now.getDate())}_${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
+}
 //#endregion
 //#region src/lib/context/consumers.ts
 function handleHistoryEvent(event) {
@@ -4190,6 +4371,7 @@ function toHistoryResponse(entryData) {
 function registerContextConsumers(manager) {
 	manager.on("change", handleHistoryEvent);
 	manager.on("change", handleTuiEvent);
+	manager.on("change", handleErrorPersistence);
 }
 //#endregion
@@ -4709,13 +4891,13 @@ const getTokenCount = async (payload, model) => {
 */
 /**
 * Log helpful debugging information when a 413 error occurs.
-* Also adjusts the dynamic byte limit for future requests.
+*
+* @param precomputedBytes - Optional pre-computed payload byte size to avoid redundant JSON.stringify
 */
-async function logPayloadSizeInfo(payload, model) {
+async function logPayloadSizeInfo(payload, model, precomputedBytes) {
 	const messageCount = payload.messages.length;
-	const bodySize = JSON.stringify(payload).length;
+	const bodySize = precomputedBytes ?? JSON.stringify(payload).length;
 	const bodySizeKB = bytesToKB(bodySize);
-	onRequestTooLarge(bodySize);
 	let imageCount = 0;
 	let largeMessages = 0;
 	let totalImageSize = 0;
@@ -4797,7 +4979,7 @@ async function executeRequestPipeline(opts) {
 		try {
 			const { result: response, queueWaitMs } = await adapter.execute(effectivePayload);
 			totalQueueWaitMs += queueWaitMs;
-			requestContext?.addQueueWaitMs(totalQueueWaitMs);
+			requestContext?.addQueueWaitMs(queueWaitMs);
 			return {
 				response,
 				effectivePayload,
@@ -5105,8 +5287,7 @@ function buildResponsesResponseData(acc, fallbackModel) {
 /**
 * Auto-truncate retry strategy.
 *
-* Handles 413 (body too large) and token limit errors by truncating the
-* message payload and retrying.
+* Handles token limit errors by truncating the message payload and retrying.
 */
 /**
 * Create an auto-truncate retry strategy.
@@ -5134,26 +5315,44 @@ function createAutoTruncateStrategy(opts) {
 				action: "abort",
 				error
 			};
-			const payloadBytes = JSON.stringify(currentPayload).length;
-			const parsed = tryParseAndLearnLimit(rawError, model.id, payloadBytes);
-			if (!parsed) return {
-				action: "abort",
-				error
-			};
+			const payloadJson = JSON.stringify(currentPayload);
+			const estimatedTokens = Math.ceil(payloadJson.length / 4);
+			const parsed = tryParseAndLearnLimit(rawError, model.id, true, estimatedTokens);
+			if (!parsed) {
+				if (rawError.status === 413) {
+					consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: 413 Body too large, retrying with truncation...`);
+					const truncateResult = await truncate(originalPayload, model, { checkTokenLimit: true });
+					if (!truncateResult.wasTruncated) return {
+						action: "abort",
+						error
+					};
+					const sanitizeResult = resanitize(truncateResult.payload);
+					return {
+						action: "retry",
+						payload: sanitizeResult.payload,
+						meta: {
+							truncateResult,
+							sanitization: sanitizeResult.stats ?? {
+								totalBlocksRemoved: sanitizeResult.removedCount,
+								systemReminderRemovals: sanitizeResult.systemReminderRemovals
+							},
+							attempt: attempt + 1
+						}
+					};
+				}
+				return {
+					action: "abort",
+					error
+				};
+			}
 			let targetTokenLimit;
-			let targetByteLimitBytes;
-			if (parsed.type === "token_limit" && parsed.limit) {
+			if (parsed.limit) {
 				targetTokenLimit = Math.floor(parsed.limit * AUTO_TRUNCATE_RETRY_FACTOR);
 				consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: Token limit error (${parsed.current}>${parsed.limit}), retrying with limit ${targetTokenLimit}...`);
-			} else if (parsed.type === "body_too_large") {
-				targetByteLimitBytes = Math.floor(payloadBytes * AUTO_TRUNCATE_RETRY_FACTOR);
-				consola.info(`[${label}] Attempt ${attempt + 1}/${maxRetries + 1}: Body too large (${bytesToKB(payloadBytes)}KB), retrying with limit ${bytesToKB(targetByteLimitBytes)}KB...`);
 			}
 			const truncateResult = await truncate(originalPayload, model, {
 				checkTokenLimit: true,
-				checkByteLimit: true,
-				targetTokenLimit,
-				targetByteLimitBytes
+				targetTokenLimit
 			});
 			if (!truncateResult.wasTruncated) return {
 				action: "abort",
@@ -5576,12 +5775,13 @@ function sanitizeMessageParamContent(msg) {
 */
 function removeAnthropicSystemReminders(messages) {
 	let modifiedCount = 0;
+	const result = messages.map((msg) => {
+		const sanitized = sanitizeMessageParamContent(msg);
+		if (sanitized !== msg) modifiedCount++;
+		return sanitized;
+	});
 	return {
-		messages: messages.map((msg) => {
-			const sanitized = sanitizeMessageParamContent(msg);
-			if (sanitized !== msg) modifiedCount++;
-			return sanitized;
-		}),
+		messages: modifiedCount === 0 ? messages : result,
 		modifiedCount
 	};
 }
@@ -6180,11 +6380,11 @@ function convertServerToolsToCustom(tools) {
 * Auto-truncate module for Anthropic-style messages.
 *
 * This module handles automatic truncation of Anthropic message format
-* when it exceeds token or byte limits.
+* when it exceeds token limits.
 *
 * Key features:
 * - Binary search for optimal truncation point
-* - Considers both token and byte limits
+* - Token limit enforcement with learned calibration
 * - Preserves system messages
 * - Filters orphaned tool_result and tool_use messages
 * - Smart compression of old tool_result content (e.g., Read tool results)
@@ -6315,15 +6515,6 @@ async function countTotalInputTokens(payload, model) {
 	}
 	return total;
 }
-/** Get byte size of a message (memoized to avoid redundant JSON.stringify) */
-const messageBytesCache$1 = /* @__PURE__ */ new WeakMap();
-function getMessageBytes$1(msg) {
-	let cached = messageBytesCache$1.get(msg);
-	if (cached !== void 0) return cached;
-	cached = JSON.stringify(msg).length;
-	messageBytesCache$1.set(msg, cached);
-	return cached;
-}
 /**
 * Strip thinking/redacted_thinking blocks from old assistant messages.
 *
@@ -6377,26 +6568,20 @@ function compressToolResultBlock(block) {
 }
 /**
 * Smart compression strategy:
-* 1. Calculate tokens/bytes from the end until reaching preservePercent of limit
+* 1. Calculate tokens from the end until reaching preservePercent of limit
 * 2. Messages before that threshold get their tool_results compressed
 * 3. Returns compressed messages and stats
 *
 * @param preservePercent - Percentage of context to preserve uncompressed (0.0-1.0)
 */
-function smartCompressToolResults$1(messages, tokenLimit, byteLimit, preservePercent) {
+function smartCompressToolResults$1(messages, tokenLimit, preservePercent) {
 	const n = messages.length;
 	const cumTokens = Array.from({ length: n + 1 }, () => 0);
-	const cumBytes = Array.from({ length: n + 1 }, () => 0);
-	for (let i = n - 1; i >= 0; i--) {
-		const msg = messages[i];
-		cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(msg);
-		cumBytes[i] = cumBytes[i + 1] + getMessageBytes$1(msg) + 1;
-	}
+	for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(messages[i]);
 	const preserveTokenLimit = Math.floor(tokenLimit * preservePercent);
-	const preserveByteLimit = Math.floor(byteLimit * preservePercent);
 	let thresholdIndex = n;
 	for (let i = n - 1; i >= 0; i--) {
-		if (cumTokens[i] > preserveTokenLimit || cumBytes[i] > preserveByteLimit) {
+		if (cumTokens[i] > preserveTokenLimit) {
 			thresholdIndex = i + 1;
 			break;
 		}
@@ -6448,40 +6633,35 @@ function smartCompressToolResults$1(messages, tokenLimit, byteLimit, preservePer
 	};
 }
 /** Default fallback for when model capabilities are not available */
-const DEFAULT_CONTEXT_WINDOW = 2e5;
-function calculateLimits$1(model, config) {
-	if (config.targetTokenLimit !== void 0 || config.targetByteLimitBytes !== void 0) return {
-		tokenLimit: config.targetTokenLimit ?? model.capabilities?.limits?.max_context_window_tokens ?? DEFAULT_CONTEXT_WINDOW,
-		byteLimit: config.targetByteLimitBytes ?? getEffectiveByteLimitBytes()
-	};
-	const rawTokenLimit = getEffectiveTokenLimit(model.id) ?? model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW;
-	return {
-		tokenLimit: Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100)),
-		byteLimit: getEffectiveByteLimitBytes()
-	};
+const DEFAULT_CONTEXT_WINDOW$1 = 2e5;
+/**
+* Calculate the effective token limit for auto-truncate.
+* Uses explicit target if provided, otherwise learned limits with calibration,
+* otherwise model capabilities with safety margin.
+*/
+function calculateTokenLimit$1(model, config) {
+	if (config.targetTokenLimit !== void 0) return config.targetTokenLimit;
+	const learned = getLearnedLimits(model.id);
+	if (learned) {
+		const margin = computeSafetyMargin(learned.sampleCount);
+		return Math.floor(learned.tokenLimit * (1 - margin));
+	}
+	const rawTokenLimit = model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW$1;
+	return Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100));
 }
 function findOptimalPreserveIndex$1(params) {
-	const { messages, systemBytes, systemTokens, payloadOverhead, tokenLimit, byteLimit, checkTokenLimit, checkByteLimit } = params;
+	const { messages, systemTokens, tokenLimit } = params;
 	if (messages.length === 0) return 0;
-	const markerBytes = 200;
 	const availableTokens = tokenLimit - systemTokens - 50;
-	const availableBytes = byteLimit - payloadOverhead - systemBytes - markerBytes;
-	if (checkTokenLimit && availableTokens <= 0 || checkByteLimit && availableBytes <= 0) return messages.length;
+	if (availableTokens <= 0) return messages.length;
 	const n = messages.length;
 	const cumTokens = Array.from({ length: n + 1 }, () => 0);
-	const cumBytes = Array.from({ length: n + 1 }, () => 0);
-	for (let i = n - 1; i >= 0; i--) {
-		const msg = messages[i];
-		cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(msg);
-		cumBytes[i] = cumBytes[i + 1] + getMessageBytes$1(msg) + 1;
-	}
+	for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens$1(messages[i]);
 	let left = 0;
 	let right = n;
 	while (left < right) {
 		const mid = left + right >>> 1;
-		const tokensFit = !checkTokenLimit || cumTokens[mid] <= availableTokens;
-		const bytesFit = !checkByteLimit || cumBytes[mid] <= availableBytes;
-		if (tokensFit && bytesFit) right = mid;
+		if (cumTokens[mid] <= availableTokens) right = mid;
 		else left = mid + 1;
 	}
 	return left;
@@ -6572,36 +6752,28 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
 		...DEFAULT_AUTO_TRUNCATE_CONFIG,
 		...config
 	};
-	const { tokenLimit, byteLimit } = calculateLimits$1(model, cfg);
+	const tokenLimit = calculateTokenLimit$1(model, cfg);
 	const fixedTokens = await countFixedTokens(payload, model);
-	const originalBytes = JSON.stringify(payload).length;
 	const originalTokens = fixedTokens + await countMessagesTokens(payload.messages, model);
-	if (originalTokens <= tokenLimit && originalBytes <= byteLimit) return buildResult({
+	if (originalTokens <= tokenLimit) return buildResult({
 		payload,
 		wasTruncated: false,
 		originalTokens,
 		compactedTokens: originalTokens,
 		removedMessageCount: 0
 	});
-	const exceedsTokens = originalTokens > tokenLimit;
-	const exceedsBytes = originalBytes > byteLimit;
 	const { messages: thinkingStripped, strippedCount: thinkingStrippedCount } = stripThinkingBlocks(payload.messages, 4);
 	let workingMessages = thinkingStripped;
 	if (thinkingStrippedCount > 0) {
-		const strippedPayload = {
-			...payload,
-			messages: workingMessages
-		};
-		const strippedBytes = JSON.stringify(strippedPayload).length;
 		const strippedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
-		if (strippedTokens <= tokenLimit && strippedBytes <= byteLimit) {
-			let reason = "tokens";
-			if (exceedsTokens && exceedsBytes) reason = "tokens+size";
-			else if (exceedsBytes) reason = "size";
+		if (strippedTokens <= tokenLimit) {
 			const elapsedMs = Math.round(performance.now() - startTime);
-			consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${strippedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(strippedBytes)}KB (stripped ${thinkingStrippedCount} thinking blocks) [${elapsedMs}ms]`);
+			consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${strippedTokens} (stripped ${thinkingStrippedCount} thinking blocks) [${elapsedMs}ms]`);
 			return buildResult({
-				payload: strippedPayload,
+				payload: {
+					...payload,
+					messages: workingMessages
+				},
 				wasTruncated: true,
 				originalTokens,
 				compactedTokens: strippedTokens,
@@ -6611,47 +6783,37 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
 	}
 	let compressedCount = 0;
 	if (state.compressToolResultsBeforeTruncate) {
-		const compressionResult = smartCompressToolResults$1(workingMessages, tokenLimit, byteLimit, cfg.preserveRecentPercent);
+		const compressionResult = smartCompressToolResults$1(workingMessages, tokenLimit, cfg.preserveRecentPercent);
 		workingMessages = compressionResult.messages;
 		compressedCount = compressionResult.compressedCount;
-		const compressedPayload = {
-			...payload,
-			messages: workingMessages
-		};
-		const compressedBytes = JSON.stringify(compressedPayload).length;
 		const compressedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
-		if (compressedTokens <= tokenLimit && compressedBytes <= byteLimit) {
-			let reason = "tokens";
-			if (exceedsTokens && exceedsBytes) reason = "tokens+size";
-			else if (exceedsBytes) reason = "size";
+		if (compressedTokens <= tokenLimit) {
 			const elapsedMs = Math.round(performance.now() - startTime);
-			consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${compressedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
+			consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${compressedTokens} (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
 			return buildResult({
-				payload: addCompressionNotice$1(compressedPayload, compressedCount),
+				payload: addCompressionNotice$1({
+					...payload,
+					messages: workingMessages
+				}, compressedCount),
 				wasTruncated: true,
 				originalTokens,
 				compactedTokens: compressedTokens + (Math.ceil(150 / 4) + 4),
 				removedMessageCount: 0
 			});
 		}
-		const allCompression = smartCompressToolResults$1(workingMessages, tokenLimit, byteLimit, 0);
+		const allCompression = smartCompressToolResults$1(workingMessages, tokenLimit, 0);
 		if (allCompression.compressedCount > 0) {
 			workingMessages = allCompression.messages;
 			compressedCount += allCompression.compressedCount;
-			const allCompressedPayload = {
-				...payload,
-				messages: workingMessages
-			};
-			const allCompressedBytes = JSON.stringify(allCompressedPayload).length;
 			const allCompressedTokens = fixedTokens + await countMessagesTokens(workingMessages, model);
-			if (allCompressedTokens <= tokenLimit && allCompressedBytes <= byteLimit) {
-				let reason = "tokens";
-				if (exceedsTokens && exceedsBytes) reason = "tokens+size";
-				else if (exceedsBytes) reason = "size";
+			if (allCompressedTokens <= tokenLimit) {
 				const elapsedMs = Math.round(performance.now() - startTime);
-				consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${allCompressedTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
+				consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${allCompressedTokens} (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
 				return buildResult({
-					payload: addCompressionNotice$1(allCompressedPayload, compressedCount),
+					payload: addCompressionNotice$1({
+						...payload,
+						messages: workingMessages
+					}, compressedCount),
 					wasTruncated: true,
 					originalTokens,
 					compactedTokens: allCompressedTokens + (Math.ceil(150 / 4) + 4),
@@ -6660,23 +6822,11 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
 			}
 		}
 	}
-	const systemBytes = payload.system ? JSON.stringify(payload.system).length : 0;
 	const systemTokens = await countSystemTokens(payload.system, model);
-	const messagesBytes = workingMessages.reduce((sum, msg) => sum + getMessageBytes$1(msg) + 1, 0) + 2;
-	const payloadOverhead = JSON.stringify({
-		...payload,
-		messages: workingMessages
-	}).length - messagesBytes - systemBytes;
-	consola.debug(`[AutoTruncate:Anthropic] overhead=${bytesToKB(payloadOverhead)}KB, system=${bytesToKB(systemBytes)}KB`);
 	const preserveIndex = findOptimalPreserveIndex$1({
 		messages: workingMessages,
-		systemBytes,
 		systemTokens,
-		payloadOverhead,
-		tokenLimit,
-		byteLimit,
-		checkTokenLimit: cfg.checkTokenLimit,
-		checkByteLimit: cfg.checkByteLimit
+		tokenLimit
 	});
 	if (preserveIndex >= workingMessages.length) {
 		consola.warn("[AutoTruncate:Anthropic] Would need to remove all messages");
@@ -6724,17 +6874,14 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
 	const newBytes = JSON.stringify(newPayload).length;
 	const newMsgTokens = await countMessagesTokens(newMessages, model);
 	const newTokens = (newSystem !== payload.system ? await countSystemTokens(newSystem, model) : systemTokens) + (fixedTokens - await countSystemTokens(payload.system, model)) + newMsgTokens;
-	let reason = "tokens";
-	if (exceedsTokens && exceedsBytes) reason = "tokens+size";
-	else if (exceedsBytes) reason = "size";
 	const actions = [];
 	if (removedCount > 0) actions.push(`removed ${removedCount} msgs`);
 	if (thinkingStrippedCount > 0) actions.push(`stripped ${thinkingStrippedCount} thinking blocks`);
 	if (compressedCount > 0) actions.push(`compressed ${compressedCount} tool_results`);
 	const actionInfo = actions.length > 0 ? ` (${actions.join(", ")})` : "";
 	const elapsedMs = Math.round(performance.now() - startTime);
-	consola.info(`[AutoTruncate:Anthropic] ${reason}: ${originalTokens}→${newTokens} tokens, ${bytesToKB(originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
-	if (newBytes > byteLimit || newTokens > tokenLimit) consola.warn(`[AutoTruncate:Anthropic] Result still over limit (${newTokens} tokens, ${bytesToKB(newBytes)}KB)`);
+	consola.info(`[AutoTruncate:Anthropic] tokens: ${originalTokens}→${newTokens}, ${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
+	if (newTokens > tokenLimit) consola.warn(`[AutoTruncate:Anthropic] Result still over token limit (${newTokens} > ${tokenLimit})`);
 	return buildResult({
 		payload: newPayload,
 		wasTruncated: true,
@@ -6744,32 +6891,43 @@ async function autoTruncateAnthropic(payload, model, config = {}) {
 	});
 }
 /**
-* Check if payload needs compaction.
+* Check if payload needs compaction based on learned model limits.
+* Returns early with `needed: false` when no limits are known for the model.
 */
 async function checkNeedsCompactionAnthropic(payload, model, config = {}) {
 	const cfg = {
 		...DEFAULT_AUTO_TRUNCATE_CONFIG,
 		...config
 	};
-	const { tokenLimit, byteLimit } = calculateLimits$1(model, cfg);
-	const currentTokens = await countTotalTokens(payload, model);
-	const currentBytes = JSON.stringify(payload).length;
+	const learned = getLearnedLimits(model.id);
+	if (!learned && cfg.targetTokenLimit === void 0) return {
+		needed: false,
+		currentTokens: 0,
+		tokenLimit: 0
+	};
+	const tokenLimit = calculateTokenLimit$1(model, cfg);
+	const rawTokens = await countTotalTokens(payload, model);
+	const currentTokens = learned && learned.sampleCount > 0 ? calibrate(model.id, rawTokens) : rawTokens;
 	const exceedsTokens = cfg.checkTokenLimit && currentTokens > tokenLimit;
-	const exceedsBytes = cfg.checkByteLimit && currentBytes > byteLimit;
-	let reason;
-	if (exceedsTokens && exceedsBytes) reason = "both";
-	else if (exceedsTokens) reason = "tokens";
-	else if (exceedsBytes) reason = "bytes";
 	return {
-		needed: exceedsTokens || exceedsBytes,
+		needed: exceedsTokens,
 		currentTokens,
 		tokenLimit,
-		currentBytes,
-		byteLimit,
-		reason
+		reason: exceedsTokens ? "tokens" : void 0
 	};
 }
+//#endregion
+//#region src/lib/fetch-utils.ts
+/**
+* Create an AbortSignal for fetch timeout if configured.
+* Controls the time from request start to receiving response headers.
+* Returns undefined if fetchTimeout is 0 (disabled).
+*/
+function createFetchSignal() {
+	return state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
+}
 //#endregion
 //#region src/lib/anthropic/features.ts
 /**
@@ -7126,7 +7284,7 @@ async function createAnthropicMessages(payload) {
 		}
 	}
 	consola.debug("Sending direct Anthropic request to Copilot /v1/messages");
-	const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
+	const fetchSignal = createFetchSignal();
 	const response = await fetch(`${copilotBaseUrl(state)}/v1/messages`, {
 		method: "POST",
 		headers,
@@ -7365,7 +7523,7 @@ function raceIteratorNext(promise, opts) {
 * Returns a decision with reason so callers can log/display the routing rationale.
 */
 function supportsDirectAnthropicApi(modelId) {
-	const model = state.models?.data.find((m) => m.id === modelId);
+	const model = state.modelIndex.get(modelId);
 	if (model?.vendor !== "Anthropic") return {
 		supported: false,
 		reason: `vendor is "${model?.vendor ?? "unknown"}", not Anthropic`
@@ -7387,6 +7545,12 @@ function supportsDirectAnthropicApi(modelId) {
 async function handleAnthropicMessagesCompletion(c, anthropicPayload, options) {
 	if (anthropicPayload.system) anthropicPayload.system = await processAnthropicSystem(anthropicPayload.system);
 	const tuiLogId = c.get("tuiLogId");
+	const routingDecision = supportsDirectAnthropicApi(anthropicPayload.model);
+	if (!routingDecision.supported) {
+		const msg = `Model "${anthropicPayload.model}" does not support /v1/messages: ${routingDecision.reason}`;
+		throw new HTTPError(msg, 400, msg);
+	}
+	consola.debug(`[AnthropicRouting] ${anthropicPayload.model}: ${routingDecision.reason}`);
 	const reqCtx = getRequestContextManager().create({
 		endpoint: "anthropic",
 		tuiLogId
@@ -7409,17 +7573,11 @@ async function handleAnthropicMessagesCompletion(c, anthropicPayload, options) {
 		strippedReadTagCount: preprocessed.strippedReadTagCount,
 		dedupedToolCallCount: preprocessed.dedupedToolCallCount
 	});
-	const routingDecision = supportsDirectAnthropicApi(anthropicPayload.model);
-	if (!routingDecision.supported) {
-		const msg = `Model "${anthropicPayload.model}" does not support /v1/messages: ${routingDecision.reason}`;
-		throw new HTTPError(msg, 400, msg);
-	}
-	consola.debug(`[AnthropicRouting] ${anthropicPayload.model}: ${routingDecision.reason}`);
 	return handleDirectAnthropicCompletion(c, anthropicPayload, reqCtx);
 }
 async function handleDirectAnthropicCompletion(c, anthropicPayload, reqCtx) {
 	consola.debug("Using direct Anthropic API path for model:", anthropicPayload.model);
-	const selectedModel = state.models?.data.find((m) => m.id === anthropicPayload.model);
+	const selectedModel = state.modelIndex.get(anthropicPayload.model);
 	const { payload: initialSanitized, stats: sanitizationStats } = sanitizeAnthropicMessages(anthropicPayload);
 	reqCtx.addSanitizationInfo(toSanitizationInfo(sanitizationStats));
 	const hasPreprocessing = reqCtx.preprocessInfo ? reqCtx.preprocessInfo.dedupedToolCallCount > 0 || reqCtx.preprocessInfo.strippedReadTagCount > 0 : false;
@@ -7527,8 +7685,8 @@ function combineAbortSignals(...signals) {
 async function* processAnthropicStream(response, acc, clientAbortSignal) {
 	const idleTimeoutMs = state.streamIdleTimeout * 1e3;
 	const iterator = response[Symbol.asyncIterator]();
+	const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
 	for (;;) {
-		const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
 		const result = await raceIteratorNext(iterator.next(), {
 			idleTimeoutMs,
 			abortSignal
@@ -7597,7 +7755,7 @@ async function handleDirectAnthropicStreamingResponse(opts) {
 			await stream.writeSSE({
 				data: rawEvent.data ?? "",
 				event: rawEvent.event,
-				id: String(rawEvent.id),
+				id: rawEvent.id != null ? String(rawEvent.id) : void 0,
 				retry: rawEvent.retry
 			});
 		}
@@ -7761,26 +7919,31 @@ function extractOpenAISystemMessages(messages) {
 //#region src/lib/openai/auto-truncate.ts
 /**
 * Auto-truncate module: Automatically truncates conversation history
-* when it exceeds token or byte limits (OpenAI format).
+* when it exceeds token limits (OpenAI format).
 *
 * Key features:
 * - Binary search for optimal truncation point
-* - Considers both token and byte limits
+* - Token limit enforcement with learned calibration
 * - Preserves system messages
 * - Filters orphaned tool_result and tool_use messages
-* - Dynamic byte limit adjustment on 413 errors
 * - Optional smart compression of old tool_result content
 */
-function calculateLimits(model, config) {
-	if (config.targetTokenLimit !== void 0 || config.targetByteLimitBytes !== void 0) return {
-		tokenLimit: config.targetTokenLimit ?? model.capabilities?.limits?.max_context_window_tokens ?? 128e3,
-		byteLimit: config.targetByteLimitBytes ?? getEffectiveByteLimitBytes()
-	};
-	const rawTokenLimit = getEffectiveTokenLimit(model.id) ?? model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? 128e3;
-	return {
-		tokenLimit: Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100)),
-		byteLimit: getEffectiveByteLimitBytes()
-	};
+/** Default fallback for when model capabilities are not available */
+const DEFAULT_CONTEXT_WINDOW = 128e3;
+/**
+* Calculate the effective token limit for auto-truncate.
+* Uses explicit target if provided, otherwise learned limits with calibration,
+* otherwise model capabilities with safety margin.
+*/
+function calculateTokenLimit(model, config) {
+	if (config.targetTokenLimit !== void 0) return config.targetTokenLimit;
+	const learned = getLearnedLimits(model.id);
+	if (learned) {
+		const margin = computeSafetyMargin(learned.sampleCount);
+		return Math.floor(learned.tokenLimit * (1 - margin));
+	}
+	const rawTokenLimit = model.capabilities?.limits?.max_context_window_tokens ?? model.capabilities?.limits?.max_prompt_tokens ?? DEFAULT_CONTEXT_WINDOW;
+	return Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100));
 }
 /** Estimate tokens for a single message (fast approximation) */
 function estimateMessageTokens(msg) {
@@ -7793,28 +7956,12 @@ function estimateMessageTokens(msg) {
 	if (msg.tool_calls) charCount += JSON.stringify(msg.tool_calls).length;
 	return Math.ceil(charCount / 4) + 10;
 }
-/** Get byte size of a message (memoized to avoid redundant JSON.stringify) */
-const messageBytesCache = /* @__PURE__ */ new WeakMap();
-function getMessageBytes(msg) {
-	let cached = messageBytesCache.get(msg);
-	if (cached !== void 0) return cached;
-	cached = JSON.stringify(msg).length;
-	messageBytesCache.set(msg, cached);
-	return cached;
-}
-/** Calculate cumulative token and byte sums from the end of the message array */
+/** Calculate cumulative token sums from the end of the message array */
 function calculateCumulativeSums(messages) {
 	const n = messages.length;
 	const cumTokens = Array.from({ length: n + 1 }).fill(0);
-	const cumBytes = Array.from({ length: n + 1 }).fill(0);
-	for (let i = n - 1; i >= 0; i--) {
-		cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens(messages[i]);
-		cumBytes[i] = cumBytes[i + 1] + getMessageBytes(messages[i]) + 1;
-	}
-	return {
-		cumTokens,
-		cumBytes
-	};
+	for (let i = n - 1; i >= 0; i--) cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens(messages[i]);
+	return { cumTokens };
 }
 /**
 * Clean up orphaned tool messages and ensure valid conversation start.
@@ -7833,20 +7980,19 @@ function cleanupMessages(messages) {
 }
 /**
 * Smart compression strategy for OpenAI format:
-* 1. Calculate tokens/bytes from the end until reaching preservePercent of limit
+* 1. Calculate tokens from the end until reaching preservePercent of limit
 * 2. Messages before that threshold get their tool content compressed
 * 3. Returns compressed messages and stats
 *
 * @param preservePercent - Percentage of context to preserve uncompressed (0.0-1.0)
 */
-function smartCompressToolResults(messages, tokenLimit, byteLimit, preservePercent) {
+function smartCompressToolResults(messages, tokenLimit, preservePercent) {
 	const n = messages.length;
-	const { cumTokens, cumBytes } = calculateCumulativeSums(messages);
+	const { cumTokens } = calculateCumulativeSums(messages);
 	const preserveTokenLimit = Math.floor(tokenLimit * preservePercent);
-	const preserveByteLimit = Math.floor(byteLimit * preservePercent);
 	let thresholdIndex = n;
 	for (let i = n - 1; i >= 0; i--) {
-		if (cumTokens[i] > preserveTokenLimit || cumBytes[i] > preserveByteLimit) {
+		if (cumTokens[i] > preserveTokenLimit) {
 			thresholdIndex = i + 1;
 			break;
 		}
@@ -7882,21 +8028,17 @@ function smartCompressToolResults(messages, tokenLimit, byteLimit, preservePerce
 * Returns the smallest index where the preserved portion fits within limits.
 */
 function findOptimalPreserveIndex(params) {
-	const { messages, systemBytes, systemTokens, payloadOverhead, tokenLimit, byteLimit, checkTokenLimit, checkByteLimit } = params;
+	const { messages, systemTokens, tokenLimit } = params;
 	if (messages.length === 0) return 0;
-	const markerBytes = 200;
 	const availableTokens = tokenLimit - systemTokens - 50;
-	const availableBytes = byteLimit - payloadOverhead - systemBytes - markerBytes;
-	if (checkTokenLimit && availableTokens <= 0 || checkByteLimit && availableBytes <= 0) return messages.length;
+	if (availableTokens <= 0) return messages.length;
 	const n = messages.length;
-	const { cumTokens, cumBytes } = calculateCumulativeSums(messages);
+	const { cumTokens } = calculateCumulativeSums(messages);
 	let left = 0;
 	let right = n;
 	while (left < right) {
 		const mid = left + right >>> 1;
-		const tokensFit = !checkTokenLimit || cumTokens[mid] <= availableTokens;
-		const bytesFit = !checkByteLimit || cumBytes[mid] <= availableBytes;
-		if (tokensFit && bytesFit) right = mid;
+		if (cumTokens[mid] <= availableTokens) right = mid;
 		else left = mid + 1;
 	}
 	return left;
@@ -7981,11 +8123,6 @@ function buildTimedResult(ctx, result) {
 		processingTimeMs: Math.round(performance.now() - ctx.startTime)
 	};
 }
-function getReasonLabel(exceedsTokens, exceedsBytes) {
-	if (exceedsTokens && exceedsBytes) return "tokens+size";
-	if (exceedsBytes) return "size";
-	return "tokens";
-}
 /**
 * Step 1: Try compressing tool results to fit within limits.
 * First compresses old tool results, then all if needed.
@@ -7996,7 +8133,7 @@ async function tryCompressToolResults(ctx) {
 		workingMessages: ctx.payload.messages,
 		compressedCount: 0
 	};
-	const compressionResult = smartCompressToolResults(ctx.payload.messages, ctx.tokenLimit, ctx.byteLimit, ctx.cfg.preserveRecentPercent);
+	const compressionResult = smartCompressToolResults(ctx.payload.messages, ctx.tokenLimit, ctx.cfg.preserveRecentPercent);
 	let workingMessages = compressionResult.messages;
 	let compressedCount = compressionResult.compressedCount;
 	const compressedPayload = {
@@ -8005,10 +8142,9 @@ async function tryCompressToolResults(ctx) {
 	};
 	const compressedBytes = JSON.stringify(compressedPayload).length;
 	const compressedTokenCount = await getTokenCount(compressedPayload, ctx.model);
-	if (compressedTokenCount.input <= ctx.tokenLimit && compressedBytes <= ctx.byteLimit) {
-		const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
+	if (compressedTokenCount.input <= ctx.tokenLimit) {
 		const elapsedMs = Math.round(performance.now() - ctx.startTime);
-		consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${compressedTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
+		consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${compressedTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(compressedBytes)}KB (compressed ${compressedCount} tool_results) [${elapsedMs}ms]`);
 		const noticePayload = addCompressionNotice(compressedPayload, compressedCount);
 		const noticeTokenOverhead = Math.ceil(150 / 4) + 10;
 		return {
@@ -8023,7 +8159,7 @@ async function tryCompressToolResults(ctx) {
 			})
 		};
 	}
-	const allCompression = smartCompressToolResults(workingMessages, ctx.tokenLimit, ctx.byteLimit, 0);
+	const allCompression = smartCompressToolResults(workingMessages, ctx.tokenLimit, 0);
 	if (allCompression.compressedCount > 0) {
 		workingMessages = allCompression.messages;
 		compressedCount += allCompression.compressedCount;
@@ -8033,10 +8169,9 @@ async function tryCompressToolResults(ctx) {
 		};
 		const allCompressedBytes = JSON.stringify(allCompressedPayload).length;
 		const allCompressedTokenCount = await getTokenCount(allCompressedPayload, ctx.model);
-		if (allCompressedTokenCount.input <= ctx.tokenLimit && allCompressedBytes <= ctx.byteLimit) {
-			const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
+		if (allCompressedTokenCount.input <= ctx.tokenLimit) {
 			const elapsedMs = Math.round(performance.now() - ctx.startTime);
-			consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${allCompressedTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
+			consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${allCompressedTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(allCompressedBytes)}KB (compressed ${compressedCount} tool_results, including recent) [${elapsedMs}ms]`);
 			const noticePayload = addCompressionNotice(allCompressedPayload, compressedCount);
 			const noticeTokenOverhead = Math.ceil(150 / 4) + 10;
 			return {
@@ -8063,23 +8198,10 @@ async function tryCompressToolResults(ctx) {
 */
 async function truncateByMessageRemoval(ctx, workingMessages, compressedCount) {
 	const { systemMessages, conversationMessages } = extractOpenAISystemMessages(workingMessages);
-	const messagesBytes = workingMessages.reduce((sum, m) => sum + getMessageBytes(m) + 1, 0) + 1;
-	const payloadOverhead = JSON.stringify({
-		...ctx.payload,
-		messages: workingMessages
-	}).length - messagesBytes;
-	const systemBytes = systemMessages.reduce((sum, m) => sum + getMessageBytes(m) + 1, 0);
-	const systemTokens = systemMessages.reduce((sum, m) => sum + estimateMessageTokens(m), 0);
-	consola.debug(`[AutoTruncate:OpenAI] overhead=${bytesToKB(payloadOverhead)}KB, system=${systemMessages.length} msgs (${bytesToKB(systemBytes)}KB)`);
 	const preserveIndex = findOptimalPreserveIndex({
 		messages: conversationMessages,
-		systemBytes,
-		systemTokens,
-		payloadOverhead,
-		tokenLimit: ctx.tokenLimit,
-		byteLimit: ctx.byteLimit,
-		checkTokenLimit: ctx.cfg.checkTokenLimit,
-		checkByteLimit: ctx.cfg.checkByteLimit
+		systemTokens: systemMessages.reduce((sum, m) => sum + estimateMessageTokens(m), 0),
+		tokenLimit: ctx.tokenLimit
 	});
 	if (preserveIndex >= conversationMessages.length) {
 		consola.warn("[AutoTruncate:OpenAI] Would need to remove all messages");
@@ -8124,14 +8246,13 @@ async function truncateByMessageRemoval(ctx, workingMessages, compressedCount) {
 	};
 	const newBytes = JSON.stringify(newPayload).length;
 	const newTokenCount = await getTokenCount(newPayload, ctx.model);
-	const reason = getReasonLabel(ctx.exceedsTokens, ctx.exceedsBytes);
 	const actions = [];
 	if (removedCount > 0) actions.push(`removed ${removedCount} msgs`);
 	if (compressedCount > 0) actions.push(`compressed ${compressedCount} tool_results`);
 	const actionInfo = actions.length > 0 ? ` (${actions.join(", ")})` : "";
 	const elapsedMs = Math.round(performance.now() - ctx.startTime);
-	consola.info(`[AutoTruncate:OpenAI] ${reason}: ${ctx.originalTokens}→${newTokenCount.input} tokens, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
-	if (newBytes > ctx.byteLimit) consola.warn(`[AutoTruncate:OpenAI] Result still over byte limit (${bytesToKB(newBytes)}KB > ${bytesToKB(ctx.byteLimit)}KB)`);
+	consola.info(`[AutoTruncate:OpenAI] tokens: ${ctx.originalTokens}→${newTokenCount.input}, ${bytesToKB(ctx.originalBytes)}→${bytesToKB(newBytes)}KB${actionInfo} [${elapsedMs}ms]`);
+	if (newTokenCount.input > ctx.tokenLimit) consola.warn(`[AutoTruncate:OpenAI] Result still over token limit (${newTokenCount.input} > ${ctx.tokenLimit})`);
 	return buildTimedResult(ctx, {
 		payload: newPayload,
 		wasTruncated: true,
@@ -8155,7 +8276,7 @@ async function autoTruncateOpenAI(payload, model, config = {}) {
 		...DEFAULT_AUTO_TRUNCATE_CONFIG,
 		...config
 	};
-	const { tokenLimit, byteLimit } = calculateLimits(model, cfg);
+	const tokenLimit = calculateTokenLimit(model, cfg);
 	const originalBytes = JSON.stringify(payload).length;
 	const originalTokens = (await getTokenCount(payload, model)).input;
 	const ctx = {
@@ -8163,14 +8284,11 @@ async function autoTruncateOpenAI(payload, model, config = {}) {
 		model,
 		cfg,
 		tokenLimit,
-		byteLimit,
 		originalTokens,
 		originalBytes,
-		exceedsTokens: originalTokens > tokenLimit,
-		exceedsBytes: originalBytes > byteLimit,
 		startTime
 	};
-	if (!ctx.exceedsTokens && !ctx.exceedsBytes) return buildTimedResult(ctx, {
+	if (originalTokens <= tokenLimit) return buildTimedResult(ctx, {
 		payload,
 		wasTruncated: false,
 		originalTokens,
@@ -8201,7 +8319,7 @@ const createChatCompletions = async (payload) => {
 		...copilotHeaders(state, enableVision),
 		"X-Initiator": isAgentCall ? "agent" : "user"
 	};
-	const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
+	const fetchSignal = createFetchSignal();
 	const response = await fetch(`${copilotBaseUrl(state)}/chat/completions`, {
 		method: "POST",
 		headers,
@@ -8372,7 +8490,7 @@ async function handleCompletion(c) {
 		consola.debug(`Model name resolved: ${clientModel} → ${resolvedModel}`);
 		originalPayload.model = resolvedModel;
 	}
-	const selectedModel = state.models?.data.find((model) => model.id === originalPayload.model);
+	const selectedModel = state.modelIndex.get(originalPayload.model);
 	if (!isEndpointSupported(selectedModel, ENDPOINT.CHAT_COMPLETIONS)) {
 		const msg = `Model "${originalPayload.model}" does not support the ${ENDPOINT.CHAT_COMPLETIONS} endpoint`;
 		throw new HTTPError(msg, 400, msg);
@@ -8524,8 +8642,8 @@ async function handleStreamingResponse(opts) {
 			acc.content += marker;
 		}
 		const iterator = response[Symbol.asyncIterator]();
+		const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
 		for (;;) {
-			const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbortSignal);
 			const result = await raceIteratorNext(iterator.next(), {
 				idleTimeoutMs,
 				abortSignal
@@ -8545,7 +8663,7 @@ async function handleStreamingResponse(opts) {
 			await stream.writeSSE({
 				data: rawEvent.data ?? "",
 				event: rawEvent.event,
-				id: String(rawEvent.id),
+				id: rawEvent.id != null ? String(rawEvent.id) : void 0,
 				retry: rawEvent.retry
 			});
 		}
@@ -8630,16 +8748,13 @@ async function handleCountTokens(c) {
 		const anthropicPayload = await c.req.json();
 		anthropicPayload.model = resolveModelName(anthropicPayload.model);
 		if (tuiLogId) tuiLogger.updateRequest(tuiLogId, { model: anthropicPayload.model });
-		const selectedModel = state.models?.data.find((model) => model.id === anthropicPayload.model);
+		const selectedModel = state.modelIndex.get(anthropicPayload.model);
 		if (!selectedModel) {
 			consola.warn(`[count_tokens] Model "${anthropicPayload.model}" not found, returning input_tokens=1`);
 			return c.json({ input_tokens: 1 });
 		}
 		if (state.autoTruncate && hasKnownLimits(selectedModel.id)) {
-			const truncateCheck = await checkNeedsCompactionAnthropic(anthropicPayload, selectedModel, {
-				checkTokenLimit: true,
-				checkByteLimit: true
-			});
+			const truncateCheck = await checkNeedsCompactionAnthropic(anthropicPayload, selectedModel, { checkTokenLimit: true });
 			if (truncateCheck.needed) {
 				const contextWindow = selectedModel.capabilities?.limits?.max_context_window_tokens ?? 2e5;
 				const inflatedTokens = Math.floor(contextWindow * .95);
@@ -8716,7 +8831,7 @@ modelRoutes.get("/:model", async (c) => {
 	try {
 		if (!state.models) await cacheModels();
 		const modelId = c.req.param("model");
-		const model = state.models?.data.find((m) => m.id === modelId);
+		const model = state.modelIndex.get(modelId);
 		if (!model) return c.json({ error: {
 			message: `The model '${modelId}' does not exist`,
 			type: "invalid_request_error",
@@ -8740,7 +8855,7 @@ const createResponses = async (payload) => {
 		...copilotHeaders(state, enableVision),
 		"X-Initiator": isAgentCall ? "agent" : "user"
 	};
-	const fetchSignal = state.fetchTimeout > 0 ? AbortSignal.timeout(state.fetchTimeout * 1e3) : void 0;
+	const fetchSignal = createFetchSignal();
 	const response = await fetch(`${copilotBaseUrl(state)}/responses`, {
 		method: "POST",
 		headers,
@@ -8880,8 +8995,7 @@ async function handleResponsesCompletion(c) {
 		consola.debug(`Model name resolved: ${clientModel} → ${resolvedModel}`);
 		payload.model = resolvedModel;
 	}
-	const selectedModel = state.models?.data.find((model) => model.id === payload.model);
-	if (!isEndpointSupported(selectedModel, ENDPOINT.RESPONSES)) {
+	if (!isEndpointSupported(state.modelIndex.get(payload.model), ENDPOINT.RESPONSES)) {
 		const msg = `Model "${payload.model}" does not support the ${ENDPOINT.RESPONSES} endpoint`;
 		throw new HTTPError(msg, 400, msg);
 	}
@@ -8911,10 +9025,33 @@ async function handleResponsesCompletion(c) {
 /** Pass through to Copilot /responses endpoint directly */
 async function handleDirectResponses(opts) {
 	const { c, payload, reqCtx } = opts;
-	const inputCount = typeof payload.input === "string" ? 1 : payload.input.length;
-	consola.debug(`Responses payload: ${inputCount} input item(s), model: ${payload.model}`);
+	const adapter = {
+		format: "openai-responses",
+		sanitize: (p) => ({
+			payload: p,
+			removedCount: 0,
+			systemReminderRemovals: 0
+		}),
+		execute: (p) => executeWithAdaptiveRateLimit(() => createResponses(p)),
+		logPayloadSize: (p) => {
+			const count = typeof p.input === "string" ? 1 : p.input.length;
+			consola.debug(`Responses payload: ${count} input item(s), model: ${p.model}`);
+		}
+	};
+	const strategies = [createTokenRefreshStrategy()];
+	const selectedModel = state.modelIndex.get(payload.model);
 	try {
-		const { result: response } = await executeWithAdaptiveRateLimit(() => createResponses(payload));
+		const pipelineResult = await executeRequestPipeline({
+			adapter,
+			strategies,
+			payload,
+			originalPayload: payload,
+			model: selectedModel,
+			maxRetries: 1,
+			requestContext: reqCtx
+		});
+		const response = pipelineResult.response;
+		reqCtx.addQueueWaitMs(pipelineResult.queueWaitMs);
 		if (!payload.stream) {
 			const responsesResponse = response;
 			const content = responsesOutputToContent(responsesResponse.output);
@@ -8942,8 +9079,8 @@ async function handleDirectResponses(opts) {
 			let eventsIn = 0;
 			try {
 				const iterator = response[Symbol.asyncIterator]();
+				const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbort.signal);
 				for (;;) {
-					const abortSignal = combineAbortSignals(getShutdownSignal(), clientAbort.signal);
 					const result = await raceIteratorNext(iterator.next(), {
 						idleTimeoutMs,
 						abortSignal
@@ -8989,7 +9126,13 @@ async function handleDirectResponses(opts) {
 * Handles POST /responses and POST /v1/responses.
 */
 const responsesRoutes = new Hono();
-responsesRoutes.post("/", handleResponsesCompletion);
+responsesRoutes.post("/", async (c) => {
+	try {
+		return await handleResponsesCompletion(c);
+	} catch (error) {
+		return forwardError(c, error);
+	}
+});
 //#endregion
 //#region src/routes/token/route.ts
@@ -9078,20 +9221,39 @@ registerRoutes(server);
 function formatLimit(value) {
 	return value ? `${Math.round(value / 1e3)}k` : "?";
 }
+/**
+* Format a model as 3 lines: main info, features, and supported endpoints.
+*
+* Example output:
+*   - claude-opus-4.6-1m          Anthropic      ctx:1000k prp: 936k out:  64k
+*       features:  adaptive-thinking, thinking, streaming, vision, tool-calls
+*       endpoints: messages, completions
+*/
 function formatModelInfo(model) {
 	const limits = model.capabilities?.limits;
 	const supports = model.capabilities?.supports;
 	const contextK = formatLimit(limits?.max_context_window_tokens);
 	const promptK = formatLimit(limits?.max_prompt_tokens);
 	const outputK = formatLimit(limits?.max_output_tokens);
+	const mainLine = `  - ${model.id.length > 28 ? `${model.id.slice(0, 25)}...` : model.id.padEnd(28)} ${model.vendor.padEnd(13)} ctx:${contextK.padStart(5)} prp:${promptK.padStart(5)} out:${outputK.padStart(5)}`;
 	const features = [
 		...Object.entries(supports ?? {}).filter(([, value]) => value === true).map(([key]) => key.replaceAll("_", "-")),
 		supports?.max_thinking_budget && "thinking",
 		model.capabilities?.type === "embeddings" && "embeddings",
 		model.preview && "preview"
 	].filter(Boolean).join(", ");
-	const featureStr = features ? ` (${features})` : "";
-	return `  - ${model.id.length > 25 ? `${model.id.slice(0, 22)}...` : model.id.padEnd(25)} ctx:${contextK.padStart(5)} prp:${promptK.padStart(5)} out:${outputK.padStart(5)}` + featureStr;
+	const featLine = features ? pc.dim(`      features:  ${features}`) : "";
+	const endpoints = formatEndpoints(model.supported_endpoints);
+	return [
+		mainLine,
+		featLine,
+		pc.dim(`      endpoints: ${endpoints}`)
+	].filter(Boolean).join("\n");
+}
+/** Format endpoint paths as short display names */
+function formatEndpoints(endpoints) {
+	if (!endpoints || endpoints.length === 0) return "(legacy)";
+	return endpoints.map((e) => e.replace(/^\/(v1\/|chat\/)?/, "")).join(", ");
 }
 /** Parse an integer from a string, returning a default if the result is NaN. */
 function parseIntOrDefault(value, defaultValue) {
@@ -9165,6 +9327,7 @@ async function runServer(options) {
 		consola.warn("Failed to fetch models from Copilot API:", error instanceof Error ? error.message : error);
 	}
 	consola.info(`Available models:\n${state.models?.data.map((m) => formatModelInfo(m)).join("\n")}`);
+	await loadPersistedLimits();
 	const availableIds = state.models?.data.map((m) => m.id) ?? [];
 	const overrideLines = Object.entries(state.modelOverrides).map(([from, to]) => {
 		const resolved = resolveModelName(from);