npm - @hsupu/copilot-api - Versions diffs - 0.7.7 → 0.7.9 - Mend

@hsupu/copilot-api 0.7.7 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/main.js CHANGED Viewed

@@ -46,7 +46,7 @@ const state = {
 	accountType: "individual",
 	manualApprove: false,
 	showToken: false,
-	autoCompact: false
+	autoCompact: true
 };
 //#endregion
@@ -246,8 +246,8 @@ async function getVSCodeVersion() {
 			}
 		});
 		if (!response.ok) return FALLBACK;
-		const version = (await response.json()).tag_name;
-		if (version && /^\d+\.\d+\.\d+$/.test(version)) return version;
+		const version$1 = (await response.json()).tag_name;
+		if (version$1 && /^\d+\.\d+\.\d+$/.test(version$1)) return version$1;
 		return FALLBACK;
 	} catch {
 		return FALLBACK;
@@ -434,13 +434,13 @@ const checkUsage = defineCommand({
 			const premiumUsed = premiumTotal - premium.remaining;
 			const premiumPercentUsed = premiumTotal > 0 ? premiumUsed / premiumTotal * 100 : 0;
 			const premiumPercentRemaining = premium.percent_remaining;
-			function summarizeQuota(name, snap) {
-				if (!snap) return `${name}: N/A`;
+			function summarizeQuota(name$1, snap) {
+				if (!snap) return `${name$1}: N/A`;
 				const total = snap.entitlement;
 				const used = total - snap.remaining;
 				const percentUsed = total > 0 ? used / total * 100 : 0;
 				const percentRemaining = snap.percent_remaining;
-				return `${name}: ${used}/${total} used (${percentUsed.toFixed(1)}% used, ${percentRemaining.toFixed(1)}% remaining)`;
+				return `${name$1}: ${used}/${total} used (${percentUsed.toFixed(1)}% used, ${percentRemaining.toFixed(1)}% remaining)`;
 			}
 			const premiumLine = `Premium: ${premiumUsed}/${premiumTotal} used (${premiumPercentUsed.toFixed(1)}% used, ${premiumPercentRemaining.toFixed(1)}% remaining)`;
 			const chatLine = summarizeQuota("Chat", usage.quota_snapshots.chat);
@@ -481,9 +481,9 @@ async function checkTokenExists() {
 	}
 }
 async function getDebugInfo() {
-	const [version, tokenExists] = await Promise.all([getPackageVersion(), checkTokenExists()]);
+	const [version$1, tokenExists] = await Promise.all([getPackageVersion(), checkTokenExists()]);
 	return {
-		version,
+		version: version$1,
 		runtime: getRuntimeInfo(),
 		paths: {
 			APP_DIR: PATHS.APP_DIR,
@@ -571,8 +571,8 @@ const PATTERNS = {
 /**
 * Parse semver version string to comparable parts
 */
-function parseVersion(version) {
-	return version.split(".").map((n) => Number.parseInt(n, 10) || 0);
+function parseVersion(version$1) {
+	return version$1.split(".").map((n) => Number.parseInt(n, 10) || 0);
 }
 /**
 * Compare two semver versions
@@ -590,9 +590,9 @@ function compareVersions(a, b) {
 	}
 	return 0;
 }
-function getPatternTypeForVersion(version) {
-	if (compareVersions(version, SUPPORTED_VERSIONS.v2a.min) >= 0 && compareVersions(version, SUPPORTED_VERSIONS.v2a.max) <= 0) return "func";
-	if (compareVersions(version, SUPPORTED_VERSIONS.v2b.min) >= 0 && compareVersions(version, SUPPORTED_VERSIONS.v2b.max) <= 0) return "variable";
+function getPatternTypeForVersion(version$1) {
+	if (compareVersions(version$1, SUPPORTED_VERSIONS.v2a.min) >= 0 && compareVersions(version$1, SUPPORTED_VERSIONS.v2a.max) <= 0) return "func";
+	if (compareVersions(version$1, SUPPORTED_VERSIONS.v2b.min) >= 0 && compareVersions(version$1, SUPPORTED_VERSIONS.v2b.max) <= 0) return "variable";
 	return null;
 }
 /**
@@ -624,8 +624,8 @@ function findInVoltaTools(voltaHome) {
 	if (existsSync(packagesPath)) paths.push(packagesPath);
 	const toolsDir = join(voltaHome, "tools", "image", "node");
 	if (existsSync(toolsDir)) try {
-		for (const version of readdirSync(toolsDir)) {
-			const claudePath = join(toolsDir, version, "lib", "node_modules", "@anthropic-ai", "claude-code", "cli.js");
+		for (const version$1 of readdirSync(toolsDir)) {
+			const claudePath = join(toolsDir, version$1, "lib", "node_modules", "@anthropic-ai", "claude-code", "cli.js");
 			if (existsSync(claudePath)) paths.push(claudePath);
 		}
 	} catch {}
@@ -668,23 +668,23 @@ function getCurrentLimit(content) {
 * Check if Claude Code version is supported for patching
 */
 function checkVersionSupport(cliPath) {
-	const version = getClaudeCodeVersion(cliPath);
-	if (!version) return {
+	const version$1 = getClaudeCodeVersion(cliPath);
+	if (!version$1) return {
 		supported: false,
 		version: null,
 		patternType: null,
 		error: "Could not detect Claude Code version"
 	};
-	const patternType = getPatternTypeForVersion(version);
+	const patternType = getPatternTypeForVersion(version$1);
 	if (!patternType) return {
 		supported: false,
-		version,
+		version: version$1,
 		patternType: null,
-		error: `Version ${version} is not supported. Supported: ${getSupportedRangeString()}`
+		error: `Version ${version$1} is not supported. Supported: ${getSupportedRangeString()}`
 	};
 	return {
 		supported: true,
-		version,
+		version: version$1,
 		patternType
 	};
 }
@@ -735,8 +735,8 @@ function restoreClaudeCode(cliPath) {
 	return true;
 }
 function showStatus(cliPath, currentLimit) {
-	const version = getClaudeCodeVersion(cliPath);
-	if (version) consola.info(`Claude Code version: ${version}`);
+	const version$1 = getClaudeCodeVersion(cliPath);
+	if (version$1) consola.info(`Claude Code version: ${version$1}`);
 	if (currentLimit === null) {
 		consola.warn("Could not detect current limit - CLI may have been updated");
 		consola.info("Look for the BS9 variable or HR function pattern in cli.js");
@@ -818,6 +818,86 @@ const patchClaude = defineCommand({
 	}
 });
+//#endregion
+//#region package.json
+var name = "@hsupu/copilot-api";
+var version = "0.7.9";
+var description = "Turn GitHub Copilot into OpenAI/Anthropic API compatible server. Usable with Claude Code!";
+var keywords = [
+	"proxy",
+	"github-copilot",
+	"openai-compatible",
+	"anthropic-compatible"
+];
+var homepage = "https://github.com/puxu-msft/copilot-api-js";
+var bugs = "https://github.com/puxu-msft/copilot-api-js/issues";
+var repository = {
+	"type": "git",
+	"url": "git+https://github.com/puxu-msft/copilot-api-js.git"
+};
+var author = "hsupu";
+var type = "module";
+var bin = { "copilot-api": "dist/main.js" };
+var files = ["dist"];
+var scripts = {
+	"build": "npx tsdown",
+	"dev": "bun run --watch ./src/main.ts",
+	"knip": "knip-bun",
+	"lint": "eslint --cache",
+	"lint:all": "eslint --cache .",
+	"prepack": "npm run build",
+	"prepare": "npm run build && (command -v bun >/dev/null 2>&1 && simple-git-hooks || true)",
+	"release": "bumpp && npm publish --access public",
+	"start": "NODE_ENV=production bun run ./src/main.ts",
+	"typecheck": "tsc"
+};
+var simple_git_hooks = { "pre-commit": "bun x lint-staged" };
+var lint_staged = { "*": "bun run lint --fix" };
+var dependencies = {
+	"citty": "^0.1.6",
+	"clipboardy": "^5.0.0",
+	"consola": "^3.4.2",
+	"fetch-event-stream": "^0.1.5",
+	"gpt-tokenizer": "^3.0.1",
+	"hono": "^4.9.9",
+	"picocolors": "^1.1.1",
+	"proxy-from-env": "^1.1.0",
+	"srvx": "^0.8.9",
+	"tiny-invariant": "^1.3.3",
+	"undici": "^7.16.0"
+};
+var devDependencies = {
+	"@echristian/eslint-config": "^0.0.54",
+	"@types/bun": "^1.2.23",
+	"@types/proxy-from-env": "^1.0.4",
+	"bumpp": "^10.2.3",
+	"eslint": "^9.37.0",
+	"knip": "^5.64.1",
+	"lint-staged": "^16.2.3",
+	"prettier-plugin-packagejson": "^2.5.19",
+	"simple-git-hooks": "^2.13.1",
+	"tsdown": "^0.15.6",
+	"typescript": "^5.9.3"
+};
+var package_default = {
+	name,
+	version,
+	description,
+	keywords,
+	homepage,
+	bugs,
+	repository,
+	author,
+	type,
+	bin,
+	files,
+	scripts,
+	"simple-git-hooks": simple_git_hooks,
+	"lint-staged": lint_staged,
+	dependencies,
+	devDependencies
+};
 //#endregion
 //#region src/lib/adaptive-rate-limiter.ts
 const DEFAULT_CONFIG$1 = {
@@ -1566,8 +1646,8 @@ var ConsoleRenderer = class {
 	/**
 	* Get log prefix based on log type
 	*/
-	getLogPrefix(type) {
-		switch (type) {
+	getLogPrefix(type$1) {
+		switch (type$1) {
 			case "error":
 			case "fatal": return pc.red("✖");
 			case "warn": return pc.yellow("⚠");
@@ -2096,171 +2176,157 @@ const getTokenCount = async (payload, model) => {
 //#endregion
 //#region src/lib/auto-compact.ts
 const DEFAULT_CONFIG = {
-	targetTokens: 12e4,
 	safetyMarginPercent: 2,
 	maxRequestBodyBytes: 500 * 1024
 };
+/** Dynamic byte limit that adjusts based on 413 errors */
+let dynamicByteLimit = null;
 /**
-* Dynamic byte limit that adjusts based on 413 errors.
-* Starts at 500KB and can be adjusted when 413 errors are encountered.
-*/
-let dynamicByteLimitOverride = null;
-/**
-* Called when a 413 error is encountered with a specific payload size.
-* Adjusts the dynamic byte limit to 90% of the failing size.
+* Called when a 413 error occurs. Adjusts the byte limit to 90% of the failing size.
 */
 function onRequestTooLarge(failingBytes) {
 	const newLimit = Math.max(Math.floor(failingBytes * .9), 100 * 1024);
-	dynamicByteLimitOverride = newLimit;
-	consola.info(`[Auto-compact] Adjusted byte limit: ${Math.round(failingBytes / 1024)}KB failed, new limit: ${Math.round(newLimit / 1024)}KB`);
+	dynamicByteLimit = newLimit;
+	consola.info(`[Auto-compact] Adjusted byte limit: ${Math.round(failingBytes / 1024)}KB failed → ${Math.round(newLimit / 1024)}KB`);
 }
-/**
-* Check if payload needs compaction based on model limits OR request body size.
-* Uses a safety margin to account for token counting differences.
-*/
-async function checkNeedsCompaction(payload, model, config = {}) {
-	const cfg = {
-		...DEFAULT_CONFIG,
-		...config
-	};
-	const currentTokens = (await getTokenCount(payload, model)).input;
-	const rawLimit = model.capabilities?.limits?.max_prompt_tokens ?? 128e3;
-	const tokenLimit = Math.floor(rawLimit * (1 - cfg.safetyMarginPercent / 100));
-	const currentBytes = JSON.stringify(payload).length;
-	const byteLimit = dynamicByteLimitOverride ?? cfg.maxRequestBodyBytes;
-	const exceedsTokens = currentTokens > tokenLimit;
-	const exceedsBytes = currentBytes > byteLimit;
-	let reason;
-	if (exceedsTokens && exceedsBytes) reason = "both";
-	else if (exceedsTokens) reason = "tokens";
-	else if (exceedsBytes) reason = "bytes";
+function calculateLimits(model, config) {
+	const rawTokenLimit = model.capabilities?.limits?.max_prompt_tokens ?? 128e3;
+	const tokenLimit = Math.floor(rawTokenLimit * (1 - config.safetyMarginPercent / 100));
+	const byteLimit = dynamicByteLimit ?? config.maxRequestBodyBytes;
 	return {
-		needed: exceedsTokens || exceedsBytes,
-		currentTokens,
 		tokenLimit,
-		currentBytes,
-		byteLimit,
-		reason
+		byteLimit
 	};
 }
-/**
-* Calculate approximate token count for a single message.
-* This is a fast estimation for splitting decisions.
-*/
-function estimateMessageTokens(message) {
-	let text = "";
-	if (typeof message.content === "string") text = message.content;
-	else if (Array.isArray(message.content)) {
-		for (const part of message.content) if (part.type === "text") text += part.text;
-		else if ("image_url" in part) text += part.image_url.url;
+/** Estimate tokens for a single message (fast approximation) */
+function estimateMessageTokens(msg) {
+	let charCount = 0;
+	if (typeof msg.content === "string") charCount = msg.content.length;
+	else if (Array.isArray(msg.content)) {
+		for (const part of msg.content) if (part.type === "text") charCount += part.text.length;
+		else if ("image_url" in part) charCount += Math.min(part.image_url.url.length, 1e4);
 	}
-	if (message.tool_calls) text += JSON.stringify(message.tool_calls);
-	return Math.ceil(text.length / 4) + 10;
+	if (msg.tool_calls) charCount += JSON.stringify(msg.tool_calls).length;
+	return Math.ceil(charCount / 4) + 10;
 }
-/**
-* Extract system messages from the beginning of the message list.
-*/
+/** Get byte size of a message */
+function getMessageBytes(msg) {
+	return JSON.stringify(msg).length;
+}
+/** Extract system/developer messages from the beginning */
 function extractSystemMessages(messages) {
-	const systemMessages = [];
-	let i = 0;
-	while (i < messages.length) {
-		const msg = messages[i];
-		if (msg.role === "system" || msg.role === "developer") {
-			systemMessages.push(msg);
-			i++;
-		} else break;
+	let splitIndex = 0;
+	while (splitIndex < messages.length) {
+		const role = messages[splitIndex].role;
+		if (role !== "system" && role !== "developer") break;
+		splitIndex++;
 	}
 	return {
-		systemMessages,
-		remainingMessages: messages.slice(i)
+		systemMessages: messages.slice(0, splitIndex),
+		conversationMessages: messages.slice(splitIndex)
 	};
 }
-/**
-* Extract tool_use ids from assistant messages with tool_calls.
-*/
-function getToolUseIds(message) {
-	if (message.role === "assistant" && message.tool_calls) return message.tool_calls.map((tc) => tc.id);
+/** Get tool_use IDs from an assistant message */
+function getToolCallIds(msg) {
+	if (msg.role === "assistant" && msg.tool_calls) return msg.tool_calls.map((tc) => tc.id);
 	return [];
 }
-/**
-* Find messages to keep from the end to stay under target tokens.
-* Returns the starting index of messages to preserve.
-*/
-function findPreserveIndex(messages, targetTokens, systemTokens) {
-	const availableTokens = targetTokens - systemTokens - 500;
-	let accumulatedTokens = 0;
-	for (let i = messages.length - 1; i >= 0; i--) {
-		const msgTokens = estimateMessageTokens(messages[i]);
-		if (accumulatedTokens + msgTokens > availableTokens) return i + 1;
-		accumulatedTokens += msgTokens;
-	}
-	return 0;
-}
-/**
-* Filter out orphaned tool_result messages that don't have a matching tool_use
-* in the preserved message list. This prevents API errors when truncation
-* separates tool_use/tool_result pairs.
-*/
+/** Filter orphaned tool_result messages */
 function filterOrphanedToolResults(messages) {
-	const availableToolUseIds = /* @__PURE__ */ new Set();
-	for (const msg of messages) for (const id of getToolUseIds(msg)) availableToolUseIds.add(id);
-	const filteredMessages = [];
+	const toolUseIds = /* @__PURE__ */ new Set();
+	for (const msg of messages) for (const id of getToolCallIds(msg)) toolUseIds.add(id);
 	let removedCount = 0;
-	for (const msg of messages) {
-		if (msg.role === "tool" && msg.tool_call_id && !availableToolUseIds.has(msg.tool_call_id)) {
+	const filtered = messages.filter((msg) => {
+		if (msg.role === "tool" && msg.tool_call_id && !toolUseIds.has(msg.tool_call_id)) {
 			removedCount++;
-			continue;
+			return false;
 		}
-		filteredMessages.push(msg);
-	}
-	if (removedCount > 0) consola.info(`Auto-compact: Removed ${removedCount} orphaned tool_result message(s) without matching tool_use`);
-	return filteredMessages;
+		return true;
+	});
+	if (removedCount > 0) consola.debug(`Auto-compact: Filtered ${removedCount} orphaned tool_result`);
+	return filtered;
 }
-/**
-* Ensure the message list starts with a user message.
-* If it starts with assistant or tool messages, skip them until we find a user message.
-* This is required because OpenAI API expects conversations to start with user messages
-* (after system messages).
-*/
+/** Ensure messages start with a user message */
 function ensureStartsWithUser(messages) {
 	let startIndex = 0;
-	while (startIndex < messages.length) {
-		if (messages[startIndex].role === "user") break;
-		startIndex++;
-	}
-	if (startIndex > 0) consola.info(`Auto-compact: Skipped ${startIndex} leading non-user message(s) to ensure valid sequence`);
+	while (startIndex < messages.length && messages[startIndex].role !== "user") startIndex++;
+	if (startIndex > 0) consola.debug(`Auto-compact: Skipped ${startIndex} leading non-user messages`);
 	return messages.slice(startIndex);
 }
 /**
-* Calculate estimated tokens for system messages.
+* Find the optimal index from which to preserve messages.
+* Uses binary search with pre-calculated cumulative sums.
+* Returns the smallest index where the preserved portion fits within limits.
 */
-function estimateSystemTokens(systemMessages) {
-	return systemMessages.reduce((sum, msg) => sum + estimateMessageTokens(msg), 0);
+function findOptimalPreserveIndex(params) {
+	const { messages, systemBytes, systemTokens, payloadOverhead, tokenLimit, byteLimit } = params;
+	if (messages.length === 0) return 0;
+	const markerBytes = 200;
+	const availableTokens = tokenLimit - systemTokens - 50;
+	const availableBytes = byteLimit - payloadOverhead - systemBytes - markerBytes;
+	if (availableTokens <= 0 || availableBytes <= 0) return messages.length;
+	const n = messages.length;
+	const cumTokens = Array.from({ length: n + 1 }, () => 0);
+	const cumBytes = Array.from({ length: n + 1 }, () => 0);
+	for (let i = n - 1; i >= 0; i--) {
+		const msg = messages[i];
+		cumTokens[i] = cumTokens[i + 1] + estimateMessageTokens(msg);
+		cumBytes[i] = cumBytes[i + 1] + getMessageBytes(msg) + 1;
+	}
+	let left = 0;
+	let right = n;
+	while (left < right) {
+		const mid = left + right >>> 1;
+		if (cumTokens[mid] <= availableTokens && cumBytes[mid] <= availableBytes) right = mid;
+		else left = mid + 1;
+	}
+	return left;
 }
 /**
-* Create a truncation marker message.
+* Check if payload needs compaction based on model limits or byte size.
 */
+async function checkNeedsCompaction(payload, model, config = {}) {
+	const cfg = {
+		...DEFAULT_CONFIG,
+		...config
+	};
+	const { tokenLimit, byteLimit } = calculateLimits(model, cfg);
+	const currentTokens = (await getTokenCount(payload, model)).input;
+	const currentBytes = JSON.stringify(payload).length;
+	const exceedsTokens = currentTokens > tokenLimit;
+	const exceedsBytes = currentBytes > byteLimit;
+	let reason;
+	if (exceedsTokens && exceedsBytes) reason = "both";
+	else if (exceedsTokens) reason = "tokens";
+	else if (exceedsBytes) reason = "bytes";
+	return {
+		needed: exceedsTokens || exceedsBytes,
+		currentTokens,
+		tokenLimit,
+		currentBytes,
+		byteLimit,
+		reason
+	};
+}
+/** Create a truncation marker message */
 function createTruncationMarker(removedCount) {
 	return {
 		role: "user",
-		content: `[CONTEXT TRUNCATED: ${removedCount} earlier messages were removed to fit context limits. The conversation continues below.]`
+		content: `[CONTEXT TRUNCATED: ${removedCount} earlier messages removed to fit context limits]`
 	};
 }
 /**
-* Perform auto-compaction on a payload that exceeds token or size limits.
-* This uses simple truncation - no LLM calls required.
-* Uses iterative approach with decreasing target tokens until under limit.
+* Perform auto-compaction on a payload that exceeds limits.
+* Uses binary search to find the optimal truncation point.
 */
 async function autoCompact(payload, model, config = {}) {
 	const cfg = {
 		...DEFAULT_CONFIG,
 		...config
 	};
-	const originalTokens = (await getTokenCount(payload, model)).input;
-	const rawLimit = model.capabilities?.limits?.max_prompt_tokens ?? 128e3;
-	const tokenLimit = Math.floor(rawLimit * (1 - cfg.safetyMarginPercent / 100));
+	const { tokenLimit, byteLimit } = calculateLimits(model, cfg);
 	const originalBytes = JSON.stringify(payload).length;
-	const byteLimit = dynamicByteLimitOverride ?? cfg.maxRequestBodyBytes;
+	const originalTokens = (await getTokenCount(payload, model)).input;
 	if (originalTokens <= tokenLimit && originalBytes <= byteLimit) return {
 		payload,
 		wasCompacted: false,
@@ -2274,60 +2340,33 @@ async function autoCompact(payload, model, config = {}) {
 	if (exceedsTokens && exceedsBytes) reason = "tokens and size";
 	else if (exceedsBytes) reason = "size";
 	else reason = "tokens";
-	consola.info(`Auto-compact: Exceeds ${reason} limit (${originalTokens} tokens, ${Math.round(originalBytes / 1024)}KB), truncating...`);
-	const { systemMessages, remainingMessages } = extractSystemMessages(payload.messages);
-	const systemTokens = estimateSystemTokens(systemMessages);
-	consola.debug(`Auto-compact: ${systemMessages.length} system messages (~${systemTokens} tokens)`);
-	const MAX_ITERATIONS = 5;
-	const MIN_TARGET = 2e4;
-	let currentTarget = Math.min(cfg.targetTokens, tokenLimit);
-	let lastResult = null;
-	for (let iteration = 0; iteration < MAX_ITERATIONS; iteration++) {
-		const result = await tryCompactWithTarget({
+	consola.info(`Auto-compact: Exceeds ${reason} limit (${originalTokens} tokens, ${Math.round(originalBytes / 1024)}KB)`);
+	const { systemMessages, conversationMessages } = extractSystemMessages(payload.messages);
+	const messagesJson = JSON.stringify(payload.messages);
+	const payloadOverhead = originalBytes - messagesJson.length;
+	const systemBytes = systemMessages.reduce((sum, m) => sum + getMessageBytes(m) + 1, 0);
+	const systemTokens = systemMessages.reduce((sum, m) => sum + estimateMessageTokens(m), 0);
+	consola.debug(`Auto-compact: overhead=${Math.round(payloadOverhead / 1024)}KB, system=${systemMessages.length} msgs (${Math.round(systemBytes / 1024)}KB)`);
+	const preserveIndex = findOptimalPreserveIndex({
+		messages: conversationMessages,
+		systemBytes,
+		systemTokens,
+		payloadOverhead,
+		tokenLimit,
+		byteLimit
+	});
+	if (preserveIndex === 0) {
+		consola.warn("Auto-compact: Cannot truncate, system messages too large");
+		return {
 			payload,
-			model,
-			systemMessages,
-			remainingMessages,
-			systemTokens,
-			targetTokens: currentTarget,
-			limit: tokenLimit,
-			originalTokens
-		});
-		if (!result.wasCompacted) return result;
-		lastResult = result;
-		const resultBytes = JSON.stringify(result.payload).length;
-		const underTokenLimit = result.compactedTokens <= tokenLimit;
-		const underByteLimit = resultBytes <= byteLimit;
-		if (underTokenLimit && underByteLimit) {
-			consola.info(`Auto-compact: ${originalTokens} → ${result.compactedTokens} tokens, ${Math.round(originalBytes / 1024)}KB → ${Math.round(resultBytes / 1024)}KB (removed ${result.removedMessageCount} messages)`);
-			return result;
-		}
-		const tokenStatus = underTokenLimit ? "OK" : `${result.compactedTokens} > ${tokenLimit}`;
-		const byteStatus = underByteLimit ? "OK" : `${Math.round(resultBytes / 1024)}KB > ${Math.round(byteLimit / 1024)}KB`;
-		consola.warn(`Auto-compact: Still over limit (tokens: ${tokenStatus}, size: ${byteStatus}), trying more aggressive truncation`);
-		currentTarget = Math.floor(currentTarget * .7);
-		if (currentTarget < MIN_TARGET) {
-			consola.error("Auto-compact: Cannot reduce further, target too low");
-			return result;
-		}
+			wasCompacted: false,
+			originalTokens,
+			compactedTokens: originalTokens,
+			removedMessageCount: 0
+		};
 	}
-	consola.error(`Auto-compact: Exhausted ${MAX_ITERATIONS} iterations, returning best effort`);
-	return lastResult ?? {
-		payload,
-		wasCompacted: false,
-		originalTokens,
-		compactedTokens: originalTokens,
-		removedMessageCount: 0
-	};
-}
-/**
-* Helper to attempt compaction with a specific target token count.
-*/
-async function tryCompactWithTarget(opts) {
-	const { payload, model, systemMessages, remainingMessages, systemTokens, targetTokens, originalTokens } = opts;
-	const preserveIndex = findPreserveIndex(remainingMessages, targetTokens, systemTokens);
-	if (preserveIndex === 0) {
-		consola.warn("Auto-compact: Cannot truncate further without losing all conversation history");
+	if (preserveIndex >= conversationMessages.length) {
+		consola.warn("Auto-compact: Would need to remove all messages");
 		return {
 			payload,
 			wasCompacted: false,
@@ -2336,13 +2375,12 @@ async function tryCompactWithTarget(opts) {
 			removedMessageCount: 0
 		};
 	}
-	const removedMessages = remainingMessages.slice(0, preserveIndex);
-	let preservedMessages = remainingMessages.slice(preserveIndex);
-	preservedMessages = filterOrphanedToolResults(preservedMessages);
-	preservedMessages = ensureStartsWithUser(preservedMessages);
-	preservedMessages = filterOrphanedToolResults(preservedMessages);
-	if (preservedMessages.length === 0) {
-		consola.warn("Auto-compact: All messages were filtered out after cleanup, cannot compact");
+	let preserved = conversationMessages.slice(preserveIndex);
+	preserved = filterOrphanedToolResults(preserved);
+	preserved = ensureStartsWithUser(preserved);
+	preserved = filterOrphanedToolResults(preserved);
+	if (preserved.length === 0) {
+		consola.warn("Auto-compact: All messages filtered out after cleanup");
 		return {
 			payload,
 			wasCompacted: false,
@@ -2351,27 +2389,30 @@ async function tryCompactWithTarget(opts) {
 			removedMessageCount: 0
 		};
 	}
-	consola.debug(`Auto-compact: Removing ${removedMessages.length} messages, keeping ${preservedMessages.length}`);
-	const truncationMarker = createTruncationMarker(removedMessages.length);
+	const removedCount = conversationMessages.length - preserved.length;
+	const marker = createTruncationMarker(removedCount);
 	const newPayload = {
 		...payload,
 		messages: [
 			...systemMessages,
-			truncationMarker,
-			...preservedMessages
+			marker,
+			...preserved
 		]
 	};
+	const newBytes = JSON.stringify(newPayload).length;
 	const newTokenCount = await getTokenCount(newPayload, model);
+	consola.info(`Auto-compact: ${originalTokens} → ${newTokenCount.input} tokens, ${Math.round(originalBytes / 1024)}KB → ${Math.round(newBytes / 1024)}KB (removed ${removedCount} messages)`);
+	if (newBytes > byteLimit) consola.warn(`Auto-compact: Result still over byte limit (${Math.round(newBytes / 1024)}KB > ${Math.round(byteLimit / 1024)}KB)`);
 	return {
 		payload: newPayload,
 		wasCompacted: true,
 		originalTokens,
 		compactedTokens: newTokenCount.input,
-		removedMessageCount: removedMessages.length
+		removedMessageCount: removedCount
 	};
 }
 /**
-* Create a marker to append to responses indicating auto-compaction occurred.
+* Create a marker to prepend to responses indicating auto-compaction occurred.
 */
 function createCompactionMarker(result) {
 	if (!result.wasCompacted) return "";
@@ -2633,7 +2674,7 @@ function handleNonStreamingResponse$1(c, originalResponse, ctx) {
 				...choice$1,
 				message: {
 					...choice$1.message,
-					content: (choice$1.message.content ?? "") + marker
+					content: marker + (choice$1.message.content ?? "")
 				}
 			} : choice$1)
 		};
@@ -2694,18 +2735,13 @@ async function handleStreamingResponse$1(opts) {
 	const { stream, response, payload, ctx } = opts;
 	const acc = createStreamAccumulator();
 	try {
-		for await (const chunk of response) {
-			consola.debug("Streaming chunk:", JSON.stringify(chunk));
-			parseStreamChunk(chunk, acc);
-			await stream.writeSSE(chunk);
-		}
 		if (ctx.compactResult?.wasCompacted) {
 			const marker = createCompactionMarker(ctx.compactResult);
 			const markerChunk = {
 				id: `compact-marker-${Date.now()}`,
 				object: "chat.completion.chunk",
 				created: Math.floor(Date.now() / 1e3),
-				model: acc.model || payload.model,
+				model: payload.model,
 				choices: [{
 					index: 0,
 					delta: { content: marker },
@@ -2719,6 +2755,11 @@ async function handleStreamingResponse$1(opts) {
 			});
 			acc.content += marker;
 		}
+		for await (const chunk of response) {
+			consola.debug("Streaming chunk:", JSON.stringify(chunk));
+			parseStreamChunk(chunk, acc);
+			await stream.writeSSE(chunk);
+		}
 		recordStreamSuccess(acc, payload.model, ctx);
 		completeTracking(ctx.trackingId, acc.inputTokens, acc.outputTokens, ctx.queueWaitMs);
 	} catch (error) {
@@ -4509,7 +4550,7 @@ function handleNonStreamingResponse(opts) {
 	consola.debug("Translated Anthropic response:", JSON.stringify(anthropicResponse));
 	if (ctx.compactResult?.wasCompacted) {
 		const marker = createCompactionMarker(ctx.compactResult);
-		anthropicResponse = appendMarkerToAnthropicResponse(anthropicResponse, marker);
+		anthropicResponse = prependMarkerToAnthropicResponse(anthropicResponse, marker);
 	}
 	recordResponse(ctx.historyId, {
 		success: true,
@@ -4541,16 +4582,16 @@ function handleNonStreamingResponse(opts) {
 	});
 	return c.json(anthropicResponse);
 }
-function appendMarkerToAnthropicResponse(response, marker) {
+function prependMarkerToAnthropicResponse(response, marker) {
 	const content = [...response.content];
-	const lastTextIndex = content.findLastIndex((block) => block.type === "text");
-	if (lastTextIndex !== -1) {
-		const textBlock = content[lastTextIndex];
-		if (textBlock.type === "text") content[lastTextIndex] = {
+	const firstTextIndex = content.findIndex((block) => block.type === "text");
+	if (firstTextIndex !== -1) {
+		const textBlock = content[firstTextIndex];
+		if (textBlock.type === "text") content[firstTextIndex] = {
 			...textBlock,
-			text: textBlock.text + marker
+			text: marker + textBlock.text
 		};
-	} else content.push({
+	} else content.unshift({
 		type: "text",
 		text: marker
 	});
@@ -4580,6 +4621,11 @@ async function handleStreamingResponse(opts) {
 	};
 	const acc = createAnthropicStreamAccumulator();
 	try {
+		if (ctx.compactResult?.wasCompacted) {
+			const marker = createCompactionMarker(ctx.compactResult);
+			await sendCompactionMarkerEvent(stream, streamState, marker);
+			acc.content += marker;
+		}
 		await processStreamChunks({
 			stream,
 			response,
@@ -4587,11 +4633,6 @@ async function handleStreamingResponse(opts) {
 			streamState,
 			acc
 		});
-		if (ctx.compactResult?.wasCompacted) {
-			const marker = createCompactionMarker(ctx.compactResult);
-			await sendCompactionMarkerEvent(stream, streamState, marker);
-			acc.content += marker;
-		}
 		recordStreamingResponse(acc, anthropicPayload.model, ctx);
 		completeTracking(ctx.trackingId, acc.inputTokens, acc.outputTokens, ctx.queueWaitMs);
 	} catch (error) {
@@ -4904,6 +4945,7 @@ function formatModelInfo(model) {
 	return `  - ${model.id.padEnd(28)} context: ${contextK.padStart(5)}, output: ${outputK.padStart(4)}${featureStr}`;
 }
 async function runServer(options) {
+	consola.info(`copilot-api v${package_default.version}`);
 	if (options.proxyEnv) initProxyFromEnv();
 	if (options.verbose) {
 		consola.level = 5;
@@ -4921,7 +4963,7 @@ async function runServer(options) {
 		consecutiveSuccessesForRecovery: options.consecutiveSuccesses
 	});
 	else consola.info("Rate limiting disabled");
-	if (options.autoCompact) consola.info("Auto-compact enabled: will compress context when exceeding token limits");
+	if (!options.autoCompact) consola.info("Auto-compact disabled");
 	initHistory(options.history, options.historyLimit);
 	if (options.history) {
 		const limitText = options.historyLimit === 0 ? "unlimited" : `max ${options.historyLimit}`;
@@ -5063,10 +5105,10 @@ const start = defineCommand({
 			default: "1000",
 			description: "Maximum number of history entries to keep in memory (0 = unlimited)"
 		},
-		"auto-compact": {
+		"no-auto-compact": {
 			type: "boolean",
 			default: false,
-			description: "Automatically compress conversation history when exceeding model token limits"
+			description: "Disable automatic conversation history compression when exceeding limits"
 		}
 	},
 	run({ args }) {
@@ -5087,7 +5129,7 @@ const start = defineCommand({
 			proxyEnv: args["proxy-env"],
 			history: !args["no-history"],
 			historyLimit: Number.parseInt(args["history-limit"], 10),
-			autoCompact: args["auto-compact"]
+			autoCompact: !args["no-auto-compact"]
 		});
 	}
 });