npm - github-router - Versions diffs - 0.3.40 → 0.3.41 - Mend

github-router 0.3.40 → 0.3.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/main.js CHANGED Viewed

@@ -5851,6 +5851,221 @@ function acquireInFlightSlot() {
 	};
 }
+//#endregion
+//#region src/lib/tokenizer.ts
+const ENCODING_MAP = {
+	o200k_base: () => import("gpt-tokenizer/encoding/o200k_base"),
+	cl100k_base: () => import("gpt-tokenizer/encoding/cl100k_base"),
+	p50k_base: () => import("gpt-tokenizer/encoding/p50k_base"),
+	p50k_edit: () => import("gpt-tokenizer/encoding/p50k_edit"),
+	r50k_base: () => import("gpt-tokenizer/encoding/r50k_base")
+};
+const encodingCache = /* @__PURE__ */ new Map();
+/**
+* Calculate tokens for tool calls
+*/
+const calculateToolCallsTokens = (toolCalls, encoder, constants) => {
+	let tokens = 0;
+	for (const toolCall of toolCalls) {
+		tokens += constants.funcInit;
+		tokens += encoder.encode(JSON.stringify(toolCall)).length;
+	}
+	tokens += constants.funcEnd;
+	return tokens;
+};
+/**
+* Calculate tokens for content parts
+*/
+const calculateContentPartsTokens = (contentParts, encoder) => {
+	let tokens = 0;
+	for (const part of contentParts) if (part.type === "image_url") tokens += encoder.encode(part.image_url.url).length + 85;
+	else if (part.text) tokens += encoder.encode(part.text).length;
+	return tokens;
+};
+/**
+* Calculate tokens for a single message
+*/
+const calculateMessageTokens = (message, encoder, constants) => {
+	const tokensPerMessage = 3;
+	const tokensPerName = 1;
+	let tokens = tokensPerMessage;
+	for (const [key, value] of Object.entries(message)) {
+		if (typeof value === "string") tokens += encoder.encode(value).length;
+		if (key === "name") tokens += tokensPerName;
+		if (key === "tool_calls") tokens += calculateToolCallsTokens(value, encoder, constants);
+		if (key === "content" && Array.isArray(value)) tokens += calculateContentPartsTokens(value, encoder);
+	}
+	return tokens;
+};
+/**
+* Calculate tokens using custom algorithm
+*/
+const calculateTokens = (messages, encoder, constants) => {
+	if (messages.length === 0) return 0;
+	let numTokens = 0;
+	for (const message of messages) numTokens += calculateMessageTokens(message, encoder, constants);
+	numTokens += 3;
+	return numTokens;
+};
+/**
+* Get the corresponding encoder module based on encoding type
+*/
+const getEncodeChatFunction = async (encoding) => {
+	if (encodingCache.has(encoding)) {
+		const cached$1 = encodingCache.get(encoding);
+		if (cached$1) return cached$1;
+	}
+	const supportedEncoding = encoding;
+	if (!(supportedEncoding in ENCODING_MAP)) {
+		const fallbackModule = await ENCODING_MAP.o200k_base();
+		encodingCache.set(encoding, fallbackModule);
+		return fallbackModule;
+	}
+	const encodingModule = await ENCODING_MAP[supportedEncoding]();
+	encodingCache.set(encoding, encodingModule);
+	return encodingModule;
+};
+/**
+* Get tokenizer type from model information
+*/
+const getTokenizerFromModel = (model) => {
+	return model.capabilities?.tokenizer || "o200k_base";
+};
+/**
+* Load (and cache) the encoder for an encoding name. Unknown encodings
+* fall back to o200k_base. Exposed so prompt-window budgeting code can
+* count raw-text tokens without going through the chat-payload path.
+*/
+const loadEncoder = async (encoding = "o200k_base") => getEncodeChatFunction(encoding);
+/**
+* Exact token count of a raw text string under the given encoding
+* (default o200k_base — the tokenizer every adaptive Copilot model in
+* our lineup declares via `capabilities.tokenizer`). This is the real
+* BPE count, NOT a chars-per-token or word-count approximation, so it
+* matches the limit Copilot enforces (`max_prompt_tokens`) to the
+* token. Used by advisor transcript budgeting and the peer-MCP
+* prompt-window guard.
+*/
+const getTextTokenCount = async (text, encoding = "o200k_base") => {
+	if (!text) return 0;
+	return (await getEncodeChatFunction(encoding)).encode(text).length;
+};
+/**
+* Get model-specific constants for token calculation
+*/
+const getModelConstants = (model) => {
+	return model.id === "gpt-3.5-turbo" || model.id === "gpt-4" ? {
+		funcInit: 10,
+		propInit: 3,
+		propKey: 3,
+		enumInit: -3,
+		enumItem: 3,
+		funcEnd: 12
+	} : {
+		funcInit: 7,
+		propInit: 3,
+		propKey: 3,
+		enumInit: -3,
+		enumItem: 3,
+		funcEnd: 12
+	};
+};
+/**
+* Calculate tokens for a single parameter
+*/
+const calculateParameterTokens = (key, prop, context) => {
+	const { encoder, constants } = context;
+	let tokens = constants.propKey;
+	if (typeof prop !== "object" || prop === null) return tokens;
+	const param = prop;
+	const paramName = key;
+	const paramType = param.type || "string";
+	let paramDesc = param.description || "";
+	if (param.enum && Array.isArray(param.enum)) {
+		tokens += constants.enumInit;
+		for (const item of param.enum) {
+			tokens += constants.enumItem;
+			tokens += encoder.encode(String(item)).length;
+		}
+	}
+	if (paramDesc.endsWith(".")) paramDesc = paramDesc.slice(0, -1);
+	const line = `${paramName}:${paramType}:${paramDesc}`;
+	tokens += encoder.encode(line).length;
+	const excludedKeys = new Set([
+		"type",
+		"description",
+		"enum"
+	]);
+	for (const propertyName of Object.keys(param)) if (!excludedKeys.has(propertyName)) {
+		const propertyValue = param[propertyName];
+		const propertyText = typeof propertyValue === "string" ? propertyValue : JSON.stringify(propertyValue);
+		tokens += encoder.encode(`${propertyName}:${propertyText}`).length;
+	}
+	return tokens;
+};
+/**
+* Calculate tokens for function parameters
+*/
+const calculateParametersTokens = (parameters, encoder, constants) => {
+	if (!parameters || typeof parameters !== "object") return 0;
+	const params = parameters;
+	let tokens = 0;
+	for (const [key, value] of Object.entries(params)) if (key === "properties") {
+		const properties = value;
+		if (Object.keys(properties).length > 0) {
+			tokens += constants.propInit;
+			for (const propKey of Object.keys(properties)) tokens += calculateParameterTokens(propKey, properties[propKey], {
+				encoder,
+				constants
+			});
+		}
+	} else {
+		const paramText = typeof value === "string" ? value : JSON.stringify(value);
+		tokens += encoder.encode(`${key}:${paramText}`).length;
+	}
+	return tokens;
+};
+/**
+* Calculate tokens for a single tool
+*/
+const calculateToolTokens = (tool, encoder, constants) => {
+	let tokens = constants.funcInit;
+	const func = tool.function;
+	const fName = func.name;
+	let fDesc = func.description || "";
+	if (fDesc.endsWith(".")) fDesc = fDesc.slice(0, -1);
+	const line = fName + ":" + fDesc;
+	tokens += encoder.encode(line).length;
+	if (typeof func.parameters === "object" && func.parameters !== null) tokens += calculateParametersTokens(func.parameters, encoder, constants);
+	return tokens;
+};
+/**
+* Calculate token count for tools based on model
+*/
+const numTokensForTools = (tools, encoder, constants) => {
+	let funcTokenCount = 0;
+	for (const tool of tools) funcTokenCount += calculateToolTokens(tool, encoder, constants);
+	funcTokenCount += constants.funcEnd;
+	return funcTokenCount;
+};
+/**
+* Calculate the token count of messages, supporting multiple GPT encoders
+*/
+const getTokenCount = async (payload, model) => {
+	const encoder = await getEncodeChatFunction(getTokenizerFromModel(model));
+	const simplifiedMessages = payload.messages;
+	const inputMessages = simplifiedMessages.filter((msg) => msg.role !== "assistant");
+	const outputMessages = simplifiedMessages.filter((msg) => msg.role === "assistant");
+	const constants = getModelConstants(model);
+	let inputTokens = calculateTokens(inputMessages, encoder, constants);
+	if (payload.tools && payload.tools.length > 0) inputTokens += numTokensForTools(payload.tools, encoder, constants);
+	const outputTokens = calculateTokens(outputMessages, encoder, constants);
+	return {
+		input: inputTokens,
+		output: outputTokens
+	};
+};
 //#endregion
 //#region src/services/copilot/create-messages.ts
 /**
@@ -6220,8 +6435,24 @@ function browserToolsEnabled() {
 	if (!(state.browseEnabled || process.env.GH_ROUTER_ENABLE_BROWSE === "1")) return false;
 	return hasSupportedBrowserInstalled();
 }
+/**
+* The 1M-context Opus variant (`claude-opus-4.7-1m-internal`,
+* `max_prompt_tokens` 936K), gated `restricted_to: ["enterprise"]`.
+* opus_critic prefers it so it can take large artifacts in one shot
+* (the whole point of pairing it with gpt-5.5 as the big-window peers);
+* falls back to the 200K `claude-opus-4-7` when the catalog (non-
+* enterprise) doesn't carry a 1M opus slug.
+*/
+const OPUS_1M_RE = /opus-4\.7.*1m/i;
+function resolveOpusCriticModel() {
+	const oneM = state.models?.data?.find((m) => OPUS_1M_RE.test(m.id));
+	return oneM ? oneM.id : "claude-opus-4-7";
+}
 function activePersonas() {
-	return PERSONAS_READ.filter((p) => !p.requiresGeminiCatalog || geminiAvailable());
+	return PERSONAS_READ.filter((p) => !p.requiresGeminiCatalog || geminiAvailable()).map((p) => p.toolNameHttp === "opus_critic" ? {
+		...p,
+		model: resolveOpusCriticModel()
+	} : p);
 }
 function toolEntries() {
 	const personaEntries = activePersonas().map((p) => ({
@@ -6355,6 +6586,46 @@ function predictedTooLong(persona, effort, briefBytes) {
 	return { tooLong: false };
 }
 /**
+* Tokens reserved below a peer model's `max_prompt_tokens` for the
+* per-call message framing (role wrappers, output_config, etc.) and any
+* discrepancy between our o200k count and Copilot's full-payload count.
+*/
+const PEER_PROMPT_TOKEN_RESERVE = 2e3;
+/**
+* Prompt-window guard. Unlike `predictedTooLong` (a JSON-path *timeout*
+* predictor in bytes), this guards the *context window*: it counts the
+* EXACT o200k tokens of the text actually sent to the peer (system
+* instructions + prompt + context) and compares against the persona
+* model's live `max_prompt_tokens`. Applies on BOTH the SSE and JSON
+* paths (called from `handleToolsCall`, before slot acquisition) because
+* an over-window brief 400s `model_max_prompt_tokens_exceeded` upstream
+* regardless of transport — and on SSE there is no other size bound.
+*
+* Returns an actionable message when over budget (reject, don't
+* truncate — silently dropping lines from a review artifact is worse
+* than a clear error), or undefined when it fits or the limit is unknown.
+*/
+async function predictedWindowOverflow(persona, prompt, context) {
+	const id = resolveModel(persona.model);
+	const entry = state.models?.data?.find((m) => m.id === id);
+	if (!entry) return void 0;
+	const maxPromptTokens = entry.capabilities?.limits?.max_prompt_tokens;
+	if (typeof maxPromptTokens !== "number" || !Number.isFinite(maxPromptTokens) || maxPromptTokens <= 0) return;
+	const budget = maxPromptTokens - PEER_PROMPT_TOKEN_RESERVE;
+	const inputText = `${persona.baseInstructions}\n${buildUserText(prompt, context)}`;
+	if (Buffer.byteLength(inputText, "utf8") <= budget) return void 0;
+	let tokens;
+	try {
+		tokens = await getTextTokenCount(inputText, getTokenizerFromModel(entry));
+	} catch (err) {
+		consola.debug("[mcp] window-guard tokenization failed; allowing call:", err);
+		return;
+	}
+	if (tokens <= budget) return void 0;
+	const opusHint = OPUS_1M_RE.test(id) ? "" : " / `opus_critic` (Opus-4.7 1M ≈ 936K tokens, when the enterprise catalog carries it)";
+	return `pre-flight rejected: this ${persona.toolNameHttp} brief is ≈${tokens} tokens, over the ${budget}-token budget for ${persona.model} (its ${maxPromptTokens}-token prompt window minus a ${PEER_PROMPT_TOKEN_RESERVE}-token framing reserve). Do NOT summarize or truncate the artifact to fit. Route the full artifact to a larger-window peer — \`codex_critic\` (gpt-5.5 ≈ 922K tokens)${opusHint} — or split it into focused sub-calls BY CONCERN and call them in parallel, then aggregate.`;
+}
+/**
 * JSON-path pre-flight predictedTooLong gate. Returns a JSON-RPC result
 * body wrapping a tool-error envelope when the call would bust the 60s
 * tools/call ceiling on the JSON path; returns undefined when the call
@@ -6516,6 +6787,10 @@ async function handleToolsCall(body) {
 		if (requestedEffort !== void 0 && !persona.allowedEfforts.includes(requestedEffort)) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: persona "${persona.toolNameHttp}" does not accept effort="${requestedEffort}". Allowed: ${persona.allowedEfforts.join("|")}.`);
 		personaEffort = requestedEffort ?? persona.defaultEffort;
 	}
+	if (persona && personaPrompt !== void 0) {
+		const overflow = await predictedWindowOverflow(persona, personaPrompt, personaContext);
+		if (overflow) return rpcResult(body.id, toolError(overflow));
+	}
 	const release = acquireInFlightSlot();
 	if (!release) return rpcResult(body.id, {
 		content: [{
@@ -6755,10 +7030,13 @@ function acceptsEventStream(accept) {
 /**
 * SSE-streamed response for a single tools/call. Delegates the actual
 * upstream call to `handleToolsCall` (so the per-persona effort gate,
-* predictedTooLong cap, AbortController registration, telemetry, and
-* inFlight slot accounting all run identically); wraps the awaited
-* result in an SSE envelope with periodic heartbeats while the upstream
-* fetch is in flight.
+* the token-exact prompt-window guard, AbortController registration,
+* telemetry, and inFlight slot accounting all run identically); wraps
+* the awaited result in an SSE envelope with periodic heartbeats while
+* the upstream fetch is in flight. NOTE: the JSON-path `predictedTooLong`
+* byte cap is NOT applied here — it lives in `jsonPathPreflightCap`
+* (JSON path only); SSE bypasses it intentionally because heartbeats
+* keep the call alive past the ~60s tools/call ceiling it guards.
 *
 * SSE event format (per MCP Streamable HTTP):
 *   event: message
@@ -7145,15 +7423,39 @@ function injectAdvisorTool(rawBody) {
 	}];
 	return JSON.stringify(parsed);
 }
-/** Character budget for rendered conversation text passed to the
-*  advisor model. gpt-5.5 (default advisor) caps prompt input at
-*  272,000 tokens. At a conservative ~3 chars/token (mixed prose +
-*  code + JSON), 720,000 chars renders to ≈240,000 tokens, leaving
-*  ~32,000 tokens of headroom for the system prompt and per-turn
-*  framing overhead. Without this cap, long Claude Code sessions
-*  produce 400 `model_max_prompt_tokens_exceeded` from /v1/responses
-*  and the advisor falls back silently. */
+/** Fallback CHARACTER budget for `renderConversationAsText` when called
+*  without a token `measure` (unit-agnostic default = char length). Also
+*  the conservative no-catalog floor: 720,000 chars ≈ 240,000 tokens at
+*  ~3 chars/token, which fits even the smaller `/responses` models. The
+*  live path measures EXACT o200k tokens (see `runAdvisor`) and budgets
+*  against the model's real `max_prompt_tokens`, so this constant is only
+*  a safety net, never the normal path. */
 const ADVISOR_MAX_CONVERSATION_CHARS = 72e4;
+/** Token budget used when the advisor model's `max_prompt_tokens` can't
+*  be resolved from the live catalog. ≈ the 720K-char fallback in tokens. */
+const ADVISOR_FALLBACK_MAX_TOKENS = 24e4;
+/** Tokens reserved below the model's `max_prompt_tokens` for the advisor
+*  system prompt + per-call framing + any encode/wire discrepancy between
+*  our o200k count and Copilot's full-payload count. The transcript token
+*  budget is `max_prompt_tokens - reserve`. Generous on purpose: a 400
+*  `model_max_prompt_tokens_exceeded` degrades to a silent advisor
+*  fallback, and the marginal window we give up is irrelevant next to
+*  gpt-5.5's 922K. */
+const ADVISOR_PROMPT_TOKEN_RESERVE = 8e3;
+/**
+* Derive the TOKEN budget for the rendered transcript from the advisor
+* model's live `max_prompt_tokens` (cached in `state.models` by
+* `cacheModels()` at startup). Self-correcting: tracks the model's real
+* window instead of a hardcoded guess, and honors a SMALLER window if a
+* caller overrides `advisorModel` to a tighter model. Falls back to
+* `ADVISOR_FALLBACK_MAX_TOKENS` when the catalog or field is missing.
+*/
+function resolveAdvisorMaxTokens(advisorModel) {
+	const id = resolveModel(advisorModel);
+	const maxPromptTokens = state.models?.data?.find((m) => m.id === id)?.capabilities?.limits?.max_prompt_tokens;
+	if (typeof maxPromptTokens !== "number" || !Number.isFinite(maxPromptTokens) || maxPromptTokens <= 0) return ADVISOR_FALLBACK_MAX_TOKENS;
+	return Math.max(1, maxPromptTokens - ADVISOR_PROMPT_TOKEN_RESERVE);
+}
 /**
 * Render an Anthropic-shape conversation (messages array with
 * role/content blocks) as a single human-readable text blob. Used
@@ -7163,14 +7465,20 @@ const ADVISOR_MAX_CONVERSATION_CHARS = 72e4;
 * just needs to READ the conversation, not produce more of it).
 *
 * Front-truncates oldest turns when the rendered output would exceed
-* `maxChars`. The advisor cares more about current state (latest
+* `maxUnits`. The advisor cares more about current state (latest
 * tool calls, errors, in-flight task) than the original prompt —
 * mirrors Claude Code's own context-truncation strategy. When any
 * turns are dropped, prepends a `[TRUNCATED: N earlier turn(s)
 * omitted ...]` notice so the advisor knows the transcript is
 * partial and can flag if it needs the missing context.
+*
+* Unit-agnostic via the injected `measure` function: production passes
+* an EXACT o200k token counter and a token budget (so truncation tracks
+* the model's real `max_prompt_tokens`); the default `measure` is char
+* length, so callers/tests that pass a plain numeric budget get the
+* historical character-budget behavior.
 */
-function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSATION_CHARS) {
+function renderConversationAsText(conversation, maxUnits = ADVISOR_MAX_CONVERSATION_CHARS, measure = (s) => s.length) {
 	const turnBlocks = [];
 	for (let i = 0; i < conversation.length; i++) {
 		const msg = conversation[i];
@@ -7191,23 +7499,42 @@ function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSAT
 		block.push("");
 		turnBlocks.push(block.join("\n"));
 	}
-	let totalChars = 0;
+	let totalUnits = 0;
 	let firstKeptIdx = turnBlocks.length;
 	for (let i = turnBlocks.length - 1; i >= 0; i--) {
-		const len = turnBlocks[i].length + 1;
-		if (totalChars + len > maxChars) break;
-		totalChars += len;
+		const len = measure(turnBlocks[i]) + 1;
+		if (totalUnits + len > maxUnits) break;
+		totalUnits += len;
 		firstKeptIdx = i;
 	}
 	if (firstKeptIdx === turnBlocks.length && turnBlocks.length > 0) {
-		const tail = turnBlocks[turnBlocks.length - 1].slice(-(maxChars - 200));
-		return `[TRUNCATED: conversation too long for advisor model context; only the tail of the latest (turn ${turnBlocks.length}) is shown]\n\n` + tail;
+		const last = turnBlocks[turnBlocks.length - 1];
+		const notice = `[TRUNCATED: conversation too long for advisor model context; only the tail of the latest (turn ${turnBlocks.length}) is shown]\n\n`;
+		return notice + truncateTailToUnits(last, Math.max(0, maxUnits - measure(notice)), measure);
 	}
 	const kept = turnBlocks.slice(firstKeptIdx);
 	if (firstKeptIdx > 0) kept.unshift(`[TRUNCATED: ${firstKeptIdx} earlier turn(s) omitted to fit advisor model context budget; ${turnBlocks.length - firstKeptIdx} most-recent turn(s) shown below]\n`);
 	return kept.join("\n");
 }
 /**
+* Return the longest suffix of `text` whose `measure(...)` is ≤ `maxUnits`.
+* Binary search on the cut point — unit-agnostic (works for the token
+* `measure` in prod and the char-length default), and exact rather than
+* a chars-per-token estimate. `measure` is called O(log n) times.
+*/
+function truncateTailToUnits(text, maxUnits, measure) {
+	if (maxUnits <= 0) return "";
+	if (measure(text) <= maxUnits) return text;
+	let lo = 0;
+	let hi = text.length;
+	while (lo < hi) {
+		const mid = Math.ceil((lo + hi + 1) / 2);
+		if (measure(text.slice(text.length - mid)) <= maxUnits) lo = mid;
+		else hi = mid - 1;
+	}
+	return text.slice(text.length - lo);
+}
+/**
 * Run the advisor model with the full conversation context. Returns
 * the advisor's text response.
 *
@@ -7227,8 +7554,20 @@ function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSAT
 async function runAdvisor(conversation, advisorModel, advisorEffort, signal) {
 	if (signal?.aborted) throw new Error("advisor call aborted before dispatch");
 	const advisorSystem = "You are an expert advisor reviewing an in-progress Claude Code session. The transcript below is the work-in-progress (turns numbered, with tool calls and results inlined). Read carefully and provide concrete, actionable advice on the next step or course-correction. Be specific — cite the parts of the transcript you're responding to. If the assistant is on the right track, say so explicitly. If they're stuck or off-track, name the specific assumption or step to revisit. Aim for 2-5 paragraphs of substantive guidance.";
-	const conversationText = renderConversationAsText(conversation);
 	const resolvedAdvisorModel = resolveModel(advisorModel);
+	let measure;
+	let maxUnits;
+	try {
+		const modelEntry = state.models?.data?.find((m) => m.id === resolvedAdvisorModel);
+		const encoder = await loadEncoder(modelEntry ? getTokenizerFromModel(modelEntry) : "o200k_base");
+		measure = (s) => encoder.encode(s).length;
+		maxUnits = resolveAdvisorMaxTokens(advisorModel);
+	} catch (err) {
+		consola.debug("advisor: tokenizer load failed; using char-length budget:", err);
+		measure = (s) => s.length;
+		maxUnits = ADVISOR_MAX_CONVERSATION_CHARS;
+	}
+	const conversationText = renderConversationAsText(conversation, maxUnits, measure);
 	if (/^(gpt-|o\d|.*codex)/i.test(resolvedAdvisorModel)) {
 		const response = await createResponses({
 			model: resolvedAdvisorModel,
@@ -9898,7 +10237,7 @@ const PERSONAS_READ = Object.freeze([
 		toolNameHttp: "codex_critic",
 		model: "gpt-5.5",
 		endpoint: "/v1/responses",
-		description: "Adversarial second opinion on plans, designs, or code tradeoffs. Backed by gpt-5.5 (OpenAI, 400K context) — strongest reasoning model in the critic lineup, different lab than Opus. Best for architecture decisions, design reviews, and tradeoff analysis where cross-lab diversity matters. Not for line-level code review (use codex_reviewer). Pass artifact verbatim.",
+		description: "Adversarial second opinion on plans, designs, or code tradeoffs. Backed by gpt-5.5 (OpenAI, ≈922K-token input window) — strongest reasoning model in the critic lineup, different lab than Opus. Best for architecture decisions, design reviews, and tradeoff analysis where cross-lab diversity matters. Not for line-level code review (use codex_reviewer). Pass artifact verbatim.",
 		baseInstructions: CRITIC_BASE,
 		agentPrompt: "",
 		writeCapable: false,
@@ -9934,7 +10273,7 @@ const PERSONAS_READ = Object.freeze([
 		toolNameHttp: "codex_reviewer",
 		model: "gpt-5.3-codex",
 		endpoint: "/v1/responses",
-		description: "Line-level review of a concrete diff or single file. Backed by gpt-5.3-codex (OpenAI, 400K context) — code-specialist, fastest critic (~16s). Surfaces bugs, edge cases, security issues, and idiom violations at specific line numbers. Not suited for architecture or design review (use codex_critic for plans). Pass artifact verbatim.",
+		description: "Line-level review of a concrete diff or single file. Backed by gpt-5.3-codex (OpenAI, ≈272K-token input window) — code-specialist, fastest critic (~16s). Surfaces bugs, edge cases, security issues, and idiom violations at specific line numbers. Not suited for architecture or design review (use codex_critic for plans). Pass artifact verbatim.",
 		baseInstructions: REVIEWER_BASE,
 		agentPrompt: "",
 		writeCapable: false,
@@ -9952,7 +10291,7 @@ const PERSONAS_READ = Object.freeze([
 		toolNameHttp: "opus_critic",
 		model: "claude-opus-4-7",
 		endpoint: "/v1/messages",
-		description: "Adversarial second opinion from a fresh-context Opus 4.7 — same lab as the lead, limited blind-spot diversity vs cross-lab critics, but has the largest context window (up to 1M tokens on enterprise tiers). Handles large artifacts without decomposition. Fast (~22s), catches confabulation and motivated reasoning. Pass artifact verbatim.",
+		description: "Adversarial second opinion from a fresh-context Opus 4.7 — same lab as the lead, limited blind-spot diversity vs cross-lab critics. On enterprise catalogs that carry Opus-4.7-1M it runs with a ≈936K-token input window and handles large artifacts without decomposition; otherwise ≈168K. Fast (~22s), catches confabulation and motivated reasoning. Pass artifact verbatim.",
 		baseInstructions: OPUS_CRITIC_BASE,
 		agentPrompt: "",
 		writeCapable: false,
@@ -10615,14 +10954,14 @@ function buildCoordinatorAgent(opts) {
 			"",
 			"- **Plan / design / architecture choice** → fan out to `codex-critic` (gpt-5.5, strongest reasoning, cross-lab)" + (opts.geminiAvailable ? " AND `gemini-critic` (third-lab triangulation, strong on formal reasoning) in parallel" : "") + ". codex-reviewer is the wrong tool for plans (it's a code-specialist, not an architecture critic).",
 			"- **Concrete diff or single file** → fan out to `codex-reviewer` (gpt-5.3-codex, line-level code specialist, fastest at ~16s)" + (opts.geminiAvailable ? " AND `gemini-critic` for cross-lab triangulation" : "") + ". For very small changes (<20 lines), one `codex-reviewer` call is enough.",
-			"- **Large artifact (>50 KB)** → prefer `opus-critic` (Opus 4.7, up to 1M context — the largest window in the lineup, no decomposition needed for most artifacts). For cross-lab diversity on large artifacts, pair with `codex-critic` and decompose the artifact into 2-4 semantic batches for codex.",
+			"- **Large artifact** → the only peers that take a large artifact WHOLE are `codex-critic` (gpt-5.5, ≈922K-token input window) and `opus-critic` (Opus-4.7-1M, ≈936K-token input on enterprise catalogs; ≈168K otherwise). Route the full artifact to those for cross-lab coverage. `codex-reviewer` (≈272K) and `gemini-critic` (≈136K) have small windows — see Decomposition below: never summarize or downsize the request to squeeze a large artifact into a small-window peer.",
 			"- **Formal reasoning, proofs, or invariants** → prefer `gemini-critic`" + (opts.geminiAvailable ? " (gemini-3.1-pro, strong on math and formally-stated properties)" : " (NOT REGISTERED in this session — gemini-3.x not in catalog)") + ".",
 			"- **Tie-breaker after codex-critic has weighed in** → call `gemini-critic`" + (opts.geminiAvailable ? "" : " (NOT REGISTERED in this session)") + " or `opus-critic` with the artifact AND codex-critic's verdict for cross-check.",
 			"- **Fast sanity check** → `opus-critic` (~22s, same lab as lead but fresh context — catches confabulation and motivated reasoning).",
 			"",
 			"## Decomposition for large artifacts",
 			"",
-			"Each per-call MCP wait is bounded (~60s SDK default on Claude Code v2.1.113+ per regressions #50289 / #52137 — empirically reproduced 2026-05-14). The proxy enforces per-persona effort allowlists AND a pre-flight `predictedTooLong` cap (codex_critic@high >8 KB, codex_reviewer@high >12 KB, opus_critic@medium >6 KB) to surface would-be-timeouts as fast actionable errors. For artifacts that exceed the cap but fit within opus-critic's context window (up to 1M tokens when available), route the full artifact to opus-critic. Otherwise, split into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) and call peers in parallel. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back.",
+			"Route by the peer's real PROMPT WINDOW (input tokens): `codex-critic` gpt-5.5 ≈922K · `opus-critic` Opus-4.7-1M ≈936K (enterprise catalogs; ≈168K otherwise) · `codex-reviewer` gpt-5.3-codex ≈272K · `gemini-critic` gemini-3.1-pro ≈136K. The proxy REJECTS (with an actionable message) any single call whose brief exceeds the target peer's window — it will NOT silently truncate, because dropping lines from a review artifact is worse than a clear error. So: send the full artifact only to peers whose window fits it (large artifacts → `codex-critic` and/or `opus-critic`). When a peer's window is too small (commonly `gemini-critic` at ≈136K, or `codex-reviewer` at ≈272K), do NOT summarize or downsize the request to include it — either skip that peer, or split the artifact into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) that each fit, and call in parallel. Use the big-window peers for the whole and reserve a small-window peer like gemini for the concerns it can actually hold. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back. (Separately, on the JSON transport a per-effort `predictedTooLong` byte cap still guards the ~60s tools/call timeout for non-SSE clients; Claude Code uses SSE, which streams with heartbeats and isn't subject to that cap.)",
 			"",
 			"## Aggregation contract",
 			"",
@@ -11120,7 +11459,7 @@ function initProxyFromEnv() {
 //#endregion
 //#region package.json
 var name = "github-router";
-var version = "0.3.40";
+var version = "0.3.41";
 //#endregion
 //#region src/lib/approval.ts
@@ -11276,202 +11615,6 @@ function collectToolFieldKeys(body) {
 	return [...seen].sort();
 }
-//#endregion
-//#region src/lib/tokenizer.ts
-const ENCODING_MAP = {
-	o200k_base: () => import("gpt-tokenizer/encoding/o200k_base"),
-	cl100k_base: () => import("gpt-tokenizer/encoding/cl100k_base"),
-	p50k_base: () => import("gpt-tokenizer/encoding/p50k_base"),
-	p50k_edit: () => import("gpt-tokenizer/encoding/p50k_edit"),
-	r50k_base: () => import("gpt-tokenizer/encoding/r50k_base")
-};
-const encodingCache = /* @__PURE__ */ new Map();
-/**
-* Calculate tokens for tool calls
-*/
-const calculateToolCallsTokens = (toolCalls, encoder, constants) => {
-	let tokens = 0;
-	for (const toolCall of toolCalls) {
-		tokens += constants.funcInit;
-		tokens += encoder.encode(JSON.stringify(toolCall)).length;
-	}
-	tokens += constants.funcEnd;
-	return tokens;
-};
-/**
-* Calculate tokens for content parts
-*/
-const calculateContentPartsTokens = (contentParts, encoder) => {
-	let tokens = 0;
-	for (const part of contentParts) if (part.type === "image_url") tokens += encoder.encode(part.image_url.url).length + 85;
-	else if (part.text) tokens += encoder.encode(part.text).length;
-	return tokens;
-};
-/**
-* Calculate tokens for a single message
-*/
-const calculateMessageTokens = (message, encoder, constants) => {
-	const tokensPerMessage = 3;
-	const tokensPerName = 1;
-	let tokens = tokensPerMessage;
-	for (const [key, value] of Object.entries(message)) {
-		if (typeof value === "string") tokens += encoder.encode(value).length;
-		if (key === "name") tokens += tokensPerName;
-		if (key === "tool_calls") tokens += calculateToolCallsTokens(value, encoder, constants);
-		if (key === "content" && Array.isArray(value)) tokens += calculateContentPartsTokens(value, encoder);
-	}
-	return tokens;
-};
-/**
-* Calculate tokens using custom algorithm
-*/
-const calculateTokens = (messages, encoder, constants) => {
-	if (messages.length === 0) return 0;
-	let numTokens = 0;
-	for (const message of messages) numTokens += calculateMessageTokens(message, encoder, constants);
-	numTokens += 3;
-	return numTokens;
-};
-/**
-* Get the corresponding encoder module based on encoding type
-*/
-const getEncodeChatFunction = async (encoding) => {
-	if (encodingCache.has(encoding)) {
-		const cached$1 = encodingCache.get(encoding);
-		if (cached$1) return cached$1;
-	}
-	const supportedEncoding = encoding;
-	if (!(supportedEncoding in ENCODING_MAP)) {
-		const fallbackModule = await ENCODING_MAP.o200k_base();
-		encodingCache.set(encoding, fallbackModule);
-		return fallbackModule;
-	}
-	const encodingModule = await ENCODING_MAP[supportedEncoding]();
-	encodingCache.set(encoding, encodingModule);
-	return encodingModule;
-};
-/**
-* Get tokenizer type from model information
-*/
-const getTokenizerFromModel = (model) => {
-	return model.capabilities?.tokenizer || "o200k_base";
-};
-/**
-* Get model-specific constants for token calculation
-*/
-const getModelConstants = (model) => {
-	return model.id === "gpt-3.5-turbo" || model.id === "gpt-4" ? {
-		funcInit: 10,
-		propInit: 3,
-		propKey: 3,
-		enumInit: -3,
-		enumItem: 3,
-		funcEnd: 12
-	} : {
-		funcInit: 7,
-		propInit: 3,
-		propKey: 3,
-		enumInit: -3,
-		enumItem: 3,
-		funcEnd: 12
-	};
-};
-/**
-* Calculate tokens for a single parameter
-*/
-const calculateParameterTokens = (key, prop, context) => {
-	const { encoder, constants } = context;
-	let tokens = constants.propKey;
-	if (typeof prop !== "object" || prop === null) return tokens;
-	const param = prop;
-	const paramName = key;
-	const paramType = param.type || "string";
-	let paramDesc = param.description || "";
-	if (param.enum && Array.isArray(param.enum)) {
-		tokens += constants.enumInit;
-		for (const item of param.enum) {
-			tokens += constants.enumItem;
-			tokens += encoder.encode(String(item)).length;
-		}
-	}
-	if (paramDesc.endsWith(".")) paramDesc = paramDesc.slice(0, -1);
-	const line = `${paramName}:${paramType}:${paramDesc}`;
-	tokens += encoder.encode(line).length;
-	const excludedKeys = new Set([
-		"type",
-		"description",
-		"enum"
-	]);
-	for (const propertyName of Object.keys(param)) if (!excludedKeys.has(propertyName)) {
-		const propertyValue = param[propertyName];
-		const propertyText = typeof propertyValue === "string" ? propertyValue : JSON.stringify(propertyValue);
-		tokens += encoder.encode(`${propertyName}:${propertyText}`).length;
-	}
-	return tokens;
-};
-/**
-* Calculate tokens for function parameters
-*/
-const calculateParametersTokens = (parameters, encoder, constants) => {
-	if (!parameters || typeof parameters !== "object") return 0;
-	const params = parameters;
-	let tokens = 0;
-	for (const [key, value] of Object.entries(params)) if (key === "properties") {
-		const properties = value;
-		if (Object.keys(properties).length > 0) {
-			tokens += constants.propInit;
-			for (const propKey of Object.keys(properties)) tokens += calculateParameterTokens(propKey, properties[propKey], {
-				encoder,
-				constants
-			});
-		}
-	} else {
-		const paramText = typeof value === "string" ? value : JSON.stringify(value);
-		tokens += encoder.encode(`${key}:${paramText}`).length;
-	}
-	return tokens;
-};
-/**
-* Calculate tokens for a single tool
-*/
-const calculateToolTokens = (tool, encoder, constants) => {
-	let tokens = constants.funcInit;
-	const func = tool.function;
-	const fName = func.name;
-	let fDesc = func.description || "";
-	if (fDesc.endsWith(".")) fDesc = fDesc.slice(0, -1);
-	const line = fName + ":" + fDesc;
-	tokens += encoder.encode(line).length;
-	if (typeof func.parameters === "object" && func.parameters !== null) tokens += calculateParametersTokens(func.parameters, encoder, constants);
-	return tokens;
-};
-/**
-* Calculate token count for tools based on model
-*/
-const numTokensForTools = (tools, encoder, constants) => {
-	let funcTokenCount = 0;
-	for (const tool of tools) funcTokenCount += calculateToolTokens(tool, encoder, constants);
-	funcTokenCount += constants.funcEnd;
-	return funcTokenCount;
-};
-/**
-* Calculate the token count of messages, supporting multiple GPT encoders
-*/
-const getTokenCount = async (payload, model) => {
-	const encoder = await getEncodeChatFunction(getTokenizerFromModel(model));
-	const simplifiedMessages = payload.messages;
-	const inputMessages = simplifiedMessages.filter((msg) => msg.role !== "assistant");
-	const outputMessages = simplifiedMessages.filter((msg) => msg.role === "assistant");
-	const constants = getModelConstants(model);
-	let inputTokens = calculateTokens(inputMessages, encoder, constants);
-	if (payload.tools && payload.tools.length > 0) inputTokens += numTokensForTools(payload.tools, encoder, constants);
-	const outputTokens = calculateTokens(outputMessages, encoder, constants);
-	return {
-		input: inputTokens,
-		output: outputTokens
-	};
-};
 //#endregion
 //#region src/routes/chat-completions/handler.ts
 const ENCODER$1 = new TextEncoder();