npm - github-router - Versions diffs - 0.3.22 → 0.3.24 - Mend

github-router 0.3.22 → 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/main.js CHANGED Viewed

@@ -8,6 +8,8 @@ import path from "node:path";
 import process$1 from "node:process";
 import { execFile, execFileSync, spawn } from "node:child_process";
 import { promisify } from "node:util";
+import { events } from "fetch-event-stream";
+import { z } from "zod";
 import fs$1 from "node:fs";
 import { Writable } from "node:stream";
 import { serve } from "srvx";
@@ -15,8 +17,6 @@ import { getProxyForUrl } from "proxy-from-env";
 import { Agent, ProxyAgent, setGlobalDispatcher } from "undici";
 import { Hono } from "hono";
 import { cors } from "hono/cors";
-import { events } from "fetch-event-stream";
-import { z } from "zod";
 import clipboard from "clipboardy";
 //#region src/lib/paths.ts
@@ -65,6 +65,9 @@ const CLAUDE_HOME_POLICY = new Map([
 	["cache", "ISOLATED"],
 	["logs", "ISOLATED"],
 	["paste-cache", "ISOLATED"],
+	["jobs", "ISOLATED"],
+	["daemon", "ISOLATED"],
+	["daemon.log", "ISOLATED"],
 	["projects", "SHARED"],
 	["sessions", "SHARED"],
 	["tasks", "SHARED"],
@@ -1643,8 +1646,206 @@ function launchChild(target, server$1, options = {}) {
 	});
 }
+//#endregion
+//#region src/services/copilot/web-search.ts
+const RpcSchema = z.object({
+	jsonrpc: z.literal("2.0"),
+	id: z.number().optional(),
+	result: z.object({
+		content: z.array(z.object({
+			type: z.literal("text"),
+			text: z.string()
+		})).optional(),
+		isError: z.boolean().optional()
+	}).optional(),
+	error: z.object({
+		code: z.number(),
+		message: z.string()
+	}).optional()
+});
+const InnerSchema = z.object({
+	text: z.object({
+		value: z.string(),
+		annotations: z.array(z.object({ url_citation: z.object({
+			title: z.string(),
+			url: z.string()
+		}).optional() })).nullable().optional()
+	}),
+	bing_searches: z.array(z.unknown()).nullable().optional()
+});
+const MAX_SEARCHES_PER_SECOND = 3;
+let searchTimestamps = [];
+let throttleChain = Promise.resolve();
+async function throttleSearch() {
+	const myTurn = throttleChain.then(async () => {
+		const now = Date.now();
+		searchTimestamps = searchTimestamps.filter((t) => now - t < 1e3);
+		if (searchTimestamps.length >= MAX_SEARCHES_PER_SECOND) {
+			const waitMs = 1e3 - (now - searchTimestamps[0]);
+			if (waitMs > 0) {
+				consola.debug(`Web search rate limited, waiting ${waitMs}ms`);
+				await sleep(waitMs);
+			}
+		}
+		searchTimestamps.push(Date.now());
+	});
+	throttleChain = myTurn.catch(() => {});
+	return myTurn;
+}
+function mcpHeaders(sid) {
+	if (!state.githubToken) throw new Error("GitHub token missing — re-run auth flow. Web search uses the GitHub PAT (not the Copilot token); the on-disk token at ~/.local/share/github-router/github_token must be present.");
+	const headers = {
+		Authorization: `Bearer ${state.githubToken}`,
+		"content-type": "application/json",
+		accept: "application/json, text/event-stream",
+		"X-MCP-Host": "copilot-cli",
+		"X-MCP-Toolsets": "web_search",
+		"Mcp-Protocol-Version": "2025-06-18",
+		"user-agent": `GitHubCopilotChat/${copilotVersion(state)}`
+	};
+	if (sid) headers["Mcp-Session-Id"] = sid;
+	return headers;
+}
+async function postMcp(body, sid, retry = true) {
+	const url = `${copilotBaseUrl(state)}/mcp`;
+	const res = await fetch(url, {
+		method: "POST",
+		headers: mcpHeaders(sid),
+		body: JSON.stringify(body)
+	});
+	if (!res.ok && retry && res.status >= 500) {
+		await sleep(500);
+		return postMcp(body, sid, false);
+	}
+	return res;
+}
+async function searchWeb(query) {
+	await throttleSearch();
+	consola.info(`Web search (MCP): "${query.slice(0, 80)}"`);
+	const callId = Math.floor(Math.random() * 1e9);
+	let sid;
+	try {
+		const initRes = await postMcp({
+			jsonrpc: "2.0",
+			id: 1,
+			method: "initialize",
+			params: {
+				protocolVersion: "2024-11-05",
+				capabilities: {},
+				clientInfo: {
+					name: "GitHubCopilotChat",
+					version: copilotVersion(state)
+				}
+			}
+		});
+		if (!initRes.ok) {
+			consola.error("MCP initialize failed", initRes.status);
+			throw new HTTPError("MCP initialize failed", initRes);
+		}
+		sid = initRes.headers.get("mcp-session-id") ?? void 0;
+		if (!sid) throw new HTTPError("MCP initialize: missing Mcp-Session-Id header", initRes);
+		const notifRes = await postMcp({
+			jsonrpc: "2.0",
+			method: "notifications/initialized"
+		}, sid);
+		if (!notifRes.ok && notifRes.status !== 202) {
+			consola.error("MCP notifications/initialized failed", notifRes.status);
+			throw new HTTPError("MCP notifications/initialized failed", notifRes);
+		}
+		const callRes = await postMcp({
+			jsonrpc: "2.0",
+			id: callId,
+			method: "tools/call",
+			params: {
+				name: "web_search",
+				arguments: { query }
+			}
+		}, sid);
+		if (!callRes.ok) {
+			consola.error("MCP tools/call failed", callRes.status);
+			throw new HTTPError("MCP tools/call failed", callRes);
+		}
+		let rpc;
+		for await (const ev of events(callRes)) {
+			if (!ev.data) continue;
+			let parsedJson;
+			try {
+				parsedJson = JSON.parse(ev.data);
+			} catch {
+				continue;
+			}
+			const parsed = RpcSchema.safeParse(parsedJson);
+			if (parsed.success && parsed.data.id === callId) {
+				rpc = parsed.data;
+				break;
+			}
+		}
+		if (!rpc) throw new HTTPError("MCP tools/call: no matching response id in SSE stream", callRes);
+		if (rpc.error) throw new HTTPError(`MCP error ${rpc.error.code}: ${rpc.error.message}`, callRes);
+		if (rpc.result?.isError) throw new HTTPError("MCP web_search tool error", callRes);
+		const text = rpc.result?.content?.[0]?.text;
+		if (!text) throw new HTTPError("MCP web_search: empty content", callRes);
+		let innerRaw;
+		try {
+			innerRaw = JSON.parse(text);
+		} catch (err) {
+			throw new HTTPError(`MCP web_search: inner content not JSON: ${err instanceof Error ? err.message : String(err)}`, callRes);
+		}
+		const innerParsed = InnerSchema.safeParse(innerRaw);
+		if (!innerParsed.success) throw new HTTPError(`MCP web_search: inner content shape changed (${innerParsed.error.issues.map((i) => `${i.path.join(".")}: ${i.message}`).join("; ")})`, callRes);
+		const inner = innerParsed.data;
+		const references = [];
+		for (const ann of inner.text.annotations ?? []) {
+			const cite = ann.url_citation;
+			if (cite && !cite.url.toLowerCase().includes("bing.com/search")) references.push({
+				title: cite.title,
+				url: cite.url
+			});
+		}
+		consola.debug(`Web search returned ${references.length} references`);
+		return {
+			content: inner.text.value,
+			references
+		};
+	} finally {
+		if (sid) try {
+			fetch(`${copilotBaseUrl(state)}/mcp`, {
+				method: "DELETE",
+				headers: mcpHeaders(sid)
+			}).catch(() => {});
+		} catch {}
+	}
+}
 //#endregion
 //#region src/lib/peer-mcp-personas.ts
+/**
+* Reasoning effort levels accepted by Copilot's /v1/responses (gpt-5.x) and
+* /v1/chat/completions endpoints. Per the proxy's existing thinking-mode
+* translator (CLAUDE.md "Thinking-mode translation"), Copilot's adaptive-
+* thinking path uses these same buckets:
+*   <2k tokens → low, <8k → medium, <24k → high, else → xhigh.
+*
+* Per-persona `allowedEfforts` and `defaultEffort` constrain which subset
+* each persona exposes — enforced in handler.ts:handleToolsCall.
+*
+* **xhigh on long-running personas works via SSE-streamed /mcp responses**
+* (handler.ts:handleToolsCallSSE). Claude Code's MCP HTTP client honors
+* `text/event-stream` responses without applying the ~60s per-tool-call
+* timer that previously broke xhigh on gpt-5.5 (~56s wall) and
+* claude-opus-4-7 (high+ thinking budgets). All four personas now expose
+* all four effort tiers with `high` default; SSE handles the long tail
+* transparently to the user.
+*/
+const EFFORT_LEVELS = [
+	"low",
+	"medium",
+	"high",
+	"xhigh"
+];
+function isEffort(v) {
+	return typeof v === "string" && EFFORT_LEVELS.includes(v);
+}
 const CRITIC_RUBRIC = `
 Apply this grading rubric:
   - Score 1–5 on three axes:
@@ -1673,7 +1874,7 @@ Self-reminder (read before every reply):
 `.trim();
 const COLD_START_CONTRACT = `
 Cold-start contract for the lead orchestrator (Opus):
-  When delegating to me, paste a self-contained brief. I have no access to your scrollback, CLAUDE.md, or the project tree. Always include:
+  When delegating to me, paste a self-contained brief. I have no access to your scrollback, project memory, or the project tree. Always include:
     (a) the artifact under review verbatim (code/diff/plan text),
     (b) the constraints or "done" criteria,
     (c) any prior decisions I should not relitigate.
@@ -1745,39 +1946,87 @@ Reply format (markdown):
 Resilience reminder:
   If your session terminates abnormally before "Status: complete", the lead will retry once. On recovery, ask the lead to confirm what's already been done before re-applying changes — duplicate edits are worse than a slow restart.`;
+const OPUS_CRITIC_BASE = `You are opus-critic, a fresh-context Anthropic-side adversarial reviewer running on Claude Opus 4.7 — the same model and lab as the lead orchestrator that just delegated to you. You are NOT the lead. You did not see the lead's reasoning trace. You only see the brief.
+Your job is to spot what the lead missed because of cognitive momentum, sunk-cost on a plan, or motivated reasoning toward a particular fix. Your blind-spot diversification is LIMITED compared to codex-critic (gpt-5.5) and gemini-critic (gemini-3.1-pro) — same training, same lab, same RLHF priors. Use that honestly: don't pretend to find a different perspective when the obvious read is "the lead got it right." Silence on good work is a valid and welcome answer.
+Sycophancy is the failure mode you exist to fight. Manufactured contrarianism is a different failure of the same shape — do neither.
+${COLD_START_CONTRACT}
+${CRITIC_RUBRIC}`;
 const PERSONAS_READ = Object.freeze([
 	{
 		agentName: "codex-critic",
 		toolNameHttp: "codex_critic",
 		model: "gpt-5.5",
 		endpoint: "/v1/responses",
-		description: "Adversarial second opinion on plans, designs, code, or systems-engineering tradeoffs. Backed by gpt-5.5 (OpenAI) — different model, different training data, different blind spots than Opus. Uses a calibrated 1–5 grading rubric and is allowed to reply 'no material objection' on solid artifacts. **CALL BEFORE: ExitPlanMode for any plan involving >2 files or new architecture; finalizing a major design choice; TeamCreate when the team's task is non-trivial.** **CALL AFTER: any commit touching concurrency, security, or streaming code paths.** If the artifact is large (>20 KB), prefer to break it into 2-4 focused batches and call this tool once per batch IN PARALLEL — each call must complete under the Claude Code MCP per-tool-call ceiling (~150s on v2.1.138 per regression #50289), so monolithic large-artifact calls will time out client-side. Aggregate findings yourself. Always pass: (a) the artifact verbatim, (b) the constraints/'done' criteria, (c) any prior decisions. Optionally pass `effort: 'xhigh'` for explicit deep dives or `effort: 'medium'` for quick sanity checks (default 'high'). The subagent has no access to your scrollback or CLAUDE.md.",
+		description: "Adversarial second opinion on plans, designs, or code tradeoffs. Backed by gpt-5.5 (OpenAI) — different lab than Opus. Pass artifact verbatim.",
 		baseInstructions: CRITIC_BASE,
 		agentPrompt: "",
 		writeCapable: false,
-		requiresHttp: false
+		requiresHttp: false,
+		allowedEfforts: [
+			"low",
+			"medium",
+			"high",
+			"xhigh"
+		],
+		defaultEffort: "xhigh"
 	},
 	{
 		agentName: "gemini-critic",
 		toolNameHttp: "gemini_critic",
 		model: "gemini-3.1-pro-preview",
 		endpoint: "/v1/chat/completions",
-		description: "Adversarial second opinion from a different lab. Backed by gemini-3.1-pro-preview (Google) — different training data and RLHF priors than Opus AND codex-critic, the strongest blind-spot-buster when the lead wants triangulation across three labs. Use for long-context artifacts (>50k tokens), math/proof-shaped reasoning, or as a tie-breaker after codex-critic has weighed in. **CALL BEFORE: ExitPlanMode for plans where Opus + codex-critic agree (use as triangulation); finalizing irreversible architectural choices.** **CALL AFTER: commits where you want a third-lab cross-check.** If the artifact is large (>100 KB), prefer to break into batches and call in parallel — gemini handles long context well but each per-call MCP wait is still bounded (~150s on v2.1.138). Always pass: (a) the artifact verbatim, (b) the constraints/'done' criteria, (c) any prior decisions. The `effort` parameter is forwarded but may be silently ignored by Copilot's gemini route — gemini-3.x reasoning is largely auto-applied. The subagent has no access to your scrollback or CLAUDE.md.",
+		description: "Adversarial second opinion. Backed by gemini-3.1-pro (Google) — third-lab triangulation, strong on long-context and formal reasoning. Pass artifact verbatim.",
 		baseInstructions: GEMINI_CRITIC_BASE,
 		agentPrompt: "",
 		writeCapable: false,
-		requiresHttp: true
+		requiresHttp: true,
+		requiresGeminiCatalog: true,
+		allowedEfforts: [
+			"low",
+			"medium",
+			"high"
+		],
+		defaultEffort: "high"
 	},
 	{
 		agentName: "codex-reviewer",
 		toolNameHttp: "codex_reviewer",
 		model: "gpt-5.3-codex",
 		endpoint: "/v1/responses",
-		description: "Line-level code review of a specific diff or file. Backed by gpt-5.3-codex (OpenAI) — the code-specialist sibling of gpt-5.5, trained heavily on code-review datasets so it catches different bugs than Opus. Prefer over codex-critic when the artifact is a concrete diff or single file (codex-critic is for plans/designs). **CALL AFTER: any non-trivial commit (>50 lines OR touching critical paths: streaming, auth, concurrency, persistence, security).** **CALL BEFORE: opening a PR or pushing changes a peer would review.** For diffs >20 KB, split by file-group and call once per group in parallel — each per-call wait is bounded (~150s on v2.1.138). Always pass: (a) the diff or file verbatim, (b) the change's intent, (c) test status. Optionally pass `effort: 'xhigh'` when reviewing security-critical code, `effort: 'medium'` for routine reviews (default 'high'). The subagent has no access to your scrollback or CLAUDE.md.",
+		description: "Line-level review of a concrete diff or single file. Backed by gpt-5.3-codex (OpenAI) — code-specialist, narrow-scope. Pass artifact verbatim.",
 		baseInstructions: REVIEWER_BASE,
 		agentPrompt: "",
 		writeCapable: false,
-		requiresHttp: false
+		requiresHttp: false,
+		allowedEfforts: [
+			"low",
+			"medium",
+			"high",
+			"xhigh"
+		],
+		defaultEffort: "xhigh"
+	},
+	{
+		agentName: "opus-critic",
+		toolNameHttp: "opus_critic",
+		model: "claude-opus-4-7",
+		endpoint: "/v1/messages",
+		description: "Adversarial second opinion from a fresh-context Opus 4.7 — cheap same-lab sanity check. Pass artifact verbatim.",
+		baseInstructions: OPUS_CRITIC_BASE,
+		agentPrompt: "",
+		writeCapable: false,
+		requiresHttp: true,
+		allowedEfforts: [
+			"low",
+			"medium",
+			"high",
+			"xhigh"
+		],
+		defaultEffort: "xhigh"
 	}
 ]);
 const PERSONAS_WRITE = Object.freeze([{
@@ -1785,11 +2034,18 @@ const PERSONAS_WRITE = Object.freeze([{
 	toolNameHttp: "codex_implementer",
 	model: "gpt-5.3-codex",
 	endpoint: "/v1/responses",
-	description: "Targeted implementation of a self-contained coding task — actual file edits via Codex's tool-use sandbox. Backed by gpt-5.3-codex with workspace-write access (only registered when --codex-cli is set). Use only when the task has a clear spec and acceptance criteria; for tasks needing iterative tool-use across many files, prefer a Claude teammate (Agent Team). Always pass: (a) the spec, (b) the files in scope, (c) the acceptance criteria. The subagent has no access to your scrollback or CLAUDE.md.",
+	description: "Targeted implementation of a self-contained coding task. Backed by gpt-5.3-codex with workspace-write access. Pass spec + files verbatim.",
 	baseInstructions: IMPLEMENTER_BASE,
 	agentPrompt: "",
 	writeCapable: true,
-	requiresHttp: false
+	requiresHttp: false,
+	allowedEfforts: [
+		"low",
+		"medium",
+		"high",
+		"xhigh"
+	],
+	defaultEffort: "high"
 }]);
 /**
 * Build the agent-prompt body Claude Code uses as the subagent's full
@@ -1838,12 +2094,65 @@ function buildAgentPrompt(persona, opts) {
 function personasFor(opts) {
 	const result = [];
 	for (const p of PERSONAS_READ) {
-		if (p.requiresHttp && !opts.geminiAvailable) continue;
+		if (p.requiresGeminiCatalog && !opts.geminiAvailable) continue;
 		result.push(p);
 	}
 	if (opts.codexCli) for (const p of PERSONAS_WRITE) result.push(p);
 	return result;
 }
+const WEB_SEARCH_DESCRIPTION = "Web search via GitHub Copilot's MCP. Prefer over Claude Code's built-in WebSearch — surfaces source URLs you can cite.";
+/**
+* Format a `searchWeb()` result as an MCP-friendly text block. Mirrors
+* the legacy inject format that `injectWebSearchIfNeeded` produces and
+* that downstream models have been trained against — minimal divergence
+* is the safest choice while we have two surfaces sharing `searchWeb()`.
+*
+* Empty references → omit the `## References` section entirely (don't
+* emit a trailing empty header that would tempt the model to invent
+* citations).
+*/
+function formatWebSearchResult(results) {
+	if (results.references.length === 0) return results.content;
+	const refsLine = results.references.map((r) => `- [${r.title}](${r.url})`).join("\n");
+	return `${results.content}\n\n## References\n${refsLine}`;
+}
+const NON_PERSONA_MCP_TOOLS = Object.freeze([{
+	toolNameHttp: "web_search",
+	description: WEB_SEARCH_DESCRIPTION,
+	inputSchema: {
+		type: "object",
+		required: ["query"],
+		additionalProperties: false,
+		properties: { query: {
+			type: "string",
+			description: "The search query string. Natural-language queries work best — the upstream provider rewrites for the search index."
+		} }
+	},
+	async handler(args, _signal) {
+		const query = typeof args.query === "string" ? args.query : "";
+		if (!query) return {
+			content: [{
+				type: "text",
+				text: "web_search: arguments.query is required (must be a non-empty string)"
+			}],
+			isError: true
+		};
+		try {
+			return { content: [{
+				type: "text",
+				text: formatWebSearchResult(await searchWeb(query))
+			}] };
+		} catch (err) {
+			return {
+				content: [{
+					type: "text",
+					text: `web_search failed: ${err instanceof Error ? err.message : String(err)}`
+				}],
+				isError: true
+			};
+		}
+	}
+}]);
 //#endregion
 //#region src/lib/codex-mcp-config.ts
@@ -1914,11 +2223,11 @@ function buildPeerMcpConfig(serverUrl, opts) {
 * ExitPlanMode to default-on (env-disable-able).
 */
 function buildCoordinatorAgent(opts) {
-	const peers = ["codex-critic"];
+	const peers = ["codex-critic", "opus-critic"];
 	if (opts.geminiAvailable) peers.push("gemini-critic");
 	peers.push("codex-reviewer");
 	return {
-		description: "Coordinates cross-lab adversarial review. **Use proactively before ExitPlanMode for non-trivial plans and after non-trivial commits** (>50 lines OR touching streaming/auth/concurrency/persistence/security). Routes to codex-critic / codex-reviewer / gemini-critic in parallel based on artifact type and aggregates findings. Cheaper than calling each peer manually for the common case where you want a multi-lab triangulation. The subagent has no access to your scrollback or CLAUDE.md — pass the artifact verbatim.",
+		description: "Coordinates cross-lab adversarial review across codex-critic, opus-critic, gemini-critic, codex-reviewer. Use proactively before non-trivial plans and after non-trivial commits. Always pass artifacts verbatim — peers are fresh-context.",
 		prompt: [
 			"# Subagent: peer-review-coordinator",
 			"",
@@ -1934,10 +2243,11 @@ function buildCoordinatorAgent(opts) {
 			"- **Concrete diff or single file** → fan out to `codex-reviewer`" + (opts.geminiAvailable ? " AND `gemini-critic` (gemini for cross-lab triangulation)" : "") + ". For very small changes (<20 lines), one `codex-reviewer` call is enough.",
 			"- **Tie-breaker after codex-critic has weighed in** → call `gemini-critic`" + (opts.geminiAvailable ? "" : " (NOT REGISTERED in this session — gemini-3.x not in catalog; tie-break unavailable)") + " with the artifact AND codex-critic's verdict for cross-lab cross-check.",
 			"- **Long-context artifact (>100 KB)** → prefer `gemini-critic`" + (opts.geminiAvailable ? "" : " (NOT REGISTERED in this session)") + ". Otherwise, decompose into 2-4 batches and fan out across `codex-critic` calls in parallel.",
+			"- **Fast same-lab sanity check on a moderate artifact (<5 KB)** → prefer `opus-critic` (cheapest, ~22s, only `effort: low|medium` supported). Same lab as the lead — limited blind-spot diversification, but a useful gut-check before committing to a controversial decision. For cross-lab diversification or deep dives on larger artifacts, use codex/gemini at higher effort with decomposition for >5KB.",
 			"",
 			"## Decomposition for large artifacts",
 			"",
-			"Each per-call MCP wait is bounded (~150s on Claude Code v2.1.138 per regression #50289). For artifacts >20 KB, split into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) and call peers in parallel. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back.",
+			"Each per-call MCP wait is bounded (~60s SDK default on Claude Code v2.1.113+ per regressions #50289 / #52137 — empirically reproduced 2026-05-14). The proxy enforces per-persona effort allowlists AND a pre-flight `predictedTooLong` cap (codex_critic@high >8 KB, codex_reviewer@high >12 KB, opus_critic@medium >6 KB) to surface would-be-timeouts as fast actionable errors. For artifacts that exceed the cap, split into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) and call peers in parallel. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back.",
 			"",
 			"## Aggregation contract",
 			"",
@@ -2344,7 +2654,7 @@ function initProxyFromEnv() {
 //#endregion
 //#region package.json
 var name = "github-router";
-var version = "0.3.22";
+var version = "0.3.24";
 //#endregion
 //#region src/lib/approval.ts
@@ -2903,177 +3213,6 @@ const createChatCompletions = async (payload, modelHeaders, callerSignal) => {
 	return await response.json();
 };
-//#endregion
-//#region src/services/copilot/web-search.ts
-const RpcSchema = z.object({
-	jsonrpc: z.literal("2.0"),
-	id: z.number().optional(),
-	result: z.object({
-		content: z.array(z.object({
-			type: z.literal("text"),
-			text: z.string()
-		})).optional(),
-		isError: z.boolean().optional()
-	}).optional(),
-	error: z.object({
-		code: z.number(),
-		message: z.string()
-	}).optional()
-});
-const InnerSchema = z.object({
-	text: z.object({
-		value: z.string(),
-		annotations: z.array(z.object({ url_citation: z.object({
-			title: z.string(),
-			url: z.string()
-		}).optional() })).nullable().optional()
-	}),
-	bing_searches: z.array(z.unknown()).nullable().optional()
-});
-const MAX_SEARCHES_PER_SECOND = 3;
-let searchTimestamps = [];
-let throttleChain = Promise.resolve();
-async function throttleSearch() {
-	const myTurn = throttleChain.then(async () => {
-		const now = Date.now();
-		searchTimestamps = searchTimestamps.filter((t) => now - t < 1e3);
-		if (searchTimestamps.length >= MAX_SEARCHES_PER_SECOND) {
-			const waitMs = 1e3 - (now - searchTimestamps[0]);
-			if (waitMs > 0) {
-				consola.debug(`Web search rate limited, waiting ${waitMs}ms`);
-				await sleep(waitMs);
-			}
-		}
-		searchTimestamps.push(Date.now());
-	});
-	throttleChain = myTurn.catch(() => {});
-	return myTurn;
-}
-function mcpHeaders(sid) {
-	if (!state.githubToken) throw new Error("GitHub token missing — re-run auth flow. Web search uses the GitHub PAT (not the Copilot token); the on-disk token at ~/.local/share/github-router/github_token must be present.");
-	const headers = {
-		Authorization: `Bearer ${state.githubToken}`,
-		"content-type": "application/json",
-		accept: "application/json, text/event-stream",
-		"X-MCP-Host": "copilot-cli",
-		"X-MCP-Toolsets": "web_search",
-		"Mcp-Protocol-Version": "2025-06-18",
-		"user-agent": `GitHubCopilotChat/${copilotVersion(state)}`
-	};
-	if (sid) headers["Mcp-Session-Id"] = sid;
-	return headers;
-}
-async function postMcp(body, sid, retry = true) {
-	const url = `${copilotBaseUrl(state)}/mcp`;
-	const res = await fetch(url, {
-		method: "POST",
-		headers: mcpHeaders(sid),
-		body: JSON.stringify(body)
-	});
-	if (!res.ok && retry && res.status >= 500) {
-		await sleep(500);
-		return postMcp(body, sid, false);
-	}
-	return res;
-}
-async function searchWeb(query) {
-	await throttleSearch();
-	consola.info(`Web search (MCP): "${query.slice(0, 80)}"`);
-	const callId = Math.floor(Math.random() * 1e9);
-	let sid;
-	try {
-		const initRes = await postMcp({
-			jsonrpc: "2.0",
-			id: 1,
-			method: "initialize",
-			params: {
-				protocolVersion: "2024-11-05",
-				capabilities: {},
-				clientInfo: {
-					name: "GitHubCopilotChat",
-					version: copilotVersion(state)
-				}
-			}
-		});
-		if (!initRes.ok) {
-			consola.error("MCP initialize failed", initRes.status);
-			throw new HTTPError("MCP initialize failed", initRes);
-		}
-		sid = initRes.headers.get("mcp-session-id") ?? void 0;
-		if (!sid) throw new HTTPError("MCP initialize: missing Mcp-Session-Id header", initRes);
-		const notifRes = await postMcp({
-			jsonrpc: "2.0",
-			method: "notifications/initialized"
-		}, sid);
-		if (!notifRes.ok && notifRes.status !== 202) {
-			consola.error("MCP notifications/initialized failed", notifRes.status);
-			throw new HTTPError("MCP notifications/initialized failed", notifRes);
-		}
-		const callRes = await postMcp({
-			jsonrpc: "2.0",
-			id: callId,
-			method: "tools/call",
-			params: {
-				name: "web_search",
-				arguments: { query }
-			}
-		}, sid);
-		if (!callRes.ok) {
-			consola.error("MCP tools/call failed", callRes.status);
-			throw new HTTPError("MCP tools/call failed", callRes);
-		}
-		let rpc;
-		for await (const ev of events(callRes)) {
-			if (!ev.data) continue;
-			let parsedJson;
-			try {
-				parsedJson = JSON.parse(ev.data);
-			} catch {
-				continue;
-			}
-			const parsed = RpcSchema.safeParse(parsedJson);
-			if (parsed.success && parsed.data.id === callId) {
-				rpc = parsed.data;
-				break;
-			}
-		}
-		if (!rpc) throw new HTTPError("MCP tools/call: no matching response id in SSE stream", callRes);
-		if (rpc.error) throw new HTTPError(`MCP error ${rpc.error.code}: ${rpc.error.message}`, callRes);
-		if (rpc.result?.isError) throw new HTTPError("MCP web_search tool error", callRes);
-		const text = rpc.result?.content?.[0]?.text;
-		if (!text) throw new HTTPError("MCP web_search: empty content", callRes);
-		let innerRaw;
-		try {
-			innerRaw = JSON.parse(text);
-		} catch (err) {
-			throw new HTTPError(`MCP web_search: inner content not JSON: ${err instanceof Error ? err.message : String(err)}`, callRes);
-		}
-		const innerParsed = InnerSchema.safeParse(innerRaw);
-		if (!innerParsed.success) throw new HTTPError(`MCP web_search: inner content shape changed (${innerParsed.error.issues.map((i) => `${i.path.join(".")}: ${i.message}`).join("; ")})`, callRes);
-		const inner = innerParsed.data;
-		const references = [];
-		for (const ann of inner.text.annotations ?? []) {
-			const cite = ann.url_citation;
-			if (cite && !cite.url.toLowerCase().includes("bing.com/search")) references.push({
-				title: cite.title,
-				url: cite.url
-			});
-		}
-		consola.debug(`Web search returned ${references.length} references`);
-		return {
-			content: inner.text.value,
-			references
-		};
-	} finally {
-		if (sid) try {
-			fetch(`${copilotBaseUrl(state)}/mcp`, {
-				method: "DELETE",
-				headers: mcpHeaders(sid)
-			}).catch(() => {});
-		} catch {}
-	}
-}
 //#endregion
 //#region src/routes/chat-completions/handler.ts
 const ENCODER$2 = new TextEncoder();
@@ -3299,6 +3438,125 @@ embeddingRoutes.post("/", async (c) => {
 	}
 });
+//#endregion
+//#region src/services/copilot/create-messages.ts
+/**
+* Build headers that match what VS Code Copilot Chat sends to the Copilot API.
+*
+* copilotHeaders() provides: Authorization, content-type, copilot-integration-id,
+* editor-version, editor-plugin-version, user-agent, openai-intent,
+* x-github-api-version, x-request-id, x-vscode-user-agent-library-version.
+*
+* We add the remaining headers VS Code sends for /v1/messages:
+* - X-Initiator (VS Code sets dynamically; "agent" is safe for CLI use)
+* - anthropic-version (VS Code's Anthropic SDK sends this)
+* - X-Interaction-Id (VS Code sends a session-scoped UUID)
+*
+* We intentionally omit copilot-vision-request — VS Code only sends it when
+* images are present, and the native /v1/messages endpoint handles vision
+* without requiring the header.
+*
+* extraHeaders allows callers to forward client-supplied beta headers
+* (anthropic-beta) so Copilot enables extended features.
+*/
+function buildHeaders(extraHeaders) {
+	return {
+		...copilotHeaders(state),
+		accept: "application/json",
+		"openai-intent": "messages-proxy",
+		"x-interaction-type": "conversation-agent",
+		"X-Initiator": "agent",
+		"anthropic-version": "2023-06-01",
+		"X-Interaction-Id": randomUUID(),
+		...extraHeaders
+	};
+}
+/**
+* Forward an Anthropic Messages API request to Copilot's native /v1/messages endpoint.
+* Returns the raw Response so callers can handle streaming vs non-streaming.
+*
+* `callerSignal` (optional) is composed with the standard
+* UPSTREAM_FETCH_TIMEOUT_MS via AbortSignal.any so callers (e.g. the
+* peer-MCP `opus-critic` persona) can cancel the upstream call when
+* Claude Code's MCP per-tool-call ceiling fires. Mirrors the pattern
+* in createResponses / createChatCompletions.
+*/
+async function createMessages(body, extraHeaders, callerSignal) {
+	if (!state.copilotToken) throw new Error("Copilot token not found");
+	const url = `${copilotBaseUrl(state)}/v1/messages?beta=true`;
+	consola.debug(`Forwarding to ${url}`);
+	const doFetch = () => {
+		const fetchInit = {
+			method: "POST",
+			headers: buildHeaders(extraHeaders),
+			body
+		};
+		const signals = [];
+		if (UPSTREAM_FETCH_TIMEOUT_MS > 0) signals.push(AbortSignal.timeout(UPSTREAM_FETCH_TIMEOUT_MS));
+		if (callerSignal) signals.push(callerSignal);
+		if (signals.length === 1) fetchInit.signal = signals[0];
+		else if (signals.length > 1) fetchInit.signal = AbortSignal.any(signals);
+		return fetch(url, fetchInit);
+	};
+	const response = await tryRefreshAndRetry(doFetch, "/v1/messages");
+	if (!response.ok) {
+		let errorBody = "";
+		try {
+			errorBody = await response.text();
+		} catch {
+			errorBody = "(could not read error body)";
+		}
+		consola.error(`Copilot /v1/messages error: ${response.status} ${errorBody}`);
+		throw new HTTPError("Copilot messages request failed", new Response(errorBody, {
+			status: response.status,
+			statusText: response.statusText,
+			headers: response.headers
+		}));
+	}
+	return response;
+}
+/**
+* Forward an Anthropic count_tokens request to Copilot's native endpoint.
+* Returns the raw Response.
+*
+* `callerSignal` is composed with UPSTREAM_FETCH_TIMEOUT_MS — same pattern
+* as createMessages.
+*/
+async function countTokens(body, extraHeaders, callerSignal) {
+	if (!state.copilotToken) throw new Error("Copilot token not found");
+	const url = `${copilotBaseUrl(state)}/v1/messages/count_tokens?beta=true`;
+	consola.debug(`Forwarding to ${url}`);
+	const doFetch = () => {
+		const fetchInit = {
+			method: "POST",
+			headers: buildHeaders(extraHeaders),
+			body
+		};
+		const signals = [];
+		if (UPSTREAM_FETCH_TIMEOUT_MS > 0) signals.push(AbortSignal.timeout(UPSTREAM_FETCH_TIMEOUT_MS));
+		if (callerSignal) signals.push(callerSignal);
+		if (signals.length === 1) fetchInit.signal = signals[0];
+		else if (signals.length > 1) fetchInit.signal = AbortSignal.any(signals);
+		return fetch(url, fetchInit);
+	};
+	const response = await tryRefreshAndRetry(doFetch, "/v1/messages/count_tokens");
+	if (!response.ok) {
+		let errorBody = "";
+		try {
+			errorBody = await response.text();
+		} catch {
+			errorBody = "(could not read error body)";
+		}
+		consola.error(`Copilot count_tokens error: ${response.status} ${errorBody}`);
+		throw new HTTPError("Copilot count_tokens request failed", new Response(errorBody, {
+			status: response.status,
+			statusText: response.statusText,
+			headers: response.headers
+		}));
+	}
+	return response;
+}
 //#endregion
 //#region src/services/copilot/create-responses.ts
 const createResponses = async (payload, modelHeaders, callerSignal) => {
@@ -3360,27 +3618,6 @@ function detectAgentCall(input) {
 const MCP_PROTOCOL_VERSION = "2025-06-18";
 const SERVER_NAME = "github-router-peers";
 const SERVER_VERSION = "1";
-/**
-* Reasoning effort levels accepted by Copilot's /v1/responses (gpt-5.x) and
-* /v1/chat/completions endpoints. Per the proxy's existing thinking-mode
-* translator (CLAUDE.md "Thinking-mode translation"), Copilot's adaptive-
-* thinking path uses these same buckets:
-*   <2k tokens → low, <8k → medium, <24k → high, else → xhigh.
-*
-* Default `high` for peer reviews — adversarial-by-design but still cost-
-* conscious. Callers can pass `xhigh` explicitly for deep dives, or `medium`
-* for quick sanity checks.
-*/
-const EFFORT_LEVELS = [
-	"low",
-	"medium",
-	"high",
-	"xhigh"
-];
-const DEFAULT_EFFORT = "high";
-function isEffort(v) {
-	return typeof v === "string" && EFFORT_LEVELS.includes(v);
-}
 /** Bounded concurrency. Originally capped at 2 (commit 4317a25) as a defensive
 *  pre-launch guess against Opus's natural pattern of fanning out to all three
 *  critics at once. Raised to 8 (Phase 2D of the peer-MCP plan) so the
@@ -3485,10 +3722,10 @@ function geminiAvailable() {
 	return models.some((m) => /^gemini-3\..*pro/i.test(m.id));
 }
 function activePersonas() {
-	return PERSONAS_READ.filter((p) => !p.requiresHttp || geminiAvailable());
+	return PERSONAS_READ.filter((p) => !p.requiresGeminiCatalog || geminiAvailable());
 }
 function toolEntries() {
-	return activePersonas().map((p) => ({
+	const personaEntries = activePersonas().map((p) => ({
 		name: p.toolNameHttp,
 		description: p.description,
 		inputSchema: {
@@ -3506,12 +3743,18 @@ function toolEntries() {
 				},
 				effort: {
 					type: "string",
-					enum: [...EFFORT_LEVELS],
-					description: `Reasoning depth (low | medium | high | xhigh). Default "${DEFAULT_EFFORT}". Use 'xhigh' for explicit deep dives where you want maximum reasoning. Use 'medium' for quick sanity checks. Note: for non-OpenAI models routed via /v1/chat/completions (gemini-3.x), the upstream may silently ignore this knob.`
+					enum: [...p.allowedEfforts],
+					description: `Reasoning depth (${p.allowedEfforts.join(" | ")}). Default "${p.defaultEffort}". Higher tiers cost more wall-clock; lower tiers are quicker sanity checks. ` + (p.endpoint === "/v1/chat/completions" ? "Note: for gemini routed via /v1/chat/completions, the upstream may silently ignore this knob." : "")
 				}
 			}
 		}
 	}));
+	const nonPersonaEntries = NON_PERSONA_MCP_TOOLS.map((t) => ({
+		name: t.toolNameHttp,
+		description: t.description,
+		inputSchema: t.inputSchema
+	}));
+	return [...personaEntries, ...nonPersonaEntries];
 }
 function buildUserText(prompt, context) {
 	if (!context) return prompt;
@@ -3539,6 +3782,11 @@ function extractChatCompletionText(response) {
 	const c = choice.message?.content;
 	return typeof c === "string" ? c : "";
 }
+function extractMessagesText(response) {
+	const out = [];
+	for (const block of response.content ?? []) if (block.type === "text" && typeof block.text === "string") out.push(block.text);
+	return out.join("");
+}
 function toolError(message) {
 	return {
 		content: [{
@@ -3548,6 +3796,94 @@ function toolError(message) {
 		isError: true
 	};
 }
+/**
+* Empirical pre-flight cap to convert "would-bust-the-60s-MCP-ceiling"
+* calls into fast actionable errors instead of slot-leaking timeouts.
+*
+* Probed live against Copilot 2026-05-14:
+*   gpt-5.5 high on a ~600B prompt = 23.8s → ~76s on 8KB (rough linear)
+*   gpt-5.3-codex high on ~600B = 16.0s → ~64s on 12KB
+*   claude-opus-4-7 medium (thinking=3000) on a trivial prompt = 22.5s
+*     but model self-paces budget → ~50s+ on a real ~6KB review
+*
+* Returns `{tooLong: true, capBytes}` when the (persona, effort, briefBytes)
+* tuple is empirically predicted to bust the 60s ceiling.
+*
+* SCOPE: the cap is JSON-PATH ONLY. Callers (handleMcpPost) MUST gate
+* the call site by `!acceptsEventStream(...)`. The SSE path
+* (handleToolsCallSSE) keeps the connection open past the 60s ceiling
+* via heartbeats — size-based pre-flight rejection there would just
+* lock SSE clients out of their primary advantage. JSON-path clients
+* (raw curl with `Accept: application/json`, older MCP clients without
+* SSE awareness) DO still hit the underlying tools/call timer, so the
+* cap is the only way to surface a fast actionable error there
+* instead of a slot-leaking timeout.
+*
+* INVARIANT: pre-flight MUST fire BEFORE inFlightToolsCall++ — the
+* slot must not be acquired for a rejected pre-flight. handleMcpPost
+* runs the check before delegating to handleRpc → handleToolsCall (the
+* function that increments the counter). Documented in CLAUDE.md.
+*
+* gemini_critic has no cap (long-context model + Copilot may auto-pace).
+*/
+const PRE_FLIGHT_CAPS = [
+	{
+		toolName: "codex_critic",
+		effort: "high",
+		maxBriefBytes: 8 * 1024
+	},
+	{
+		toolName: "codex_reviewer",
+		effort: "high",
+		maxBriefBytes: 12 * 1024
+	},
+	{
+		toolName: "opus_critic",
+		effort: "medium",
+		maxBriefBytes: 6 * 1024
+	}
+];
+function predictedTooLong(persona, effort, briefBytes) {
+	for (const cap of PRE_FLIGHT_CAPS) if (cap.toolName === persona.toolNameHttp && cap.effort === effort && briefBytes > cap.maxBriefBytes) return {
+		tooLong: true,
+		capBytes: cap.maxBriefBytes
+	};
+	return { tooLong: false };
+}
+/**
+* JSON-path pre-flight predictedTooLong gate. Returns a JSON-RPC result
+* body wrapping a tool-error envelope when the call would bust the 60s
+* tools/call ceiling on the JSON path; returns undefined when the call
+* should proceed normally.
+*
+* Skips the check (returns undefined) for any shape problem so
+* handleRpc can return the canonical JSON-RPC error code instead:
+*   - notification (no id) → handleRpc returns 202 + empty body
+*   - missing/unknown name  → handleRpc returns -32601
+*   - missing prompt        → handleRpc returns -32602
+*   - invalid effort string → handleRpc returns -32602
+*   - effort not in persona.allowedEfforts → handleRpc returns -32602
+*/
+function jsonPathPreflightCap(body) {
+	if (body.id === void 0) return void 0;
+	const params = body.params ?? {};
+	const name$1 = typeof params.name === "string" ? params.name : "";
+	const args = params.arguments ?? {};
+	const prompt = typeof args.prompt === "string" ? args.prompt : "";
+	const context = typeof args.context === "string" ? args.context : void 0;
+	const rawEffort = args.effort;
+	if (!name$1 || !prompt) return void 0;
+	const persona = activePersonas().find((p) => p.toolNameHttp === name$1);
+	if (!persona) return void 0;
+	if (rawEffort !== void 0 && !isEffort(rawEffort)) return void 0;
+	const effortMaybe = rawEffort;
+	if (effortMaybe !== void 0 && !persona.allowedEfforts.includes(effortMaybe)) return;
+	const effort = effortMaybe ?? persona.defaultEffort;
+	const briefBytes = Buffer.byteLength(buildUserText(prompt, context), "utf8");
+	const verdict = predictedTooLong(persona, effort, briefBytes);
+	if (!verdict.tooLong) return void 0;
+	return rpcResult(body.id, toolError(`pre-flight rejected: ${persona.toolNameHttp} at effort=${effort} on a ${briefBytes}-byte brief is empirically predicted to exceed the JSON tools/call timeout (cap=${verdict.capBytes} bytes for this tier). Either drop to a lower effort tier, split the brief into 2-4 parallel sub-calls per the decomposition guidance, or send Accept: text/event-stream to use the SSE path which bypasses this cap.`));
+}
 async function callPersona(persona, prompt, context, effort, signal) {
 	const resolvedModel = resolveModel(persona.model);
 	const userText = buildUserText(prompt, context);
@@ -3571,6 +3907,25 @@ async function callPersona(persona, prompt, context, effort, signal) {
 			text: text$1
 		}] };
 	}
+	if (persona.endpoint === "/v1/messages") {
+		const maxTokens = effort === "low" ? 4096 : effort === "medium" ? 8192 : effort === "high" ? 16384 : 32768;
+		const text$1 = extractMessagesText(await (await createMessages(JSON.stringify({
+			model: resolvedModel,
+			max_tokens: maxTokens,
+			system: persona.baseInstructions,
+			thinking: { type: "adaptive" },
+			output_config: { effort },
+			messages: [{
+				role: "user",
+				content: userText
+			}]
+		}), void 0, signal)).json());
+		if (!text$1) return toolError(`persona ${persona.agentName}: empty assistant output`);
+		return { content: [{
+			type: "text",
+			text: text$1
+		}] };
+	}
 	const text = extractChatCompletionText(await createChatCompletions({
 		model: resolvedModel,
 		messages: [{
@@ -3604,17 +3959,23 @@ async function handleToolsCall(body) {
 	const params = body.params ?? {};
 	const name$1 = typeof params.name === "string" ? params.name : "";
 	const args = params.arguments ?? {};
-	const prompt = typeof args.prompt === "string" ? args.prompt : "";
-	const context = typeof args.context === "string" ? args.context : void 0;
-	let effort = DEFAULT_EFFORT;
-	if (args.effort !== void 0) {
-		if (!isEffort(args.effort)) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: arguments.effort must be one of ${EFFORT_LEVELS.join("|")}; got ${JSON.stringify(args.effort)}`);
-		effort = args.effort;
-	}
 	if (!name$1) return rpcError(body.id, RPC_INVALID_PARAMS, "tools/call missing name");
 	const persona = activePersonas().find((p) => p.toolNameHttp === name$1);
-	if (!persona) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
-	if (!prompt) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: arguments.prompt is required`);
+	const nonPersonaTool = persona ? void 0 : NON_PERSONA_MCP_TOOLS.find((t) => t.toolNameHttp === name$1);
+	if (!persona && !nonPersonaTool) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
+	let personaPrompt;
+	let personaContext;
+	let personaEffort;
+	if (persona) {
+		if (args.effort !== void 0 && !isEffort(args.effort)) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: arguments.effort must be one of ${EFFORT_LEVELS.join("|")}; got ${JSON.stringify(args.effort)}`);
+		const requestedEffort = args.effort;
+		const prompt = typeof args.prompt === "string" ? args.prompt : "";
+		if (!prompt) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: arguments.prompt is required`);
+		personaPrompt = prompt;
+		personaContext = typeof args.context === "string" ? args.context : void 0;
+		if (requestedEffort !== void 0 && !persona.allowedEfforts.includes(requestedEffort)) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: persona "${persona.toolNameHttp}" does not accept effort="${requestedEffort}". Allowed: ${persona.allowedEfforts.join("|")}.`);
+		personaEffort = requestedEffort ?? persona.defaultEffort;
+	}
 	if (inFlightToolsCall >= MAX_INFLIGHT_TOOLS_CALL) return rpcResult(body.id, {
 		content: [{
 			type: "text",
@@ -3630,11 +3991,13 @@ async function handleToolsCall(body) {
 		aborter = new AbortController();
 		inflightAborts.set(abortKey, aborter);
 	}
+	const telemetryName = persona ? persona.agentName : nonPersonaTool.toolNameHttp;
+	const telemetryModel = persona ? persona.model : "(non-persona)";
 	try {
-		const result = await callPersona(persona, prompt, context, effort, aborter?.signal);
+		const result = persona ? await callPersona(persona, personaPrompt, personaContext, personaEffort, aborter?.signal) : await nonPersonaTool.handler(args, aborter?.signal);
 		logTelemetry({
-			name: persona.agentName,
-			model: persona.model,
+			name: telemetryName,
+			model: telemetryModel,
 			durationMs: Date.now() - startedAt,
 			result: result.isError ? "isError" : "ok"
 		});
@@ -3642,8 +4005,8 @@ async function handleToolsCall(body) {
 	} catch (err) {
 		const message = err instanceof Error ? err.message : String(err);
 		logTelemetry({
-			name: persona.agentName,
-			model: persona.model,
+			name: telemetryName,
+			model: telemetryModel,
 			durationMs: Date.now() - startedAt,
 			result: "exception",
 			errorMessage: message
@@ -3651,7 +4014,7 @@ async function handleToolsCall(body) {
 		return rpcResult(body.id, {
 			content: [{
 				type: "text",
-				text: `persona ${persona.agentName} failed: ${message}`
+				text: persona ? `persona ${persona.agentName} failed: ${message}` : `tool ${nonPersonaTool.toolNameHttp} failed: ${message}`
 			}],
 			isError: true
 		});
@@ -3814,6 +4177,11 @@ async function handleMcpPost(c) {
 		consola.debug("/mcp parse error:", err);
 		return c.json(rpcError(null, RPC_PARSE_ERROR, "request body is not valid JSON"), 200);
 	}
+	if (typeof body === "object" && body !== null && !Array.isArray(body) && body.method === "tools/call" && acceptsEventStream(c.req.header("accept"))) return handleToolsCallSSE(body);
+	if (typeof body === "object" && body !== null && !Array.isArray(body) && body.method === "tools/call") {
+		const preflight = jsonPathPreflightCap(body);
+		if (preflight) return c.json(preflight, 200);
+	}
 	try {
 		const { status, body: respBody } = await handleRpc(c, body);
 		if (respBody === null) return c.body(null, status);
@@ -3824,6 +4192,111 @@ async function handleMcpPost(c) {
 		return c.json(rpcError(echoId, RPC_INTERNAL_ERROR, err instanceof Error ? err.message : String(err)), 200);
 	}
 }
+/**
+* Accept-header parsing for MCP Streamable HTTP. Per MCP 2025-06-18
+* spec, clients send `Accept: application/json, text/event-stream` to
+* indicate they can consume either response shape. Server picks; for
+* tools/call we pick SSE because Claude Code's per-tool-call timer
+* (~60s on v2.1.113+) does not fire on streamed responses.
+*
+* Lenient parse: split on commas, strip params (q-values, charset),
+* trim, lowercase, look for the SSE token. Returns false on undefined
+* / empty / strict-JSON-only Accept.
+*/
+function acceptsEventStream(accept) {
+	if (!accept) return false;
+	return accept.toLowerCase().split(",").map((t) => t.split(";")[0].trim()).includes("text/event-stream");
+}
+/**
+* SSE-streamed response for a single tools/call. Delegates the actual
+* upstream call to `handleToolsCall` (so the per-persona effort gate,
+* predictedTooLong cap, AbortController registration, telemetry, and
+* inFlight slot accounting all run identically); wraps the awaited
+* result in an SSE envelope with periodic heartbeats while the upstream
+* fetch is in flight.
+*
+* SSE event format (per MCP Streamable HTTP):
+*   event: message
+*   data: <json-rpc-2.0 message>\n\n
+*
+* - Heartbeats are JSON-RPC `notifications/progress` notifications with
+*   the request id as `progressToken` (per MCP progress-notification spec).
+* - The final message is the JSON-RPC response envelope returned by
+*   handleToolsCall — same structure as the JSON-path response.
+* - On consumer cancel (ReadableStream.cancel), the heartbeat interval
+*   is cleared and the inFlight slot's AbortController is signalled
+*   (handleToolsCall observes the abort and returns an error envelope
+*   that we drop unwritten — controller is already closed).
+*
+* Per CLAUDE.md "Stream lifecycle" / "The smoking gun" rules: every
+* controller.enqueue/close is wrapped in a try/catch that swallows the
+* "Invalid state: Controller is already closed" race without warning.
+*/
+const SSE_HEARTBEAT_INTERVAL_MS = 5e3;
+async function handleToolsCallSSE(body) {
+	const encoder = new TextEncoder();
+	const callPromise = handleToolsCall(body);
+	const stream = new ReadableStream({
+		async start(controller) {
+			let closed = false;
+			const safeEnqueue = (chunk) => {
+				if (closed) return;
+				try {
+					controller.enqueue(chunk);
+				} catch (err) {
+					consola.debug("/mcp SSE enqueue after close (expected race):", err);
+					closed = true;
+				}
+			};
+			const safeClose = () => {
+				if (closed) return;
+				closed = true;
+				try {
+					controller.close();
+				} catch (err) {
+					consola.debug("/mcp SSE close after close:", err);
+				}
+			};
+			const sseFrame = (rpcMessage) => encoder.encode(`event: message\ndata: ${JSON.stringify(rpcMessage)}\n\n`);
+			const heartbeatFrame = () => sseFrame({
+				jsonrpc: "2.0",
+				method: "notifications/progress",
+				params: {
+					progressToken: body.id ?? null,
+					progress: 0,
+					message: "in flight"
+				}
+			});
+			safeEnqueue(heartbeatFrame());
+			const heartbeatHandle = setInterval(() => safeEnqueue(heartbeatFrame()), SSE_HEARTBEAT_INTERVAL_MS);
+			try {
+				safeEnqueue(sseFrame(await callPromise));
+			} catch (err) {
+				consola.error("/mcp SSE upstream error:", err);
+				safeEnqueue(sseFrame(rpcError(body.id ?? null, RPC_INTERNAL_ERROR, err instanceof Error ? err.message : String(err))));
+			} finally {
+				clearInterval(heartbeatHandle);
+				safeClose();
+			}
+		},
+		cancel() {
+			const abortKey = body.id !== void 0 && body.id !== null ? body.id : void 0;
+			if (abortKey !== void 0) {
+				const aborter = inflightAborts.get(abortKey);
+				if (aborter) aborter.abort(/* @__PURE__ */ new Error("client disconnected SSE stream"));
+			}
+		}
+	});
+	return new Response(stream, {
+		status: 200,
+		headers: {
+			"Content-Type": "text/event-stream",
+			"Cache-Control": "no-cache, no-transform",
+			"Connection": "keep-alive",
+			"X-Accel-Buffering": "no"
+		}
+	});
+}
 function handleMcpDelete(c) {
 	const auth$1 = checkAuth(c);
 	if (!auth$1.ok) return c.json(rpcError(null, RPC_INVALID_REQUEST, auth$1.reason), auth$1.status);
@@ -3848,108 +4321,6 @@ mcpRoutes.delete("/", (c) => {
 	}
 });
-//#endregion
-//#region src/services/copilot/create-messages.ts
-/**
-* Build headers that match what VS Code Copilot Chat sends to the Copilot API.
-*
-* copilotHeaders() provides: Authorization, content-type, copilot-integration-id,
-* editor-version, editor-plugin-version, user-agent, openai-intent,
-* x-github-api-version, x-request-id, x-vscode-user-agent-library-version.
-*
-* We add the remaining headers VS Code sends for /v1/messages:
-* - X-Initiator (VS Code sets dynamically; "agent" is safe for CLI use)
-* - anthropic-version (VS Code's Anthropic SDK sends this)
-* - X-Interaction-Id (VS Code sends a session-scoped UUID)
-*
-* We intentionally omit copilot-vision-request — VS Code only sends it when
-* images are present, and the native /v1/messages endpoint handles vision
-* without requiring the header.
-*
-* extraHeaders allows callers to forward client-supplied beta headers
-* (anthropic-beta) so Copilot enables extended features.
-*/
-function buildHeaders(extraHeaders) {
-	return {
-		...copilotHeaders(state),
-		accept: "application/json",
-		"openai-intent": "messages-proxy",
-		"x-interaction-type": "conversation-agent",
-		"X-Initiator": "agent",
-		"anthropic-version": "2023-06-01",
-		"X-Interaction-Id": randomUUID(),
-		...extraHeaders
-	};
-}
-/**
-* Forward an Anthropic Messages API request to Copilot's native /v1/messages endpoint.
-* Returns the raw Response so callers can handle streaming vs non-streaming.
-*/
-async function createMessages(body, extraHeaders) {
-	if (!state.copilotToken) throw new Error("Copilot token not found");
-	const url = `${copilotBaseUrl(state)}/v1/messages?beta=true`;
-	consola.debug(`Forwarding to ${url}`);
-	const doFetch = () => {
-		const fetchInit = {
-			method: "POST",
-			headers: buildHeaders(extraHeaders),
-			body
-		};
-		if (UPSTREAM_FETCH_TIMEOUT_MS > 0) fetchInit.signal = AbortSignal.timeout(UPSTREAM_FETCH_TIMEOUT_MS);
-		return fetch(url, fetchInit);
-	};
-	const response = await tryRefreshAndRetry(doFetch, "/v1/messages");
-	if (!response.ok) {
-		let errorBody = "";
-		try {
-			errorBody = await response.text();
-		} catch {
-			errorBody = "(could not read error body)";
-		}
-		consola.error(`Copilot /v1/messages error: ${response.status} ${errorBody}`);
-		throw new HTTPError("Copilot messages request failed", new Response(errorBody, {
-			status: response.status,
-			statusText: response.statusText,
-			headers: response.headers
-		}));
-	}
-	return response;
-}
-/**
-* Forward an Anthropic count_tokens request to Copilot's native endpoint.
-* Returns the raw Response.
-*/
-async function countTokens(body, extraHeaders) {
-	if (!state.copilotToken) throw new Error("Copilot token not found");
-	const url = `${copilotBaseUrl(state)}/v1/messages/count_tokens?beta=true`;
-	consola.debug(`Forwarding to ${url}`);
-	const doFetch = () => {
-		const fetchInit = {
-			method: "POST",
-			headers: buildHeaders(extraHeaders),
-			body
-		};
-		if (UPSTREAM_FETCH_TIMEOUT_MS > 0) fetchInit.signal = AbortSignal.timeout(UPSTREAM_FETCH_TIMEOUT_MS);
-		return fetch(url, fetchInit);
-	};
-	const response = await tryRefreshAndRetry(doFetch, "/v1/messages/count_tokens");
-	if (!response.ok) {
-		let errorBody = "";
-		try {
-			errorBody = await response.text();
-		} catch {
-			errorBody = "(could not read error body)";
-		}
-		consola.error(`Copilot count_tokens error: ${response.status} ${errorBody}`);
-		throw new HTTPError("Copilot count_tokens request failed", new Response(errorBody, {
-			status: response.status,
-			statusText: response.statusText,
-			headers: response.headers
-		}));
-	}
-	return response;
-}
 //#endregion
 //#region src/services/advisor/advisor.ts
 const ENCODER$1 = new TextEncoder();
@@ -5003,7 +5374,7 @@ async function handleCompletion(c) {
 			type: "error",
 			error: {
 				type: "invalid_request_error",
-				message: "Inline `mcp_servers` body field is not supported by github-router (Copilot returns 400 'Extra inputs are not permitted'; the proxy would need a multi-turn tool-loop translation that has unresolved design holes — see Phase G in the plan). Configure your remote MCP servers as local stdio entries in `~/.claude/mcp.json` instead — Claude Code will spawn them locally and the proxy passes their tool calls through transparently. (https://docs.claude.com/en/docs/claude-code/mcp)"
+				message: "Inline `mcp_servers` body field is not supported by github-router. Configure remote MCP servers as local stdio entries in `~/.claude/mcp.json` instead — Claude Code will spawn them locally and the proxy passes their tool calls through transparently. (https://docs.claude.com/en/docs/claude-code/mcp)"
 			}
 		}, 400);
 	} catch {}
@@ -5971,6 +6342,7 @@ function getClaudeCodeEnvVars(serverUrl, model) {
 		ANTHROPIC_BASE_URL: serverUrl,
 		CLAUDE_CONFIG_DIR: PATHS.CLAUDE_CONFIG_DIR,
 		MCP_TIMEOUT: "600000",
+		MCP_TOOL_TIMEOUT: "600000",
 		DISABLE_NON_ESSENTIAL_MODEL_CALLS: "1",
 		CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: "1",
 		DISABLE_TELEMETRY: "1"