github-router 0.3.40 → 0.3.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.js CHANGED
@@ -5851,6 +5851,221 @@ function acquireInFlightSlot() {
5851
5851
  };
5852
5852
  }
5853
5853
 
5854
+ //#endregion
5855
+ //#region src/lib/tokenizer.ts
5856
+ const ENCODING_MAP = {
5857
+ o200k_base: () => import("gpt-tokenizer/encoding/o200k_base"),
5858
+ cl100k_base: () => import("gpt-tokenizer/encoding/cl100k_base"),
5859
+ p50k_base: () => import("gpt-tokenizer/encoding/p50k_base"),
5860
+ p50k_edit: () => import("gpt-tokenizer/encoding/p50k_edit"),
5861
+ r50k_base: () => import("gpt-tokenizer/encoding/r50k_base")
5862
+ };
5863
+ const encodingCache = /* @__PURE__ */ new Map();
5864
+ /**
5865
+ * Calculate tokens for tool calls
5866
+ */
5867
+ const calculateToolCallsTokens = (toolCalls, encoder, constants) => {
5868
+ let tokens = 0;
5869
+ for (const toolCall of toolCalls) {
5870
+ tokens += constants.funcInit;
5871
+ tokens += encoder.encode(JSON.stringify(toolCall)).length;
5872
+ }
5873
+ tokens += constants.funcEnd;
5874
+ return tokens;
5875
+ };
5876
+ /**
5877
+ * Calculate tokens for content parts
5878
+ */
5879
+ const calculateContentPartsTokens = (contentParts, encoder) => {
5880
+ let tokens = 0;
5881
+ for (const part of contentParts) if (part.type === "image_url") tokens += encoder.encode(part.image_url.url).length + 85;
5882
+ else if (part.text) tokens += encoder.encode(part.text).length;
5883
+ return tokens;
5884
+ };
5885
+ /**
5886
+ * Calculate tokens for a single message
5887
+ */
5888
+ const calculateMessageTokens = (message, encoder, constants) => {
5889
+ const tokensPerMessage = 3;
5890
+ const tokensPerName = 1;
5891
+ let tokens = tokensPerMessage;
5892
+ for (const [key, value] of Object.entries(message)) {
5893
+ if (typeof value === "string") tokens += encoder.encode(value).length;
5894
+ if (key === "name") tokens += tokensPerName;
5895
+ if (key === "tool_calls") tokens += calculateToolCallsTokens(value, encoder, constants);
5896
+ if (key === "content" && Array.isArray(value)) tokens += calculateContentPartsTokens(value, encoder);
5897
+ }
5898
+ return tokens;
5899
+ };
5900
+ /**
5901
+ * Calculate tokens using custom algorithm
5902
+ */
5903
+ const calculateTokens = (messages, encoder, constants) => {
5904
+ if (messages.length === 0) return 0;
5905
+ let numTokens = 0;
5906
+ for (const message of messages) numTokens += calculateMessageTokens(message, encoder, constants);
5907
+ numTokens += 3;
5908
+ return numTokens;
5909
+ };
5910
+ /**
5911
+ * Get the corresponding encoder module based on encoding type
5912
+ */
5913
+ const getEncodeChatFunction = async (encoding) => {
5914
+ if (encodingCache.has(encoding)) {
5915
+ const cached$1 = encodingCache.get(encoding);
5916
+ if (cached$1) return cached$1;
5917
+ }
5918
+ const supportedEncoding = encoding;
5919
+ if (!(supportedEncoding in ENCODING_MAP)) {
5920
+ const fallbackModule = await ENCODING_MAP.o200k_base();
5921
+ encodingCache.set(encoding, fallbackModule);
5922
+ return fallbackModule;
5923
+ }
5924
+ const encodingModule = await ENCODING_MAP[supportedEncoding]();
5925
+ encodingCache.set(encoding, encodingModule);
5926
+ return encodingModule;
5927
+ };
5928
+ /**
5929
+ * Get tokenizer type from model information
5930
+ */
5931
+ const getTokenizerFromModel = (model) => {
5932
+ return model.capabilities?.tokenizer || "o200k_base";
5933
+ };
5934
+ /**
5935
+ * Load (and cache) the encoder for an encoding name. Unknown encodings
5936
+ * fall back to o200k_base. Exposed so prompt-window budgeting code can
5937
+ * count raw-text tokens without going through the chat-payload path.
5938
+ */
5939
+ const loadEncoder = async (encoding = "o200k_base") => getEncodeChatFunction(encoding);
5940
+ /**
5941
+ * Exact token count of a raw text string under the given encoding
5942
+ * (default o200k_base — the tokenizer every adaptive Copilot model in
5943
+ * our lineup declares via `capabilities.tokenizer`). This is the real
5944
+ * BPE count, NOT a chars-per-token or word-count approximation, so it
5945
+ * matches the limit Copilot enforces (`max_prompt_tokens`) to the
5946
+ * token. Used by advisor transcript budgeting and the peer-MCP
5947
+ * prompt-window guard.
5948
+ */
5949
+ const getTextTokenCount = async (text, encoding = "o200k_base") => {
5950
+ if (!text) return 0;
5951
+ return (await getEncodeChatFunction(encoding)).encode(text).length;
5952
+ };
5953
+ /**
5954
+ * Get model-specific constants for token calculation
5955
+ */
5956
+ const getModelConstants = (model) => {
5957
+ return model.id === "gpt-3.5-turbo" || model.id === "gpt-4" ? {
5958
+ funcInit: 10,
5959
+ propInit: 3,
5960
+ propKey: 3,
5961
+ enumInit: -3,
5962
+ enumItem: 3,
5963
+ funcEnd: 12
5964
+ } : {
5965
+ funcInit: 7,
5966
+ propInit: 3,
5967
+ propKey: 3,
5968
+ enumInit: -3,
5969
+ enumItem: 3,
5970
+ funcEnd: 12
5971
+ };
5972
+ };
5973
+ /**
5974
+ * Calculate tokens for a single parameter
5975
+ */
5976
+ const calculateParameterTokens = (key, prop, context) => {
5977
+ const { encoder, constants } = context;
5978
+ let tokens = constants.propKey;
5979
+ if (typeof prop !== "object" || prop === null) return tokens;
5980
+ const param = prop;
5981
+ const paramName = key;
5982
+ const paramType = param.type || "string";
5983
+ let paramDesc = param.description || "";
5984
+ if (param.enum && Array.isArray(param.enum)) {
5985
+ tokens += constants.enumInit;
5986
+ for (const item of param.enum) {
5987
+ tokens += constants.enumItem;
5988
+ tokens += encoder.encode(String(item)).length;
5989
+ }
5990
+ }
5991
+ if (paramDesc.endsWith(".")) paramDesc = paramDesc.slice(0, -1);
5992
+ const line = `${paramName}:${paramType}:${paramDesc}`;
5993
+ tokens += encoder.encode(line).length;
5994
+ const excludedKeys = new Set([
5995
+ "type",
5996
+ "description",
5997
+ "enum"
5998
+ ]);
5999
+ for (const propertyName of Object.keys(param)) if (!excludedKeys.has(propertyName)) {
6000
+ const propertyValue = param[propertyName];
6001
+ const propertyText = typeof propertyValue === "string" ? propertyValue : JSON.stringify(propertyValue);
6002
+ tokens += encoder.encode(`${propertyName}:${propertyText}`).length;
6003
+ }
6004
+ return tokens;
6005
+ };
6006
+ /**
6007
+ * Calculate tokens for function parameters
6008
+ */
6009
+ const calculateParametersTokens = (parameters, encoder, constants) => {
6010
+ if (!parameters || typeof parameters !== "object") return 0;
6011
+ const params = parameters;
6012
+ let tokens = 0;
6013
+ for (const [key, value] of Object.entries(params)) if (key === "properties") {
6014
+ const properties = value;
6015
+ if (Object.keys(properties).length > 0) {
6016
+ tokens += constants.propInit;
6017
+ for (const propKey of Object.keys(properties)) tokens += calculateParameterTokens(propKey, properties[propKey], {
6018
+ encoder,
6019
+ constants
6020
+ });
6021
+ }
6022
+ } else {
6023
+ const paramText = typeof value === "string" ? value : JSON.stringify(value);
6024
+ tokens += encoder.encode(`${key}:${paramText}`).length;
6025
+ }
6026
+ return tokens;
6027
+ };
6028
+ /**
6029
+ * Calculate tokens for a single tool
6030
+ */
6031
+ const calculateToolTokens = (tool, encoder, constants) => {
6032
+ let tokens = constants.funcInit;
6033
+ const func = tool.function;
6034
+ const fName = func.name;
6035
+ let fDesc = func.description || "";
6036
+ if (fDesc.endsWith(".")) fDesc = fDesc.slice(0, -1);
6037
+ const line = fName + ":" + fDesc;
6038
+ tokens += encoder.encode(line).length;
6039
+ if (typeof func.parameters === "object" && func.parameters !== null) tokens += calculateParametersTokens(func.parameters, encoder, constants);
6040
+ return tokens;
6041
+ };
6042
+ /**
6043
+ * Calculate token count for tools based on model
6044
+ */
6045
+ const numTokensForTools = (tools, encoder, constants) => {
6046
+ let funcTokenCount = 0;
6047
+ for (const tool of tools) funcTokenCount += calculateToolTokens(tool, encoder, constants);
6048
+ funcTokenCount += constants.funcEnd;
6049
+ return funcTokenCount;
6050
+ };
6051
+ /**
6052
+ * Calculate the token count of messages, supporting multiple GPT encoders
6053
+ */
6054
+ const getTokenCount = async (payload, model) => {
6055
+ const encoder = await getEncodeChatFunction(getTokenizerFromModel(model));
6056
+ const simplifiedMessages = payload.messages;
6057
+ const inputMessages = simplifiedMessages.filter((msg) => msg.role !== "assistant");
6058
+ const outputMessages = simplifiedMessages.filter((msg) => msg.role === "assistant");
6059
+ const constants = getModelConstants(model);
6060
+ let inputTokens = calculateTokens(inputMessages, encoder, constants);
6061
+ if (payload.tools && payload.tools.length > 0) inputTokens += numTokensForTools(payload.tools, encoder, constants);
6062
+ const outputTokens = calculateTokens(outputMessages, encoder, constants);
6063
+ return {
6064
+ input: inputTokens,
6065
+ output: outputTokens
6066
+ };
6067
+ };
6068
+
5854
6069
  //#endregion
5855
6070
  //#region src/services/copilot/create-messages.ts
5856
6071
  /**
@@ -6220,8 +6435,24 @@ function browserToolsEnabled() {
6220
6435
  if (!(state.browseEnabled || process.env.GH_ROUTER_ENABLE_BROWSE === "1")) return false;
6221
6436
  return hasSupportedBrowserInstalled();
6222
6437
  }
6438
+ /**
6439
+ * The 1M-context Opus variant (`claude-opus-4.7-1m-internal`,
6440
+ * `max_prompt_tokens` 936K), gated `restricted_to: ["enterprise"]`.
6441
+ * opus_critic prefers it so it can take large artifacts in one shot
6442
+ * (the whole point of pairing it with gpt-5.5 as the big-window peers);
6443
+ * falls back to the 200K `claude-opus-4-7` when the catalog (non-
6444
+ * enterprise) doesn't carry a 1M opus slug.
6445
+ */
6446
+ const OPUS_1M_RE = /opus-4\.7.*1m/i;
6447
+ function resolveOpusCriticModel() {
6448
+ const oneM = state.models?.data?.find((m) => OPUS_1M_RE.test(m.id));
6449
+ return oneM ? oneM.id : "claude-opus-4-7";
6450
+ }
6223
6451
  function activePersonas() {
6224
- return PERSONAS_READ.filter((p) => !p.requiresGeminiCatalog || geminiAvailable());
6452
+ return PERSONAS_READ.filter((p) => !p.requiresGeminiCatalog || geminiAvailable()).map((p) => p.toolNameHttp === "opus_critic" ? {
6453
+ ...p,
6454
+ model: resolveOpusCriticModel()
6455
+ } : p);
6225
6456
  }
6226
6457
  function toolEntries() {
6227
6458
  const personaEntries = activePersonas().map((p) => ({
@@ -6355,6 +6586,46 @@ function predictedTooLong(persona, effort, briefBytes) {
6355
6586
  return { tooLong: false };
6356
6587
  }
6357
6588
  /**
6589
+ * Tokens reserved below a peer model's `max_prompt_tokens` for the
6590
+ * per-call message framing (role wrappers, output_config, etc.) and any
6591
+ * discrepancy between our o200k count and Copilot's full-payload count.
6592
+ */
6593
+ const PEER_PROMPT_TOKEN_RESERVE = 2e3;
6594
+ /**
6595
+ * Prompt-window guard. Unlike `predictedTooLong` (a JSON-path *timeout*
6596
+ * predictor in bytes), this guards the *context window*: it counts the
6597
+ * EXACT o200k tokens of the text actually sent to the peer (system
6598
+ * instructions + prompt + context) and compares against the persona
6599
+ * model's live `max_prompt_tokens`. Applies on BOTH the SSE and JSON
6600
+ * paths (called from `handleToolsCall`, before slot acquisition) because
6601
+ * an over-window brief 400s `model_max_prompt_tokens_exceeded` upstream
6602
+ * regardless of transport — and on SSE there is no other size bound.
6603
+ *
6604
+ * Returns an actionable message when over budget (reject, don't
6605
+ * truncate — silently dropping lines from a review artifact is worse
6606
+ * than a clear error), or undefined when it fits or the limit is unknown.
6607
+ */
6608
+ async function predictedWindowOverflow(persona, prompt, context) {
6609
+ const id = resolveModel(persona.model);
6610
+ const entry = state.models?.data?.find((m) => m.id === id);
6611
+ if (!entry) return void 0;
6612
+ const maxPromptTokens = entry.capabilities?.limits?.max_prompt_tokens;
6613
+ if (typeof maxPromptTokens !== "number" || !Number.isFinite(maxPromptTokens) || maxPromptTokens <= 0) return;
6614
+ const budget = maxPromptTokens - PEER_PROMPT_TOKEN_RESERVE;
6615
+ const inputText = `${persona.baseInstructions}\n${buildUserText(prompt, context)}`;
6616
+ if (Buffer.byteLength(inputText, "utf8") <= budget) return void 0;
6617
+ let tokens;
6618
+ try {
6619
+ tokens = await getTextTokenCount(inputText, getTokenizerFromModel(entry));
6620
+ } catch (err) {
6621
+ consola.debug("[mcp] window-guard tokenization failed; allowing call:", err);
6622
+ return;
6623
+ }
6624
+ if (tokens <= budget) return void 0;
6625
+ const opusHint = OPUS_1M_RE.test(id) ? "" : " / `opus_critic` (Opus-4.7 1M ≈ 936K tokens, when the enterprise catalog carries it)";
6626
+ return `pre-flight rejected: this ${persona.toolNameHttp} brief is ≈${tokens} tokens, over the ${budget}-token budget for ${persona.model} (its ${maxPromptTokens}-token prompt window minus a ${PEER_PROMPT_TOKEN_RESERVE}-token framing reserve). Do NOT summarize or truncate the artifact to fit. Route the full artifact to a larger-window peer — \`codex_critic\` (gpt-5.5 ≈ 922K tokens)${opusHint} — or split it into focused sub-calls BY CONCERN and call them in parallel, then aggregate.`;
6627
+ }
6628
+ /**
6358
6629
  * JSON-path pre-flight predictedTooLong gate. Returns a JSON-RPC result
6359
6630
  * body wrapping a tool-error envelope when the call would bust the 60s
6360
6631
  * tools/call ceiling on the JSON path; returns undefined when the call
@@ -6516,6 +6787,10 @@ async function handleToolsCall(body) {
6516
6787
  if (requestedEffort !== void 0 && !persona.allowedEfforts.includes(requestedEffort)) return rpcError(body.id, RPC_INVALID_PARAMS, `tools/call: persona "${persona.toolNameHttp}" does not accept effort="${requestedEffort}". Allowed: ${persona.allowedEfforts.join("|")}.`);
6517
6788
  personaEffort = requestedEffort ?? persona.defaultEffort;
6518
6789
  }
6790
+ if (persona && personaPrompt !== void 0) {
6791
+ const overflow = await predictedWindowOverflow(persona, personaPrompt, personaContext);
6792
+ if (overflow) return rpcResult(body.id, toolError(overflow));
6793
+ }
6519
6794
  const release = acquireInFlightSlot();
6520
6795
  if (!release) return rpcResult(body.id, {
6521
6796
  content: [{
@@ -6755,10 +7030,13 @@ function acceptsEventStream(accept) {
6755
7030
  /**
6756
7031
  * SSE-streamed response for a single tools/call. Delegates the actual
6757
7032
  * upstream call to `handleToolsCall` (so the per-persona effort gate,
6758
- * predictedTooLong cap, AbortController registration, telemetry, and
6759
- * inFlight slot accounting all run identically); wraps the awaited
6760
- * result in an SSE envelope with periodic heartbeats while the upstream
6761
- * fetch is in flight.
7033
+ * the token-exact prompt-window guard, AbortController registration,
7034
+ * telemetry, and inFlight slot accounting all run identically); wraps
7035
+ * the awaited result in an SSE envelope with periodic heartbeats while
7036
+ * the upstream fetch is in flight. NOTE: the JSON-path `predictedTooLong`
7037
+ * byte cap is NOT applied here — it lives in `jsonPathPreflightCap`
7038
+ * (JSON path only); SSE bypasses it intentionally because heartbeats
7039
+ * keep the call alive past the ~60s tools/call ceiling it guards.
6762
7040
  *
6763
7041
  * SSE event format (per MCP Streamable HTTP):
6764
7042
  * event: message
@@ -7145,15 +7423,39 @@ function injectAdvisorTool(rawBody) {
7145
7423
  }];
7146
7424
  return JSON.stringify(parsed);
7147
7425
  }
7148
- /** Character budget for rendered conversation text passed to the
7149
- * advisor model. gpt-5.5 (default advisor) caps prompt input at
7150
- * 272,000 tokens. At a conservative ~3 chars/token (mixed prose +
7151
- * code + JSON), 720,000 chars renders to ≈240,000 tokens, leaving
7152
- * ~32,000 tokens of headroom for the system prompt and per-turn
7153
- * framing overhead. Without this cap, long Claude Code sessions
7154
- * produce 400 `model_max_prompt_tokens_exceeded` from /v1/responses
7155
- * and the advisor falls back silently. */
7426
+ /** Fallback CHARACTER budget for `renderConversationAsText` when called
7427
+ * without a token `measure` (unit-agnostic default = char length). Also
7428
+ * the conservative no-catalog floor: 720,000 chars 240,000 tokens at
7429
+ * ~3 chars/token, which fits even the smaller `/responses` models. The
7430
+ * live path measures EXACT o200k tokens (see `runAdvisor`) and budgets
7431
+ * against the model's real `max_prompt_tokens`, so this constant is only
7432
+ * a safety net, never the normal path. */
7156
7433
  const ADVISOR_MAX_CONVERSATION_CHARS = 72e4;
7434
+ /** Token budget used when the advisor model's `max_prompt_tokens` can't
7435
+ * be resolved from the live catalog. ≈ the 720K-char fallback in tokens. */
7436
+ const ADVISOR_FALLBACK_MAX_TOKENS = 24e4;
7437
+ /** Tokens reserved below the model's `max_prompt_tokens` for the advisor
7438
+ * system prompt + per-call framing + any encode/wire discrepancy between
7439
+ * our o200k count and Copilot's full-payload count. The transcript token
7440
+ * budget is `max_prompt_tokens - reserve`. Generous on purpose: a 400
7441
+ * `model_max_prompt_tokens_exceeded` degrades to a silent advisor
7442
+ * fallback, and the marginal window we give up is irrelevant next to
7443
+ * gpt-5.5's 922K. */
7444
+ const ADVISOR_PROMPT_TOKEN_RESERVE = 8e3;
7445
+ /**
7446
+ * Derive the TOKEN budget for the rendered transcript from the advisor
7447
+ * model's live `max_prompt_tokens` (cached in `state.models` by
7448
+ * `cacheModels()` at startup). Self-correcting: tracks the model's real
7449
+ * window instead of a hardcoded guess, and honors a SMALLER window if a
7450
+ * caller overrides `advisorModel` to a tighter model. Falls back to
7451
+ * `ADVISOR_FALLBACK_MAX_TOKENS` when the catalog or field is missing.
7452
+ */
7453
+ function resolveAdvisorMaxTokens(advisorModel) {
7454
+ const id = resolveModel(advisorModel);
7455
+ const maxPromptTokens = state.models?.data?.find((m) => m.id === id)?.capabilities?.limits?.max_prompt_tokens;
7456
+ if (typeof maxPromptTokens !== "number" || !Number.isFinite(maxPromptTokens) || maxPromptTokens <= 0) return ADVISOR_FALLBACK_MAX_TOKENS;
7457
+ return Math.max(1, maxPromptTokens - ADVISOR_PROMPT_TOKEN_RESERVE);
7458
+ }
7157
7459
  /**
7158
7460
  * Render an Anthropic-shape conversation (messages array with
7159
7461
  * role/content blocks) as a single human-readable text blob. Used
@@ -7163,14 +7465,20 @@ const ADVISOR_MAX_CONVERSATION_CHARS = 72e4;
7163
7465
  * just needs to READ the conversation, not produce more of it).
7164
7466
  *
7165
7467
  * Front-truncates oldest turns when the rendered output would exceed
7166
- * `maxChars`. The advisor cares more about current state (latest
7468
+ * `maxUnits`. The advisor cares more about current state (latest
7167
7469
  * tool calls, errors, in-flight task) than the original prompt —
7168
7470
  * mirrors Claude Code's own context-truncation strategy. When any
7169
7471
  * turns are dropped, prepends a `[TRUNCATED: N earlier turn(s)
7170
7472
  * omitted ...]` notice so the advisor knows the transcript is
7171
7473
  * partial and can flag if it needs the missing context.
7474
+ *
7475
+ * Unit-agnostic via the injected `measure` function: production passes
7476
+ * an EXACT o200k token counter and a token budget (so truncation tracks
7477
+ * the model's real `max_prompt_tokens`); the default `measure` is char
7478
+ * length, so callers/tests that pass a plain numeric budget get the
7479
+ * historical character-budget behavior.
7172
7480
  */
7173
- function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSATION_CHARS) {
7481
+ function renderConversationAsText(conversation, maxUnits = ADVISOR_MAX_CONVERSATION_CHARS, measure = (s) => s.length) {
7174
7482
  const turnBlocks = [];
7175
7483
  for (let i = 0; i < conversation.length; i++) {
7176
7484
  const msg = conversation[i];
@@ -7191,23 +7499,42 @@ function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSAT
7191
7499
  block.push("");
7192
7500
  turnBlocks.push(block.join("\n"));
7193
7501
  }
7194
- let totalChars = 0;
7502
+ let totalUnits = 0;
7195
7503
  let firstKeptIdx = turnBlocks.length;
7196
7504
  for (let i = turnBlocks.length - 1; i >= 0; i--) {
7197
- const len = turnBlocks[i].length + 1;
7198
- if (totalChars + len > maxChars) break;
7199
- totalChars += len;
7505
+ const len = measure(turnBlocks[i]) + 1;
7506
+ if (totalUnits + len > maxUnits) break;
7507
+ totalUnits += len;
7200
7508
  firstKeptIdx = i;
7201
7509
  }
7202
7510
  if (firstKeptIdx === turnBlocks.length && turnBlocks.length > 0) {
7203
- const tail = turnBlocks[turnBlocks.length - 1].slice(-(maxChars - 200));
7204
- return `[TRUNCATED: conversation too long for advisor model context; only the tail of the latest (turn ${turnBlocks.length}) is shown]\n\n` + tail;
7511
+ const last = turnBlocks[turnBlocks.length - 1];
7512
+ const notice = `[TRUNCATED: conversation too long for advisor model context; only the tail of the latest (turn ${turnBlocks.length}) is shown]\n\n`;
7513
+ return notice + truncateTailToUnits(last, Math.max(0, maxUnits - measure(notice)), measure);
7205
7514
  }
7206
7515
  const kept = turnBlocks.slice(firstKeptIdx);
7207
7516
  if (firstKeptIdx > 0) kept.unshift(`[TRUNCATED: ${firstKeptIdx} earlier turn(s) omitted to fit advisor model context budget; ${turnBlocks.length - firstKeptIdx} most-recent turn(s) shown below]\n`);
7208
7517
  return kept.join("\n");
7209
7518
  }
7210
7519
  /**
7520
+ * Return the longest suffix of `text` whose `measure(...)` is ≤ `maxUnits`.
7521
+ * Binary search on the cut point — unit-agnostic (works for the token
7522
+ * `measure` in prod and the char-length default), and exact rather than
7523
+ * a chars-per-token estimate. `measure` is called O(log n) times.
7524
+ */
7525
+ function truncateTailToUnits(text, maxUnits, measure) {
7526
+ if (maxUnits <= 0) return "";
7527
+ if (measure(text) <= maxUnits) return text;
7528
+ let lo = 0;
7529
+ let hi = text.length;
7530
+ while (lo < hi) {
7531
+ const mid = Math.ceil((lo + hi + 1) / 2);
7532
+ if (measure(text.slice(text.length - mid)) <= maxUnits) lo = mid;
7533
+ else hi = mid - 1;
7534
+ }
7535
+ return text.slice(text.length - lo);
7536
+ }
7537
+ /**
7211
7538
  * Run the advisor model with the full conversation context. Returns
7212
7539
  * the advisor's text response.
7213
7540
  *
@@ -7227,8 +7554,20 @@ function renderConversationAsText(conversation, maxChars = ADVISOR_MAX_CONVERSAT
7227
7554
  async function runAdvisor(conversation, advisorModel, advisorEffort, signal) {
7228
7555
  if (signal?.aborted) throw new Error("advisor call aborted before dispatch");
7229
7556
  const advisorSystem = "You are an expert advisor reviewing an in-progress Claude Code session. The transcript below is the work-in-progress (turns numbered, with tool calls and results inlined). Read carefully and provide concrete, actionable advice on the next step or course-correction. Be specific — cite the parts of the transcript you're responding to. If the assistant is on the right track, say so explicitly. If they're stuck or off-track, name the specific assumption or step to revisit. Aim for 2-5 paragraphs of substantive guidance.";
7230
- const conversationText = renderConversationAsText(conversation);
7231
7557
  const resolvedAdvisorModel = resolveModel(advisorModel);
7558
+ let measure;
7559
+ let maxUnits;
7560
+ try {
7561
+ const modelEntry = state.models?.data?.find((m) => m.id === resolvedAdvisorModel);
7562
+ const encoder = await loadEncoder(modelEntry ? getTokenizerFromModel(modelEntry) : "o200k_base");
7563
+ measure = (s) => encoder.encode(s).length;
7564
+ maxUnits = resolveAdvisorMaxTokens(advisorModel);
7565
+ } catch (err) {
7566
+ consola.debug("advisor: tokenizer load failed; using char-length budget:", err);
7567
+ measure = (s) => s.length;
7568
+ maxUnits = ADVISOR_MAX_CONVERSATION_CHARS;
7569
+ }
7570
+ const conversationText = renderConversationAsText(conversation, maxUnits, measure);
7232
7571
  if (/^(gpt-|o\d|.*codex)/i.test(resolvedAdvisorModel)) {
7233
7572
  const response = await createResponses({
7234
7573
  model: resolvedAdvisorModel,
@@ -9898,7 +10237,7 @@ const PERSONAS_READ = Object.freeze([
9898
10237
  toolNameHttp: "codex_critic",
9899
10238
  model: "gpt-5.5",
9900
10239
  endpoint: "/v1/responses",
9901
- description: "Adversarial second opinion on plans, designs, or code tradeoffs. Backed by gpt-5.5 (OpenAI, 400K context) — strongest reasoning model in the critic lineup, different lab than Opus. Best for architecture decisions, design reviews, and tradeoff analysis where cross-lab diversity matters. Not for line-level code review (use codex_reviewer). Pass artifact verbatim.",
10240
+ description: "Adversarial second opinion on plans, designs, or code tradeoffs. Backed by gpt-5.5 (OpenAI, ≈922K-token input window) — strongest reasoning model in the critic lineup, different lab than Opus. Best for architecture decisions, design reviews, and tradeoff analysis where cross-lab diversity matters. Not for line-level code review (use codex_reviewer). Pass artifact verbatim.",
9902
10241
  baseInstructions: CRITIC_BASE,
9903
10242
  agentPrompt: "",
9904
10243
  writeCapable: false,
@@ -9934,7 +10273,7 @@ const PERSONAS_READ = Object.freeze([
9934
10273
  toolNameHttp: "codex_reviewer",
9935
10274
  model: "gpt-5.3-codex",
9936
10275
  endpoint: "/v1/responses",
9937
- description: "Line-level review of a concrete diff or single file. Backed by gpt-5.3-codex (OpenAI, 400K context) — code-specialist, fastest critic (~16s). Surfaces bugs, edge cases, security issues, and idiom violations at specific line numbers. Not suited for architecture or design review (use codex_critic for plans). Pass artifact verbatim.",
10276
+ description: "Line-level review of a concrete diff or single file. Backed by gpt-5.3-codex (OpenAI, ≈272K-token input window) — code-specialist, fastest critic (~16s). Surfaces bugs, edge cases, security issues, and idiom violations at specific line numbers. Not suited for architecture or design review (use codex_critic for plans). Pass artifact verbatim.",
9938
10277
  baseInstructions: REVIEWER_BASE,
9939
10278
  agentPrompt: "",
9940
10279
  writeCapable: false,
@@ -9952,7 +10291,7 @@ const PERSONAS_READ = Object.freeze([
9952
10291
  toolNameHttp: "opus_critic",
9953
10292
  model: "claude-opus-4-7",
9954
10293
  endpoint: "/v1/messages",
9955
- description: "Adversarial second opinion from a fresh-context Opus 4.7 — same lab as the lead, limited blind-spot diversity vs cross-lab critics, but has the largest context window (up to 1M tokens on enterprise tiers). Handles large artifacts without decomposition. Fast (~22s), catches confabulation and motivated reasoning. Pass artifact verbatim.",
10294
+ description: "Adversarial second opinion from a fresh-context Opus 4.7 — same lab as the lead, limited blind-spot diversity vs cross-lab critics. On enterprise catalogs that carry Opus-4.7-1M it runs with a ≈936K-token input window and handles large artifacts without decomposition; otherwise ≈168K. Fast (~22s), catches confabulation and motivated reasoning. Pass artifact verbatim.",
9956
10295
  baseInstructions: OPUS_CRITIC_BASE,
9957
10296
  agentPrompt: "",
9958
10297
  writeCapable: false,
@@ -10615,14 +10954,14 @@ function buildCoordinatorAgent(opts) {
10615
10954
  "",
10616
10955
  "- **Plan / design / architecture choice** → fan out to `codex-critic` (gpt-5.5, strongest reasoning, cross-lab)" + (opts.geminiAvailable ? " AND `gemini-critic` (third-lab triangulation, strong on formal reasoning) in parallel" : "") + ". codex-reviewer is the wrong tool for plans (it's a code-specialist, not an architecture critic).",
10617
10956
  "- **Concrete diff or single file** → fan out to `codex-reviewer` (gpt-5.3-codex, line-level code specialist, fastest at ~16s)" + (opts.geminiAvailable ? " AND `gemini-critic` for cross-lab triangulation" : "") + ". For very small changes (<20 lines), one `codex-reviewer` call is enough.",
10618
- "- **Large artifact (>50 KB)** prefer `opus-critic` (Opus 4.7, up to 1M context the largest window in the lineup, no decomposition needed for most artifacts). For cross-lab diversity on large artifacts, pair with `codex-critic` and decompose the artifact into 2-4 semantic batches for codex.",
10957
+ "- **Large artifact** → the only peers that take a large artifact WHOLE are `codex-critic` (gpt-5.5, ≈922K-token input window) and `opus-critic` (Opus-4.7-1M, ≈936K-token input on enterprise catalogs; ≈168K otherwise). Route the full artifact to those for cross-lab coverage. `codex-reviewer` (≈272K) and `gemini-critic` (≈136K) have small windows — see Decomposition below: never summarize or downsize the request to squeeze a large artifact into a small-window peer.",
10619
10958
  "- **Formal reasoning, proofs, or invariants** → prefer `gemini-critic`" + (opts.geminiAvailable ? " (gemini-3.1-pro, strong on math and formally-stated properties)" : " (NOT REGISTERED in this session — gemini-3.x not in catalog)") + ".",
10620
10959
  "- **Tie-breaker after codex-critic has weighed in** → call `gemini-critic`" + (opts.geminiAvailable ? "" : " (NOT REGISTERED in this session)") + " or `opus-critic` with the artifact AND codex-critic's verdict for cross-check.",
10621
10960
  "- **Fast sanity check** → `opus-critic` (~22s, same lab as lead but fresh context — catches confabulation and motivated reasoning).",
10622
10961
  "",
10623
10962
  "## Decomposition for large artifacts",
10624
10963
  "",
10625
- "Each per-call MCP wait is bounded (~60s SDK default on Claude Code v2.1.113+ per regressions #50289 / #52137 empirically reproduced 2026-05-14). The proxy enforces per-persona effort allowlists AND a pre-flight `predictedTooLong` cap (codex_critic@high >8 KB, codex_reviewer@high >12 KB, opus_critic@medium >6 KB) to surface would-be-timeouts as fast actionable errors. For artifacts that exceed the cap but fit within opus-critic's context window (up to 1M tokens when available), route the full artifact to opus-critic. Otherwise, split into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) and call peers in parallel. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back.",
10964
+ "Route by the peer's real PROMPT WINDOW (input tokens): `codex-critic` gpt-5.5 ≈922K · `opus-critic` Opus-4.7-1M ≈936K (enterprise catalogs; ≈168K otherwise) · `codex-reviewer` gpt-5.3-codex ≈272K · `gemini-critic` gemini-3.1-pro ≈136K. The proxy REJECTS (with an actionable message) any single call whose brief exceeds the target peer's window — it will NOT silently truncate, because dropping lines from a review artifact is worse than a clear error. So: send the full artifact only to peers whose window fits it (large artifacts → `codex-critic` and/or `opus-critic`). When a peer's window is too small (commonly `gemini-critic` at ≈136K, or `codex-reviewer` at ≈272K), do NOT summarize or downsize the request to include it — either skip that peer, or split the artifact into 2-4 logical batches BY CONCERN (not by raw size — semantic batches give better per-batch reviews) that each fit, and call in parallel. Use the big-window peers for the whole and reserve a small-window peer like gemini for the concerns it can actually hold. The proxy's MCP cap allows up to 8 in-flight calls. Aggregate findings yourself before reporting back. (Separately, on the JSON transport a per-effort `predictedTooLong` byte cap still guards the ~60s tools/call timeout for non-SSE clients; Claude Code uses SSE, which streams with heartbeats and isn't subject to that cap.)",
10626
10965
  "",
10627
10966
  "## Aggregation contract",
10628
10967
  "",
@@ -11120,7 +11459,7 @@ function initProxyFromEnv() {
11120
11459
  //#endregion
11121
11460
  //#region package.json
11122
11461
  var name = "github-router";
11123
- var version = "0.3.40";
11462
+ var version = "0.3.41";
11124
11463
 
11125
11464
  //#endregion
11126
11465
  //#region src/lib/approval.ts
@@ -11276,202 +11615,6 @@ function collectToolFieldKeys(body) {
11276
11615
  return [...seen].sort();
11277
11616
  }
11278
11617
 
11279
- //#endregion
11280
- //#region src/lib/tokenizer.ts
11281
- const ENCODING_MAP = {
11282
- o200k_base: () => import("gpt-tokenizer/encoding/o200k_base"),
11283
- cl100k_base: () => import("gpt-tokenizer/encoding/cl100k_base"),
11284
- p50k_base: () => import("gpt-tokenizer/encoding/p50k_base"),
11285
- p50k_edit: () => import("gpt-tokenizer/encoding/p50k_edit"),
11286
- r50k_base: () => import("gpt-tokenizer/encoding/r50k_base")
11287
- };
11288
- const encodingCache = /* @__PURE__ */ new Map();
11289
- /**
11290
- * Calculate tokens for tool calls
11291
- */
11292
- const calculateToolCallsTokens = (toolCalls, encoder, constants) => {
11293
- let tokens = 0;
11294
- for (const toolCall of toolCalls) {
11295
- tokens += constants.funcInit;
11296
- tokens += encoder.encode(JSON.stringify(toolCall)).length;
11297
- }
11298
- tokens += constants.funcEnd;
11299
- return tokens;
11300
- };
11301
- /**
11302
- * Calculate tokens for content parts
11303
- */
11304
- const calculateContentPartsTokens = (contentParts, encoder) => {
11305
- let tokens = 0;
11306
- for (const part of contentParts) if (part.type === "image_url") tokens += encoder.encode(part.image_url.url).length + 85;
11307
- else if (part.text) tokens += encoder.encode(part.text).length;
11308
- return tokens;
11309
- };
11310
- /**
11311
- * Calculate tokens for a single message
11312
- */
11313
- const calculateMessageTokens = (message, encoder, constants) => {
11314
- const tokensPerMessage = 3;
11315
- const tokensPerName = 1;
11316
- let tokens = tokensPerMessage;
11317
- for (const [key, value] of Object.entries(message)) {
11318
- if (typeof value === "string") tokens += encoder.encode(value).length;
11319
- if (key === "name") tokens += tokensPerName;
11320
- if (key === "tool_calls") tokens += calculateToolCallsTokens(value, encoder, constants);
11321
- if (key === "content" && Array.isArray(value)) tokens += calculateContentPartsTokens(value, encoder);
11322
- }
11323
- return tokens;
11324
- };
11325
- /**
11326
- * Calculate tokens using custom algorithm
11327
- */
11328
- const calculateTokens = (messages, encoder, constants) => {
11329
- if (messages.length === 0) return 0;
11330
- let numTokens = 0;
11331
- for (const message of messages) numTokens += calculateMessageTokens(message, encoder, constants);
11332
- numTokens += 3;
11333
- return numTokens;
11334
- };
11335
- /**
11336
- * Get the corresponding encoder module based on encoding type
11337
- */
11338
- const getEncodeChatFunction = async (encoding) => {
11339
- if (encodingCache.has(encoding)) {
11340
- const cached$1 = encodingCache.get(encoding);
11341
- if (cached$1) return cached$1;
11342
- }
11343
- const supportedEncoding = encoding;
11344
- if (!(supportedEncoding in ENCODING_MAP)) {
11345
- const fallbackModule = await ENCODING_MAP.o200k_base();
11346
- encodingCache.set(encoding, fallbackModule);
11347
- return fallbackModule;
11348
- }
11349
- const encodingModule = await ENCODING_MAP[supportedEncoding]();
11350
- encodingCache.set(encoding, encodingModule);
11351
- return encodingModule;
11352
- };
11353
- /**
11354
- * Get tokenizer type from model information
11355
- */
11356
- const getTokenizerFromModel = (model) => {
11357
- return model.capabilities?.tokenizer || "o200k_base";
11358
- };
11359
- /**
11360
- * Get model-specific constants for token calculation
11361
- */
11362
- const getModelConstants = (model) => {
11363
- return model.id === "gpt-3.5-turbo" || model.id === "gpt-4" ? {
11364
- funcInit: 10,
11365
- propInit: 3,
11366
- propKey: 3,
11367
- enumInit: -3,
11368
- enumItem: 3,
11369
- funcEnd: 12
11370
- } : {
11371
- funcInit: 7,
11372
- propInit: 3,
11373
- propKey: 3,
11374
- enumInit: -3,
11375
- enumItem: 3,
11376
- funcEnd: 12
11377
- };
11378
- };
11379
- /**
11380
- * Calculate tokens for a single parameter
11381
- */
11382
- const calculateParameterTokens = (key, prop, context) => {
11383
- const { encoder, constants } = context;
11384
- let tokens = constants.propKey;
11385
- if (typeof prop !== "object" || prop === null) return tokens;
11386
- const param = prop;
11387
- const paramName = key;
11388
- const paramType = param.type || "string";
11389
- let paramDesc = param.description || "";
11390
- if (param.enum && Array.isArray(param.enum)) {
11391
- tokens += constants.enumInit;
11392
- for (const item of param.enum) {
11393
- tokens += constants.enumItem;
11394
- tokens += encoder.encode(String(item)).length;
11395
- }
11396
- }
11397
- if (paramDesc.endsWith(".")) paramDesc = paramDesc.slice(0, -1);
11398
- const line = `${paramName}:${paramType}:${paramDesc}`;
11399
- tokens += encoder.encode(line).length;
11400
- const excludedKeys = new Set([
11401
- "type",
11402
- "description",
11403
- "enum"
11404
- ]);
11405
- for (const propertyName of Object.keys(param)) if (!excludedKeys.has(propertyName)) {
11406
- const propertyValue = param[propertyName];
11407
- const propertyText = typeof propertyValue === "string" ? propertyValue : JSON.stringify(propertyValue);
11408
- tokens += encoder.encode(`${propertyName}:${propertyText}`).length;
11409
- }
11410
- return tokens;
11411
- };
11412
- /**
11413
- * Calculate tokens for function parameters
11414
- */
11415
- const calculateParametersTokens = (parameters, encoder, constants) => {
11416
- if (!parameters || typeof parameters !== "object") return 0;
11417
- const params = parameters;
11418
- let tokens = 0;
11419
- for (const [key, value] of Object.entries(params)) if (key === "properties") {
11420
- const properties = value;
11421
- if (Object.keys(properties).length > 0) {
11422
- tokens += constants.propInit;
11423
- for (const propKey of Object.keys(properties)) tokens += calculateParameterTokens(propKey, properties[propKey], {
11424
- encoder,
11425
- constants
11426
- });
11427
- }
11428
- } else {
11429
- const paramText = typeof value === "string" ? value : JSON.stringify(value);
11430
- tokens += encoder.encode(`${key}:${paramText}`).length;
11431
- }
11432
- return tokens;
11433
- };
11434
- /**
11435
- * Calculate tokens for a single tool
11436
- */
11437
- const calculateToolTokens = (tool, encoder, constants) => {
11438
- let tokens = constants.funcInit;
11439
- const func = tool.function;
11440
- const fName = func.name;
11441
- let fDesc = func.description || "";
11442
- if (fDesc.endsWith(".")) fDesc = fDesc.slice(0, -1);
11443
- const line = fName + ":" + fDesc;
11444
- tokens += encoder.encode(line).length;
11445
- if (typeof func.parameters === "object" && func.parameters !== null) tokens += calculateParametersTokens(func.parameters, encoder, constants);
11446
- return tokens;
11447
- };
11448
- /**
11449
- * Calculate token count for tools based on model
11450
- */
11451
- const numTokensForTools = (tools, encoder, constants) => {
11452
- let funcTokenCount = 0;
11453
- for (const tool of tools) funcTokenCount += calculateToolTokens(tool, encoder, constants);
11454
- funcTokenCount += constants.funcEnd;
11455
- return funcTokenCount;
11456
- };
11457
- /**
11458
- * Calculate the token count of messages, supporting multiple GPT encoders
11459
- */
11460
- const getTokenCount = async (payload, model) => {
11461
- const encoder = await getEncodeChatFunction(getTokenizerFromModel(model));
11462
- const simplifiedMessages = payload.messages;
11463
- const inputMessages = simplifiedMessages.filter((msg) => msg.role !== "assistant");
11464
- const outputMessages = simplifiedMessages.filter((msg) => msg.role === "assistant");
11465
- const constants = getModelConstants(model);
11466
- let inputTokens = calculateTokens(inputMessages, encoder, constants);
11467
- if (payload.tools && payload.tools.length > 0) inputTokens += numTokensForTools(payload.tools, encoder, constants);
11468
- const outputTokens = calculateTokens(outputMessages, encoder, constants);
11469
- return {
11470
- input: inputTokens,
11471
- output: outputTokens
11472
- };
11473
- };
11474
-
11475
11618
  //#endregion
11476
11619
  //#region src/routes/chat-completions/handler.ts
11477
11620
  const ENCODER$1 = new TextEncoder();