github-router 0.3.27 → 0.3.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.js CHANGED
@@ -1,16 +1,20 @@
1
1
  #!/usr/bin/env node
2
+ import { createRequire } from "node:module";
2
3
  import { defineCommand, runMain } from "citty";
3
4
  import consola from "consola";
4
5
  import { randomBytes, randomUUID, timingSafeEqual } from "node:crypto";
5
6
  import fs from "node:fs/promises";
6
7
  import os from "node:os";
8
+ import * as path$1 from "node:path";
7
9
  import path from "node:path";
8
10
  import process$1 from "node:process";
9
11
  import { execFile, execFileSync, spawn } from "node:child_process";
10
12
  import { promisify } from "node:util";
13
+ import fs$1, { existsSync, readFileSync, realpathSync, statSync } from "node:fs";
14
+ import { createInterface } from "node:readline";
15
+ import Parser from "web-tree-sitter";
11
16
  import { events } from "fetch-event-stream";
12
17
  import { z } from "zod";
13
- import fs$1 from "node:fs";
14
18
  import { Writable } from "node:stream";
15
19
  import { serve } from "srvx";
16
20
  import { getProxyForUrl } from "proxy-from-env";
@@ -19,6 +23,10 @@ import { Hono } from "hono";
19
23
  import { cors } from "hono/cors";
20
24
  import clipboard from "clipboardy";
21
25
 
26
+ //#region rolldown:runtime
27
+ var __require = /* @__PURE__ */ createRequire(import.meta.url);
28
+
29
+ //#endregion
22
30
  //#region src/lib/paths.ts
23
31
  function appDir() {
24
32
  return path.join(os.homedir(), ".local", "share", "github-router");
@@ -1786,6 +1794,1250 @@ function launchChild(target, server$1, options = {}) {
1786
1794
  });
1787
1795
  }
1788
1796
 
1797
+ //#endregion
1798
+ //#region src/lib/code-search.ts
1799
+ /**
1800
+ * BM25's `k1` term-frequency saturation parameter. Lucene's default.
1801
+ * Robertson & Zaragoza 2009 monograph recommends 1.2-2.0; Lucene
1802
+ * ships 1.2, Elasticsearch ships 1.2, we ship 1.2.
1803
+ */
1804
+ const BM25F_K1 = 1.2;
1805
+ /**
1806
+ * Per-field BM25F boost weights (`b_f` in the CIKM 2004 paper). The
1807
+ * relative ordering follows Sourcegraph Zoekt's published signal
1808
+ * priorities — matched line first, then symbol context, then path,
1809
+ * then surrounding context.
1810
+ */
1811
+ const FIELD_BOOSTS = {
1812
+ match_line: 3,
1813
+ symbol_context: 2.5,
1814
+ file_path: 2,
1815
+ context: 1
1816
+ };
1817
+ /**
1818
+ * Per-field length-normalization parameter (`l_f`). 0.0 disables
1819
+ * length normalization for short, uniform fields. Lucene's default
1820
+ * `b=0.75` for prose-like fields.
1821
+ */
1822
+ const FIELD_LEN_NORMS = {
1823
+ match_line: 0,
1824
+ symbol_context: 0,
1825
+ file_path: 0,
1826
+ context: .75
1827
+ };
1828
+ /**
1829
+ * Shoulder cut: drop results below this fraction of the top score.
1830
+ * 0.5 is the convention from learning-to-rank literature (Burges
1831
+ * 2010); chosen as the deliberate single-place constant.
1832
+ */
1833
+ const SHOULDER_THRESHOLD = .5;
1834
+ const MAX_QUERY_LEN = 1024;
1835
+ const MAX_GLOB_LEN = 512;
1836
+ const DEFAULT_LIMIT = 20;
1837
+ const MAX_CONTEXT_LINES = 10;
1838
+ const DEFAULT_CONTEXT_LINES = 2;
1839
+ const MAX_SNIPPET_BYTES = 2048;
1840
+ const MAX_STDOUT_BYTES = 10 * 1024 * 1024;
1841
+ const WALL_TIME_MS = 3e4;
1842
+ /**
1843
+ * Structural-pass settings. The wall-clock budget is checked between
1844
+ * files (NOT mid-parse — tree-sitter doesn't surface a usable cancel
1845
+ * hook in the web-tree-sitter binding we're on), so a single
1846
+ * pathological file can overrun by one file's parse-time. In practice
1847
+ * a single source file parses in well under 50ms; 200ms gives us
1848
+ * comfortable headroom for ~5-10 files even on cold cache.
1849
+ */
1850
+ const STRUCTURAL_BUDGET_MS = 200;
1851
+ const STRUCTURAL_TOPN_FULL = 50;
1852
+ const STRUCTURAL_TOPN_FAST = 10;
1853
+ /**
1854
+ * Cap the per-file size we'll parse. 1MB of source covers all
1855
+ * reasonable hand-written files; bigger files are almost always
1856
+ * generated code or vendored bundles whose AST signal is worthless
1857
+ * for ranking real definitions.
1858
+ */
1859
+ const STRUCTURAL_MAX_FILE_BYTES = 1024 * 1024;
1860
+ /**
1861
+ * LRU bound on the parsed-tree cache. Each Tree pins ~roughly the
1862
+ * size of its source plus tree-sitter's internal node arena. 64 is
1863
+ * comfortably under typical Node heap budgets; trees are eagerly
1864
+ * `.delete()`-ed on eviction.
1865
+ */
1866
+ const STRUCTURAL_CACHE_MAX = 64;
1867
+ /**
1868
+ * Definition-shape heuristic for `symbol_context` field. Match this
1869
+ * against the matched line (after leading whitespace strip) to
1870
+ * detect "the match is on a definition." This is the regex fallback
1871
+ * we use when (a) tree-sitter can't reach the file (unsupported
1872
+ * language, grammar load failure, parse error), (b) the file isn't
1873
+ * in the structural pass's top-N slice, or (c) the structural budget
1874
+ * fired.
1875
+ */
1876
+ const SYMBOL_REGEX = /^(?:export\s+)?(?:default\s+)?(?:async\s+)?(?:public\s+|private\s+|protected\s+|static\s+|abstract\s+|readonly\s+)*(?:function|class|interface|type|enum|def|fn|trait|impl|module|namespace|const|let|var)\s+[A-Za-z_$]/;
1877
+ let _rgResolution;
1878
+ /**
1879
+ * Tri-tier resolution. Memoized. Mirrors cc-backup
1880
+ * `src/utils/ripgrep.ts:31-65`.
1881
+ *
1882
+ * 1. System rg on PATH — use the literal command name `"rg"` (NOT
1883
+ * the absolute path). This leverages NoDefaultCurrentDirectory-
1884
+ * InExePath on Windows, preventing PATH-hijacking via a
1885
+ * malicious ./rg.exe in the proxy's cwd.
1886
+ * 2. Bundled via `@vscode/ripgrep` — falls back to the per-platform
1887
+ * binary that `optionalDependencies` installed.
1888
+ * 3. Throw — surfaced to the caller as an MCP isError response.
1889
+ */
1890
+ function resolveRipgrep() {
1891
+ if (_rgResolution) return _rgResolution;
1892
+ if (hasSystemRipgrep()) {
1893
+ _rgResolution = {
1894
+ rgPath: "rg",
1895
+ source: "system"
1896
+ };
1897
+ return _rgResolution;
1898
+ }
1899
+ try {
1900
+ const mod = __require("@vscode/ripgrep");
1901
+ if (mod.rgPath && existsSync(mod.rgPath)) {
1902
+ _rgResolution = {
1903
+ rgPath: mod.rgPath,
1904
+ source: "bundled"
1905
+ };
1906
+ return _rgResolution;
1907
+ }
1908
+ } catch {}
1909
+ throw new Error("ripgrep not found. Either install rg system-wide (brew/apt/winget) or reinstall the proxy so @vscode/ripgrep's per-platform binary is fetched. See README's code_search section.");
1910
+ }
1911
+ function hasSystemRipgrep() {
1912
+ try {
1913
+ return execFileSync(process.platform === "win32" ? "where" : "which", ["rg"], {
1914
+ stdio: [
1915
+ "ignore",
1916
+ "pipe",
1917
+ "ignore"
1918
+ ],
1919
+ timeout: 1e3
1920
+ }).length > 0;
1921
+ } catch {
1922
+ return false;
1923
+ }
1924
+ }
1925
+ function validateInputs(input) {
1926
+ if (typeof input.query !== "string" || input.query.length === 0) return "code_search: arguments.query is required (non-empty string)";
1927
+ if (input.query.length > MAX_QUERY_LEN) return `code_search: query exceeds ${MAX_QUERY_LEN} chars`;
1928
+ if (/[\0\r\n]/.test(input.query)) return "code_search: query contains null byte or newline (rejected)";
1929
+ if (typeof input.workspace !== "string" || input.workspace.length === 0) return "code_search: arguments.workspace is required (absolute path)";
1930
+ if (input.mode && ![
1931
+ "ranked",
1932
+ "literal",
1933
+ "regex"
1934
+ ].includes(input.mode)) return `code_search: mode must be one of "ranked", "literal", "regex"`;
1935
+ if (input.file_glob !== void 0) {
1936
+ if (typeof input.file_glob !== "string") return "code_search: file_glob must be a string";
1937
+ if (input.file_glob.length > MAX_GLOB_LEN) return `code_search: file_glob exceeds ${MAX_GLOB_LEN} chars`;
1938
+ if (/[\0\r\n]/.test(input.file_glob)) return "code_search: file_glob contains null byte or newline";
1939
+ }
1940
+ if (input.limit !== void 0) {
1941
+ if (typeof input.limit !== "number" || !Number.isInteger(input.limit) || input.limit < 1) return "code_search: limit must be a positive integer";
1942
+ }
1943
+ if (input.context_lines !== void 0) {
1944
+ if (typeof input.context_lines !== "number" || !Number.isInteger(input.context_lines) || input.context_lines < 0) return "code_search: context_lines must be a non-negative integer";
1945
+ }
1946
+ return null;
1947
+ }
1948
+ /**
1949
+ * Validate a `workspace` arg. The proxy runs as the user; any path
1950
+ * the proxy process can `stat` is a legal workspace — mirrors what
1951
+ * Claude Code's Read / Bash tools could already reach. Earlier the
1952
+ * validator enforced an allow-set + secret-shape file denylist; the
1953
+ * holistic threat model showed those were inconsistent guardrails
1954
+ * (the model already has filesystem access via its other tools), so
1955
+ * they're dropped.
1956
+ *
1957
+ * Still enforced:
1958
+ * - Absolute path (relative paths are an integration-error footgun).
1959
+ * - realpath canonicalization (resolves symlinks; output paths are
1960
+ * reported relative to this).
1961
+ * - Path must exist AND be a directory.
1962
+ *
1963
+ * Errors do NOT echo the rejected path (output of code_search flows
1964
+ * upstream to the model provider; consistent with the
1965
+ * COPILOT_HOST_ALLOWLIST pattern in `src/lib/utils.ts`).
1966
+ */
1967
+ function validateWorkspace(workspace) {
1968
+ if (!path$1.isAbsolute(workspace)) return {
1969
+ ok: false,
1970
+ error: "workspace must be an absolute path"
1971
+ };
1972
+ let canonical;
1973
+ try {
1974
+ canonical = realpathSync(workspace);
1975
+ } catch {
1976
+ return {
1977
+ ok: false,
1978
+ error: "workspace path is not accessible"
1979
+ };
1980
+ }
1981
+ try {
1982
+ if (!statSync(canonical).isDirectory()) return {
1983
+ ok: false,
1984
+ error: "workspace must be a directory"
1985
+ };
1986
+ } catch {
1987
+ return {
1988
+ ok: false,
1989
+ error: "workspace path is not accessible"
1990
+ };
1991
+ }
1992
+ return {
1993
+ ok: true,
1994
+ canonical
1995
+ };
1996
+ }
1997
+ /**
1998
+ * Rule-based identifier splitter per the ESEC/FSE 2021 benchmark.
1999
+ *
2000
+ * 1. Split on non-word characters.
2001
+ * 2. Within each chunk, split on case boundaries with acronym
2002
+ * lookahead — `HTTPSConnection` → [`HTTPS`, `Connection`].
2003
+ * 3. Attach trailing digit runs to letters — `parseV2Handler` →
2004
+ * [`parse`, `V2`, `Handler`] (NOT `[parse, V, 2, Handler]`).
2005
+ * 4. Lowercase all tokens.
2006
+ * 5. Drop tokens of length < 2 to suppress single-char noise.
2007
+ *
2008
+ * Limitation: ASCII identifiers only. Unicode identifiers (Cyrillic,
2009
+ * CJK, etc.) won't be tokenized. Documented as MVP scope.
2010
+ */
2011
+ function tokenize(text) {
2012
+ const out = [];
2013
+ const pieces = text.split(/[^A-Za-z0-9]+/);
2014
+ const re = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+[0-9]*|[A-Z]+[0-9]*|[0-9]+/g;
2015
+ for (const piece of pieces) {
2016
+ if (!piece) continue;
2017
+ const matches = piece.match(re);
2018
+ if (!matches) continue;
2019
+ for (const m of matches) {
2020
+ const lc = m.toLowerCase();
2021
+ if (lc.length >= 2) out.push(lc);
2022
+ }
2023
+ }
2024
+ return out;
2025
+ }
2026
+ /**
2027
+ * Platform-aware child termination. On Unix: SIGTERM, then SIGKILL
2028
+ * after a brief grace period. On Windows: taskkill /T /F because
2029
+ * child.kill() doesn't reliably terminate descendants — a long
2030
+ * search with worker threads would leak rg.exe processes.
2031
+ */
2032
+ function killChild(child) {
2033
+ if (!child.pid || child.killed) return;
2034
+ if (process.platform === "win32") {
2035
+ try {
2036
+ execFile("taskkill", [
2037
+ "/T",
2038
+ "/F",
2039
+ "/PID",
2040
+ String(child.pid)
2041
+ ], () => {});
2042
+ } catch {}
2043
+ return;
2044
+ }
2045
+ try {
2046
+ child.kill("SIGTERM");
2047
+ } catch {}
2048
+ setTimeout(() => {
2049
+ if (!child.killed) try {
2050
+ child.kill("SIGKILL");
2051
+ } catch {}
2052
+ }, 500).unref();
2053
+ }
2054
+ /**
2055
+ * Single-identifier query matcher. We only expand queries that look
2056
+ * like a single identifier — any whitespace, regex metacharacters, or
2057
+ * structural punctuation defeats the expansion and we fall through to
2058
+ * the original rg behavior. ASCII-only on purpose (matches the
2059
+ * tokenizer's scope; Unicode identifiers are MVP-out).
2060
+ */
2061
+ const SINGLE_IDENTIFIER_REGEX = /^[A-Za-z][A-Za-z0-9_-]{0,127}$/;
2062
+ /**
2063
+ * Split an identifier into its constituent word-pieces, recognizing
2064
+ *
2065
+ * - snake_case (split on `_`)
2066
+ * - kebab-case (split on `-`)
2067
+ * - camelCase (split on lowercase→uppercase boundaries)
2068
+ * - PascalCase (each capitalized run is a piece)
2069
+ * - acronym runs (HTTPSConnection → [HTTPS, Connection])
2070
+ * - trailing digits attached to letters (parseV2 → [parse, V2])
2071
+ *
2072
+ * Pieces are returned in source-order, with the original case
2073
+ * preserved per piece — re-skeletons compose by re-casing each piece.
2074
+ */
2075
+ function splitIdentifierPieces(identifier) {
2076
+ const pieces = [];
2077
+ for (const chunk of identifier.split(/[-_]/)) {
2078
+ if (!chunk) continue;
2079
+ const matches = chunk.match(/[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+[0-9]*|[A-Z]+[0-9]*|[0-9]+/g);
2080
+ if (matches) pieces.push(...matches);
2081
+ }
2082
+ return pieces;
2083
+ }
2084
+ /**
2085
+ * Produce skeleton variants for an identifier query. Returns `null`
2086
+ * when the query is not a single identifier or has only one piece
2087
+ * (no skeleton structure to vary across) — caller falls through to
2088
+ * the literal-search path.
2089
+ *
2090
+ * The variant set covers the five conventions any real codebase
2091
+ * mixes:
2092
+ *
2093
+ * getUserName (lowerCamelCase)
2094
+ * GetUserName (UpperCamelCase / PascalCase)
2095
+ * get_user_name (snake_case)
2096
+ * get-user-name (kebab-case)
2097
+ * GET_USER_NAME (UPPER_SNAKE_CASE)
2098
+ *
2099
+ * The set is deduplicated so identifiers that collapse skeletons
2100
+ * (e.g., single-word queries) don't bloat the regex pointlessly.
2101
+ */
2102
+ function expandIdentifierVariants(query) {
2103
+ if (!SINGLE_IDENTIFIER_REGEX.test(query)) return null;
2104
+ const pieces = splitIdentifierPieces(query);
2105
+ if (pieces.length < 2) return null;
2106
+ const lower = pieces.map((p) => p.toLowerCase());
2107
+ const upper = pieces.map((p) => p.toUpperCase());
2108
+ const cap = lower.map((p) => p.charAt(0).toUpperCase() + p.slice(1));
2109
+ const variants = /* @__PURE__ */ new Set();
2110
+ variants.add(query);
2111
+ variants.add(lower[0] + cap.slice(1).join(""));
2112
+ variants.add(cap.join(""));
2113
+ variants.add(lower.join("_"));
2114
+ variants.add(lower.join("-"));
2115
+ variants.add(upper.join("_"));
2116
+ return Array.from(variants);
2117
+ }
2118
+ /**
2119
+ * Build the rg regex pattern for a set of skeleton variants. The
2120
+ * variants are already plain identifiers (no regex metacharacters),
2121
+ * so simple alternation suffices. Word boundaries are intentionally
2122
+ * NOT applied — the user's mental model for "search for getUserName"
2123
+ * is substring-anywhere, which is also what `-F getUserName` did.
2124
+ */
2125
+ function buildExpansionPattern(variants) {
2126
+ return "(?:" + variants.join("|") + ")";
2127
+ }
2128
+ function buildRgArgs(input) {
2129
+ const args = ["--json", "--no-follow"];
2130
+ if (input.contextLines > 0) args.push(`-C`, String(input.contextLines));
2131
+ if (!input.expansionPattern && (input.mode === "literal" || input.mode === "ranked")) args.push("-F");
2132
+ if (input.fileGlob && input.fileGlob !== "**/*") args.push("-g", input.fileGlob);
2133
+ args.push("--", input.expansionPattern ?? input.query, ".");
2134
+ return args;
2135
+ }
2136
+ /**
2137
+ * Stream-parse ripgrep --json output. Two load-bearing behaviors:
2138
+ *
2139
+ * 1. GLOBAL limit cap (NOT per-file — MEDIUM-10). Once we've
2140
+ * accumulated `limit` hits, send SIGTERM and stop emitting.
2141
+ *
2142
+ * 2. CANCEL RACE short-circuit (HIGH-9, 3-lab confirmed). The
2143
+ * moment `signal.aborted` flips, detach the line listener AND
2144
+ * return early. A half-flushed truncated JSON line never
2145
+ * reaches JSON.parse — that's the bug we're guarding against.
2146
+ */
2147
+ async function parseRgJsonStream(child, opts) {
2148
+ const hits = [];
2149
+ let stdoutBytes = 0;
2150
+ let truncatedByCap = false;
2151
+ let cancelled = false;
2152
+ let scannedFiles = 0;
2153
+ if (opts.signal.aborted) {
2154
+ killChild(child);
2155
+ return {
2156
+ hits,
2157
+ scannedFiles: 0,
2158
+ truncated: false,
2159
+ cancelled: true,
2160
+ stdoutBytes: 0
2161
+ };
2162
+ }
2163
+ const pendingContextBefore = [];
2164
+ let lastHitForContext;
2165
+ if (!child.stdout) return {
2166
+ hits,
2167
+ scannedFiles: 0,
2168
+ truncated: false,
2169
+ cancelled: false,
2170
+ stdoutBytes: 0
2171
+ };
2172
+ const rl = createInterface({
2173
+ input: child.stdout,
2174
+ crlfDelay: Infinity
2175
+ });
2176
+ const onAbort = () => {
2177
+ cancelled = true;
2178
+ rl.close();
2179
+ killChild(child);
2180
+ };
2181
+ opts.signal.addEventListener("abort", onAbort, { once: true });
2182
+ try {
2183
+ for await (const rawLine of rl) {
2184
+ if (cancelled) break;
2185
+ stdoutBytes += rawLine.length + 1;
2186
+ if (stdoutBytes > MAX_STDOUT_BYTES) {
2187
+ truncatedByCap = true;
2188
+ killChild(child);
2189
+ break;
2190
+ }
2191
+ if (rawLine.length === 0) continue;
2192
+ let evt;
2193
+ try {
2194
+ evt = JSON.parse(rawLine);
2195
+ } catch {
2196
+ continue;
2197
+ }
2198
+ switch (evt.type) {
2199
+ case "begin":
2200
+ scannedFiles += 1;
2201
+ pendingContextBefore.length = 0;
2202
+ lastHitForContext = void 0;
2203
+ break;
2204
+ case "context": {
2205
+ const text = stripTrailingNewline(evt.data.lines?.text ?? "");
2206
+ if (lastHitForContext && lastHitForContext.context_after.length < opts.contextLines) lastHitForContext.context_after.push(text);
2207
+ else {
2208
+ pendingContextBefore.push(text);
2209
+ if (pendingContextBefore.length > opts.contextLines) pendingContextBefore.shift();
2210
+ }
2211
+ break;
2212
+ }
2213
+ case "match": {
2214
+ if (hits.length >= opts.limit) {
2215
+ killChild(child);
2216
+ break;
2217
+ }
2218
+ const sub = evt.data.submatches?.[0];
2219
+ if (!evt.data.path || !evt.data.lines || !evt.data.line_number || !sub) break;
2220
+ const hit = {
2221
+ file: evt.data.path.text,
2222
+ line: evt.data.line_number,
2223
+ matched_line: stripTrailingNewline(evt.data.lines.text),
2224
+ match_start: sub.start,
2225
+ match_end: sub.end,
2226
+ context_before: [...pendingContextBefore],
2227
+ context_after: []
2228
+ };
2229
+ pendingContextBefore.length = 0;
2230
+ lastHitForContext = hit;
2231
+ hits.push(hit);
2232
+ break;
2233
+ }
2234
+ case "end":
2235
+ case "summary":
2236
+ default: break;
2237
+ }
2238
+ }
2239
+ } finally {
2240
+ opts.signal.removeEventListener("abort", onAbort);
2241
+ }
2242
+ return {
2243
+ hits,
2244
+ scannedFiles,
2245
+ truncated: truncatedByCap || hits.length >= opts.limit,
2246
+ cancelled,
2247
+ stdoutBytes
2248
+ };
2249
+ }
2250
+ function stripTrailingNewline(s) {
2251
+ if (s.endsWith("\r\n")) return s.slice(0, -2);
2252
+ if (s.endsWith("\n")) return s.slice(0, -1);
2253
+ return s;
2254
+ }
2255
+ /**
2256
+ * Extension → grammar key. Grammars not in this map skip structural
2257
+ * parsing (the hit falls back to the regex SYMBOL_REGEX heuristic for
2258
+ * `symbol_context`). Keep this list aligned with `GRAMMAR_FILES`
2259
+ * below — adding a language requires both an extension mapping and a
2260
+ * `.wasm` to load.
2261
+ */
2262
+ const EXTENSION_TO_LANG = {
2263
+ ".ts": "typescript",
2264
+ ".tsx": "tsx",
2265
+ ".js": "javascript",
2266
+ ".mjs": "javascript",
2267
+ ".cjs": "javascript",
2268
+ ".jsx": "javascript",
2269
+ ".py": "python",
2270
+ ".go": "go",
2271
+ ".rs": "rust",
2272
+ ".java": "java",
2273
+ ".c": "c",
2274
+ ".h": "c",
2275
+ ".cpp": "cpp",
2276
+ ".cc": "cpp",
2277
+ ".cxx": "cpp",
2278
+ ".hpp": "cpp",
2279
+ ".hxx": "cpp"
2280
+ };
2281
+ /**
2282
+ * Grammar key → wasm filename under `node_modules/tree-sitter-wasms/out/`.
2283
+ * Resolved at runtime from `node_modules`; the file paths are stable
2284
+ * because `tree-sitter-wasms` ships prebuilt binaries (no per-install
2285
+ * codegen).
2286
+ */
2287
+ const GRAMMAR_FILES = {
2288
+ typescript: "tree-sitter-typescript.wasm",
2289
+ tsx: "tree-sitter-tsx.wasm",
2290
+ javascript: "tree-sitter-javascript.wasm",
2291
+ python: "tree-sitter-python.wasm",
2292
+ go: "tree-sitter-go.wasm",
2293
+ rust: "tree-sitter-rust.wasm",
2294
+ java: "tree-sitter-java.wasm",
2295
+ c: "tree-sitter-c.wasm",
2296
+ cpp: "tree-sitter-cpp.wasm"
2297
+ };
2298
+ /**
2299
+ * Per-language definition-shape node types. When a matched identifier
2300
+ * sits inside one of these nodes AND is at the node's "name" position,
2301
+ * we have AST-confirmed evidence the line is an identifier-definition
2302
+ * site. The brief's enumeration plus a handful of language-idiomatic
2303
+ * extras (e.g., `lexical_declaration` for TS/JS top-level `const`s,
2304
+ * `mod_item` for Rust modules).
2305
+ *
2306
+ * The set lookup is per-language so a node type that means
2307
+ * "definition" in one language but "reference" in another won't
2308
+ * cross-pollute.
2309
+ */
2310
+ const DEFINITION_NODE_TYPES = {
2311
+ typescript: new Set([
2312
+ "function_declaration",
2313
+ "function_signature",
2314
+ "function_expression",
2315
+ "method_definition",
2316
+ "method_signature",
2317
+ "class_declaration",
2318
+ "interface_declaration",
2319
+ "type_alias_declaration",
2320
+ "enum_declaration",
2321
+ "variable_declarator",
2322
+ "generator_function_declaration",
2323
+ "abstract_method_signature",
2324
+ "public_field_definition",
2325
+ "property_signature"
2326
+ ]),
2327
+ tsx: new Set([
2328
+ "function_declaration",
2329
+ "function_signature",
2330
+ "function_expression",
2331
+ "method_definition",
2332
+ "method_signature",
2333
+ "class_declaration",
2334
+ "interface_declaration",
2335
+ "type_alias_declaration",
2336
+ "enum_declaration",
2337
+ "variable_declarator",
2338
+ "generator_function_declaration",
2339
+ "abstract_method_signature",
2340
+ "public_field_definition",
2341
+ "property_signature"
2342
+ ]),
2343
+ javascript: new Set([
2344
+ "function_declaration",
2345
+ "function_expression",
2346
+ "method_definition",
2347
+ "class_declaration",
2348
+ "variable_declarator",
2349
+ "generator_function_declaration"
2350
+ ]),
2351
+ python: new Set([
2352
+ "function_definition",
2353
+ "class_definition",
2354
+ "decorated_definition"
2355
+ ]),
2356
+ go: new Set([
2357
+ "function_declaration",
2358
+ "method_declaration",
2359
+ "type_spec",
2360
+ "type_alias",
2361
+ "const_spec",
2362
+ "var_spec"
2363
+ ]),
2364
+ rust: new Set([
2365
+ "function_item",
2366
+ "impl_item",
2367
+ "trait_item",
2368
+ "struct_item",
2369
+ "enum_item",
2370
+ "mod_item",
2371
+ "type_item",
2372
+ "const_item",
2373
+ "static_item",
2374
+ "macro_definition"
2375
+ ]),
2376
+ java: new Set([
2377
+ "class_declaration",
2378
+ "interface_declaration",
2379
+ "method_declaration",
2380
+ "constructor_declaration",
2381
+ "enum_declaration",
2382
+ "field_declaration",
2383
+ "annotation_type_declaration"
2384
+ ]),
2385
+ c: new Set([
2386
+ "function_definition",
2387
+ "declaration",
2388
+ "struct_specifier",
2389
+ "enum_specifier",
2390
+ "union_specifier",
2391
+ "type_definition"
2392
+ ]),
2393
+ cpp: new Set([
2394
+ "function_definition",
2395
+ "declaration",
2396
+ "struct_specifier",
2397
+ "class_specifier",
2398
+ "enum_specifier",
2399
+ "union_specifier",
2400
+ "type_definition",
2401
+ "namespace_definition",
2402
+ "template_declaration"
2403
+ ])
2404
+ };
2405
+ /**
2406
+ * Node types that the AST exposes as "this token is an identifier".
2407
+ * The match-position lookup uses these to filter out parent-node hits
2408
+ * before checking the definition-site predicate.
2409
+ */
2410
+ const IDENTIFIER_NODE_TYPES = new Set([
2411
+ "identifier",
2412
+ "type_identifier",
2413
+ "field_identifier",
2414
+ "property_identifier",
2415
+ "shorthand_property_identifier_pattern",
2416
+ "shorthand_property_identifier",
2417
+ "scoped_identifier",
2418
+ "name"
2419
+ ]);
2420
+ let _grammarBundle;
2421
+ /**
2422
+ * Resolve the `tree-sitter-wasms/out/` directory at the package root.
2423
+ * `require.resolve` is used through a try/catch — the bundled-only
2424
+ * fallback runs in environments where node_modules has been pruned to
2425
+ * just runtime deps.
2426
+ */
2427
+ function resolveGrammarRoot() {
2428
+ try {
2429
+ const pkgPath = __require.resolve("tree-sitter-wasms/package.json");
2430
+ return path$1.join(path$1.dirname(pkgPath), "out");
2431
+ } catch {
2432
+ return null;
2433
+ }
2434
+ }
2435
+ /**
2436
+ * Pre-load all grammars at module-init time so the first search
2437
+ * doesn't pay a ~500ms cold-start cost. The Promise is captured at
2438
+ * import time and awaited per-call; per-grammar failures are caught
2439
+ * individually so one broken grammar can't take the whole tool down.
2440
+ */
2441
+ function getGrammarBundle() {
2442
+ if (_grammarBundle) return _grammarBundle;
2443
+ _grammarBundle = { ready: (async () => {
2444
+ const out = /* @__PURE__ */ new Map();
2445
+ try {
2446
+ await Parser.init();
2447
+ } catch (err) {
2448
+ consola.warn(`[code_search] tree-sitter Parser.init failed; structural ranking disabled: ${err.message}`);
2449
+ return out;
2450
+ }
2451
+ const root = resolveGrammarRoot();
2452
+ if (!root) {
2453
+ consola.warn("[code_search] tree-sitter-wasms package not resolvable; structural ranking disabled");
2454
+ return out;
2455
+ }
2456
+ for (const [key, filename] of Object.entries(GRAMMAR_FILES)) {
2457
+ const wasmPath = path$1.join(root, filename);
2458
+ try {
2459
+ const lang = await Parser.Language.load(wasmPath);
2460
+ out.set(key, lang);
2461
+ } catch (err) {
2462
+ consola.warn(`[code_search] failed to load tree-sitter grammar '${key}' from ${filename}: ${err.message}`);
2463
+ }
2464
+ }
2465
+ return out;
2466
+ })() };
2467
+ return _grammarBundle;
2468
+ }
2469
+ getGrammarBundle().ready.catch(() => {});
2470
+ function getLanguageKeyForPath(filePath) {
2471
+ return EXTENSION_TO_LANG[path$1.extname(filePath).toLowerCase()] ?? null;
2472
+ }
2473
+ const _treeCache = /* @__PURE__ */ new Map();
2474
+ function cacheGet(absPath, mtimeMs) {
2475
+ const cur = _treeCache.get(absPath);
2476
+ if (!cur) return void 0;
2477
+ if (cur.mtimeMs !== mtimeMs) {
2478
+ if (cur.tree) try {
2479
+ cur.tree.delete();
2480
+ } catch {}
2481
+ _treeCache.delete(absPath);
2482
+ return;
2483
+ }
2484
+ _treeCache.delete(absPath);
2485
+ _treeCache.set(absPath, cur);
2486
+ return cur;
2487
+ }
2488
+ function cachePut(absPath, entry) {
2489
+ while (_treeCache.size >= STRUCTURAL_CACHE_MAX) {
2490
+ const firstKey = _treeCache.keys().next().value;
2491
+ if (firstKey === void 0) break;
2492
+ const evicted = _treeCache.get(firstKey);
2493
+ if (evicted?.tree) try {
2494
+ evicted.tree.delete();
2495
+ } catch {}
2496
+ _treeCache.delete(firstKey);
2497
+ }
2498
+ _treeCache.set(absPath, entry);
2499
+ }
2500
+ /**
2501
+ * Compute the absolute byte offset where line `lineNumber1` starts
2502
+ * in `source`. Lines are counted by LF; CRLF files have the same
2503
+ * line starts as LF files (the \r is part of the previous line's
2504
+ * content, not the line break). `lineNumber1` is 1-indexed to match
2505
+ * ripgrep's output. Returns -1 if the line is past EOF.
2506
+ */
2507
+ function lineStartByte(source, lineNumber1) {
2508
+ if (lineNumber1 <= 1) return 0;
2509
+ let line = 1;
2510
+ for (let i = 0; i < source.length; i++) if (source.charCodeAt(i) === 10) {
2511
+ line += 1;
2512
+ if (line === lineNumber1) return i + 1;
2513
+ }
2514
+ return -1;
2515
+ }
2516
+ /**
2517
+ * Walk up from a matched identifier node looking for the closest
2518
+ * definition-shape ancestor (per the language's allowed types). When
2519
+ * we find one, verify the matched identifier is at the definition's
2520
+ * "name" slot — NOT inside a parameter type, a body, or a parent's
2521
+ * signature. Returns true iff this is a real definition site for
2522
+ * the identifier the rg submatch landed on.
2523
+ *
2524
+ * The walk has a small depth bound (6) — definition names sit very
2525
+ * close to their definition node in every supported grammar; deeper
2526
+ * walks risk false positives (e.g., matching `name` inside the body
2527
+ * of an enclosing function and concluding "yes, definition").
2528
+ */
2529
+ function isDefiningSite(matchedNode, langKey) {
2530
+ const defTypes = DEFINITION_NODE_TYPES[langKey];
2531
+ if (!defTypes) return false;
2532
+ let cur = matchedNode.parent;
2533
+ let depth = 0;
2534
+ while (cur && depth < 6) {
2535
+ if (defTypes.has(cur.type)) {
2536
+ const nameField = cur.childForFieldName("name");
2537
+ if (nameField && containsByteRange(nameField, matchedNode)) return true;
2538
+ const declarator = cur.childForFieldName("declarator");
2539
+ if (declarator && containsByteRange(declarator, matchedNode)) {
2540
+ const first = firstIdentifierLeaf(declarator);
2541
+ if (first && first.startIndex === matchedNode.startIndex) return true;
2542
+ }
2543
+ const typeField = cur.childForFieldName("type");
2544
+ if (typeField && containsByteRange(typeField, matchedNode)) {
2545
+ const first = firstIdentifierLeaf(typeField);
2546
+ if (first && first.startIndex === matchedNode.startIndex) return true;
2547
+ }
2548
+ }
2549
+ cur = cur.parent;
2550
+ depth += 1;
2551
+ }
2552
+ return false;
2553
+ }
2554
+ function containsByteRange(outer, inner) {
2555
+ return outer.startIndex <= inner.startIndex && outer.endIndex >= inner.endIndex;
2556
+ }
2557
+ function firstIdentifierLeaf(node) {
2558
+ if (IDENTIFIER_NODE_TYPES.has(node.type)) return node;
2559
+ for (const child of node.namedChildren) {
2560
+ const r = firstIdentifierLeaf(child);
2561
+ if (r) return r;
2562
+ }
2563
+ return null;
2564
+ }
2565
+ /**
2566
+ * Run the structural-confirmation pass over the top-N already-ranked
2567
+ * BM25F hits. Wall-clock-bounded — checked between files, not mid-
2568
+ * parse (web-tree-sitter@0.22 doesn't expose a usable cancel hook).
2569
+ *
2570
+ * Per-file failure modes (file too big, language unsupported, parse
2571
+ * error, I/O error) are silent: the file's hits keep the regex
2572
+ * `symbol_context` heuristic. Only the wall-clock budget fires the
2573
+ * user-visible `fallback` message.
2574
+ */
2575
+ async function runStructuralPass(opts) {
2576
+ const result = {
2577
+ confirmedHitIndexes: /* @__PURE__ */ new Set(),
2578
+ fallback: null
2579
+ };
2580
+ if (opts.hitsRanked.length === 0) return result;
2581
+ if (opts.signal.aborted) return result;
2582
+ const grammars = await getGrammarBundle().ready;
2583
+ if (grammars.size === 0) return result;
2584
+ const cap = Math.min(opts.hitsRanked.length, opts.topN);
2585
+ const byFile = /* @__PURE__ */ new Map();
2586
+ for (let i = 0; i < cap; i++) {
2587
+ const entry = opts.hitsRanked[i];
2588
+ const list = byFile.get(entry.hit.file) ?? [];
2589
+ list.push(entry);
2590
+ byFile.set(entry.hit.file, list);
2591
+ }
2592
+ const t0 = Date.now();
2593
+ let filesParsed = 0;
2594
+ let parsersUsed = /* @__PURE__ */ new Map();
2595
+ try {
2596
+ for (const [relFile, entries] of byFile) {
2597
+ if (opts.signal.aborted) break;
2598
+ if (Date.now() - t0 >= opts.budgetMs) {
2599
+ result.fallback = `structural budget exceeded after parsing ${filesParsed}/${cap} hits; retry with structural: "topN" or narrow your query`;
2600
+ break;
2601
+ }
2602
+ const langKey = getLanguageKeyForPath(relFile);
2603
+ if (!langKey) continue;
2604
+ const lang = grammars.get(langKey);
2605
+ if (!lang) continue;
2606
+ const absPath = path$1.join(opts.workspaceRoot, relFile);
2607
+ let mtimeMs;
2608
+ let size;
2609
+ try {
2610
+ const st = statSync(absPath);
2611
+ mtimeMs = st.mtimeMs;
2612
+ size = st.size;
2613
+ } catch (err) {
2614
+ consola.debug(`[code_search] structural skip ${relFile} (stat failed: ${err.message})`);
2615
+ continue;
2616
+ }
2617
+ if (size > STRUCTURAL_MAX_FILE_BYTES) {
2618
+ consola.debug(`[code_search] structural skip ${relFile} (${size} bytes > cap)`);
2619
+ continue;
2620
+ }
2621
+ let cached = cacheGet(absPath, mtimeMs);
2622
+ if (!cached) {
2623
+ let source;
2624
+ try {
2625
+ source = readFileSync(absPath, "utf8");
2626
+ } catch (err) {
2627
+ consola.debug(`[code_search] structural skip ${relFile} (read failed: ${err.message})`);
2628
+ cachePut(absPath, {
2629
+ mtimeMs,
2630
+ tree: null,
2631
+ source: null
2632
+ });
2633
+ continue;
2634
+ }
2635
+ let parser = parsersUsed.get(langKey);
2636
+ if (!parser) {
2637
+ parser = new Parser();
2638
+ parser.setLanguage(lang);
2639
+ parsersUsed.set(langKey, parser);
2640
+ }
2641
+ let tree = null;
2642
+ try {
2643
+ tree = parser.parse(source);
2644
+ } catch (err) {
2645
+ consola.debug(`[code_search] tree-sitter parse failed for ${relFile}: ${err.message}`);
2646
+ }
2647
+ cached = {
2648
+ mtimeMs,
2649
+ tree,
2650
+ source: tree ? source : null
2651
+ };
2652
+ cachePut(absPath, cached);
2653
+ filesParsed += 1;
2654
+ }
2655
+ if (!cached.tree || !cached.source) continue;
2656
+ for (const entry of entries) {
2657
+ const lineStart = lineStartByte(cached.source, entry.hit.line);
2658
+ if (lineStart < 0) continue;
2659
+ const matchByteStart = lineStart + entry.hit.match_start;
2660
+ const matchByteEnd = lineStart + entry.hit.match_end;
2661
+ let node;
2662
+ try {
2663
+ node = cached.tree.rootNode.descendantForIndex(matchByteStart, matchByteEnd);
2664
+ } catch {
2665
+ node = null;
2666
+ }
2667
+ if (!node) continue;
2668
+ if (!IDENTIFIER_NODE_TYPES.has(node.type)) {
2669
+ let cur = node;
2670
+ let depth = 0;
2671
+ while (cur && !IDENTIFIER_NODE_TYPES.has(cur.type) && depth < 3) {
2672
+ const leaf = firstIdentifierLeaf(cur);
2673
+ if (leaf && leaf.startIndex === matchByteStart) {
2674
+ cur = leaf;
2675
+ break;
2676
+ }
2677
+ cur = cur.parent;
2678
+ depth += 1;
2679
+ }
2680
+ node = cur;
2681
+ }
2682
+ if (!node || !IDENTIFIER_NODE_TYPES.has(node.type)) continue;
2683
+ if (isDefiningSite(node, langKey)) result.confirmedHitIndexes.add(entry.index);
2684
+ }
2685
+ }
2686
+ } finally {
2687
+ for (const parser of parsersUsed.values()) try {
2688
+ parser.delete();
2689
+ } catch {}
2690
+ parsersUsed = /* @__PURE__ */ new Map();
2691
+ }
2692
+ return result;
2693
+ }
2694
+ function extractFields(hit, astConfirmed) {
2695
+ const ctx = [...hit.context_before, ...hit.context_after].join("\n");
2696
+ let symbolContext;
2697
+ if (astConfirmed) {
2698
+ const ident = hit.matched_line.slice(hit.match_start, hit.match_end);
2699
+ symbolContext = ident.length > 0 ? ident : hit.matched_line;
2700
+ } else if (SYMBOL_REGEX.test(hit.matched_line.trimStart())) symbolContext = hit.matched_line;
2701
+ else symbolContext = "";
2702
+ return {
2703
+ match_line: hit.matched_line,
2704
+ context: ctx,
2705
+ file_path: hit.file.replace(/[/\\]/g, " "),
2706
+ symbol_context: symbolContext
2707
+ };
2708
+ }
2709
+ /**
2710
+ * BM25F score for the given hit set against the tokenized query.
2711
+ *
2712
+ * BM25F(q, f) = Σ_t IDF(t) · w_t,f / (w_t,f + k1)
2713
+ *
2714
+ * w_t,f = Σ_field b_field · tf_t,field,f /
2715
+ * ((1 − l_field) + l_field · len_field,f/avglen_field)
2716
+ *
2717
+ * IDF(t) = log( (M − df(t) + 0.5) / (df(t) + 0.5) )
2718
+ *
2719
+ * Corpus stats are derived from the rg hit set itself — we have no
2720
+ * persistent index. M = number of files in the hit set; df(t) = how
2721
+ * many of those files contain token `t` in any field; avglen_f =
2722
+ * mean tokenized length of field `f` across those files. This is
2723
+ * the "compute corpus stats per-call" pattern, which works because
2724
+ * M ≤ a few hundred files in practice (sub-second).
2725
+ */
2726
+ function bm25fScore(hits, queryTokens, astConfirmedHits) {
2727
+ if (hits.length === 0 || queryTokens.length === 0) return hits.map((h) => ({
2728
+ hit: h,
2729
+ score: 0,
2730
+ field_contributions: {
2731
+ match_line: 0,
2732
+ symbol_context: 0,
2733
+ file_path: 0,
2734
+ context: 0
2735
+ }
2736
+ }));
2737
+ const fileTokenCache = /* @__PURE__ */ new Map();
2738
+ const perHitTokens = [];
2739
+ for (let i = 0; i < hits.length; i++) {
2740
+ const hit = hits[i];
2741
+ const fields = extractFields(hit, astConfirmedHits?.has(i) ?? false);
2742
+ fileTokenCache.set(hit.file, fields);
2743
+ perHitTokens.push({
2744
+ match_line: tokenize(fields.match_line),
2745
+ context: tokenize(fields.context),
2746
+ file_path: tokenize(fields.file_path),
2747
+ symbol_context: tokenize(fields.symbol_context)
2748
+ });
2749
+ }
2750
+ const filesSeen = /* @__PURE__ */ new Set();
2751
+ for (const hit of hits) filesSeen.add(hit.file);
2752
+ const M = filesSeen.size;
2753
+ const df = /* @__PURE__ */ new Map();
2754
+ const fileTokensByField = {
2755
+ match_line: /* @__PURE__ */ new Map(),
2756
+ context: /* @__PURE__ */ new Map(),
2757
+ file_path: /* @__PURE__ */ new Map(),
2758
+ symbol_context: /* @__PURE__ */ new Map()
2759
+ };
2760
+ for (let i = 0; i < hits.length; i++) {
2761
+ const file = hits[i].file;
2762
+ const t = perHitTokens[i];
2763
+ for (const fname of Object.keys(t)) {
2764
+ let bucket = fileTokensByField[fname].get(file);
2765
+ if (!bucket) {
2766
+ bucket = /* @__PURE__ */ new Set();
2767
+ fileTokensByField[fname].set(file, bucket);
2768
+ }
2769
+ for (const tok of t[fname]) bucket.add(tok);
2770
+ }
2771
+ }
2772
+ for (const qt of queryTokens) {
2773
+ const files = /* @__PURE__ */ new Set();
2774
+ for (const fname of Object.keys(fileTokensByField)) for (const [file, tokSet] of fileTokensByField[fname]) if (tokSet.has(qt)) files.add(file);
2775
+ df.set(qt, files.size);
2776
+ }
2777
+ const avglen = {
2778
+ match_line: 0,
2779
+ context: 0,
2780
+ file_path: 0,
2781
+ symbol_context: 0
2782
+ };
2783
+ for (const fname of Object.keys(avglen)) {
2784
+ const lens = [];
2785
+ const seen = /* @__PURE__ */ new Set();
2786
+ for (let i = 0; i < hits.length; i++) {
2787
+ if (seen.has(hits[i].file)) continue;
2788
+ seen.add(hits[i].file);
2789
+ lens.push(perHitTokens[i][fname].length);
2790
+ }
2791
+ avglen[fname] = lens.length > 0 ? lens.reduce((a, b) => a + b, 0) / lens.length : 1;
2792
+ if (avglen[fname] === 0) avglen[fname] = 1;
2793
+ }
2794
+ const idf = /* @__PURE__ */ new Map();
2795
+ for (const qt of queryTokens) {
2796
+ const d = df.get(qt) ?? 0;
2797
+ idf.set(qt, Math.log((M - d + .5) / (d + .5) + 1));
2798
+ }
2799
+ const out = [];
2800
+ for (let i = 0; i < hits.length; i++) {
2801
+ const tokens = perHitTokens[i];
2802
+ const contributions = {
2803
+ match_line: 0,
2804
+ symbol_context: 0,
2805
+ file_path: 0,
2806
+ context: 0
2807
+ };
2808
+ for (const qt of queryTokens) {
2809
+ let w = 0;
2810
+ const perField = {
2811
+ match_line: 0,
2812
+ symbol_context: 0,
2813
+ file_path: 0,
2814
+ context: 0
2815
+ };
2816
+ for (const fname of Object.keys(FIELD_BOOSTS)) {
2817
+ const tf = tokens[fname].filter((t) => t === qt).length;
2818
+ if (tf === 0) continue;
2819
+ const len = tokens[fname].length || 1;
2820
+ const l = FIELD_LEN_NORMS[fname];
2821
+ const norm = 1 - l + l * (len / (avglen[fname] || 1));
2822
+ const fieldContrib = FIELD_BOOSTS[fname] * (tf / norm);
2823
+ w += fieldContrib;
2824
+ perField[fname] = fieldContrib;
2825
+ }
2826
+ if (w === 0) continue;
2827
+ const termScore = (idf.get(qt) ?? 0) * (w / (w + BM25F_K1));
2828
+ for (const fname of Object.keys(perField)) {
2829
+ const share = perField[fname] / w;
2830
+ contributions[fname] += termScore * share;
2831
+ }
2832
+ }
2833
+ const total = Object.values(contributions).reduce((a, b) => a + b, 0);
2834
+ out.push({
2835
+ hit: hits[i],
2836
+ score: total,
2837
+ field_contributions: contributions
2838
+ });
2839
+ }
2840
+ return out;
2841
+ }
2842
+ function shoulderPrune(scored) {
2843
+ if (scored.length === 0) return {
2844
+ kept: [],
2845
+ prunedBelowShoulder: 0
2846
+ };
2847
+ scored.sort((a, b) => {
2848
+ if (b.score !== a.score) return b.score - a.score;
2849
+ if (a.hit.file !== b.hit.file) return a.hit.file < b.hit.file ? -1 : 1;
2850
+ return a.hit.line - b.hit.line;
2851
+ });
2852
+ const topScore = scored[0].score;
2853
+ if (topScore <= 0) return {
2854
+ kept: scored,
2855
+ prunedBelowShoulder: 0
2856
+ };
2857
+ const threshold = topScore * SHOULDER_THRESHOLD;
2858
+ let cut = scored.length;
2859
+ for (let i = 0; i < scored.length; i++) if (scored[i].score < threshold) {
2860
+ cut = i;
2861
+ break;
2862
+ }
2863
+ return {
2864
+ kept: scored.slice(0, cut),
2865
+ prunedBelowShoulder: scored.length - cut
2866
+ };
2867
+ }
2868
+ function renderSnippet(hit) {
2869
+ let snippet = [
2870
+ ...hit.context_before,
2871
+ hit.matched_line,
2872
+ ...hit.context_after
2873
+ ].join("\n");
2874
+ if (Buffer.byteLength(snippet, "utf8") > MAX_SNIPPET_BYTES) {
2875
+ const buf = Buffer.from(snippet, "utf8");
2876
+ const halfCap = Math.floor((MAX_SNIPPET_BYTES - 16) / 2);
2877
+ snippet = buf.slice(0, halfCap).toString("utf8") + "\n... [truncated] ...\n" + buf.slice(buf.length - halfCap).toString("utf8");
2878
+ }
2879
+ return snippet;
2880
+ }
2881
+ async function searchCode(rawInput, externalSignal) {
2882
+ const t0 = Date.now();
2883
+ const inputErr = validateInputs(rawInput);
2884
+ if (inputErr) throw new Error(inputErr);
2885
+ const ws = validateWorkspace(rawInput.workspace);
2886
+ if (!ws.ok || !ws.canonical) throw new Error(ws.error ?? "workspace validation failed");
2887
+ const mode = rawInput.mode ?? "ranked";
2888
+ const structuralMode = rawInput.structural ?? "full";
2889
+ const limit = rawInput.limit ?? DEFAULT_LIMIT;
2890
+ const contextLines = Math.min(rawInput.context_lines ?? DEFAULT_CONTEXT_LINES, MAX_CONTEXT_LINES);
2891
+ const expansion = mode === "regex" ? null : expandIdentifierVariants(rawInput.query);
2892
+ const expansionPattern = expansion ? buildExpansionPattern(expansion) : void 0;
2893
+ const ac = new AbortController();
2894
+ const onExternal = () => ac.abort("external");
2895
+ if (externalSignal) if (externalSignal.aborted) ac.abort("external");
2896
+ else externalSignal.addEventListener("abort", onExternal, { once: true });
2897
+ const wallTimer = setTimeout(() => ac.abort("timeout"), WALL_TIME_MS);
2898
+ wallTimer.unref();
2899
+ let parseResult;
2900
+ let rgResolution;
2901
+ try {
2902
+ rgResolution = resolveRipgrep();
2903
+ } catch (err) {
2904
+ clearTimeout(wallTimer);
2905
+ if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
2906
+ throw err;
2907
+ }
2908
+ const args = buildRgArgs({
2909
+ mode,
2910
+ fileGlob: rawInput.file_glob,
2911
+ contextLines,
2912
+ query: rawInput.query,
2913
+ expansionPattern
2914
+ });
2915
+ let child;
2916
+ try {
2917
+ child = spawn(rgResolution.rgPath, args, {
2918
+ cwd: ws.canonical,
2919
+ shell: false,
2920
+ stdio: [
2921
+ "ignore",
2922
+ "pipe",
2923
+ "pipe"
2924
+ ]
2925
+ });
2926
+ } catch (err) {
2927
+ clearTimeout(wallTimer);
2928
+ if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
2929
+ throw new Error(`failed to spawn ripgrep: ${err.message}`);
2930
+ }
2931
+ const STDERR_TEXT_CAP = 64 * 1024;
2932
+ let stderrBytes = 0;
2933
+ let stderrText = "";
2934
+ if (child.stderr) {
2935
+ child.stderr.setEncoding("utf8");
2936
+ child.stderr.on("data", (chunk) => {
2937
+ stderrBytes += chunk.length;
2938
+ if (stderrText.length < STDERR_TEXT_CAP) stderrText = (stderrText + chunk).slice(0, STDERR_TEXT_CAP);
2939
+ if (stderrBytes > 1024 * 1024) ac.abort("stderr_cap");
2940
+ });
2941
+ }
2942
+ let exitCode = null;
2943
+ const exitPromise = new Promise((resolve) => {
2944
+ child.on("close", (code) => {
2945
+ exitCode = code;
2946
+ resolve();
2947
+ });
2948
+ });
2949
+ try {
2950
+ parseResult = await parseRgJsonStream(child, {
2951
+ limit,
2952
+ contextLines,
2953
+ signal: ac.signal
2954
+ });
2955
+ } finally {
2956
+ clearTimeout(wallTimer);
2957
+ if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
2958
+ if (!child.killed) killChild(child);
2959
+ }
2960
+ if (ac.signal.aborted && parseResult.hits.length === 0) {
2961
+ const reason = String(ac.signal.reason ?? "aborted");
2962
+ throw new Error(`code_search aborted (${reason})`);
2963
+ }
2964
+ if (!ac.signal.aborted) await exitPromise;
2965
+ if (exitCode !== null && exitCode !== 0 && exitCode !== 1 && !ac.signal.aborted && parseResult.hits.length === 0) {
2966
+ const trimmed = stderrText.trim();
2967
+ const detail = trimmed.length > 0 ? trimmed.replace(/^rg:\s*/i, "").slice(0, 600) : `ripgrep exited with code ${exitCode}`;
2968
+ throw new Error(`code_search: ${detail}`);
2969
+ }
2970
+ let kept;
2971
+ let prunedBelowShoulder;
2972
+ let notice = null;
2973
+ if (mode === "ranked") {
2974
+ const queryTokens = tokenize(rawInput.query);
2975
+ const pass1 = bm25fScore(parseResult.hits, queryTokens);
2976
+ pass1.sort((a, b) => b.score - a.score);
2977
+ const topN = structuralMode === "topN" ? STRUCTURAL_TOPN_FAST : STRUCTURAL_TOPN_FULL;
2978
+ const indexByHit = /* @__PURE__ */ new Map();
2979
+ for (let i = 0; i < parseResult.hits.length; i++) indexByHit.set(parseResult.hits[i], i);
2980
+ const structural = await runStructuralPass({
2981
+ hitsRanked: pass1.slice(0, Math.min(topN, pass1.length)).map((sh) => ({
2982
+ hit: sh.hit,
2983
+ index: indexByHit.get(sh.hit) ?? -1
2984
+ })).filter((e) => e.index >= 0),
2985
+ workspaceRoot: ws.canonical,
2986
+ topN,
2987
+ budgetMs: STRUCTURAL_BUDGET_MS,
2988
+ signal: ac.signal
2989
+ });
2990
+ notice = structural.fallback;
2991
+ const pruned = shoulderPrune(bm25fScore(parseResult.hits, queryTokens, structural.confirmedHitIndexes));
2992
+ kept = pruned.kept.slice(0, limit);
2993
+ prunedBelowShoulder = pruned.prunedBelowShoulder;
2994
+ } else kept = parseResult.hits.map((h) => ({
2995
+ hit: h,
2996
+ score: 0,
2997
+ field_contributions: {}
2998
+ }));
2999
+ const results = kept.map((sh) => {
3000
+ let file = sh.hit.file;
3001
+ if (file.startsWith("./") || file.startsWith(".\\")) file = file.slice(2);
3002
+ file = file.replace(/\\/g, "/");
3003
+ const baseHit = {
3004
+ file,
3005
+ line: sh.hit.line,
3006
+ snippet: renderSnippet(sh.hit),
3007
+ match_byte_range: [sh.hit.match_start, sh.hit.match_end]
3008
+ };
3009
+ if (mode === "ranked") {
3010
+ baseHit.score = round4(sh.score);
3011
+ baseHit.field_contributions = {
3012
+ match_line: round4(sh.field_contributions.match_line ?? 0),
3013
+ symbol_context: round4(sh.field_contributions.symbol_context ?? 0),
3014
+ file_path: round4(sh.field_contributions.file_path ?? 0),
3015
+ context: round4(sh.field_contributions.context ?? 0)
3016
+ };
3017
+ } else baseHit.field_contributions = null;
3018
+ return baseHit;
3019
+ });
3020
+ const elapsed_ms = Date.now() - t0;
3021
+ const debugLog = process.env.GH_ROUTER_DEBUG_CODE_SEARCH === "1";
3022
+ consola.info(`[code_search] mode=${mode} structural=${structuralMode} expansion=${expansion ? expansion.length : 0} results=${results.length} truncated=${parseResult.truncated} scanned_files=${parseResult.scannedFiles} elapsed_ms=${elapsed_ms} abort=${parseResult.cancelled} rg=${rgResolution.source} notice=${notice ? "yes" : "no"}` + (debugLog ? ` query="${rawInput.query}" workspace="${ws.canonical}"` : ""));
3023
+ return {
3024
+ results,
3025
+ truncated: parseResult.truncated,
3026
+ pruned_below_shoulder: mode === "ranked" ? prunedBelowShoulder : void 0,
3027
+ scanned_files: parseResult.scannedFiles,
3028
+ elapsed_ms,
3029
+ ranking: mode === "ranked" ? {
3030
+ algorithm: "BM25F",
3031
+ citation: "Robertson, Zaragoza, Taylor 2004",
3032
+ k1: BM25F_K1
3033
+ } : { algorithm: "ripgrep_document_order" },
3034
+ notice
3035
+ };
3036
+ }
3037
+ function round4(x) {
3038
+ return Math.round(x * 1e4) / 1e4;
3039
+ }
3040
+
1789
3041
  //#endregion
1790
3042
  //#region src/services/copilot/web-search.ts
1791
3043
  const RpcSchema = z.object({
@@ -2263,7 +3515,7 @@ function buildPeerAwarenessSnippet(opts) {
2263
3515
  return [
2264
3516
  "## Peer review and advisor",
2265
3517
  "",
2266
- `Cross-lab peer critics under \`mcp__gh-router-peers__*\` — ${criticList.join(", ")} — plus the \`peer-review-coordinator\` fan-out subagent, and Claude Code's built-in \`advisor\` tool, are available at your discretion for second opinions and adversarial review. Subagents you spawn inherit them.${codexCliClause}`
3518
+ `Cross-lab peer critics under \`mcp__gh-router-peers__*\` — ${criticList.join(", ")} — plus the \`peer-review-coordinator\` fan-out subagent, and Claude Code's built-in \`advisor\` tool, are available at your discretion for second opinions and adversarial review. Subagents you spawn inherit them.${codexCliClause} Also \`mcp__gh-router-peers__code_search\` for accurate ranked code discovery (BM25F + tree-sitter) — prefer it over \`Grep\` when finding definitions or call sites.`
2267
3519
  ].join("\n");
2268
3520
  }
2269
3521
  /** Convenience: every persona that should be registered for the given mode. */
@@ -2328,6 +3580,94 @@ const NON_PERSONA_MCP_TOOLS = Object.freeze([{
2328
3580
  };
2329
3581
  }
2330
3582
  }
3583
+ }, {
3584
+ toolNameHttp: "code_search",
3585
+ description: "Fast structured code search over a local workspace. Returns ranked, deduplicated hits with snippets. Ranks with BM25F across matched-line / file-path / surrounding-context / symbol-context fields, then refines `symbol-context` with tree-sitter AST analysis on the top hits so identifier definitions outrank incidental string matches. Prefer this over Grep/Bash+grep for ranked discovery (\"where is X defined\", \"which files reference Y\", \"find code that does Z\") — ranked mode surfaces the few right answers instead of every match. Use Grep for exact-pattern enumeration when you need every hit unranked, and Glob for file-name patterns (no content match). `workspace` is any absolute path the proxy process can read — typically the project root or a sub-tree you're working in.",
3586
+ inputSchema: {
3587
+ type: "object",
3588
+ required: ["query", "workspace"],
3589
+ additionalProperties: false,
3590
+ properties: {
3591
+ query: {
3592
+ type: "string",
3593
+ description: "Search text. In 'ranked' (default) and 'literal' modes, interpreted as a literal string. In 'regex' mode, interpreted as a PCRE2 regex. In 'ranked' and 'literal' modes, single-identifier queries are auto-expanded across camelCase / snake_case / kebab-case / SCREAMING_SNAKE skeletons so `getUserName` also matches `get_user_name`."
3594
+ },
3595
+ workspace: {
3596
+ type: "string",
3597
+ description: "Absolute path to the project root (or sub-tree) to search."
3598
+ },
3599
+ mode: {
3600
+ type: "string",
3601
+ enum: [
3602
+ "ranked",
3603
+ "literal",
3604
+ "regex"
3605
+ ],
3606
+ description: "Ranking mode. 'ranked' (default): BM25F + tree-sitter structural boost; results ordered by score with shoulder pruning (drops results below 50% of the top score). 'literal': fixed-string search, ripgrep document order. 'regex': PCRE2 search, ripgrep document order."
3607
+ },
3608
+ file_glob: {
3609
+ type: "string",
3610
+ description: "Optional ripgrep glob filter (e.g. 'src/**/*.ts')."
3611
+ },
3612
+ limit: {
3613
+ type: "number",
3614
+ description: "Max hits to return (default 20)."
3615
+ },
3616
+ structural: {
3617
+ type: "string",
3618
+ enum: ["full", "topN"],
3619
+ description: "Structural-ranking depth (ranked mode only). 'full' (default) runs tree-sitter on the top 50 BM25F hits — best signal, fine for typical repos. 'topN' restricts to the top 10 for tighter latency on very large workspaces. Both modes share a 200ms wall-clock budget; on budget exhaustion the response includes `notice` and remaining hits fall back to the regex symbol heuristic."
3620
+ }
3621
+ }
3622
+ },
3623
+ async handler(args, signal) {
3624
+ try {
3625
+ const result = await searchCode({
3626
+ query: typeof args.query === "string" ? args.query : "",
3627
+ workspace: typeof args.workspace === "string" ? args.workspace : "",
3628
+ mode: args.mode === "literal" || args.mode === "regex" || args.mode === "ranked" ? args.mode : void 0,
3629
+ file_glob: typeof args.file_glob === "string" ? args.file_glob : void 0,
3630
+ limit: typeof args.limit === "number" ? args.limit : void 0,
3631
+ structural: args.structural === "full" || args.structural === "topN" ? args.structural : void 0
3632
+ }, signal);
3633
+ const SIZE_CAP_BYTES = 256 * 1024;
3634
+ const trimmedHits = [];
3635
+ let totalBytes = 0;
3636
+ let sizeCapped = false;
3637
+ for (const hit of result.results) {
3638
+ const next = {
3639
+ file: hit.file,
3640
+ line: hit.line,
3641
+ snippet: hit.snippet
3642
+ };
3643
+ const nextBytes = Buffer.byteLength(JSON.stringify(next), "utf8");
3644
+ if (trimmedHits.length > 0 && totalBytes + nextBytes > SIZE_CAP_BYTES) {
3645
+ sizeCapped = true;
3646
+ break;
3647
+ }
3648
+ trimmedHits.push(next);
3649
+ totalBytes += nextBytes;
3650
+ }
3651
+ const minimal = {
3652
+ results: trimmedHits,
3653
+ truncated: result.truncated || sizeCapped
3654
+ };
3655
+ if (sizeCapped) minimal.notice = `response size limit reached at ${trimmedHits.length} hits (~${Math.round(totalBytes / 1024)}KB); narrow your query or lower 'limit' to get all relevant matches`;
3656
+ else if (typeof result.notice === "string") minimal.notice = result.notice;
3657
+ return { content: [{
3658
+ type: "text",
3659
+ text: JSON.stringify(minimal)
3660
+ }] };
3661
+ } catch (err) {
3662
+ return {
3663
+ content: [{
3664
+ type: "text",
3665
+ text: `code_search failed: ${err instanceof Error ? err.message : String(err)}`
3666
+ }],
3667
+ isError: true
3668
+ };
3669
+ }
3670
+ }
2331
3671
  }]);
2332
3672
 
2333
3673
  //#endregion
@@ -2835,8 +4175,8 @@ const ENDPOINT_ALIASES = {
2835
4175
  * - the model has no `supported_endpoints` field (backward-compat)
2836
4176
  * - the endpoint is listed in `supported_endpoints`
2837
4177
  */
2838
- function modelSupportsEndpoint(modelId, path$1) {
2839
- const endpoint = ENDPOINT_ALIASES[path$1] ?? path$1;
4178
+ function modelSupportsEndpoint(modelId, path$2) {
4179
+ const endpoint = ENDPOINT_ALIASES[path$2] ?? path$2;
2840
4180
  const model = state.models?.data.find((m) => m.id === modelId);
2841
4181
  if (!model) return true;
2842
4182
  const supported = model.supported_endpoints;
@@ -2847,17 +4187,17 @@ function modelSupportsEndpoint(modelId, path$1) {
2847
4187
  * Log an error when a model is used on an endpoint it doesn't support.
2848
4188
  * Returns `true` if a mismatch was detected (for testing).
2849
4189
  */
2850
- function logEndpointMismatch(modelId, path$1) {
2851
- if (modelSupportsEndpoint(modelId, path$1)) return false;
4190
+ function logEndpointMismatch(modelId, path$2) {
4191
+ if (modelSupportsEndpoint(modelId, path$2)) return false;
2852
4192
  const supported = (state.models?.data.find((m) => m.id === modelId))?.supported_endpoints ?? [];
2853
- consola.error(`Model "${modelId}" does not support ${path$1}. Supported endpoints: ${supported.join(", ")}`);
4193
+ consola.error(`Model "${modelId}" does not support ${path$2}. Supported endpoints: ${supported.join(", ")}`);
2854
4194
  return true;
2855
4195
  }
2856
4196
  /**
2857
4197
  * Return model IDs that support the given endpoint.
2858
4198
  */
2859
- function listModelsForEndpoint(path$1) {
2860
- const endpoint = ENDPOINT_ALIASES[path$1] ?? path$1;
4199
+ function listModelsForEndpoint(path$2) {
4200
+ const endpoint = ENDPOINT_ALIASES[path$2] ?? path$2;
2861
4201
  return (state.models?.data ?? []).filter((m) => {
2862
4202
  const supported = m.supported_endpoints;
2863
4203
  if (!supported || supported.length === 0) return true;
@@ -2914,7 +4254,7 @@ function initProxyFromEnv() {
2914
4254
  //#endregion
2915
4255
  //#region package.json
2916
4256
  var name = "github-router";
2917
- var version = "0.3.27";
4257
+ var version = "0.3.28";
2918
4258
 
2919
4259
  //#endregion
2920
4260
  //#region src/lib/approval.ts