github-router 0.3.27 → 0.3.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -0
- package/dist/main.js +1350 -10
- package/dist/main.js.map +1 -1
- package/package.json +7 -1
package/dist/main.js
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import { createRequire } from "node:module";
|
|
2
3
|
import { defineCommand, runMain } from "citty";
|
|
3
4
|
import consola from "consola";
|
|
4
5
|
import { randomBytes, randomUUID, timingSafeEqual } from "node:crypto";
|
|
5
6
|
import fs from "node:fs/promises";
|
|
6
7
|
import os from "node:os";
|
|
8
|
+
import * as path$1 from "node:path";
|
|
7
9
|
import path from "node:path";
|
|
8
10
|
import process$1 from "node:process";
|
|
9
11
|
import { execFile, execFileSync, spawn } from "node:child_process";
|
|
10
12
|
import { promisify } from "node:util";
|
|
13
|
+
import fs$1, { existsSync, readFileSync, realpathSync, statSync } from "node:fs";
|
|
14
|
+
import { createInterface } from "node:readline";
|
|
15
|
+
import Parser from "web-tree-sitter";
|
|
11
16
|
import { events } from "fetch-event-stream";
|
|
12
17
|
import { z } from "zod";
|
|
13
|
-
import fs$1 from "node:fs";
|
|
14
18
|
import { Writable } from "node:stream";
|
|
15
19
|
import { serve } from "srvx";
|
|
16
20
|
import { getProxyForUrl } from "proxy-from-env";
|
|
@@ -19,6 +23,10 @@ import { Hono } from "hono";
|
|
|
19
23
|
import { cors } from "hono/cors";
|
|
20
24
|
import clipboard from "clipboardy";
|
|
21
25
|
|
|
26
|
+
//#region rolldown:runtime
|
|
27
|
+
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
28
|
+
|
|
29
|
+
//#endregion
|
|
22
30
|
//#region src/lib/paths.ts
|
|
23
31
|
function appDir() {
|
|
24
32
|
return path.join(os.homedir(), ".local", "share", "github-router");
|
|
@@ -1786,6 +1794,1250 @@ function launchChild(target, server$1, options = {}) {
|
|
|
1786
1794
|
});
|
|
1787
1795
|
}
|
|
1788
1796
|
|
|
1797
|
+
//#endregion
|
|
1798
|
+
//#region src/lib/code-search.ts
|
|
1799
|
+
/**
|
|
1800
|
+
* BM25's `k1` term-frequency saturation parameter. Lucene's default.
|
|
1801
|
+
* Robertson & Zaragoza 2009 monograph recommends 1.2-2.0; Lucene
|
|
1802
|
+
* ships 1.2, Elasticsearch ships 1.2, we ship 1.2.
|
|
1803
|
+
*/
|
|
1804
|
+
const BM25F_K1 = 1.2;
|
|
1805
|
+
/**
|
|
1806
|
+
* Per-field BM25F boost weights (`b_f` in the CIKM 2004 paper). The
|
|
1807
|
+
* relative ordering follows Sourcegraph Zoekt's published signal
|
|
1808
|
+
* priorities — matched line first, then symbol context, then path,
|
|
1809
|
+
* then surrounding context.
|
|
1810
|
+
*/
|
|
1811
|
+
const FIELD_BOOSTS = {
|
|
1812
|
+
match_line: 3,
|
|
1813
|
+
symbol_context: 2.5,
|
|
1814
|
+
file_path: 2,
|
|
1815
|
+
context: 1
|
|
1816
|
+
};
|
|
1817
|
+
/**
|
|
1818
|
+
* Per-field length-normalization parameter (`l_f`). 0.0 disables
|
|
1819
|
+
* length normalization for short, uniform fields. Lucene's default
|
|
1820
|
+
* `b=0.75` for prose-like fields.
|
|
1821
|
+
*/
|
|
1822
|
+
const FIELD_LEN_NORMS = {
|
|
1823
|
+
match_line: 0,
|
|
1824
|
+
symbol_context: 0,
|
|
1825
|
+
file_path: 0,
|
|
1826
|
+
context: .75
|
|
1827
|
+
};
|
|
1828
|
+
/**
|
|
1829
|
+
* Shoulder cut: drop results below this fraction of the top score.
|
|
1830
|
+
* 0.5 is the convention from learning-to-rank literature (Burges
|
|
1831
|
+
* 2010); chosen as the deliberate single-place constant.
|
|
1832
|
+
*/
|
|
1833
|
+
const SHOULDER_THRESHOLD = .5;
|
|
1834
|
+
const MAX_QUERY_LEN = 1024;
|
|
1835
|
+
const MAX_GLOB_LEN = 512;
|
|
1836
|
+
const DEFAULT_LIMIT = 20;
|
|
1837
|
+
const MAX_CONTEXT_LINES = 10;
|
|
1838
|
+
const DEFAULT_CONTEXT_LINES = 2;
|
|
1839
|
+
const MAX_SNIPPET_BYTES = 2048;
|
|
1840
|
+
const MAX_STDOUT_BYTES = 10 * 1024 * 1024;
|
|
1841
|
+
const WALL_TIME_MS = 3e4;
|
|
1842
|
+
/**
|
|
1843
|
+
* Structural-pass settings. The wall-clock budget is checked between
|
|
1844
|
+
* files (NOT mid-parse — tree-sitter doesn't surface a usable cancel
|
|
1845
|
+
* hook in the web-tree-sitter binding we're on), so a single
|
|
1846
|
+
* pathological file can overrun by one file's parse-time. In practice
|
|
1847
|
+
* a single source file parses in well under 50ms; 200ms gives us
|
|
1848
|
+
* comfortable headroom for ~5-10 files even on cold cache.
|
|
1849
|
+
*/
|
|
1850
|
+
const STRUCTURAL_BUDGET_MS = 200;
|
|
1851
|
+
const STRUCTURAL_TOPN_FULL = 50;
|
|
1852
|
+
const STRUCTURAL_TOPN_FAST = 10;
|
|
1853
|
+
/**
|
|
1854
|
+
* Cap the per-file size we'll parse. 1MB of source covers all
|
|
1855
|
+
* reasonable hand-written files; bigger files are almost always
|
|
1856
|
+
* generated code or vendored bundles whose AST signal is worthless
|
|
1857
|
+
* for ranking real definitions.
|
|
1858
|
+
*/
|
|
1859
|
+
const STRUCTURAL_MAX_FILE_BYTES = 1024 * 1024;
|
|
1860
|
+
/**
|
|
1861
|
+
* LRU bound on the parsed-tree cache. Each Tree pins ~roughly the
|
|
1862
|
+
* size of its source plus tree-sitter's internal node arena. 64 is
|
|
1863
|
+
* comfortably under typical Node heap budgets; trees are eagerly
|
|
1864
|
+
* `.delete()`-ed on eviction.
|
|
1865
|
+
*/
|
|
1866
|
+
const STRUCTURAL_CACHE_MAX = 64;
|
|
1867
|
+
/**
|
|
1868
|
+
* Definition-shape heuristic for `symbol_context` field. Match this
|
|
1869
|
+
* against the matched line (after leading whitespace strip) to
|
|
1870
|
+
* detect "the match is on a definition." This is the regex fallback
|
|
1871
|
+
* we use when (a) tree-sitter can't reach the file (unsupported
|
|
1872
|
+
* language, grammar load failure, parse error), (b) the file isn't
|
|
1873
|
+
* in the structural pass's top-N slice, or (c) the structural budget
|
|
1874
|
+
* fired.
|
|
1875
|
+
*/
|
|
1876
|
+
const SYMBOL_REGEX = /^(?:export\s+)?(?:default\s+)?(?:async\s+)?(?:public\s+|private\s+|protected\s+|static\s+|abstract\s+|readonly\s+)*(?:function|class|interface|type|enum|def|fn|trait|impl|module|namespace|const|let|var)\s+[A-Za-z_$]/;
|
|
1877
|
+
let _rgResolution;
|
|
1878
|
+
/**
|
|
1879
|
+
* Tri-tier resolution. Memoized. Mirrors cc-backup
|
|
1880
|
+
* `src/utils/ripgrep.ts:31-65`.
|
|
1881
|
+
*
|
|
1882
|
+
* 1. System rg on PATH — use the literal command name `"rg"` (NOT
|
|
1883
|
+
* the absolute path). This leverages NoDefaultCurrentDirectory-
|
|
1884
|
+
* InExePath on Windows, preventing PATH-hijacking via a
|
|
1885
|
+
* malicious ./rg.exe in the proxy's cwd.
|
|
1886
|
+
* 2. Bundled via `@vscode/ripgrep` — falls back to the per-platform
|
|
1887
|
+
* binary that `optionalDependencies` installed.
|
|
1888
|
+
* 3. Throw — surfaced to the caller as an MCP isError response.
|
|
1889
|
+
*/
|
|
1890
|
+
function resolveRipgrep() {
|
|
1891
|
+
if (_rgResolution) return _rgResolution;
|
|
1892
|
+
if (hasSystemRipgrep()) {
|
|
1893
|
+
_rgResolution = {
|
|
1894
|
+
rgPath: "rg",
|
|
1895
|
+
source: "system"
|
|
1896
|
+
};
|
|
1897
|
+
return _rgResolution;
|
|
1898
|
+
}
|
|
1899
|
+
try {
|
|
1900
|
+
const mod = __require("@vscode/ripgrep");
|
|
1901
|
+
if (mod.rgPath && existsSync(mod.rgPath)) {
|
|
1902
|
+
_rgResolution = {
|
|
1903
|
+
rgPath: mod.rgPath,
|
|
1904
|
+
source: "bundled"
|
|
1905
|
+
};
|
|
1906
|
+
return _rgResolution;
|
|
1907
|
+
}
|
|
1908
|
+
} catch {}
|
|
1909
|
+
throw new Error("ripgrep not found. Either install rg system-wide (brew/apt/winget) or reinstall the proxy so @vscode/ripgrep's per-platform binary is fetched. See README's code_search section.");
|
|
1910
|
+
}
|
|
1911
|
+
function hasSystemRipgrep() {
|
|
1912
|
+
try {
|
|
1913
|
+
return execFileSync(process.platform === "win32" ? "where" : "which", ["rg"], {
|
|
1914
|
+
stdio: [
|
|
1915
|
+
"ignore",
|
|
1916
|
+
"pipe",
|
|
1917
|
+
"ignore"
|
|
1918
|
+
],
|
|
1919
|
+
timeout: 1e3
|
|
1920
|
+
}).length > 0;
|
|
1921
|
+
} catch {
|
|
1922
|
+
return false;
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
function validateInputs(input) {
|
|
1926
|
+
if (typeof input.query !== "string" || input.query.length === 0) return "code_search: arguments.query is required (non-empty string)";
|
|
1927
|
+
if (input.query.length > MAX_QUERY_LEN) return `code_search: query exceeds ${MAX_QUERY_LEN} chars`;
|
|
1928
|
+
if (/[\0\r\n]/.test(input.query)) return "code_search: query contains null byte or newline (rejected)";
|
|
1929
|
+
if (typeof input.workspace !== "string" || input.workspace.length === 0) return "code_search: arguments.workspace is required (absolute path)";
|
|
1930
|
+
if (input.mode && ![
|
|
1931
|
+
"ranked",
|
|
1932
|
+
"literal",
|
|
1933
|
+
"regex"
|
|
1934
|
+
].includes(input.mode)) return `code_search: mode must be one of "ranked", "literal", "regex"`;
|
|
1935
|
+
if (input.file_glob !== void 0) {
|
|
1936
|
+
if (typeof input.file_glob !== "string") return "code_search: file_glob must be a string";
|
|
1937
|
+
if (input.file_glob.length > MAX_GLOB_LEN) return `code_search: file_glob exceeds ${MAX_GLOB_LEN} chars`;
|
|
1938
|
+
if (/[\0\r\n]/.test(input.file_glob)) return "code_search: file_glob contains null byte or newline";
|
|
1939
|
+
}
|
|
1940
|
+
if (input.limit !== void 0) {
|
|
1941
|
+
if (typeof input.limit !== "number" || !Number.isInteger(input.limit) || input.limit < 1) return "code_search: limit must be a positive integer";
|
|
1942
|
+
}
|
|
1943
|
+
if (input.context_lines !== void 0) {
|
|
1944
|
+
if (typeof input.context_lines !== "number" || !Number.isInteger(input.context_lines) || input.context_lines < 0) return "code_search: context_lines must be a non-negative integer";
|
|
1945
|
+
}
|
|
1946
|
+
return null;
|
|
1947
|
+
}
|
|
1948
|
+
/**
|
|
1949
|
+
* Validate a `workspace` arg. The proxy runs as the user; any path
|
|
1950
|
+
* the proxy process can `stat` is a legal workspace — mirrors what
|
|
1951
|
+
* Claude Code's Read / Bash tools could already reach. Earlier the
|
|
1952
|
+
* validator enforced an allow-set + secret-shape file denylist; the
|
|
1953
|
+
* holistic threat model showed those were inconsistent guardrails
|
|
1954
|
+
* (the model already has filesystem access via its other tools), so
|
|
1955
|
+
* they're dropped.
|
|
1956
|
+
*
|
|
1957
|
+
* Still enforced:
|
|
1958
|
+
* - Absolute path (relative paths are an integration-error footgun).
|
|
1959
|
+
* - realpath canonicalization (resolves symlinks; output paths are
|
|
1960
|
+
* reported relative to this).
|
|
1961
|
+
* - Path must exist AND be a directory.
|
|
1962
|
+
*
|
|
1963
|
+
* Errors do NOT echo the rejected path (output of code_search flows
|
|
1964
|
+
* upstream to the model provider; consistent with the
|
|
1965
|
+
* COPILOT_HOST_ALLOWLIST pattern in `src/lib/utils.ts`).
|
|
1966
|
+
*/
|
|
1967
|
+
function validateWorkspace(workspace) {
|
|
1968
|
+
if (!path$1.isAbsolute(workspace)) return {
|
|
1969
|
+
ok: false,
|
|
1970
|
+
error: "workspace must be an absolute path"
|
|
1971
|
+
};
|
|
1972
|
+
let canonical;
|
|
1973
|
+
try {
|
|
1974
|
+
canonical = realpathSync(workspace);
|
|
1975
|
+
} catch {
|
|
1976
|
+
return {
|
|
1977
|
+
ok: false,
|
|
1978
|
+
error: "workspace path is not accessible"
|
|
1979
|
+
};
|
|
1980
|
+
}
|
|
1981
|
+
try {
|
|
1982
|
+
if (!statSync(canonical).isDirectory()) return {
|
|
1983
|
+
ok: false,
|
|
1984
|
+
error: "workspace must be a directory"
|
|
1985
|
+
};
|
|
1986
|
+
} catch {
|
|
1987
|
+
return {
|
|
1988
|
+
ok: false,
|
|
1989
|
+
error: "workspace path is not accessible"
|
|
1990
|
+
};
|
|
1991
|
+
}
|
|
1992
|
+
return {
|
|
1993
|
+
ok: true,
|
|
1994
|
+
canonical
|
|
1995
|
+
};
|
|
1996
|
+
}
|
|
1997
|
+
/**
|
|
1998
|
+
* Rule-based identifier splitter per the ESEC/FSE 2021 benchmark.
|
|
1999
|
+
*
|
|
2000
|
+
* 1. Split on non-word characters.
|
|
2001
|
+
* 2. Within each chunk, split on case boundaries with acronym
|
|
2002
|
+
* lookahead — `HTTPSConnection` → [`HTTPS`, `Connection`].
|
|
2003
|
+
* 3. Attach trailing digit runs to letters — `parseV2Handler` →
|
|
2004
|
+
* [`parse`, `V2`, `Handler`] (NOT `[parse, V, 2, Handler]`).
|
|
2005
|
+
* 4. Lowercase all tokens.
|
|
2006
|
+
* 5. Drop tokens of length < 2 to suppress single-char noise.
|
|
2007
|
+
*
|
|
2008
|
+
* Limitation: ASCII identifiers only. Unicode identifiers (Cyrillic,
|
|
2009
|
+
* CJK, etc.) won't be tokenized. Documented as MVP scope.
|
|
2010
|
+
*/
|
|
2011
|
+
function tokenize(text) {
|
|
2012
|
+
const out = [];
|
|
2013
|
+
const pieces = text.split(/[^A-Za-z0-9]+/);
|
|
2014
|
+
const re = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+[0-9]*|[A-Z]+[0-9]*|[0-9]+/g;
|
|
2015
|
+
for (const piece of pieces) {
|
|
2016
|
+
if (!piece) continue;
|
|
2017
|
+
const matches = piece.match(re);
|
|
2018
|
+
if (!matches) continue;
|
|
2019
|
+
for (const m of matches) {
|
|
2020
|
+
const lc = m.toLowerCase();
|
|
2021
|
+
if (lc.length >= 2) out.push(lc);
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
return out;
|
|
2025
|
+
}
|
|
2026
|
+
/**
|
|
2027
|
+
* Platform-aware child termination. On Unix: SIGTERM, then SIGKILL
|
|
2028
|
+
* after a brief grace period. On Windows: taskkill /T /F because
|
|
2029
|
+
* child.kill() doesn't reliably terminate descendants — a long
|
|
2030
|
+
* search with worker threads would leak rg.exe processes.
|
|
2031
|
+
*/
|
|
2032
|
+
function killChild(child) {
|
|
2033
|
+
if (!child.pid || child.killed) return;
|
|
2034
|
+
if (process.platform === "win32") {
|
|
2035
|
+
try {
|
|
2036
|
+
execFile("taskkill", [
|
|
2037
|
+
"/T",
|
|
2038
|
+
"/F",
|
|
2039
|
+
"/PID",
|
|
2040
|
+
String(child.pid)
|
|
2041
|
+
], () => {});
|
|
2042
|
+
} catch {}
|
|
2043
|
+
return;
|
|
2044
|
+
}
|
|
2045
|
+
try {
|
|
2046
|
+
child.kill("SIGTERM");
|
|
2047
|
+
} catch {}
|
|
2048
|
+
setTimeout(() => {
|
|
2049
|
+
if (!child.killed) try {
|
|
2050
|
+
child.kill("SIGKILL");
|
|
2051
|
+
} catch {}
|
|
2052
|
+
}, 500).unref();
|
|
2053
|
+
}
|
|
2054
|
+
/**
|
|
2055
|
+
* Single-identifier query matcher. We only expand queries that look
|
|
2056
|
+
* like a single identifier — any whitespace, regex metacharacters, or
|
|
2057
|
+
* structural punctuation defeats the expansion and we fall through to
|
|
2058
|
+
* the original rg behavior. ASCII-only on purpose (matches the
|
|
2059
|
+
* tokenizer's scope; Unicode identifiers are MVP-out).
|
|
2060
|
+
*/
|
|
2061
|
+
const SINGLE_IDENTIFIER_REGEX = /^[A-Za-z][A-Za-z0-9_-]{0,127}$/;
|
|
2062
|
+
/**
|
|
2063
|
+
* Split an identifier into its constituent word-pieces, recognizing
|
|
2064
|
+
*
|
|
2065
|
+
* - snake_case (split on `_`)
|
|
2066
|
+
* - kebab-case (split on `-`)
|
|
2067
|
+
* - camelCase (split on lowercase→uppercase boundaries)
|
|
2068
|
+
* - PascalCase (each capitalized run is a piece)
|
|
2069
|
+
* - acronym runs (HTTPSConnection → [HTTPS, Connection])
|
|
2070
|
+
* - trailing digits attached to letters (parseV2 → [parse, V2])
|
|
2071
|
+
*
|
|
2072
|
+
* Pieces are returned in source-order, with the original case
|
|
2073
|
+
* preserved per piece — re-skeletons compose by re-casing each piece.
|
|
2074
|
+
*/
|
|
2075
|
+
function splitIdentifierPieces(identifier) {
|
|
2076
|
+
const pieces = [];
|
|
2077
|
+
for (const chunk of identifier.split(/[-_]/)) {
|
|
2078
|
+
if (!chunk) continue;
|
|
2079
|
+
const matches = chunk.match(/[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+[0-9]*|[A-Z]+[0-9]*|[0-9]+/g);
|
|
2080
|
+
if (matches) pieces.push(...matches);
|
|
2081
|
+
}
|
|
2082
|
+
return pieces;
|
|
2083
|
+
}
|
|
2084
|
+
/**
|
|
2085
|
+
* Produce skeleton variants for an identifier query. Returns `null`
|
|
2086
|
+
* when the query is not a single identifier or has only one piece
|
|
2087
|
+
* (no skeleton structure to vary across) — caller falls through to
|
|
2088
|
+
* the literal-search path.
|
|
2089
|
+
*
|
|
2090
|
+
* The variant set covers the five conventions any real codebase
|
|
2091
|
+
* mixes:
|
|
2092
|
+
*
|
|
2093
|
+
* getUserName (lowerCamelCase)
|
|
2094
|
+
* GetUserName (UpperCamelCase / PascalCase)
|
|
2095
|
+
* get_user_name (snake_case)
|
|
2096
|
+
* get-user-name (kebab-case)
|
|
2097
|
+
* GET_USER_NAME (UPPER_SNAKE_CASE)
|
|
2098
|
+
*
|
|
2099
|
+
* The set is deduplicated so identifiers that collapse skeletons
|
|
2100
|
+
* (e.g., single-word queries) don't bloat the regex pointlessly.
|
|
2101
|
+
*/
|
|
2102
|
+
function expandIdentifierVariants(query) {
|
|
2103
|
+
if (!SINGLE_IDENTIFIER_REGEX.test(query)) return null;
|
|
2104
|
+
const pieces = splitIdentifierPieces(query);
|
|
2105
|
+
if (pieces.length < 2) return null;
|
|
2106
|
+
const lower = pieces.map((p) => p.toLowerCase());
|
|
2107
|
+
const upper = pieces.map((p) => p.toUpperCase());
|
|
2108
|
+
const cap = lower.map((p) => p.charAt(0).toUpperCase() + p.slice(1));
|
|
2109
|
+
const variants = /* @__PURE__ */ new Set();
|
|
2110
|
+
variants.add(query);
|
|
2111
|
+
variants.add(lower[0] + cap.slice(1).join(""));
|
|
2112
|
+
variants.add(cap.join(""));
|
|
2113
|
+
variants.add(lower.join("_"));
|
|
2114
|
+
variants.add(lower.join("-"));
|
|
2115
|
+
variants.add(upper.join("_"));
|
|
2116
|
+
return Array.from(variants);
|
|
2117
|
+
}
|
|
2118
|
+
/**
|
|
2119
|
+
* Build the rg regex pattern for a set of skeleton variants. The
|
|
2120
|
+
* variants are already plain identifiers (no regex metacharacters),
|
|
2121
|
+
* so simple alternation suffices. Word boundaries are intentionally
|
|
2122
|
+
* NOT applied — the user's mental model for "search for getUserName"
|
|
2123
|
+
* is substring-anywhere, which is also what `-F getUserName` did.
|
|
2124
|
+
*/
|
|
2125
|
+
function buildExpansionPattern(variants) {
|
|
2126
|
+
return "(?:" + variants.join("|") + ")";
|
|
2127
|
+
}
|
|
2128
|
+
function buildRgArgs(input) {
|
|
2129
|
+
const args = ["--json", "--no-follow"];
|
|
2130
|
+
if (input.contextLines > 0) args.push(`-C`, String(input.contextLines));
|
|
2131
|
+
if (!input.expansionPattern && (input.mode === "literal" || input.mode === "ranked")) args.push("-F");
|
|
2132
|
+
if (input.fileGlob && input.fileGlob !== "**/*") args.push("-g", input.fileGlob);
|
|
2133
|
+
args.push("--", input.expansionPattern ?? input.query, ".");
|
|
2134
|
+
return args;
|
|
2135
|
+
}
|
|
2136
|
+
/**
|
|
2137
|
+
* Stream-parse ripgrep --json output. Two load-bearing behaviors:
|
|
2138
|
+
*
|
|
2139
|
+
* 1. GLOBAL limit cap (NOT per-file — MEDIUM-10). Once we've
|
|
2140
|
+
* accumulated `limit` hits, send SIGTERM and stop emitting.
|
|
2141
|
+
*
|
|
2142
|
+
* 2. CANCEL RACE short-circuit (HIGH-9, 3-lab confirmed). The
|
|
2143
|
+
* moment `signal.aborted` flips, detach the line listener AND
|
|
2144
|
+
* return early. A half-flushed truncated JSON line never
|
|
2145
|
+
* reaches JSON.parse — that's the bug we're guarding against.
|
|
2146
|
+
*/
|
|
2147
|
+
async function parseRgJsonStream(child, opts) {
|
|
2148
|
+
const hits = [];
|
|
2149
|
+
let stdoutBytes = 0;
|
|
2150
|
+
let truncatedByCap = false;
|
|
2151
|
+
let cancelled = false;
|
|
2152
|
+
let scannedFiles = 0;
|
|
2153
|
+
if (opts.signal.aborted) {
|
|
2154
|
+
killChild(child);
|
|
2155
|
+
return {
|
|
2156
|
+
hits,
|
|
2157
|
+
scannedFiles: 0,
|
|
2158
|
+
truncated: false,
|
|
2159
|
+
cancelled: true,
|
|
2160
|
+
stdoutBytes: 0
|
|
2161
|
+
};
|
|
2162
|
+
}
|
|
2163
|
+
const pendingContextBefore = [];
|
|
2164
|
+
let lastHitForContext;
|
|
2165
|
+
if (!child.stdout) return {
|
|
2166
|
+
hits,
|
|
2167
|
+
scannedFiles: 0,
|
|
2168
|
+
truncated: false,
|
|
2169
|
+
cancelled: false,
|
|
2170
|
+
stdoutBytes: 0
|
|
2171
|
+
};
|
|
2172
|
+
const rl = createInterface({
|
|
2173
|
+
input: child.stdout,
|
|
2174
|
+
crlfDelay: Infinity
|
|
2175
|
+
});
|
|
2176
|
+
const onAbort = () => {
|
|
2177
|
+
cancelled = true;
|
|
2178
|
+
rl.close();
|
|
2179
|
+
killChild(child);
|
|
2180
|
+
};
|
|
2181
|
+
opts.signal.addEventListener("abort", onAbort, { once: true });
|
|
2182
|
+
try {
|
|
2183
|
+
for await (const rawLine of rl) {
|
|
2184
|
+
if (cancelled) break;
|
|
2185
|
+
stdoutBytes += rawLine.length + 1;
|
|
2186
|
+
if (stdoutBytes > MAX_STDOUT_BYTES) {
|
|
2187
|
+
truncatedByCap = true;
|
|
2188
|
+
killChild(child);
|
|
2189
|
+
break;
|
|
2190
|
+
}
|
|
2191
|
+
if (rawLine.length === 0) continue;
|
|
2192
|
+
let evt;
|
|
2193
|
+
try {
|
|
2194
|
+
evt = JSON.parse(rawLine);
|
|
2195
|
+
} catch {
|
|
2196
|
+
continue;
|
|
2197
|
+
}
|
|
2198
|
+
switch (evt.type) {
|
|
2199
|
+
case "begin":
|
|
2200
|
+
scannedFiles += 1;
|
|
2201
|
+
pendingContextBefore.length = 0;
|
|
2202
|
+
lastHitForContext = void 0;
|
|
2203
|
+
break;
|
|
2204
|
+
case "context": {
|
|
2205
|
+
const text = stripTrailingNewline(evt.data.lines?.text ?? "");
|
|
2206
|
+
if (lastHitForContext && lastHitForContext.context_after.length < opts.contextLines) lastHitForContext.context_after.push(text);
|
|
2207
|
+
else {
|
|
2208
|
+
pendingContextBefore.push(text);
|
|
2209
|
+
if (pendingContextBefore.length > opts.contextLines) pendingContextBefore.shift();
|
|
2210
|
+
}
|
|
2211
|
+
break;
|
|
2212
|
+
}
|
|
2213
|
+
case "match": {
|
|
2214
|
+
if (hits.length >= opts.limit) {
|
|
2215
|
+
killChild(child);
|
|
2216
|
+
break;
|
|
2217
|
+
}
|
|
2218
|
+
const sub = evt.data.submatches?.[0];
|
|
2219
|
+
if (!evt.data.path || !evt.data.lines || !evt.data.line_number || !sub) break;
|
|
2220
|
+
const hit = {
|
|
2221
|
+
file: evt.data.path.text,
|
|
2222
|
+
line: evt.data.line_number,
|
|
2223
|
+
matched_line: stripTrailingNewline(evt.data.lines.text),
|
|
2224
|
+
match_start: sub.start,
|
|
2225
|
+
match_end: sub.end,
|
|
2226
|
+
context_before: [...pendingContextBefore],
|
|
2227
|
+
context_after: []
|
|
2228
|
+
};
|
|
2229
|
+
pendingContextBefore.length = 0;
|
|
2230
|
+
lastHitForContext = hit;
|
|
2231
|
+
hits.push(hit);
|
|
2232
|
+
break;
|
|
2233
|
+
}
|
|
2234
|
+
case "end":
|
|
2235
|
+
case "summary":
|
|
2236
|
+
default: break;
|
|
2237
|
+
}
|
|
2238
|
+
}
|
|
2239
|
+
} finally {
|
|
2240
|
+
opts.signal.removeEventListener("abort", onAbort);
|
|
2241
|
+
}
|
|
2242
|
+
return {
|
|
2243
|
+
hits,
|
|
2244
|
+
scannedFiles,
|
|
2245
|
+
truncated: truncatedByCap || hits.length >= opts.limit,
|
|
2246
|
+
cancelled,
|
|
2247
|
+
stdoutBytes
|
|
2248
|
+
};
|
|
2249
|
+
}
|
|
2250
|
+
function stripTrailingNewline(s) {
|
|
2251
|
+
if (s.endsWith("\r\n")) return s.slice(0, -2);
|
|
2252
|
+
if (s.endsWith("\n")) return s.slice(0, -1);
|
|
2253
|
+
return s;
|
|
2254
|
+
}
|
|
2255
|
+
/**
|
|
2256
|
+
* Extension → grammar key. Grammars not in this map skip structural
|
|
2257
|
+
* parsing (the hit falls back to the regex SYMBOL_REGEX heuristic for
|
|
2258
|
+
* `symbol_context`). Keep this list aligned with `GRAMMAR_FILES`
|
|
2259
|
+
* below — adding a language requires both an extension mapping and a
|
|
2260
|
+
* `.wasm` to load.
|
|
2261
|
+
*/
|
|
2262
|
+
const EXTENSION_TO_LANG = {
|
|
2263
|
+
".ts": "typescript",
|
|
2264
|
+
".tsx": "tsx",
|
|
2265
|
+
".js": "javascript",
|
|
2266
|
+
".mjs": "javascript",
|
|
2267
|
+
".cjs": "javascript",
|
|
2268
|
+
".jsx": "javascript",
|
|
2269
|
+
".py": "python",
|
|
2270
|
+
".go": "go",
|
|
2271
|
+
".rs": "rust",
|
|
2272
|
+
".java": "java",
|
|
2273
|
+
".c": "c",
|
|
2274
|
+
".h": "c",
|
|
2275
|
+
".cpp": "cpp",
|
|
2276
|
+
".cc": "cpp",
|
|
2277
|
+
".cxx": "cpp",
|
|
2278
|
+
".hpp": "cpp",
|
|
2279
|
+
".hxx": "cpp"
|
|
2280
|
+
};
|
|
2281
|
+
/**
|
|
2282
|
+
* Grammar key → wasm filename under `node_modules/tree-sitter-wasms/out/`.
|
|
2283
|
+
* Resolved at runtime from `node_modules`; the file paths are stable
|
|
2284
|
+
* because `tree-sitter-wasms` ships prebuilt binaries (no per-install
|
|
2285
|
+
* codegen).
|
|
2286
|
+
*/
|
|
2287
|
+
const GRAMMAR_FILES = {
|
|
2288
|
+
typescript: "tree-sitter-typescript.wasm",
|
|
2289
|
+
tsx: "tree-sitter-tsx.wasm",
|
|
2290
|
+
javascript: "tree-sitter-javascript.wasm",
|
|
2291
|
+
python: "tree-sitter-python.wasm",
|
|
2292
|
+
go: "tree-sitter-go.wasm",
|
|
2293
|
+
rust: "tree-sitter-rust.wasm",
|
|
2294
|
+
java: "tree-sitter-java.wasm",
|
|
2295
|
+
c: "tree-sitter-c.wasm",
|
|
2296
|
+
cpp: "tree-sitter-cpp.wasm"
|
|
2297
|
+
};
|
|
2298
|
+
/**
|
|
2299
|
+
* Per-language definition-shape node types. When a matched identifier
|
|
2300
|
+
* sits inside one of these nodes AND is at the node's "name" position,
|
|
2301
|
+
* we have AST-confirmed evidence the line is an identifier-definition
|
|
2302
|
+
* site. The brief's enumeration plus a handful of language-idiomatic
|
|
2303
|
+
* extras (e.g., `lexical_declaration` for TS/JS top-level `const`s,
|
|
2304
|
+
* `mod_item` for Rust modules).
|
|
2305
|
+
*
|
|
2306
|
+
* The set lookup is per-language so a node type that means
|
|
2307
|
+
* "definition" in one language but "reference" in another won't
|
|
2308
|
+
* cross-pollute.
|
|
2309
|
+
*/
|
|
2310
|
+
const DEFINITION_NODE_TYPES = {
|
|
2311
|
+
typescript: new Set([
|
|
2312
|
+
"function_declaration",
|
|
2313
|
+
"function_signature",
|
|
2314
|
+
"function_expression",
|
|
2315
|
+
"method_definition",
|
|
2316
|
+
"method_signature",
|
|
2317
|
+
"class_declaration",
|
|
2318
|
+
"interface_declaration",
|
|
2319
|
+
"type_alias_declaration",
|
|
2320
|
+
"enum_declaration",
|
|
2321
|
+
"variable_declarator",
|
|
2322
|
+
"generator_function_declaration",
|
|
2323
|
+
"abstract_method_signature",
|
|
2324
|
+
"public_field_definition",
|
|
2325
|
+
"property_signature"
|
|
2326
|
+
]),
|
|
2327
|
+
tsx: new Set([
|
|
2328
|
+
"function_declaration",
|
|
2329
|
+
"function_signature",
|
|
2330
|
+
"function_expression",
|
|
2331
|
+
"method_definition",
|
|
2332
|
+
"method_signature",
|
|
2333
|
+
"class_declaration",
|
|
2334
|
+
"interface_declaration",
|
|
2335
|
+
"type_alias_declaration",
|
|
2336
|
+
"enum_declaration",
|
|
2337
|
+
"variable_declarator",
|
|
2338
|
+
"generator_function_declaration",
|
|
2339
|
+
"abstract_method_signature",
|
|
2340
|
+
"public_field_definition",
|
|
2341
|
+
"property_signature"
|
|
2342
|
+
]),
|
|
2343
|
+
javascript: new Set([
|
|
2344
|
+
"function_declaration",
|
|
2345
|
+
"function_expression",
|
|
2346
|
+
"method_definition",
|
|
2347
|
+
"class_declaration",
|
|
2348
|
+
"variable_declarator",
|
|
2349
|
+
"generator_function_declaration"
|
|
2350
|
+
]),
|
|
2351
|
+
python: new Set([
|
|
2352
|
+
"function_definition",
|
|
2353
|
+
"class_definition",
|
|
2354
|
+
"decorated_definition"
|
|
2355
|
+
]),
|
|
2356
|
+
go: new Set([
|
|
2357
|
+
"function_declaration",
|
|
2358
|
+
"method_declaration",
|
|
2359
|
+
"type_spec",
|
|
2360
|
+
"type_alias",
|
|
2361
|
+
"const_spec",
|
|
2362
|
+
"var_spec"
|
|
2363
|
+
]),
|
|
2364
|
+
rust: new Set([
|
|
2365
|
+
"function_item",
|
|
2366
|
+
"impl_item",
|
|
2367
|
+
"trait_item",
|
|
2368
|
+
"struct_item",
|
|
2369
|
+
"enum_item",
|
|
2370
|
+
"mod_item",
|
|
2371
|
+
"type_item",
|
|
2372
|
+
"const_item",
|
|
2373
|
+
"static_item",
|
|
2374
|
+
"macro_definition"
|
|
2375
|
+
]),
|
|
2376
|
+
java: new Set([
|
|
2377
|
+
"class_declaration",
|
|
2378
|
+
"interface_declaration",
|
|
2379
|
+
"method_declaration",
|
|
2380
|
+
"constructor_declaration",
|
|
2381
|
+
"enum_declaration",
|
|
2382
|
+
"field_declaration",
|
|
2383
|
+
"annotation_type_declaration"
|
|
2384
|
+
]),
|
|
2385
|
+
c: new Set([
|
|
2386
|
+
"function_definition",
|
|
2387
|
+
"declaration",
|
|
2388
|
+
"struct_specifier",
|
|
2389
|
+
"enum_specifier",
|
|
2390
|
+
"union_specifier",
|
|
2391
|
+
"type_definition"
|
|
2392
|
+
]),
|
|
2393
|
+
cpp: new Set([
|
|
2394
|
+
"function_definition",
|
|
2395
|
+
"declaration",
|
|
2396
|
+
"struct_specifier",
|
|
2397
|
+
"class_specifier",
|
|
2398
|
+
"enum_specifier",
|
|
2399
|
+
"union_specifier",
|
|
2400
|
+
"type_definition",
|
|
2401
|
+
"namespace_definition",
|
|
2402
|
+
"template_declaration"
|
|
2403
|
+
])
|
|
2404
|
+
};
|
|
2405
|
+
/**
|
|
2406
|
+
* Node types that the AST exposes as "this token is an identifier".
|
|
2407
|
+
* The match-position lookup uses these to filter out parent-node hits
|
|
2408
|
+
* before checking the definition-site predicate.
|
|
2409
|
+
*/
|
|
2410
|
+
const IDENTIFIER_NODE_TYPES = new Set([
|
|
2411
|
+
"identifier",
|
|
2412
|
+
"type_identifier",
|
|
2413
|
+
"field_identifier",
|
|
2414
|
+
"property_identifier",
|
|
2415
|
+
"shorthand_property_identifier_pattern",
|
|
2416
|
+
"shorthand_property_identifier",
|
|
2417
|
+
"scoped_identifier",
|
|
2418
|
+
"name"
|
|
2419
|
+
]);
|
|
2420
|
+
let _grammarBundle;
|
|
2421
|
+
/**
|
|
2422
|
+
* Resolve the `tree-sitter-wasms/out/` directory at the package root.
|
|
2423
|
+
* `require.resolve` is used through a try/catch — the bundled-only
|
|
2424
|
+
* fallback runs in environments where node_modules has been pruned to
|
|
2425
|
+
* just runtime deps.
|
|
2426
|
+
*/
|
|
2427
|
+
function resolveGrammarRoot() {
|
|
2428
|
+
try {
|
|
2429
|
+
const pkgPath = __require.resolve("tree-sitter-wasms/package.json");
|
|
2430
|
+
return path$1.join(path$1.dirname(pkgPath), "out");
|
|
2431
|
+
} catch {
|
|
2432
|
+
return null;
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2435
|
+
/**
|
|
2436
|
+
* Pre-load all grammars at module-init time so the first search
|
|
2437
|
+
* doesn't pay a ~500ms cold-start cost. The Promise is captured at
|
|
2438
|
+
* import time and awaited per-call; per-grammar failures are caught
|
|
2439
|
+
* individually so one broken grammar can't take the whole tool down.
|
|
2440
|
+
*/
|
|
2441
|
+
function getGrammarBundle() {
|
|
2442
|
+
if (_grammarBundle) return _grammarBundle;
|
|
2443
|
+
_grammarBundle = { ready: (async () => {
|
|
2444
|
+
const out = /* @__PURE__ */ new Map();
|
|
2445
|
+
try {
|
|
2446
|
+
await Parser.init();
|
|
2447
|
+
} catch (err) {
|
|
2448
|
+
consola.warn(`[code_search] tree-sitter Parser.init failed; structural ranking disabled: ${err.message}`);
|
|
2449
|
+
return out;
|
|
2450
|
+
}
|
|
2451
|
+
const root = resolveGrammarRoot();
|
|
2452
|
+
if (!root) {
|
|
2453
|
+
consola.warn("[code_search] tree-sitter-wasms package not resolvable; structural ranking disabled");
|
|
2454
|
+
return out;
|
|
2455
|
+
}
|
|
2456
|
+
for (const [key, filename] of Object.entries(GRAMMAR_FILES)) {
|
|
2457
|
+
const wasmPath = path$1.join(root, filename);
|
|
2458
|
+
try {
|
|
2459
|
+
const lang = await Parser.Language.load(wasmPath);
|
|
2460
|
+
out.set(key, lang);
|
|
2461
|
+
} catch (err) {
|
|
2462
|
+
consola.warn(`[code_search] failed to load tree-sitter grammar '${key}' from ${filename}: ${err.message}`);
|
|
2463
|
+
}
|
|
2464
|
+
}
|
|
2465
|
+
return out;
|
|
2466
|
+
})() };
|
|
2467
|
+
return _grammarBundle;
|
|
2468
|
+
}
|
|
2469
|
+
getGrammarBundle().ready.catch(() => {});
|
|
2470
|
+
function getLanguageKeyForPath(filePath) {
|
|
2471
|
+
return EXTENSION_TO_LANG[path$1.extname(filePath).toLowerCase()] ?? null;
|
|
2472
|
+
}
|
|
2473
|
+
const _treeCache = /* @__PURE__ */ new Map();
|
|
2474
|
+
function cacheGet(absPath, mtimeMs) {
|
|
2475
|
+
const cur = _treeCache.get(absPath);
|
|
2476
|
+
if (!cur) return void 0;
|
|
2477
|
+
if (cur.mtimeMs !== mtimeMs) {
|
|
2478
|
+
if (cur.tree) try {
|
|
2479
|
+
cur.tree.delete();
|
|
2480
|
+
} catch {}
|
|
2481
|
+
_treeCache.delete(absPath);
|
|
2482
|
+
return;
|
|
2483
|
+
}
|
|
2484
|
+
_treeCache.delete(absPath);
|
|
2485
|
+
_treeCache.set(absPath, cur);
|
|
2486
|
+
return cur;
|
|
2487
|
+
}
|
|
2488
|
+
function cachePut(absPath, entry) {
|
|
2489
|
+
while (_treeCache.size >= STRUCTURAL_CACHE_MAX) {
|
|
2490
|
+
const firstKey = _treeCache.keys().next().value;
|
|
2491
|
+
if (firstKey === void 0) break;
|
|
2492
|
+
const evicted = _treeCache.get(firstKey);
|
|
2493
|
+
if (evicted?.tree) try {
|
|
2494
|
+
evicted.tree.delete();
|
|
2495
|
+
} catch {}
|
|
2496
|
+
_treeCache.delete(firstKey);
|
|
2497
|
+
}
|
|
2498
|
+
_treeCache.set(absPath, entry);
|
|
2499
|
+
}
|
|
2500
|
+
/**
|
|
2501
|
+
* Compute the absolute byte offset where line `lineNumber1` starts
|
|
2502
|
+
* in `source`. Lines are counted by LF; CRLF files have the same
|
|
2503
|
+
* line starts as LF files (the \r is part of the previous line's
|
|
2504
|
+
* content, not the line break). `lineNumber1` is 1-indexed to match
|
|
2505
|
+
* ripgrep's output. Returns -1 if the line is past EOF.
|
|
2506
|
+
*/
|
|
2507
|
+
function lineStartByte(source, lineNumber1) {
|
|
2508
|
+
if (lineNumber1 <= 1) return 0;
|
|
2509
|
+
let line = 1;
|
|
2510
|
+
for (let i = 0; i < source.length; i++) if (source.charCodeAt(i) === 10) {
|
|
2511
|
+
line += 1;
|
|
2512
|
+
if (line === lineNumber1) return i + 1;
|
|
2513
|
+
}
|
|
2514
|
+
return -1;
|
|
2515
|
+
}
|
|
2516
|
+
/**
|
|
2517
|
+
* Walk up from a matched identifier node looking for the closest
|
|
2518
|
+
* definition-shape ancestor (per the language's allowed types). When
|
|
2519
|
+
* we find one, verify the matched identifier is at the definition's
|
|
2520
|
+
* "name" slot — NOT inside a parameter type, a body, or a parent's
|
|
2521
|
+
* signature. Returns true iff this is a real definition site for
|
|
2522
|
+
* the identifier the rg submatch landed on.
|
|
2523
|
+
*
|
|
2524
|
+
* The walk has a small depth bound (6) — definition names sit very
|
|
2525
|
+
* close to their definition node in every supported grammar; deeper
|
|
2526
|
+
* walks risk false positives (e.g., matching `name` inside the body
|
|
2527
|
+
* of an enclosing function and concluding "yes, definition").
|
|
2528
|
+
*/
|
|
2529
|
+
function isDefiningSite(matchedNode, langKey) {
|
|
2530
|
+
const defTypes = DEFINITION_NODE_TYPES[langKey];
|
|
2531
|
+
if (!defTypes) return false;
|
|
2532
|
+
let cur = matchedNode.parent;
|
|
2533
|
+
let depth = 0;
|
|
2534
|
+
while (cur && depth < 6) {
|
|
2535
|
+
if (defTypes.has(cur.type)) {
|
|
2536
|
+
const nameField = cur.childForFieldName("name");
|
|
2537
|
+
if (nameField && containsByteRange(nameField, matchedNode)) return true;
|
|
2538
|
+
const declarator = cur.childForFieldName("declarator");
|
|
2539
|
+
if (declarator && containsByteRange(declarator, matchedNode)) {
|
|
2540
|
+
const first = firstIdentifierLeaf(declarator);
|
|
2541
|
+
if (first && first.startIndex === matchedNode.startIndex) return true;
|
|
2542
|
+
}
|
|
2543
|
+
const typeField = cur.childForFieldName("type");
|
|
2544
|
+
if (typeField && containsByteRange(typeField, matchedNode)) {
|
|
2545
|
+
const first = firstIdentifierLeaf(typeField);
|
|
2546
|
+
if (first && first.startIndex === matchedNode.startIndex) return true;
|
|
2547
|
+
}
|
|
2548
|
+
}
|
|
2549
|
+
cur = cur.parent;
|
|
2550
|
+
depth += 1;
|
|
2551
|
+
}
|
|
2552
|
+
return false;
|
|
2553
|
+
}
|
|
2554
|
+
function containsByteRange(outer, inner) {
|
|
2555
|
+
return outer.startIndex <= inner.startIndex && outer.endIndex >= inner.endIndex;
|
|
2556
|
+
}
|
|
2557
|
+
function firstIdentifierLeaf(node) {
|
|
2558
|
+
if (IDENTIFIER_NODE_TYPES.has(node.type)) return node;
|
|
2559
|
+
for (const child of node.namedChildren) {
|
|
2560
|
+
const r = firstIdentifierLeaf(child);
|
|
2561
|
+
if (r) return r;
|
|
2562
|
+
}
|
|
2563
|
+
return null;
|
|
2564
|
+
}
|
|
2565
|
+
/**
|
|
2566
|
+
* Run the structural-confirmation pass over the top-N already-ranked
|
|
2567
|
+
* BM25F hits. Wall-clock-bounded — checked between files, not mid-
|
|
2568
|
+
* parse (web-tree-sitter@0.22 doesn't expose a usable cancel hook).
|
|
2569
|
+
*
|
|
2570
|
+
* Per-file failure modes (file too big, language unsupported, parse
|
|
2571
|
+
* error, I/O error) are silent: the file's hits keep the regex
|
|
2572
|
+
* `symbol_context` heuristic. Only the wall-clock budget fires the
|
|
2573
|
+
* user-visible `fallback` message.
|
|
2574
|
+
*/
|
|
2575
|
+
async function runStructuralPass(opts) {
|
|
2576
|
+
const result = {
|
|
2577
|
+
confirmedHitIndexes: /* @__PURE__ */ new Set(),
|
|
2578
|
+
fallback: null
|
|
2579
|
+
};
|
|
2580
|
+
if (opts.hitsRanked.length === 0) return result;
|
|
2581
|
+
if (opts.signal.aborted) return result;
|
|
2582
|
+
const grammars = await getGrammarBundle().ready;
|
|
2583
|
+
if (grammars.size === 0) return result;
|
|
2584
|
+
const cap = Math.min(opts.hitsRanked.length, opts.topN);
|
|
2585
|
+
const byFile = /* @__PURE__ */ new Map();
|
|
2586
|
+
for (let i = 0; i < cap; i++) {
|
|
2587
|
+
const entry = opts.hitsRanked[i];
|
|
2588
|
+
const list = byFile.get(entry.hit.file) ?? [];
|
|
2589
|
+
list.push(entry);
|
|
2590
|
+
byFile.set(entry.hit.file, list);
|
|
2591
|
+
}
|
|
2592
|
+
const t0 = Date.now();
|
|
2593
|
+
let filesParsed = 0;
|
|
2594
|
+
let parsersUsed = /* @__PURE__ */ new Map();
|
|
2595
|
+
try {
|
|
2596
|
+
for (const [relFile, entries] of byFile) {
|
|
2597
|
+
if (opts.signal.aborted) break;
|
|
2598
|
+
if (Date.now() - t0 >= opts.budgetMs) {
|
|
2599
|
+
result.fallback = `structural budget exceeded after parsing ${filesParsed}/${cap} hits; retry with structural: "topN" or narrow your query`;
|
|
2600
|
+
break;
|
|
2601
|
+
}
|
|
2602
|
+
const langKey = getLanguageKeyForPath(relFile);
|
|
2603
|
+
if (!langKey) continue;
|
|
2604
|
+
const lang = grammars.get(langKey);
|
|
2605
|
+
if (!lang) continue;
|
|
2606
|
+
const absPath = path$1.join(opts.workspaceRoot, relFile);
|
|
2607
|
+
let mtimeMs;
|
|
2608
|
+
let size;
|
|
2609
|
+
try {
|
|
2610
|
+
const st = statSync(absPath);
|
|
2611
|
+
mtimeMs = st.mtimeMs;
|
|
2612
|
+
size = st.size;
|
|
2613
|
+
} catch (err) {
|
|
2614
|
+
consola.debug(`[code_search] structural skip ${relFile} (stat failed: ${err.message})`);
|
|
2615
|
+
continue;
|
|
2616
|
+
}
|
|
2617
|
+
if (size > STRUCTURAL_MAX_FILE_BYTES) {
|
|
2618
|
+
consola.debug(`[code_search] structural skip ${relFile} (${size} bytes > cap)`);
|
|
2619
|
+
continue;
|
|
2620
|
+
}
|
|
2621
|
+
let cached = cacheGet(absPath, mtimeMs);
|
|
2622
|
+
if (!cached) {
|
|
2623
|
+
let source;
|
|
2624
|
+
try {
|
|
2625
|
+
source = readFileSync(absPath, "utf8");
|
|
2626
|
+
} catch (err) {
|
|
2627
|
+
consola.debug(`[code_search] structural skip ${relFile} (read failed: ${err.message})`);
|
|
2628
|
+
cachePut(absPath, {
|
|
2629
|
+
mtimeMs,
|
|
2630
|
+
tree: null,
|
|
2631
|
+
source: null
|
|
2632
|
+
});
|
|
2633
|
+
continue;
|
|
2634
|
+
}
|
|
2635
|
+
let parser = parsersUsed.get(langKey);
|
|
2636
|
+
if (!parser) {
|
|
2637
|
+
parser = new Parser();
|
|
2638
|
+
parser.setLanguage(lang);
|
|
2639
|
+
parsersUsed.set(langKey, parser);
|
|
2640
|
+
}
|
|
2641
|
+
let tree = null;
|
|
2642
|
+
try {
|
|
2643
|
+
tree = parser.parse(source);
|
|
2644
|
+
} catch (err) {
|
|
2645
|
+
consola.debug(`[code_search] tree-sitter parse failed for ${relFile}: ${err.message}`);
|
|
2646
|
+
}
|
|
2647
|
+
cached = {
|
|
2648
|
+
mtimeMs,
|
|
2649
|
+
tree,
|
|
2650
|
+
source: tree ? source : null
|
|
2651
|
+
};
|
|
2652
|
+
cachePut(absPath, cached);
|
|
2653
|
+
filesParsed += 1;
|
|
2654
|
+
}
|
|
2655
|
+
if (!cached.tree || !cached.source) continue;
|
|
2656
|
+
for (const entry of entries) {
|
|
2657
|
+
const lineStart = lineStartByte(cached.source, entry.hit.line);
|
|
2658
|
+
if (lineStart < 0) continue;
|
|
2659
|
+
const matchByteStart = lineStart + entry.hit.match_start;
|
|
2660
|
+
const matchByteEnd = lineStart + entry.hit.match_end;
|
|
2661
|
+
let node;
|
|
2662
|
+
try {
|
|
2663
|
+
node = cached.tree.rootNode.descendantForIndex(matchByteStart, matchByteEnd);
|
|
2664
|
+
} catch {
|
|
2665
|
+
node = null;
|
|
2666
|
+
}
|
|
2667
|
+
if (!node) continue;
|
|
2668
|
+
if (!IDENTIFIER_NODE_TYPES.has(node.type)) {
|
|
2669
|
+
let cur = node;
|
|
2670
|
+
let depth = 0;
|
|
2671
|
+
while (cur && !IDENTIFIER_NODE_TYPES.has(cur.type) && depth < 3) {
|
|
2672
|
+
const leaf = firstIdentifierLeaf(cur);
|
|
2673
|
+
if (leaf && leaf.startIndex === matchByteStart) {
|
|
2674
|
+
cur = leaf;
|
|
2675
|
+
break;
|
|
2676
|
+
}
|
|
2677
|
+
cur = cur.parent;
|
|
2678
|
+
depth += 1;
|
|
2679
|
+
}
|
|
2680
|
+
node = cur;
|
|
2681
|
+
}
|
|
2682
|
+
if (!node || !IDENTIFIER_NODE_TYPES.has(node.type)) continue;
|
|
2683
|
+
if (isDefiningSite(node, langKey)) result.confirmedHitIndexes.add(entry.index);
|
|
2684
|
+
}
|
|
2685
|
+
}
|
|
2686
|
+
} finally {
|
|
2687
|
+
for (const parser of parsersUsed.values()) try {
|
|
2688
|
+
parser.delete();
|
|
2689
|
+
} catch {}
|
|
2690
|
+
parsersUsed = /* @__PURE__ */ new Map();
|
|
2691
|
+
}
|
|
2692
|
+
return result;
|
|
2693
|
+
}
|
|
2694
|
+
function extractFields(hit, astConfirmed) {
|
|
2695
|
+
const ctx = [...hit.context_before, ...hit.context_after].join("\n");
|
|
2696
|
+
let symbolContext;
|
|
2697
|
+
if (astConfirmed) {
|
|
2698
|
+
const ident = hit.matched_line.slice(hit.match_start, hit.match_end);
|
|
2699
|
+
symbolContext = ident.length > 0 ? ident : hit.matched_line;
|
|
2700
|
+
} else if (SYMBOL_REGEX.test(hit.matched_line.trimStart())) symbolContext = hit.matched_line;
|
|
2701
|
+
else symbolContext = "";
|
|
2702
|
+
return {
|
|
2703
|
+
match_line: hit.matched_line,
|
|
2704
|
+
context: ctx,
|
|
2705
|
+
file_path: hit.file.replace(/[/\\]/g, " "),
|
|
2706
|
+
symbol_context: symbolContext
|
|
2707
|
+
};
|
|
2708
|
+
}
|
|
2709
|
+
/**
|
|
2710
|
+
* BM25F score for the given hit set against the tokenized query.
|
|
2711
|
+
*
|
|
2712
|
+
* BM25F(q, f) = Σ_t IDF(t) · w_t,f / (w_t,f + k1)
|
|
2713
|
+
*
|
|
2714
|
+
* w_t,f = Σ_field b_field · tf_t,field,f /
|
|
2715
|
+
* ((1 − l_field) + l_field · len_field,f/avglen_field)
|
|
2716
|
+
*
|
|
2717
|
+
* IDF(t) = log( (M − df(t) + 0.5) / (df(t) + 0.5) )
|
|
2718
|
+
*
|
|
2719
|
+
* Corpus stats are derived from the rg hit set itself — we have no
|
|
2720
|
+
* persistent index. M = number of files in the hit set; df(t) = how
|
|
2721
|
+
* many of those files contain token `t` in any field; avglen_f =
|
|
2722
|
+
* mean tokenized length of field `f` across those files. This is
|
|
2723
|
+
* the "compute corpus stats per-call" pattern, which works because
|
|
2724
|
+
* M ≤ a few hundred files in practice (sub-second).
|
|
2725
|
+
*/
|
|
2726
|
+
function bm25fScore(hits, queryTokens, astConfirmedHits) {
|
|
2727
|
+
if (hits.length === 0 || queryTokens.length === 0) return hits.map((h) => ({
|
|
2728
|
+
hit: h,
|
|
2729
|
+
score: 0,
|
|
2730
|
+
field_contributions: {
|
|
2731
|
+
match_line: 0,
|
|
2732
|
+
symbol_context: 0,
|
|
2733
|
+
file_path: 0,
|
|
2734
|
+
context: 0
|
|
2735
|
+
}
|
|
2736
|
+
}));
|
|
2737
|
+
const fileTokenCache = /* @__PURE__ */ new Map();
|
|
2738
|
+
const perHitTokens = [];
|
|
2739
|
+
for (let i = 0; i < hits.length; i++) {
|
|
2740
|
+
const hit = hits[i];
|
|
2741
|
+
const fields = extractFields(hit, astConfirmedHits?.has(i) ?? false);
|
|
2742
|
+
fileTokenCache.set(hit.file, fields);
|
|
2743
|
+
perHitTokens.push({
|
|
2744
|
+
match_line: tokenize(fields.match_line),
|
|
2745
|
+
context: tokenize(fields.context),
|
|
2746
|
+
file_path: tokenize(fields.file_path),
|
|
2747
|
+
symbol_context: tokenize(fields.symbol_context)
|
|
2748
|
+
});
|
|
2749
|
+
}
|
|
2750
|
+
const filesSeen = /* @__PURE__ */ new Set();
|
|
2751
|
+
for (const hit of hits) filesSeen.add(hit.file);
|
|
2752
|
+
const M = filesSeen.size;
|
|
2753
|
+
const df = /* @__PURE__ */ new Map();
|
|
2754
|
+
const fileTokensByField = {
|
|
2755
|
+
match_line: /* @__PURE__ */ new Map(),
|
|
2756
|
+
context: /* @__PURE__ */ new Map(),
|
|
2757
|
+
file_path: /* @__PURE__ */ new Map(),
|
|
2758
|
+
symbol_context: /* @__PURE__ */ new Map()
|
|
2759
|
+
};
|
|
2760
|
+
for (let i = 0; i < hits.length; i++) {
|
|
2761
|
+
const file = hits[i].file;
|
|
2762
|
+
const t = perHitTokens[i];
|
|
2763
|
+
for (const fname of Object.keys(t)) {
|
|
2764
|
+
let bucket = fileTokensByField[fname].get(file);
|
|
2765
|
+
if (!bucket) {
|
|
2766
|
+
bucket = /* @__PURE__ */ new Set();
|
|
2767
|
+
fileTokensByField[fname].set(file, bucket);
|
|
2768
|
+
}
|
|
2769
|
+
for (const tok of t[fname]) bucket.add(tok);
|
|
2770
|
+
}
|
|
2771
|
+
}
|
|
2772
|
+
for (const qt of queryTokens) {
|
|
2773
|
+
const files = /* @__PURE__ */ new Set();
|
|
2774
|
+
for (const fname of Object.keys(fileTokensByField)) for (const [file, tokSet] of fileTokensByField[fname]) if (tokSet.has(qt)) files.add(file);
|
|
2775
|
+
df.set(qt, files.size);
|
|
2776
|
+
}
|
|
2777
|
+
const avglen = {
|
|
2778
|
+
match_line: 0,
|
|
2779
|
+
context: 0,
|
|
2780
|
+
file_path: 0,
|
|
2781
|
+
symbol_context: 0
|
|
2782
|
+
};
|
|
2783
|
+
for (const fname of Object.keys(avglen)) {
|
|
2784
|
+
const lens = [];
|
|
2785
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2786
|
+
for (let i = 0; i < hits.length; i++) {
|
|
2787
|
+
if (seen.has(hits[i].file)) continue;
|
|
2788
|
+
seen.add(hits[i].file);
|
|
2789
|
+
lens.push(perHitTokens[i][fname].length);
|
|
2790
|
+
}
|
|
2791
|
+
avglen[fname] = lens.length > 0 ? lens.reduce((a, b) => a + b, 0) / lens.length : 1;
|
|
2792
|
+
if (avglen[fname] === 0) avglen[fname] = 1;
|
|
2793
|
+
}
|
|
2794
|
+
const idf = /* @__PURE__ */ new Map();
|
|
2795
|
+
for (const qt of queryTokens) {
|
|
2796
|
+
const d = df.get(qt) ?? 0;
|
|
2797
|
+
idf.set(qt, Math.log((M - d + .5) / (d + .5) + 1));
|
|
2798
|
+
}
|
|
2799
|
+
const out = [];
|
|
2800
|
+
for (let i = 0; i < hits.length; i++) {
|
|
2801
|
+
const tokens = perHitTokens[i];
|
|
2802
|
+
const contributions = {
|
|
2803
|
+
match_line: 0,
|
|
2804
|
+
symbol_context: 0,
|
|
2805
|
+
file_path: 0,
|
|
2806
|
+
context: 0
|
|
2807
|
+
};
|
|
2808
|
+
for (const qt of queryTokens) {
|
|
2809
|
+
let w = 0;
|
|
2810
|
+
const perField = {
|
|
2811
|
+
match_line: 0,
|
|
2812
|
+
symbol_context: 0,
|
|
2813
|
+
file_path: 0,
|
|
2814
|
+
context: 0
|
|
2815
|
+
};
|
|
2816
|
+
for (const fname of Object.keys(FIELD_BOOSTS)) {
|
|
2817
|
+
const tf = tokens[fname].filter((t) => t === qt).length;
|
|
2818
|
+
if (tf === 0) continue;
|
|
2819
|
+
const len = tokens[fname].length || 1;
|
|
2820
|
+
const l = FIELD_LEN_NORMS[fname];
|
|
2821
|
+
const norm = 1 - l + l * (len / (avglen[fname] || 1));
|
|
2822
|
+
const fieldContrib = FIELD_BOOSTS[fname] * (tf / norm);
|
|
2823
|
+
w += fieldContrib;
|
|
2824
|
+
perField[fname] = fieldContrib;
|
|
2825
|
+
}
|
|
2826
|
+
if (w === 0) continue;
|
|
2827
|
+
const termScore = (idf.get(qt) ?? 0) * (w / (w + BM25F_K1));
|
|
2828
|
+
for (const fname of Object.keys(perField)) {
|
|
2829
|
+
const share = perField[fname] / w;
|
|
2830
|
+
contributions[fname] += termScore * share;
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
const total = Object.values(contributions).reduce((a, b) => a + b, 0);
|
|
2834
|
+
out.push({
|
|
2835
|
+
hit: hits[i],
|
|
2836
|
+
score: total,
|
|
2837
|
+
field_contributions: contributions
|
|
2838
|
+
});
|
|
2839
|
+
}
|
|
2840
|
+
return out;
|
|
2841
|
+
}
|
|
2842
|
+
function shoulderPrune(scored) {
|
|
2843
|
+
if (scored.length === 0) return {
|
|
2844
|
+
kept: [],
|
|
2845
|
+
prunedBelowShoulder: 0
|
|
2846
|
+
};
|
|
2847
|
+
scored.sort((a, b) => {
|
|
2848
|
+
if (b.score !== a.score) return b.score - a.score;
|
|
2849
|
+
if (a.hit.file !== b.hit.file) return a.hit.file < b.hit.file ? -1 : 1;
|
|
2850
|
+
return a.hit.line - b.hit.line;
|
|
2851
|
+
});
|
|
2852
|
+
const topScore = scored[0].score;
|
|
2853
|
+
if (topScore <= 0) return {
|
|
2854
|
+
kept: scored,
|
|
2855
|
+
prunedBelowShoulder: 0
|
|
2856
|
+
};
|
|
2857
|
+
const threshold = topScore * SHOULDER_THRESHOLD;
|
|
2858
|
+
let cut = scored.length;
|
|
2859
|
+
for (let i = 0; i < scored.length; i++) if (scored[i].score < threshold) {
|
|
2860
|
+
cut = i;
|
|
2861
|
+
break;
|
|
2862
|
+
}
|
|
2863
|
+
return {
|
|
2864
|
+
kept: scored.slice(0, cut),
|
|
2865
|
+
prunedBelowShoulder: scored.length - cut
|
|
2866
|
+
};
|
|
2867
|
+
}
|
|
2868
|
+
function renderSnippet(hit) {
|
|
2869
|
+
let snippet = [
|
|
2870
|
+
...hit.context_before,
|
|
2871
|
+
hit.matched_line,
|
|
2872
|
+
...hit.context_after
|
|
2873
|
+
].join("\n");
|
|
2874
|
+
if (Buffer.byteLength(snippet, "utf8") > MAX_SNIPPET_BYTES) {
|
|
2875
|
+
const buf = Buffer.from(snippet, "utf8");
|
|
2876
|
+
const halfCap = Math.floor((MAX_SNIPPET_BYTES - 16) / 2);
|
|
2877
|
+
snippet = buf.slice(0, halfCap).toString("utf8") + "\n... [truncated] ...\n" + buf.slice(buf.length - halfCap).toString("utf8");
|
|
2878
|
+
}
|
|
2879
|
+
return snippet;
|
|
2880
|
+
}
|
|
2881
|
+
async function searchCode(rawInput, externalSignal) {
|
|
2882
|
+
const t0 = Date.now();
|
|
2883
|
+
const inputErr = validateInputs(rawInput);
|
|
2884
|
+
if (inputErr) throw new Error(inputErr);
|
|
2885
|
+
const ws = validateWorkspace(rawInput.workspace);
|
|
2886
|
+
if (!ws.ok || !ws.canonical) throw new Error(ws.error ?? "workspace validation failed");
|
|
2887
|
+
const mode = rawInput.mode ?? "ranked";
|
|
2888
|
+
const structuralMode = rawInput.structural ?? "full";
|
|
2889
|
+
const limit = rawInput.limit ?? DEFAULT_LIMIT;
|
|
2890
|
+
const contextLines = Math.min(rawInput.context_lines ?? DEFAULT_CONTEXT_LINES, MAX_CONTEXT_LINES);
|
|
2891
|
+
const expansion = mode === "regex" ? null : expandIdentifierVariants(rawInput.query);
|
|
2892
|
+
const expansionPattern = expansion ? buildExpansionPattern(expansion) : void 0;
|
|
2893
|
+
const ac = new AbortController();
|
|
2894
|
+
const onExternal = () => ac.abort("external");
|
|
2895
|
+
if (externalSignal) if (externalSignal.aborted) ac.abort("external");
|
|
2896
|
+
else externalSignal.addEventListener("abort", onExternal, { once: true });
|
|
2897
|
+
const wallTimer = setTimeout(() => ac.abort("timeout"), WALL_TIME_MS);
|
|
2898
|
+
wallTimer.unref();
|
|
2899
|
+
let parseResult;
|
|
2900
|
+
let rgResolution;
|
|
2901
|
+
try {
|
|
2902
|
+
rgResolution = resolveRipgrep();
|
|
2903
|
+
} catch (err) {
|
|
2904
|
+
clearTimeout(wallTimer);
|
|
2905
|
+
if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
|
|
2906
|
+
throw err;
|
|
2907
|
+
}
|
|
2908
|
+
const args = buildRgArgs({
|
|
2909
|
+
mode,
|
|
2910
|
+
fileGlob: rawInput.file_glob,
|
|
2911
|
+
contextLines,
|
|
2912
|
+
query: rawInput.query,
|
|
2913
|
+
expansionPattern
|
|
2914
|
+
});
|
|
2915
|
+
let child;
|
|
2916
|
+
try {
|
|
2917
|
+
child = spawn(rgResolution.rgPath, args, {
|
|
2918
|
+
cwd: ws.canonical,
|
|
2919
|
+
shell: false,
|
|
2920
|
+
stdio: [
|
|
2921
|
+
"ignore",
|
|
2922
|
+
"pipe",
|
|
2923
|
+
"pipe"
|
|
2924
|
+
]
|
|
2925
|
+
});
|
|
2926
|
+
} catch (err) {
|
|
2927
|
+
clearTimeout(wallTimer);
|
|
2928
|
+
if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
|
|
2929
|
+
throw new Error(`failed to spawn ripgrep: ${err.message}`);
|
|
2930
|
+
}
|
|
2931
|
+
const STDERR_TEXT_CAP = 64 * 1024;
|
|
2932
|
+
let stderrBytes = 0;
|
|
2933
|
+
let stderrText = "";
|
|
2934
|
+
if (child.stderr) {
|
|
2935
|
+
child.stderr.setEncoding("utf8");
|
|
2936
|
+
child.stderr.on("data", (chunk) => {
|
|
2937
|
+
stderrBytes += chunk.length;
|
|
2938
|
+
if (stderrText.length < STDERR_TEXT_CAP) stderrText = (stderrText + chunk).slice(0, STDERR_TEXT_CAP);
|
|
2939
|
+
if (stderrBytes > 1024 * 1024) ac.abort("stderr_cap");
|
|
2940
|
+
});
|
|
2941
|
+
}
|
|
2942
|
+
let exitCode = null;
|
|
2943
|
+
const exitPromise = new Promise((resolve) => {
|
|
2944
|
+
child.on("close", (code) => {
|
|
2945
|
+
exitCode = code;
|
|
2946
|
+
resolve();
|
|
2947
|
+
});
|
|
2948
|
+
});
|
|
2949
|
+
try {
|
|
2950
|
+
parseResult = await parseRgJsonStream(child, {
|
|
2951
|
+
limit,
|
|
2952
|
+
contextLines,
|
|
2953
|
+
signal: ac.signal
|
|
2954
|
+
});
|
|
2955
|
+
} finally {
|
|
2956
|
+
clearTimeout(wallTimer);
|
|
2957
|
+
if (externalSignal) externalSignal.removeEventListener("abort", onExternal);
|
|
2958
|
+
if (!child.killed) killChild(child);
|
|
2959
|
+
}
|
|
2960
|
+
if (ac.signal.aborted && parseResult.hits.length === 0) {
|
|
2961
|
+
const reason = String(ac.signal.reason ?? "aborted");
|
|
2962
|
+
throw new Error(`code_search aborted (${reason})`);
|
|
2963
|
+
}
|
|
2964
|
+
if (!ac.signal.aborted) await exitPromise;
|
|
2965
|
+
if (exitCode !== null && exitCode !== 0 && exitCode !== 1 && !ac.signal.aborted && parseResult.hits.length === 0) {
|
|
2966
|
+
const trimmed = stderrText.trim();
|
|
2967
|
+
const detail = trimmed.length > 0 ? trimmed.replace(/^rg:\s*/i, "").slice(0, 600) : `ripgrep exited with code ${exitCode}`;
|
|
2968
|
+
throw new Error(`code_search: ${detail}`);
|
|
2969
|
+
}
|
|
2970
|
+
let kept;
|
|
2971
|
+
let prunedBelowShoulder;
|
|
2972
|
+
let notice = null;
|
|
2973
|
+
if (mode === "ranked") {
|
|
2974
|
+
const queryTokens = tokenize(rawInput.query);
|
|
2975
|
+
const pass1 = bm25fScore(parseResult.hits, queryTokens);
|
|
2976
|
+
pass1.sort((a, b) => b.score - a.score);
|
|
2977
|
+
const topN = structuralMode === "topN" ? STRUCTURAL_TOPN_FAST : STRUCTURAL_TOPN_FULL;
|
|
2978
|
+
const indexByHit = /* @__PURE__ */ new Map();
|
|
2979
|
+
for (let i = 0; i < parseResult.hits.length; i++) indexByHit.set(parseResult.hits[i], i);
|
|
2980
|
+
const structural = await runStructuralPass({
|
|
2981
|
+
hitsRanked: pass1.slice(0, Math.min(topN, pass1.length)).map((sh) => ({
|
|
2982
|
+
hit: sh.hit,
|
|
2983
|
+
index: indexByHit.get(sh.hit) ?? -1
|
|
2984
|
+
})).filter((e) => e.index >= 0),
|
|
2985
|
+
workspaceRoot: ws.canonical,
|
|
2986
|
+
topN,
|
|
2987
|
+
budgetMs: STRUCTURAL_BUDGET_MS,
|
|
2988
|
+
signal: ac.signal
|
|
2989
|
+
});
|
|
2990
|
+
notice = structural.fallback;
|
|
2991
|
+
const pruned = shoulderPrune(bm25fScore(parseResult.hits, queryTokens, structural.confirmedHitIndexes));
|
|
2992
|
+
kept = pruned.kept.slice(0, limit);
|
|
2993
|
+
prunedBelowShoulder = pruned.prunedBelowShoulder;
|
|
2994
|
+
} else kept = parseResult.hits.map((h) => ({
|
|
2995
|
+
hit: h,
|
|
2996
|
+
score: 0,
|
|
2997
|
+
field_contributions: {}
|
|
2998
|
+
}));
|
|
2999
|
+
const results = kept.map((sh) => {
|
|
3000
|
+
let file = sh.hit.file;
|
|
3001
|
+
if (file.startsWith("./") || file.startsWith(".\\")) file = file.slice(2);
|
|
3002
|
+
file = file.replace(/\\/g, "/");
|
|
3003
|
+
const baseHit = {
|
|
3004
|
+
file,
|
|
3005
|
+
line: sh.hit.line,
|
|
3006
|
+
snippet: renderSnippet(sh.hit),
|
|
3007
|
+
match_byte_range: [sh.hit.match_start, sh.hit.match_end]
|
|
3008
|
+
};
|
|
3009
|
+
if (mode === "ranked") {
|
|
3010
|
+
baseHit.score = round4(sh.score);
|
|
3011
|
+
baseHit.field_contributions = {
|
|
3012
|
+
match_line: round4(sh.field_contributions.match_line ?? 0),
|
|
3013
|
+
symbol_context: round4(sh.field_contributions.symbol_context ?? 0),
|
|
3014
|
+
file_path: round4(sh.field_contributions.file_path ?? 0),
|
|
3015
|
+
context: round4(sh.field_contributions.context ?? 0)
|
|
3016
|
+
};
|
|
3017
|
+
} else baseHit.field_contributions = null;
|
|
3018
|
+
return baseHit;
|
|
3019
|
+
});
|
|
3020
|
+
const elapsed_ms = Date.now() - t0;
|
|
3021
|
+
const debugLog = process.env.GH_ROUTER_DEBUG_CODE_SEARCH === "1";
|
|
3022
|
+
consola.info(`[code_search] mode=${mode} structural=${structuralMode} expansion=${expansion ? expansion.length : 0} results=${results.length} truncated=${parseResult.truncated} scanned_files=${parseResult.scannedFiles} elapsed_ms=${elapsed_ms} abort=${parseResult.cancelled} rg=${rgResolution.source} notice=${notice ? "yes" : "no"}` + (debugLog ? ` query="${rawInput.query}" workspace="${ws.canonical}"` : ""));
|
|
3023
|
+
return {
|
|
3024
|
+
results,
|
|
3025
|
+
truncated: parseResult.truncated,
|
|
3026
|
+
pruned_below_shoulder: mode === "ranked" ? prunedBelowShoulder : void 0,
|
|
3027
|
+
scanned_files: parseResult.scannedFiles,
|
|
3028
|
+
elapsed_ms,
|
|
3029
|
+
ranking: mode === "ranked" ? {
|
|
3030
|
+
algorithm: "BM25F",
|
|
3031
|
+
citation: "Robertson, Zaragoza, Taylor 2004",
|
|
3032
|
+
k1: BM25F_K1
|
|
3033
|
+
} : { algorithm: "ripgrep_document_order" },
|
|
3034
|
+
notice
|
|
3035
|
+
};
|
|
3036
|
+
}
|
|
3037
|
+
function round4(x) {
|
|
3038
|
+
return Math.round(x * 1e4) / 1e4;
|
|
3039
|
+
}
|
|
3040
|
+
|
|
1789
3041
|
//#endregion
|
|
1790
3042
|
//#region src/services/copilot/web-search.ts
|
|
1791
3043
|
const RpcSchema = z.object({
|
|
@@ -2263,7 +3515,7 @@ function buildPeerAwarenessSnippet(opts) {
|
|
|
2263
3515
|
return [
|
|
2264
3516
|
"## Peer review and advisor",
|
|
2265
3517
|
"",
|
|
2266
|
-
`Cross-lab peer critics under \`mcp__gh-router-peers__*\` — ${criticList.join(", ")} — plus the \`peer-review-coordinator\` fan-out subagent, and Claude Code's built-in \`advisor\` tool, are available at your discretion for second opinions and adversarial review. Subagents you spawn inherit them.${codexCliClause}
|
|
3518
|
+
`Cross-lab peer critics under \`mcp__gh-router-peers__*\` — ${criticList.join(", ")} — plus the \`peer-review-coordinator\` fan-out subagent, and Claude Code's built-in \`advisor\` tool, are available at your discretion for second opinions and adversarial review. Subagents you spawn inherit them.${codexCliClause} Also \`mcp__gh-router-peers__code_search\` for accurate ranked code discovery (BM25F + tree-sitter) — prefer it over \`Grep\` when finding definitions or call sites.`
|
|
2267
3519
|
].join("\n");
|
|
2268
3520
|
}
|
|
2269
3521
|
/** Convenience: every persona that should be registered for the given mode. */
|
|
@@ -2328,6 +3580,94 @@ const NON_PERSONA_MCP_TOOLS = Object.freeze([{
|
|
|
2328
3580
|
};
|
|
2329
3581
|
}
|
|
2330
3582
|
}
|
|
3583
|
+
}, {
|
|
3584
|
+
toolNameHttp: "code_search",
|
|
3585
|
+
description: "Fast structured code search over a local workspace. Returns ranked, deduplicated hits with snippets. Ranks with BM25F across matched-line / file-path / surrounding-context / symbol-context fields, then refines `symbol-context` with tree-sitter AST analysis on the top hits so identifier definitions outrank incidental string matches. Prefer this over Grep/Bash+grep for ranked discovery (\"where is X defined\", \"which files reference Y\", \"find code that does Z\") — ranked mode surfaces the few right answers instead of every match. Use Grep for exact-pattern enumeration when you need every hit unranked, and Glob for file-name patterns (no content match). `workspace` is any absolute path the proxy process can read — typically the project root or a sub-tree you're working in.",
|
|
3586
|
+
inputSchema: {
|
|
3587
|
+
type: "object",
|
|
3588
|
+
required: ["query", "workspace"],
|
|
3589
|
+
additionalProperties: false,
|
|
3590
|
+
properties: {
|
|
3591
|
+
query: {
|
|
3592
|
+
type: "string",
|
|
3593
|
+
description: "Search text. In 'ranked' (default) and 'literal' modes, interpreted as a literal string. In 'regex' mode, interpreted as a PCRE2 regex. In 'ranked' and 'literal' modes, single-identifier queries are auto-expanded across camelCase / snake_case / kebab-case / SCREAMING_SNAKE skeletons so `getUserName` also matches `get_user_name`."
|
|
3594
|
+
},
|
|
3595
|
+
workspace: {
|
|
3596
|
+
type: "string",
|
|
3597
|
+
description: "Absolute path to the project root (or sub-tree) to search."
|
|
3598
|
+
},
|
|
3599
|
+
mode: {
|
|
3600
|
+
type: "string",
|
|
3601
|
+
enum: [
|
|
3602
|
+
"ranked",
|
|
3603
|
+
"literal",
|
|
3604
|
+
"regex"
|
|
3605
|
+
],
|
|
3606
|
+
description: "Ranking mode. 'ranked' (default): BM25F + tree-sitter structural boost; results ordered by score with shoulder pruning (drops results below 50% of the top score). 'literal': fixed-string search, ripgrep document order. 'regex': PCRE2 search, ripgrep document order."
|
|
3607
|
+
},
|
|
3608
|
+
file_glob: {
|
|
3609
|
+
type: "string",
|
|
3610
|
+
description: "Optional ripgrep glob filter (e.g. 'src/**/*.ts')."
|
|
3611
|
+
},
|
|
3612
|
+
limit: {
|
|
3613
|
+
type: "number",
|
|
3614
|
+
description: "Max hits to return (default 20)."
|
|
3615
|
+
},
|
|
3616
|
+
structural: {
|
|
3617
|
+
type: "string",
|
|
3618
|
+
enum: ["full", "topN"],
|
|
3619
|
+
description: "Structural-ranking depth (ranked mode only). 'full' (default) runs tree-sitter on the top 50 BM25F hits — best signal, fine for typical repos. 'topN' restricts to the top 10 for tighter latency on very large workspaces. Both modes share a 200ms wall-clock budget; on budget exhaustion the response includes `notice` and remaining hits fall back to the regex symbol heuristic."
|
|
3620
|
+
}
|
|
3621
|
+
}
|
|
3622
|
+
},
|
|
3623
|
+
async handler(args, signal) {
|
|
3624
|
+
try {
|
|
3625
|
+
const result = await searchCode({
|
|
3626
|
+
query: typeof args.query === "string" ? args.query : "",
|
|
3627
|
+
workspace: typeof args.workspace === "string" ? args.workspace : "",
|
|
3628
|
+
mode: args.mode === "literal" || args.mode === "regex" || args.mode === "ranked" ? args.mode : void 0,
|
|
3629
|
+
file_glob: typeof args.file_glob === "string" ? args.file_glob : void 0,
|
|
3630
|
+
limit: typeof args.limit === "number" ? args.limit : void 0,
|
|
3631
|
+
structural: args.structural === "full" || args.structural === "topN" ? args.structural : void 0
|
|
3632
|
+
}, signal);
|
|
3633
|
+
const SIZE_CAP_BYTES = 256 * 1024;
|
|
3634
|
+
const trimmedHits = [];
|
|
3635
|
+
let totalBytes = 0;
|
|
3636
|
+
let sizeCapped = false;
|
|
3637
|
+
for (const hit of result.results) {
|
|
3638
|
+
const next = {
|
|
3639
|
+
file: hit.file,
|
|
3640
|
+
line: hit.line,
|
|
3641
|
+
snippet: hit.snippet
|
|
3642
|
+
};
|
|
3643
|
+
const nextBytes = Buffer.byteLength(JSON.stringify(next), "utf8");
|
|
3644
|
+
if (trimmedHits.length > 0 && totalBytes + nextBytes > SIZE_CAP_BYTES) {
|
|
3645
|
+
sizeCapped = true;
|
|
3646
|
+
break;
|
|
3647
|
+
}
|
|
3648
|
+
trimmedHits.push(next);
|
|
3649
|
+
totalBytes += nextBytes;
|
|
3650
|
+
}
|
|
3651
|
+
const minimal = {
|
|
3652
|
+
results: trimmedHits,
|
|
3653
|
+
truncated: result.truncated || sizeCapped
|
|
3654
|
+
};
|
|
3655
|
+
if (sizeCapped) minimal.notice = `response size limit reached at ${trimmedHits.length} hits (~${Math.round(totalBytes / 1024)}KB); narrow your query or lower 'limit' to get all relevant matches`;
|
|
3656
|
+
else if (typeof result.notice === "string") minimal.notice = result.notice;
|
|
3657
|
+
return { content: [{
|
|
3658
|
+
type: "text",
|
|
3659
|
+
text: JSON.stringify(minimal)
|
|
3660
|
+
}] };
|
|
3661
|
+
} catch (err) {
|
|
3662
|
+
return {
|
|
3663
|
+
content: [{
|
|
3664
|
+
type: "text",
|
|
3665
|
+
text: `code_search failed: ${err instanceof Error ? err.message : String(err)}`
|
|
3666
|
+
}],
|
|
3667
|
+
isError: true
|
|
3668
|
+
};
|
|
3669
|
+
}
|
|
3670
|
+
}
|
|
2331
3671
|
}]);
|
|
2332
3672
|
|
|
2333
3673
|
//#endregion
|
|
@@ -2835,8 +4175,8 @@ const ENDPOINT_ALIASES = {
|
|
|
2835
4175
|
* - the model has no `supported_endpoints` field (backward-compat)
|
|
2836
4176
|
* - the endpoint is listed in `supported_endpoints`
|
|
2837
4177
|
*/
|
|
2838
|
-
function modelSupportsEndpoint(modelId, path$
|
|
2839
|
-
const endpoint = ENDPOINT_ALIASES[path$
|
|
4178
|
+
function modelSupportsEndpoint(modelId, path$2) {
|
|
4179
|
+
const endpoint = ENDPOINT_ALIASES[path$2] ?? path$2;
|
|
2840
4180
|
const model = state.models?.data.find((m) => m.id === modelId);
|
|
2841
4181
|
if (!model) return true;
|
|
2842
4182
|
const supported = model.supported_endpoints;
|
|
@@ -2847,17 +4187,17 @@ function modelSupportsEndpoint(modelId, path$1) {
|
|
|
2847
4187
|
* Log an error when a model is used on an endpoint it doesn't support.
|
|
2848
4188
|
* Returns `true` if a mismatch was detected (for testing).
|
|
2849
4189
|
*/
|
|
2850
|
-
function logEndpointMismatch(modelId, path$
|
|
2851
|
-
if (modelSupportsEndpoint(modelId, path$
|
|
4190
|
+
function logEndpointMismatch(modelId, path$2) {
|
|
4191
|
+
if (modelSupportsEndpoint(modelId, path$2)) return false;
|
|
2852
4192
|
const supported = (state.models?.data.find((m) => m.id === modelId))?.supported_endpoints ?? [];
|
|
2853
|
-
consola.error(`Model "${modelId}" does not support ${path$
|
|
4193
|
+
consola.error(`Model "${modelId}" does not support ${path$2}. Supported endpoints: ${supported.join(", ")}`);
|
|
2854
4194
|
return true;
|
|
2855
4195
|
}
|
|
2856
4196
|
/**
|
|
2857
4197
|
* Return model IDs that support the given endpoint.
|
|
2858
4198
|
*/
|
|
2859
|
-
function listModelsForEndpoint(path$
|
|
2860
|
-
const endpoint = ENDPOINT_ALIASES[path$
|
|
4199
|
+
function listModelsForEndpoint(path$2) {
|
|
4200
|
+
const endpoint = ENDPOINT_ALIASES[path$2] ?? path$2;
|
|
2861
4201
|
return (state.models?.data ?? []).filter((m) => {
|
|
2862
4202
|
const supported = m.supported_endpoints;
|
|
2863
4203
|
if (!supported || supported.length === 0) return true;
|
|
@@ -2914,7 +4254,7 @@ function initProxyFromEnv() {
|
|
|
2914
4254
|
//#endregion
|
|
2915
4255
|
//#region package.json
|
|
2916
4256
|
var name = "github-router";
|
|
2917
|
-
var version = "0.3.
|
|
4257
|
+
var version = "0.3.28";
|
|
2918
4258
|
|
|
2919
4259
|
//#endregion
|
|
2920
4260
|
//#region src/lib/approval.ts
|