rbxstudio-mcp 2.3.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -14
- package/dist/__tests__/bridge-service.test.js +25 -13
- package/dist/__tests__/bridge-service.test.js.map +1 -1
- package/dist/__tests__/bridge-session.test.d.ts +2 -0
- package/dist/__tests__/bridge-session.test.d.ts.map +1 -0
- package/dist/__tests__/bridge-session.test.js +171 -0
- package/dist/__tests__/bridge-session.test.js.map +1 -0
- package/dist/__tests__/chunker.test.d.ts +2 -0
- package/dist/__tests__/chunker.test.d.ts.map +1 -0
- package/dist/__tests__/chunker.test.js +201 -0
- package/dist/__tests__/chunker.test.js.map +1 -0
- package/dist/__tests__/docs-core.test.d.ts +2 -0
- package/dist/__tests__/docs-core.test.d.ts.map +1 -0
- package/dist/__tests__/docs-core.test.js +137 -0
- package/dist/__tests__/docs-core.test.js.map +1 -0
- package/dist/__tests__/docs-fetcher.test.d.ts +2 -0
- package/dist/__tests__/docs-fetcher.test.d.ts.map +1 -0
- package/dist/__tests__/docs-fetcher.test.js +173 -0
- package/dist/__tests__/docs-fetcher.test.js.map +1 -0
- package/dist/__tests__/helpers.d.ts +8 -0
- package/dist/__tests__/helpers.d.ts.map +1 -0
- package/dist/__tests__/helpers.js +23 -0
- package/dist/__tests__/helpers.js.map +1 -0
- package/dist/__tests__/http-routes.test.d.ts +2 -0
- package/dist/__tests__/http-routes.test.d.ts.map +1 -0
- package/dist/__tests__/http-routes.test.js +233 -0
- package/dist/__tests__/http-routes.test.js.map +1 -0
- package/dist/__tests__/http-server.test.js +13 -6
- package/dist/__tests__/http-server.test.js.map +1 -1
- package/dist/__tests__/integration.test.js +9 -4
- package/dist/__tests__/integration.test.js.map +1 -1
- package/dist/__tests__/semantic-search.test.d.ts +2 -0
- package/dist/__tests__/semantic-search.test.d.ts.map +1 -0
- package/dist/__tests__/semantic-search.test.js +202 -0
- package/dist/__tests__/semantic-search.test.js.map +1 -0
- package/dist/__tests__/smoke.test.js +7 -3
- package/dist/__tests__/smoke.test.js.map +1 -1
- package/dist/__tests__/studio-client.test.d.ts +2 -0
- package/dist/__tests__/studio-client.test.d.ts.map +1 -0
- package/dist/__tests__/studio-client.test.js +25 -0
- package/dist/__tests__/studio-client.test.js.map +1 -0
- package/dist/__tests__/tool-nudges.test.d.ts +2 -0
- package/dist/__tests__/tool-nudges.test.d.ts.map +1 -0
- package/dist/__tests__/tool-nudges.test.js +60 -0
- package/dist/__tests__/tool-nudges.test.js.map +1 -0
- package/dist/__tests__/tool-registry.test.d.ts +2 -0
- package/dist/__tests__/tool-registry.test.d.ts.map +1 -0
- package/dist/__tests__/tool-registry.test.js +365 -0
- package/dist/__tests__/tool-registry.test.js.map +1 -0
- package/dist/__tests__/tools-bridge.test.d.ts +2 -0
- package/dist/__tests__/tools-bridge.test.d.ts.map +1 -0
- package/dist/__tests__/tools-bridge.test.js +396 -0
- package/dist/__tests__/tools-bridge.test.js.map +1 -0
- package/dist/__tests__/tools-docs.test.d.ts +2 -0
- package/dist/__tests__/tools-docs.test.d.ts.map +1 -0
- package/dist/__tests__/tools-docs.test.js +112 -0
- package/dist/__tests__/tools-docs.test.js.map +1 -0
- package/dist/__tests__/tools-guards.test.d.ts +2 -0
- package/dist/__tests__/tools-guards.test.d.ts.map +1 -0
- package/dist/__tests__/tools-guards.test.js +131 -0
- package/dist/__tests__/tools-guards.test.js.map +1 -0
- package/dist/__tests__/tools-runtime.test.d.ts +2 -0
- package/dist/__tests__/tools-runtime.test.d.ts.map +1 -0
- package/dist/__tests__/tools-runtime.test.js +214 -0
- package/dist/__tests__/tools-runtime.test.js.map +1 -0
- package/dist/__tests__/tools-visual.test.d.ts +2 -0
- package/dist/__tests__/tools-visual.test.d.ts.map +1 -0
- package/dist/__tests__/tools-visual.test.js +149 -0
- package/dist/__tests__/tools-visual.test.js.map +1 -0
- package/dist/bridge-service.d.ts +99 -12
- package/dist/bridge-service.d.ts.map +1 -1
- package/dist/bridge-service.js +238 -21
- package/dist/bridge-service.js.map +1 -1
- package/dist/docs/cache.d.ts +50 -0
- package/dist/docs/cache.d.ts.map +1 -0
- package/dist/docs/cache.js +123 -0
- package/dist/docs/cache.js.map +1 -0
- package/dist/docs/embeddings/chunker.d.ts +120 -0
- package/dist/docs/embeddings/chunker.d.ts.map +1 -0
- package/dist/docs/embeddings/chunker.js +395 -0
- package/dist/docs/embeddings/chunker.js.map +1 -0
- package/dist/docs/embeddings/embedder.d.ts +41 -0
- package/dist/docs/embeddings/embedder.d.ts.map +1 -0
- package/dist/docs/embeddings/embedder.js +113 -0
- package/dist/docs/embeddings/embedder.js.map +1 -0
- package/dist/docs/embeddings/index.d.ts +102 -0
- package/dist/docs/embeddings/index.d.ts.map +1 -0
- package/dist/docs/embeddings/index.js +250 -0
- package/dist/docs/embeddings/index.js.map +1 -0
- package/dist/docs/embeddings/manager.d.ts +68 -0
- package/dist/docs/embeddings/manager.d.ts.map +1 -0
- package/dist/docs/embeddings/manager.js +97 -0
- package/dist/docs/embeddings/manager.js.map +1 -0
- package/dist/docs/fetcher.d.ts +29 -0
- package/dist/docs/fetcher.d.ts.map +1 -0
- package/dist/docs/fetcher.js +244 -0
- package/dist/docs/fetcher.js.map +1 -0
- package/dist/docs/reference.d.ts +37 -0
- package/dist/docs/reference.d.ts.map +1 -0
- package/dist/docs/reference.js +108 -0
- package/dist/docs/reference.js.map +1 -0
- package/dist/docs/search.d.ts +194 -0
- package/dist/docs/search.d.ts.map +1 -0
- package/dist/docs/search.js +733 -0
- package/dist/docs/search.js.map +1 -0
- package/dist/http-server.d.ts.map +1 -1
- package/dist/http-server.js +52 -5
- package/dist/http-server.js.map +1 -1
- package/dist/index.d.ts +8 -9
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +35 -1035
- package/dist/index.js.map +1 -1
- package/dist/instructions.d.ts +15 -0
- package/dist/instructions.d.ts.map +1 -0
- package/dist/instructions.js +26 -0
- package/dist/instructions.js.map +1 -0
- package/dist/tools/defs/attributes.d.ts +6 -0
- package/dist/tools/defs/attributes.d.ts.map +1 -0
- package/dist/tools/defs/attributes.js +85 -0
- package/dist/tools/defs/attributes.js.map +1 -0
- package/dist/tools/defs/docs.d.ts +17 -0
- package/dist/tools/defs/docs.d.ts.map +1 -0
- package/dist/tools/defs/docs.js +151 -0
- package/dist/tools/defs/docs.js.map +1 -0
- package/dist/tools/defs/execute.d.ts +6 -0
- package/dist/tools/defs/execute.d.ts.map +1 -0
- package/dist/tools/defs/execute.js +21 -0
- package/dist/tools/defs/execute.js.map +1 -0
- package/dist/tools/defs/inspection.d.ts +7 -0
- package/dist/tools/defs/inspection.d.ts.map +1 -0
- package/dist/tools/defs/inspection.js +202 -0
- package/dist/tools/defs/inspection.js.map +1 -0
- package/dist/tools/defs/objects.d.ts +6 -0
- package/dist/tools/defs/objects.d.ts.map +1 -0
- package/dist/tools/defs/objects.js +111 -0
- package/dist/tools/defs/objects.js.map +1 -0
- package/dist/tools/defs/properties.d.ts +6 -0
- package/dist/tools/defs/properties.d.ts.map +1 -0
- package/dist/tools/defs/properties.js +71 -0
- package/dist/tools/defs/properties.js.map +1 -0
- package/dist/tools/defs/runtime.d.ts +6 -0
- package/dist/tools/defs/runtime.d.ts.map +1 -0
- package/dist/tools/defs/runtime.js +145 -0
- package/dist/tools/defs/runtime.js.map +1 -0
- package/dist/tools/defs/scripts.d.ts +18 -0
- package/dist/tools/defs/scripts.d.ts.map +1 -0
- package/dist/tools/defs/scripts.js +163 -0
- package/dist/tools/defs/scripts.js.map +1 -0
- package/dist/tools/defs/tags.d.ts +6 -0
- package/dist/tools/defs/tags.d.ts.map +1 -0
- package/dist/tools/defs/tags.js +74 -0
- package/dist/tools/defs/tags.js.map +1 -0
- package/dist/tools/defs/visual.d.ts +7 -0
- package/dist/tools/defs/visual.d.ts.map +1 -0
- package/dist/tools/defs/visual.js +208 -0
- package/dist/tools/defs/visual.js.map +1 -0
- package/dist/tools/index.d.ts +101 -25
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +580 -63
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/nudges.d.ts +25 -0
- package/dist/tools/nudges.d.ts.map +1 -0
- package/dist/tools/nudges.js +34 -0
- package/dist/tools/nudges.js.map +1 -0
- package/dist/tools/registry.d.ts +20 -0
- package/dist/tools/registry.d.ts.map +1 -0
- package/dist/tools/registry.js +65 -0
- package/dist/tools/registry.js.map +1 -0
- package/dist/tools/types.d.ts +24 -0
- package/dist/tools/types.d.ts.map +1 -0
- package/dist/tools/types.js +2 -0
- package/dist/tools/types.js.map +1 -0
- package/package.json +7 -6
- package/studio-plugin/MCPPlugin.rbxmx +3 -238
- package/studio-plugin/plugin.luau +2041 -365
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
import { promises as fs } from 'fs';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
import { contentRoot } from './cache.js';
|
|
4
|
+
import { getOrBuild } from './embeddings/manager.js';
|
|
5
|
+
import { encodeOne, dot } from './embeddings/embedder.js';
|
|
6
|
+
const DEFAULT_EXTENSIONS = ['md', 'yaml', 'yml'];
|
|
7
|
+
function escapeRegex(literal) {
|
|
8
|
+
return literal.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
9
|
+
}
|
|
10
|
+
function buildPattern(pattern, opts) {
|
|
11
|
+
const body = opts.useRegex ? pattern : escapeRegex(pattern);
|
|
12
|
+
const flags = opts.caseSensitive ? 'g' : 'gi';
|
|
13
|
+
return new RegExp(body, flags);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Shell-style tokenizer: splits on whitespace, but `"foo bar"` stays one
|
|
17
|
+
* token. Lets callers force a phrase match where they want it
|
|
18
|
+
* (e.g. `"Class.Motor6D"` for the literal dotted form).
|
|
19
|
+
*
|
|
20
|
+
* Returns an empty array for an empty/whitespace string.
|
|
21
|
+
*/
|
|
22
|
+
function tokenize(query) {
|
|
23
|
+
const tokens = [];
|
|
24
|
+
// Match either:
|
|
25
|
+
// - "...quoted phrase..." (any chars except an unescaped ")
|
|
26
|
+
// - bare run of non-whitespace
|
|
27
|
+
const re = /"([^"]*)"|(\S+)/g;
|
|
28
|
+
let m;
|
|
29
|
+
while ((m = re.exec(query)) !== null) {
|
|
30
|
+
const tok = m[1] ?? m[2];
|
|
31
|
+
if (tok && tok.length > 0)
|
|
32
|
+
tokens.push(tok);
|
|
33
|
+
}
|
|
34
|
+
return tokens;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Common English stopwords that wreck keyword AND-recall on natural-
|
|
38
|
+
* language queries like "how do I rotate a body part smoothly" — words
|
|
39
|
+
* like "how", "do", "I", "a" almost never appear together with the
|
|
40
|
+
* meaningful tokens in API docs, so requiring them in the AND filter
|
|
41
|
+
* zeroes out every hit.
|
|
42
|
+
*
|
|
43
|
+
* We strip these from the KEYWORD filter only — the semantic embedding
|
|
44
|
+
* still sees the full natural-language query (the model benefits from
|
|
45
|
+
* the question framing).
|
|
46
|
+
*
|
|
47
|
+
* Punctuation and short tokens (1 char) are also dropped because they
|
|
48
|
+
* don't carry meaning and dilute keyword density. Keep this list
|
|
49
|
+
* conservative: anything domain-specific (`set`, `get`, `play`, etc.)
|
|
50
|
+
* stays in — we'd rather under-strip than lose a Roblox-relevant term.
|
|
51
|
+
*/
|
|
52
|
+
const STOPWORDS = new Set([
|
|
53
|
+
'a', 'an', 'the',
|
|
54
|
+
'i', 'me', 'my', 'you', 'your', 'we', 'our', 'us', 'they', 'them', 'their', 'it', 'its',
|
|
55
|
+
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'whose',
|
|
56
|
+
'do', 'does', 'did', 'doing',
|
|
57
|
+
'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
58
|
+
'have', 'has', 'had', 'having',
|
|
59
|
+
'can', 'could', 'should', 'would', 'may', 'might', 'will', 'shall', 'must',
|
|
60
|
+
'to', 'of', 'for', 'in', 'on', 'at', 'by', 'with', 'from', 'as', 'into', 'onto', 'about',
|
|
61
|
+
'and', 'or', 'but', 'if', 'so', 'than', 'then', 'because',
|
|
62
|
+
'that', 'this', 'these', 'those',
|
|
63
|
+
'there', 'here',
|
|
64
|
+
'not', 'no',
|
|
65
|
+
'some', 'any', 'all', 'each', 'every',
|
|
66
|
+
'just', 'only', 'also', 'very', 'really', 'still',
|
|
67
|
+
'one', 'two',
|
|
68
|
+
'something',
|
|
69
|
+
]);
|
|
70
|
+
/**
|
|
71
|
+
* Strip stopwords (and 1-char garbage) from a token list, returning the
|
|
72
|
+
* "meaningful" tokens. Always preserves at least the longest input
|
|
73
|
+
* token even if it happens to be a stopword — better to AND on
|
|
74
|
+
* something than nothing.
|
|
75
|
+
*/
|
|
76
|
+
function meaningfulTokens(tokens) {
|
|
77
|
+
const kept = tokens.filter((t) => t.length >= 2 && !STOPWORDS.has(t.toLowerCase()));
|
|
78
|
+
if (kept.length > 0)
|
|
79
|
+
return kept;
|
|
80
|
+
// Degenerate: query was all stopwords (e.g. "what is the"). Fall
|
|
81
|
+
// back to the longest input token so something gets through.
|
|
82
|
+
const longest = tokens.slice().sort((a, b) => b.length - a.length)[0];
|
|
83
|
+
return longest ? [longest] : [];
|
|
84
|
+
}
|
|
85
|
+
function popcount(n) {
|
|
86
|
+
let c = 0;
|
|
87
|
+
while (n) {
|
|
88
|
+
n &= n - 1;
|
|
89
|
+
c++;
|
|
90
|
+
}
|
|
91
|
+
return c;
|
|
92
|
+
}
|
|
93
|
+
async function* walkFiles(root, scope, exts) {
|
|
94
|
+
const start = scope ? path.join(root, scope) : root;
|
|
95
|
+
// Lazily-recurse using an explicit stack so we don't blow the call
|
|
96
|
+
// stack on deep trees and don't hold the full file list in memory.
|
|
97
|
+
const stack = [start];
|
|
98
|
+
while (stack.length > 0) {
|
|
99
|
+
const dir = stack.pop();
|
|
100
|
+
let entries;
|
|
101
|
+
try {
|
|
102
|
+
entries = await fs.readdir(dir, { withFileTypes: true });
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
for (const e of entries) {
|
|
108
|
+
const p = path.join(dir, e.name);
|
|
109
|
+
if (e.isDirectory()) {
|
|
110
|
+
stack.push(p);
|
|
111
|
+
}
|
|
112
|
+
else if (e.isFile()) {
|
|
113
|
+
const ext = path.extname(e.name).slice(1).toLowerCase();
|
|
114
|
+
if (exts.has(ext)) {
|
|
115
|
+
yield p;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Top-level entry point. Decides between literal (single-token / regex)
|
|
123
|
+
* and token-AND (multi-token, windowed) search and dispatches.
|
|
124
|
+
*
|
|
125
|
+
* Token-AND mode triggers when ALL of:
|
|
126
|
+
* - `useRegex: false` (or unset)
|
|
127
|
+
* - the tokenized query has 2+ tokens
|
|
128
|
+
*
|
|
129
|
+
* Why? Plain literal substring is "the docs say `Motor6D C0` somewhere?"
|
|
130
|
+
* which is almost never true (the docs use dotted/quoted forms like
|
|
131
|
+
* `Class.Motor6D.C0|C0`). Token-AND turns the query into "find lines
|
|
132
|
+
* where Motor6D and C0 are both nearby", which is what the model
|
|
133
|
+
* actually meant.
|
|
134
|
+
*/
|
|
135
|
+
export async function searchDocs(cacheDir, pattern, options = {}, internal = {}) {
|
|
136
|
+
const tokens = options.useRegex ? [] : tokenize(pattern);
|
|
137
|
+
if (!options.useRegex && tokens.length >= 2) {
|
|
138
|
+
const semantic = options.semantic ?? true;
|
|
139
|
+
if (semantic) {
|
|
140
|
+
// Hybrid uses the meaningful (stopword-stripped) tokens for the
|
|
141
|
+
// keyword AND filter but feeds the FULL original query to the
|
|
142
|
+
// embedder. This is the key to making natural-language queries
|
|
143
|
+
// like "how do I rotate a body part smoothly" actually surface
|
|
144
|
+
// anything: the AND filter only requires "rotate", "body",
|
|
145
|
+
// "part", "smoothly" to coexist (very plausible), while the
|
|
146
|
+
// semantic rerank still sees the full question form.
|
|
147
|
+
const keywordTokens = meaningfulTokens(tokens);
|
|
148
|
+
return searchDocsHybrid(cacheDir, pattern, tokens, keywordTokens, options, internal);
|
|
149
|
+
}
|
|
150
|
+
// Pure keyword mode (semantic: false): apply the same stopword
|
|
151
|
+
// strip so deterministic keyword queries don't get nuked by
|
|
152
|
+
// "how do I" boilerplate either.
|
|
153
|
+
const keywordTokens = meaningfulTokens(tokens);
|
|
154
|
+
return searchDocsTokenAnd(cacheDir, keywordTokens.length >= 1 ? keywordTokens : tokens, options);
|
|
155
|
+
}
|
|
156
|
+
// Literal mode: if the user wrapped the whole query in double-quotes
|
|
157
|
+
// (one resulting token after tokenize), search for the unquoted phrase
|
|
158
|
+
// — otherwise the literal substring includes the quotes themselves and
|
|
159
|
+
// matches nothing.
|
|
160
|
+
const literalPattern = !options.useRegex && tokens.length === 1 ? tokens[0] : pattern;
|
|
161
|
+
return searchDocsLiteral(cacheDir, literalPattern, options);
|
|
162
|
+
}
|
|
163
|
+
async function searchDocsLiteral(cacheDir, pattern, options) {
|
|
164
|
+
const t0 = Date.now();
|
|
165
|
+
const root = contentRoot(cacheDir);
|
|
166
|
+
const re = buildPattern(pattern, options);
|
|
167
|
+
const exts = new Set((options.extensions ?? DEFAULT_EXTENSIONS).map((e) => e.toLowerCase()));
|
|
168
|
+
const ctx = Math.max(0, Math.min(options.contextLines ?? 0, 5));
|
|
169
|
+
const maxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
|
|
170
|
+
const hits = [];
|
|
171
|
+
let totalHits = 0;
|
|
172
|
+
let filesScanned = 0;
|
|
173
|
+
let truncated = false;
|
|
174
|
+
outer: for await (const filePath of walkFiles(root, options.scope, exts)) {
|
|
175
|
+
filesScanned++;
|
|
176
|
+
let raw;
|
|
177
|
+
try {
|
|
178
|
+
raw = await fs.readFile(filePath, 'utf8');
|
|
179
|
+
}
|
|
180
|
+
catch {
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
// Cheap binary guard — if there's a NUL byte in the first 4KB, skip.
|
|
184
|
+
if (raw.length > 0 && raw.indexOf('\0', 0) !== -1)
|
|
185
|
+
continue;
|
|
186
|
+
const rel = path.relative(root, filePath);
|
|
187
|
+
const lines = raw.split(/\r?\n/);
|
|
188
|
+
for (let i = 0; i < lines.length; i++) {
|
|
189
|
+
// .test() advances lastIndex on /g flags — reset for each line.
|
|
190
|
+
re.lastIndex = 0;
|
|
191
|
+
if (!re.test(lines[i]))
|
|
192
|
+
continue;
|
|
193
|
+
totalHits++;
|
|
194
|
+
if (hits.length < maxHits) {
|
|
195
|
+
const hit = {
|
|
196
|
+
path: rel,
|
|
197
|
+
line: i + 1,
|
|
198
|
+
text: lines[i].replace(/\s+$/, ''),
|
|
199
|
+
};
|
|
200
|
+
if (ctx > 0) {
|
|
201
|
+
const start = Math.max(0, i - ctx);
|
|
202
|
+
const end = Math.min(lines.length, i + ctx + 1);
|
|
203
|
+
const surround = [];
|
|
204
|
+
for (let j = start; j < end; j++) {
|
|
205
|
+
if (j === i)
|
|
206
|
+
continue;
|
|
207
|
+
surround.push({ line: j + 1, text: lines[j].replace(/\s+$/, '') });
|
|
208
|
+
}
|
|
209
|
+
hit.context = surround;
|
|
210
|
+
}
|
|
211
|
+
hits.push(hit);
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
truncated = true;
|
|
215
|
+
break outer;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
totalHits,
|
|
221
|
+
truncated,
|
|
222
|
+
hits,
|
|
223
|
+
filesScanned,
|
|
224
|
+
durationMs: Date.now() - t0,
|
|
225
|
+
mode: 'literal',
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Multi-token AND search.
|
|
230
|
+
*
|
|
231
|
+
* For each file:
|
|
232
|
+
* 1. Compute a per-line bitmap of which tokens match.
|
|
233
|
+
* 2. Slide a window of `±windowLines` over the file. At each anchor
|
|
234
|
+
* position, OR the masks in `[anchor-w, anchor+w]`. If the union
|
|
235
|
+
* covers all tokens, we have a hit.
|
|
236
|
+
* 3. Pick the anchor as the line in that window with the most token
|
|
237
|
+
* coverage (ties → lowest line number). This way the `text` field
|
|
238
|
+
* is the most "informative" line, not just an arbitrary middle.
|
|
239
|
+
* 4. After firing, skip past `anchor + windowLines` to avoid pile-up
|
|
240
|
+
* of overlapping windows reporting the same dense paragraph N times.
|
|
241
|
+
*
|
|
242
|
+
* Implementation notes:
|
|
243
|
+
* - We bitmask, so token count is capped at 31. Over 31, we truncate
|
|
244
|
+
* and surface the truncation in the response (`tokens` will be the
|
|
245
|
+
* first 31). Real queries never have that many tokens.
|
|
246
|
+
* - The matched window is always emitted as `context` (regardless of
|
|
247
|
+
* `contextLines`) so the model can verify all tokens are nearby.
|
|
248
|
+
* Caller-specified `contextLines` widens this further if larger.
|
|
249
|
+
*/
|
|
250
|
+
async function searchDocsTokenAnd(cacheDir, tokensIn, options) {
|
|
251
|
+
const t0 = Date.now();
|
|
252
|
+
const root = contentRoot(cacheDir);
|
|
253
|
+
const exts = new Set((options.extensions ?? DEFAULT_EXTENSIONS).map((e) => e.toLowerCase()));
|
|
254
|
+
const maxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
|
|
255
|
+
const w = Math.max(1, Math.min(options.windowLines ?? 3, 10));
|
|
256
|
+
const ctx = Math.max(w, Math.min(options.contextLines ?? 0, 10));
|
|
257
|
+
// Bitmask cap — see fn comment.
|
|
258
|
+
const tokens = tokensIn.slice(0, 31);
|
|
259
|
+
const tokenRes = tokens.map((t) => new RegExp(escapeRegex(t), options.caseSensitive ? '' : 'i'));
|
|
260
|
+
const allMask = tokens.length === 31 ? 0x7fffffff : (1 << tokens.length) - 1;
|
|
261
|
+
const hits = [];
|
|
262
|
+
let totalHits = 0;
|
|
263
|
+
let filesScanned = 0;
|
|
264
|
+
let truncated = false;
|
|
265
|
+
outer: for await (const filePath of walkFiles(root, options.scope, exts)) {
|
|
266
|
+
filesScanned++;
|
|
267
|
+
let raw;
|
|
268
|
+
try {
|
|
269
|
+
raw = await fs.readFile(filePath, 'utf8');
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
continue;
|
|
273
|
+
}
|
|
274
|
+
if (raw.length > 0 && raw.indexOf('\0', 0) !== -1)
|
|
275
|
+
continue;
|
|
276
|
+
const rel = path.relative(root, filePath);
|
|
277
|
+
const lines = raw.split(/\r?\n/);
|
|
278
|
+
// Quick reject: file must contain ALL tokens at least once. Cheap
|
|
279
|
+
// string scans avoid the per-line work for files that can never hit.
|
|
280
|
+
const lower = options.caseSensitive ? raw : raw.toLowerCase();
|
|
281
|
+
let canHit = true;
|
|
282
|
+
for (const t of tokens) {
|
|
283
|
+
const needle = options.caseSensitive ? t : t.toLowerCase();
|
|
284
|
+
if (lower.indexOf(needle) === -1) {
|
|
285
|
+
canHit = false;
|
|
286
|
+
break;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
if (!canHit)
|
|
290
|
+
continue;
|
|
291
|
+
// Pass 1: per-line tokenMask.
|
|
292
|
+
const tokenMask = new Array(lines.length).fill(0);
|
|
293
|
+
for (let i = 0; i < lines.length; i++) {
|
|
294
|
+
const line = lines[i];
|
|
295
|
+
for (let t = 0; t < tokens.length; t++) {
|
|
296
|
+
tokenRes[t].lastIndex = 0;
|
|
297
|
+
if (tokenRes[t].test(line))
|
|
298
|
+
tokenMask[i] |= 1 << t;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
// Pass 2: sliding-window union. Skip ahead after a hit.
|
|
302
|
+
let i = 0;
|
|
303
|
+
while (i < lines.length) {
|
|
304
|
+
let combined = 0;
|
|
305
|
+
let bestLine = i;
|
|
306
|
+
let bestCount = -1;
|
|
307
|
+
const lo = Math.max(0, i - w);
|
|
308
|
+
const hi = Math.min(lines.length - 1, i + w);
|
|
309
|
+
for (let j = lo; j <= hi; j++) {
|
|
310
|
+
combined |= tokenMask[j];
|
|
311
|
+
const cnt = popcount(tokenMask[j]);
|
|
312
|
+
if (cnt > bestCount || (cnt === bestCount && j < bestLine)) {
|
|
313
|
+
bestCount = cnt;
|
|
314
|
+
bestLine = j;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
if ((combined & allMask) === allMask && bestCount > 0) {
|
|
318
|
+
totalHits++;
|
|
319
|
+
if (hits.length < maxHits) {
|
|
320
|
+
const ctxLo = Math.max(0, bestLine - ctx);
|
|
321
|
+
const ctxHi = Math.min(lines.length - 1, bestLine + ctx);
|
|
322
|
+
const surround = [];
|
|
323
|
+
for (let j = ctxLo; j <= ctxHi; j++) {
|
|
324
|
+
if (j === bestLine)
|
|
325
|
+
continue;
|
|
326
|
+
surround.push({ line: j + 1, text: lines[j].replace(/\s+$/, '') });
|
|
327
|
+
}
|
|
328
|
+
const matched = [];
|
|
329
|
+
for (let t = 0; t < tokens.length; t++) {
|
|
330
|
+
if (tokenMask[bestLine] & (1 << t))
|
|
331
|
+
matched.push(tokens[t]);
|
|
332
|
+
}
|
|
333
|
+
hits.push({
|
|
334
|
+
path: rel,
|
|
335
|
+
line: bestLine + 1,
|
|
336
|
+
text: lines[bestLine].replace(/\s+$/, ''),
|
|
337
|
+
context: surround,
|
|
338
|
+
matchedTokens: matched,
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
else {
|
|
342
|
+
truncated = true;
|
|
343
|
+
break outer;
|
|
344
|
+
}
|
|
345
|
+
// Advance past this window so we don't fire 7 times for one paragraph.
|
|
346
|
+
i = bestLine + w + 1;
|
|
347
|
+
}
|
|
348
|
+
else {
|
|
349
|
+
i++;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
return {
|
|
354
|
+
totalHits,
|
|
355
|
+
truncated,
|
|
356
|
+
hits,
|
|
357
|
+
filesScanned,
|
|
358
|
+
durationMs: Date.now() - t0,
|
|
359
|
+
mode: 'token-and',
|
|
360
|
+
tokens,
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Hybrid search: token-AND keyword filter + semantic rerank.
|
|
365
|
+
*
|
|
366
|
+
* Pipeline:
|
|
367
|
+
* 1. Run the existing token-AND keyword search to get a high-recall
|
|
368
|
+
* pool of candidate hits. We bump `maxHits` for this stage so we
|
|
369
|
+
* have more room to rerank — the final user-facing cap is
|
|
370
|
+
* applied after scoring.
|
|
371
|
+
* 2. Look up the chunk(s) that contain each hit's line. A hit
|
|
372
|
+
* inherits the embedding of the chunk it falls in.
|
|
373
|
+
* 3. Embed the query (~5ms cold, ~1ms warm).
|
|
374
|
+
* 4. Score each hit:
|
|
375
|
+
* finalScore = α · cosine(query, chunk) // semantic
|
|
376
|
+
* + β · (matchedTokens / totalTokens) // keyword density
|
|
377
|
+
* + γ · pathBoost // reference > guide for API-ish queries
|
|
378
|
+
* where α=0.7, β=0.2, γ=0.1. These were picked by eyeballing a
|
|
379
|
+
* handful of golden queries — see test/golden-queries.test.ts.
|
|
380
|
+
* 5. Sort by finalScore, slice to `maxHits`, return.
|
|
381
|
+
*
|
|
382
|
+
* Fallbacks (any of which keep search usable):
|
|
383
|
+
* - No SHA passed in → can't load index → return plain token-AND.
|
|
384
|
+
* - Index load/build fails → return plain token-AND with semanticUsed=false.
|
|
385
|
+
* - No candidate hits → return empty (semantic won't invent matches
|
|
386
|
+
* that don't lexically exist; that's a feature, not a bug — we
|
|
387
|
+
* guarantee every returned hit contains every query token).
|
|
388
|
+
*/
|
|
389
|
+
async function searchDocsHybrid(cacheDir, pattern, allTokens, keywordTokens, options, internal) {
|
|
390
|
+
const t0 = Date.now();
|
|
391
|
+
const userMaxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
|
|
392
|
+
// High recall pool: rerank wants more to choose from, but we cap
|
|
393
|
+
// hard to avoid embedding 1000 hits for nothing.
|
|
394
|
+
const poolMaxHits = Math.min(Math.max(userMaxHits * 3, 50), 300);
|
|
395
|
+
// 1. Run keyword pass with the STOPWORD-STRIPPED tokens and a
|
|
396
|
+
// beefier cap. We can only AND on meaningful tokens — otherwise
|
|
397
|
+
// "how do I rotate a body part" requires "how" + "do" + "I" + "a"
|
|
398
|
+
// to all be present somewhere in docs, which zeroes out the pool
|
|
399
|
+
// instantly.
|
|
400
|
+
const keyword = keywordTokens.length >= 1
|
|
401
|
+
? await searchDocsTokenAnd(cacheDir, keywordTokens, {
|
|
402
|
+
...options,
|
|
403
|
+
maxHits: poolMaxHits,
|
|
404
|
+
})
|
|
405
|
+
: {
|
|
406
|
+
// No usable keyword tokens at all (degenerate). Fake an empty
|
|
407
|
+
// keyword result and let the semantic fallback kick in.
|
|
408
|
+
totalHits: 0,
|
|
409
|
+
truncated: false,
|
|
410
|
+
hits: [],
|
|
411
|
+
filesScanned: 0,
|
|
412
|
+
durationMs: 0,
|
|
413
|
+
mode: 'token-and',
|
|
414
|
+
tokens: [],
|
|
415
|
+
};
|
|
416
|
+
// 2. Load (or build) the semantic index. If unavailable, return what
|
|
417
|
+
// keyword found (possibly empty) and let the caller deal with it.
|
|
418
|
+
let index = null;
|
|
419
|
+
if (internal.docsSha) {
|
|
420
|
+
try {
|
|
421
|
+
index = await getOrBuild(cacheDir, internal.docsSha);
|
|
422
|
+
}
|
|
423
|
+
catch {
|
|
424
|
+
index = null;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
if (!index) {
|
|
428
|
+
return {
|
|
429
|
+
...keyword,
|
|
430
|
+
hits: keyword.hits.slice(0, userMaxHits),
|
|
431
|
+
mode: 'hybrid',
|
|
432
|
+
semanticUsed: false,
|
|
433
|
+
durationMs: Date.now() - t0,
|
|
434
|
+
tokens: keywordTokens.slice(0, 31),
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
// 3. Embed query (full original pattern — model benefits from
|
|
438
|
+
// natural-language framing, even though keyword filter dropped
|
|
439
|
+
// stopwords).
|
|
440
|
+
let qvec;
|
|
441
|
+
try {
|
|
442
|
+
qvec = await encodeOne(pattern);
|
|
443
|
+
}
|
|
444
|
+
catch {
|
|
445
|
+
return {
|
|
446
|
+
...keyword,
|
|
447
|
+
hits: keyword.hits.slice(0, userMaxHits),
|
|
448
|
+
mode: 'hybrid',
|
|
449
|
+
semanticUsed: false,
|
|
450
|
+
durationMs: Date.now() - t0,
|
|
451
|
+
tokens: keywordTokens.slice(0, 31),
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
// 4. If keyword AND filter found nothing, fall back to PURE SEMANTIC
|
|
455
|
+
// top-K from the chunk index. This is what makes natural-
|
|
456
|
+
// language queries work — the user gets conceptually relevant
|
|
457
|
+
// chunks even though no single line lexically contains all the
|
|
458
|
+
// meaningful terms. Returned hits carry semanticScore but no
|
|
459
|
+
// matchedTokens (since there's no keyword guarantee).
|
|
460
|
+
if (keyword.hits.length === 0) {
|
|
461
|
+
return semanticOnlyFallback(cacheDir, pattern, allTokens, keywordTokens, qvec, index, options, userMaxHits, keyword.filesScanned, t0);
|
|
462
|
+
}
|
|
463
|
+
// 5. Build path→chunks lookup so the hit→chunk join is O(hits + chunks),
|
|
464
|
+
// not O(hits × chunks).
|
|
465
|
+
const chunksByPath = new Map();
|
|
466
|
+
const dim = index.meta.dim;
|
|
467
|
+
for (let i = 0; i < index.chunks.length; i++) {
|
|
468
|
+
const c = index.chunks[i];
|
|
469
|
+
const vec = index.vectors.subarray(i * dim, (i + 1) * dim);
|
|
470
|
+
const arr = chunksByPath.get(c.path);
|
|
471
|
+
const entry = { startLine: c.startLine, endLine: c.endLine, vec };
|
|
472
|
+
if (arr)
|
|
473
|
+
arr.push(entry);
|
|
474
|
+
else
|
|
475
|
+
chunksByPath.set(c.path, [entry]);
|
|
476
|
+
}
|
|
477
|
+
// Heuristic: does the query look like an API name lookup? If so,
|
|
478
|
+
// boost reference/engine chunks. Cheap regex test: any PascalCase
|
|
479
|
+
// token (e.g. "Motor6D", "TweenService") triggers the boost.
|
|
480
|
+
const looksApiLike = allTokens.some((t) => /^[A-Z][a-zA-Z0-9_]*$/.test(t));
|
|
481
|
+
// 6. Score every keyword hit.
|
|
482
|
+
const ALPHA = 0.7;
|
|
483
|
+
const BETA = 0.2;
|
|
484
|
+
const GAMMA = 0.1;
|
|
485
|
+
const tokenCount = Math.max(1, keywordTokens.length);
|
|
486
|
+
const scored = [];
|
|
487
|
+
for (const hit of keyword.hits) {
|
|
488
|
+
const chunksForFile = chunksByPath.get(hit.path);
|
|
489
|
+
let semanticScore = 0;
|
|
490
|
+
if (chunksForFile) {
|
|
491
|
+
for (const ch of chunksForFile) {
|
|
492
|
+
if (hit.line < ch.startLine || hit.line > ch.endLine)
|
|
493
|
+
continue;
|
|
494
|
+
const sim = dot(qvec, ch.vec);
|
|
495
|
+
if (sim > semanticScore)
|
|
496
|
+
semanticScore = sim;
|
|
497
|
+
}
|
|
498
|
+
if (semanticScore === 0) {
|
|
499
|
+
for (const ch of chunksForFile) {
|
|
500
|
+
const sim = dot(qvec, ch.vec);
|
|
501
|
+
if (sim > semanticScore)
|
|
502
|
+
semanticScore = sim;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
const keywordDensity = (hit.matchedTokens?.length ?? 0) / tokenCount;
|
|
507
|
+
const pathBoost = looksApiLike && hit.path.includes('reference/engine') ? 1 : 0;
|
|
508
|
+
const finalScore = ALPHA * semanticScore + BETA * keywordDensity + GAMMA * pathBoost;
|
|
509
|
+
scored.push({ ...hit, score: finalScore, semanticScore });
|
|
510
|
+
}
|
|
511
|
+
// 7. Sort & trim.
|
|
512
|
+
scored.sort((a, b) => b.score - a.score);
|
|
513
|
+
const finalHits = scored.slice(0, userMaxHits);
|
|
514
|
+
return {
|
|
515
|
+
totalHits: keyword.totalHits,
|
|
516
|
+
truncated: keyword.totalHits > finalHits.length,
|
|
517
|
+
hits: finalHits,
|
|
518
|
+
filesScanned: keyword.filesScanned,
|
|
519
|
+
durationMs: Date.now() - t0,
|
|
520
|
+
mode: 'hybrid',
|
|
521
|
+
tokens: keywordTokens.slice(0, 31),
|
|
522
|
+
semanticUsed: true,
|
|
523
|
+
keywordFiltered: true,
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* Pure-semantic search over the chunk index. Used as a fallback when
|
|
528
|
+
* keyword AND-filtering would have killed recall (e.g. natural-language
|
|
529
|
+
* queries where no single line contains every meaningful term).
|
|
530
|
+
*
|
|
531
|
+
* Returns one SearchHit per top-K chunk:
|
|
532
|
+
* - `path`: chunk path
|
|
533
|
+
* - `line`: chunk startLine
|
|
534
|
+
* - `text`: first non-empty line of the chunk (~best summary anchor)
|
|
535
|
+
* - `context`: a few subsequent lines of the chunk so the model can
|
|
536
|
+
* verify topical relevance
|
|
537
|
+
* - `score` / `semanticScore`: cosine similarity (also written into
|
|
538
|
+
* `score` because there's no keyword density component here)
|
|
539
|
+
* - `matchedTokens`: omitted (no keyword guarantee)
|
|
540
|
+
*
|
|
541
|
+
* No keyword guarantee means a hit's `text` may not contain any token
|
|
542
|
+
* from the user's query. That's intentional and correct for queries
|
|
543
|
+
* like "how do I rotate a body part smoothly" — the relevant doc
|
|
544
|
+
* (`AlignOrientation`) doesn't use the word "rotate" in every line.
|
|
545
|
+
*
|
|
546
|
+
* `keywordFiltered: false` in the response signals this to the caller.
|
|
547
|
+
*/
|
|
548
|
+
async function semanticOnlyFallback(cacheDir, pattern, allTokens, keywordTokens, qvec, index, options, userMaxHits, filesScanned, t0) {
|
|
549
|
+
const dim = index.meta.dim;
|
|
550
|
+
const scope = options.scope;
|
|
551
|
+
const looksApiLike = allTokens.some((t) => /^[A-Z][a-zA-Z0-9_]*$/.test(t));
|
|
552
|
+
const scored = [];
|
|
553
|
+
for (let i = 0; i < index.chunks.length; i++) {
|
|
554
|
+
const c = index.chunks[i];
|
|
555
|
+
if (scope && !c.path.startsWith(scope))
|
|
556
|
+
continue;
|
|
557
|
+
const vec = index.vectors.subarray(i * dim, (i + 1) * dim);
|
|
558
|
+
const sem = dot(qvec, vec);
|
|
559
|
+
// Add a small path boost for API-ish queries — same idea as hybrid.
|
|
560
|
+
const pathBoost = looksApiLike && c.path.includes('reference/engine') ? 1 : 0;
|
|
561
|
+
const finalScore = 0.9 * sem + 0.1 * pathBoost;
|
|
562
|
+
scored.push({ idx: i, score: finalScore, semanticScore: sem });
|
|
563
|
+
}
|
|
564
|
+
if (scored.length === 0) {
|
|
565
|
+
return {
|
|
566
|
+
totalHits: 0,
|
|
567
|
+
truncated: false,
|
|
568
|
+
hits: [],
|
|
569
|
+
filesScanned,
|
|
570
|
+
durationMs: Date.now() - t0,
|
|
571
|
+
mode: 'hybrid',
|
|
572
|
+
tokens: keywordTokens.slice(0, 31),
|
|
573
|
+
semanticUsed: true,
|
|
574
|
+
keywordFiltered: false,
|
|
575
|
+
};
|
|
576
|
+
}
|
|
577
|
+
scored.sort((a, b) => b.score - a.score);
|
|
578
|
+
// Diversify: don't return 5 chunks from the same file.
|
|
579
|
+
const seenPath = new Map();
|
|
580
|
+
const perPathCap = 2;
|
|
581
|
+
const picked = [];
|
|
582
|
+
for (const s of scored) {
|
|
583
|
+
const p = index.chunks[s.idx].path;
|
|
584
|
+
const used = seenPath.get(p) ?? 0;
|
|
585
|
+
if (used >= perPathCap)
|
|
586
|
+
continue;
|
|
587
|
+
picked.push(s);
|
|
588
|
+
seenPath.set(p, used + 1);
|
|
589
|
+
if (picked.length >= userMaxHits)
|
|
590
|
+
break;
|
|
591
|
+
}
|
|
592
|
+
const hits = picked.map((s) => {
|
|
593
|
+
const c = index.chunks[s.idx];
|
|
594
|
+
// Pick a representative anchor line: first non-empty, non-heading line.
|
|
595
|
+
const chunkLines = c.text.split(/\r?\n/);
|
|
596
|
+
let anchorOffset = 0;
|
|
597
|
+
let anchorText = chunkLines[0] ?? '';
|
|
598
|
+
for (let j = 0; j < chunkLines.length; j++) {
|
|
599
|
+
const t = chunkLines[j].trim();
|
|
600
|
+
if (!t)
|
|
601
|
+
continue;
|
|
602
|
+
// Skip pure heading lines as anchors — the body is more informative.
|
|
603
|
+
if (j < chunkLines.length - 1 && /^#+\s/.test(t))
|
|
604
|
+
continue;
|
|
605
|
+
anchorOffset = j;
|
|
606
|
+
anchorText = chunkLines[j];
|
|
607
|
+
break;
|
|
608
|
+
}
|
|
609
|
+
// Provide a small slice of surrounding lines as context.
|
|
610
|
+
const ctxLines = [];
|
|
611
|
+
const ctxStart = Math.max(0, anchorOffset - 1);
|
|
612
|
+
const ctxEnd = Math.min(chunkLines.length, anchorOffset + 4);
|
|
613
|
+
for (let j = ctxStart; j < ctxEnd; j++) {
|
|
614
|
+
if (j === anchorOffset)
|
|
615
|
+
continue;
|
|
616
|
+
const lineText = chunkLines[j].replace(/\s+$/, '');
|
|
617
|
+
if (!lineText)
|
|
618
|
+
continue;
|
|
619
|
+
ctxLines.push({ line: c.startLine + j, text: lineText });
|
|
620
|
+
}
|
|
621
|
+
return {
|
|
622
|
+
path: c.path,
|
|
623
|
+
line: c.startLine + anchorOffset,
|
|
624
|
+
text: anchorText.replace(/\s+$/, ''),
|
|
625
|
+
context: ctxLines,
|
|
626
|
+
score: s.score,
|
|
627
|
+
semanticScore: s.semanticScore,
|
|
628
|
+
};
|
|
629
|
+
});
|
|
630
|
+
return {
|
|
631
|
+
totalHits: scored.length,
|
|
632
|
+
truncated: scored.length > hits.length,
|
|
633
|
+
hits,
|
|
634
|
+
filesScanned,
|
|
635
|
+
durationMs: Date.now() - t0,
|
|
636
|
+
mode: 'hybrid',
|
|
637
|
+
tokens: keywordTokens.slice(0, 31),
|
|
638
|
+
semanticUsed: true,
|
|
639
|
+
keywordFiltered: false,
|
|
640
|
+
};
|
|
641
|
+
}
|
|
642
|
+
/**
|
|
643
|
+
* Read a single doc file by its content-relative path.
|
|
644
|
+
*
|
|
645
|
+
* Two flavors of input are accepted to make the LLM-facing tool more
|
|
646
|
+
* forgiving:
|
|
647
|
+
* • "en-us/reference/engine/classes/Part.yaml" (canonical)
|
|
648
|
+
* • "Part" / "Part.yaml" — best-effort lookup under reference/engine
|
|
649
|
+
* (handled by `resolveReferenceDoc` in reference.ts, not here).
|
|
650
|
+
*/
|
|
651
|
+
export async function readDocFile(cacheDir, relPath) {
|
|
652
|
+
const root = contentRoot(cacheDir);
|
|
653
|
+
// Normalize and reject path-traversal attempts.
|
|
654
|
+
const safe = path.normalize(relPath).replace(/^[/\\]+/, '');
|
|
655
|
+
if (safe.startsWith('..'))
|
|
656
|
+
return null;
|
|
657
|
+
const full = path.join(root, safe);
|
|
658
|
+
if (!full.startsWith(root))
|
|
659
|
+
return null;
|
|
660
|
+
try {
|
|
661
|
+
const stat = await fs.stat(full);
|
|
662
|
+
if (!stat.isFile())
|
|
663
|
+
return null;
|
|
664
|
+
const content = await fs.readFile(full, 'utf8');
|
|
665
|
+
return { path: safe, bytes: stat.size, content };
|
|
666
|
+
}
|
|
667
|
+
catch {
|
|
668
|
+
return null;
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
export async function listDocs(cacheDir, relPath = '', options = {}) {
|
|
672
|
+
const root = contentRoot(cacheDir);
|
|
673
|
+
const safe = path.normalize(relPath).replace(/^[/\\]+/, '');
|
|
674
|
+
if (safe.startsWith('..'))
|
|
675
|
+
return null;
|
|
676
|
+
const full = safe ? path.join(root, safe) : root;
|
|
677
|
+
if (!full.startsWith(root))
|
|
678
|
+
return null;
|
|
679
|
+
let entries;
|
|
680
|
+
try {
|
|
681
|
+
entries = await fs.readdir(full, { withFileTypes: true });
|
|
682
|
+
}
|
|
683
|
+
catch {
|
|
684
|
+
return null;
|
|
685
|
+
}
|
|
686
|
+
// Build the full sorted list first (cheap — just names + types + a
|
|
687
|
+
// stat per file). Pagination then slices into it.
|
|
688
|
+
const all = [];
|
|
689
|
+
for (const e of entries) {
|
|
690
|
+
if (e.isDirectory()) {
|
|
691
|
+
all.push({ name: e.name, type: 'dir' });
|
|
692
|
+
}
|
|
693
|
+
else if (e.isFile()) {
|
|
694
|
+
// We stat lazily below — only for the slice we'll return.
|
|
695
|
+
all.push({ name: e.name, type: 'file' });
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
// Stable order: directories first, then files, alphabetic within each.
|
|
699
|
+
all.sort((a, b) => {
|
|
700
|
+
if (a.type !== b.type)
|
|
701
|
+
return a.type === 'dir' ? -1 : 1;
|
|
702
|
+
return a.name.localeCompare(b.name);
|
|
703
|
+
});
|
|
704
|
+
const totalEntries = all.length;
|
|
705
|
+
const offset = Math.max(0, Math.floor(options.offset ?? 0));
|
|
706
|
+
const limit = Math.max(1, Math.min(Math.floor(options.limit ?? 100), 1000));
|
|
707
|
+
const pageRaw = all.slice(offset, offset + limit);
|
|
708
|
+
// Stat only the page we're returning.
|
|
709
|
+
const page = [];
|
|
710
|
+
for (const entry of pageRaw) {
|
|
711
|
+
if (entry.type === 'file') {
|
|
712
|
+
try {
|
|
713
|
+
const stat = await fs.stat(path.join(full, entry.name));
|
|
714
|
+
page.push({ ...entry, size: stat.size });
|
|
715
|
+
}
|
|
716
|
+
catch {
|
|
717
|
+
page.push(entry);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
else {
|
|
721
|
+
page.push(entry);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
return {
|
|
725
|
+
path: safe,
|
|
726
|
+
totalEntries,
|
|
727
|
+
offset,
|
|
728
|
+
limit,
|
|
729
|
+
truncated: offset + page.length < totalEntries,
|
|
730
|
+
entries: page,
|
|
731
|
+
};
|
|
732
|
+
}
|
|
733
|
+
//# sourceMappingURL=search.js.map
|