rbxstudio-mcp 2.3.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/README.md +67 -14
  2. package/dist/__tests__/bridge-service.test.js +25 -13
  3. package/dist/__tests__/bridge-service.test.js.map +1 -1
  4. package/dist/__tests__/bridge-session.test.d.ts +2 -0
  5. package/dist/__tests__/bridge-session.test.d.ts.map +1 -0
  6. package/dist/__tests__/bridge-session.test.js +171 -0
  7. package/dist/__tests__/bridge-session.test.js.map +1 -0
  8. package/dist/__tests__/chunker.test.d.ts +2 -0
  9. package/dist/__tests__/chunker.test.d.ts.map +1 -0
  10. package/dist/__tests__/chunker.test.js +201 -0
  11. package/dist/__tests__/chunker.test.js.map +1 -0
  12. package/dist/__tests__/docs-core.test.d.ts +2 -0
  13. package/dist/__tests__/docs-core.test.d.ts.map +1 -0
  14. package/dist/__tests__/docs-core.test.js +137 -0
  15. package/dist/__tests__/docs-core.test.js.map +1 -0
  16. package/dist/__tests__/docs-fetcher.test.d.ts +2 -0
  17. package/dist/__tests__/docs-fetcher.test.d.ts.map +1 -0
  18. package/dist/__tests__/docs-fetcher.test.js +173 -0
  19. package/dist/__tests__/docs-fetcher.test.js.map +1 -0
  20. package/dist/__tests__/helpers.d.ts +8 -0
  21. package/dist/__tests__/helpers.d.ts.map +1 -0
  22. package/dist/__tests__/helpers.js +23 -0
  23. package/dist/__tests__/helpers.js.map +1 -0
  24. package/dist/__tests__/http-routes.test.d.ts +2 -0
  25. package/dist/__tests__/http-routes.test.d.ts.map +1 -0
  26. package/dist/__tests__/http-routes.test.js +233 -0
  27. package/dist/__tests__/http-routes.test.js.map +1 -0
  28. package/dist/__tests__/http-server.test.js +13 -6
  29. package/dist/__tests__/http-server.test.js.map +1 -1
  30. package/dist/__tests__/integration.test.js +9 -4
  31. package/dist/__tests__/integration.test.js.map +1 -1
  32. package/dist/__tests__/semantic-search.test.d.ts +2 -0
  33. package/dist/__tests__/semantic-search.test.d.ts.map +1 -0
  34. package/dist/__tests__/semantic-search.test.js +202 -0
  35. package/dist/__tests__/semantic-search.test.js.map +1 -0
  36. package/dist/__tests__/smoke.test.js +7 -3
  37. package/dist/__tests__/smoke.test.js.map +1 -1
  38. package/dist/__tests__/studio-client.test.d.ts +2 -0
  39. package/dist/__tests__/studio-client.test.d.ts.map +1 -0
  40. package/dist/__tests__/studio-client.test.js +25 -0
  41. package/dist/__tests__/studio-client.test.js.map +1 -0
  42. package/dist/__tests__/tool-nudges.test.d.ts +2 -0
  43. package/dist/__tests__/tool-nudges.test.d.ts.map +1 -0
  44. package/dist/__tests__/tool-nudges.test.js +60 -0
  45. package/dist/__tests__/tool-nudges.test.js.map +1 -0
  46. package/dist/__tests__/tool-registry.test.d.ts +2 -0
  47. package/dist/__tests__/tool-registry.test.d.ts.map +1 -0
  48. package/dist/__tests__/tool-registry.test.js +365 -0
  49. package/dist/__tests__/tool-registry.test.js.map +1 -0
  50. package/dist/__tests__/tools-bridge.test.d.ts +2 -0
  51. package/dist/__tests__/tools-bridge.test.d.ts.map +1 -0
  52. package/dist/__tests__/tools-bridge.test.js +396 -0
  53. package/dist/__tests__/tools-bridge.test.js.map +1 -0
  54. package/dist/__tests__/tools-docs.test.d.ts +2 -0
  55. package/dist/__tests__/tools-docs.test.d.ts.map +1 -0
  56. package/dist/__tests__/tools-docs.test.js +112 -0
  57. package/dist/__tests__/tools-docs.test.js.map +1 -0
  58. package/dist/__tests__/tools-guards.test.d.ts +2 -0
  59. package/dist/__tests__/tools-guards.test.d.ts.map +1 -0
  60. package/dist/__tests__/tools-guards.test.js +131 -0
  61. package/dist/__tests__/tools-guards.test.js.map +1 -0
  62. package/dist/__tests__/tools-runtime.test.d.ts +2 -0
  63. package/dist/__tests__/tools-runtime.test.d.ts.map +1 -0
  64. package/dist/__tests__/tools-runtime.test.js +214 -0
  65. package/dist/__tests__/tools-runtime.test.js.map +1 -0
  66. package/dist/__tests__/tools-visual.test.d.ts +2 -0
  67. package/dist/__tests__/tools-visual.test.d.ts.map +1 -0
  68. package/dist/__tests__/tools-visual.test.js +149 -0
  69. package/dist/__tests__/tools-visual.test.js.map +1 -0
  70. package/dist/bridge-service.d.ts +99 -12
  71. package/dist/bridge-service.d.ts.map +1 -1
  72. package/dist/bridge-service.js +238 -21
  73. package/dist/bridge-service.js.map +1 -1
  74. package/dist/docs/cache.d.ts +50 -0
  75. package/dist/docs/cache.d.ts.map +1 -0
  76. package/dist/docs/cache.js +123 -0
  77. package/dist/docs/cache.js.map +1 -0
  78. package/dist/docs/embeddings/chunker.d.ts +120 -0
  79. package/dist/docs/embeddings/chunker.d.ts.map +1 -0
  80. package/dist/docs/embeddings/chunker.js +395 -0
  81. package/dist/docs/embeddings/chunker.js.map +1 -0
  82. package/dist/docs/embeddings/embedder.d.ts +41 -0
  83. package/dist/docs/embeddings/embedder.d.ts.map +1 -0
  84. package/dist/docs/embeddings/embedder.js +113 -0
  85. package/dist/docs/embeddings/embedder.js.map +1 -0
  86. package/dist/docs/embeddings/index.d.ts +102 -0
  87. package/dist/docs/embeddings/index.d.ts.map +1 -0
  88. package/dist/docs/embeddings/index.js +250 -0
  89. package/dist/docs/embeddings/index.js.map +1 -0
  90. package/dist/docs/embeddings/manager.d.ts +68 -0
  91. package/dist/docs/embeddings/manager.d.ts.map +1 -0
  92. package/dist/docs/embeddings/manager.js +97 -0
  93. package/dist/docs/embeddings/manager.js.map +1 -0
  94. package/dist/docs/fetcher.d.ts +29 -0
  95. package/dist/docs/fetcher.d.ts.map +1 -0
  96. package/dist/docs/fetcher.js +244 -0
  97. package/dist/docs/fetcher.js.map +1 -0
  98. package/dist/docs/reference.d.ts +37 -0
  99. package/dist/docs/reference.d.ts.map +1 -0
  100. package/dist/docs/reference.js +108 -0
  101. package/dist/docs/reference.js.map +1 -0
  102. package/dist/docs/search.d.ts +194 -0
  103. package/dist/docs/search.d.ts.map +1 -0
  104. package/dist/docs/search.js +733 -0
  105. package/dist/docs/search.js.map +1 -0
  106. package/dist/http-server.d.ts.map +1 -1
  107. package/dist/http-server.js +52 -5
  108. package/dist/http-server.js.map +1 -1
  109. package/dist/index.d.ts +8 -9
  110. package/dist/index.d.ts.map +1 -1
  111. package/dist/index.js +35 -1035
  112. package/dist/index.js.map +1 -1
  113. package/dist/instructions.d.ts +15 -0
  114. package/dist/instructions.d.ts.map +1 -0
  115. package/dist/instructions.js +26 -0
  116. package/dist/instructions.js.map +1 -0
  117. package/dist/tools/defs/attributes.d.ts +6 -0
  118. package/dist/tools/defs/attributes.d.ts.map +1 -0
  119. package/dist/tools/defs/attributes.js +85 -0
  120. package/dist/tools/defs/attributes.js.map +1 -0
  121. package/dist/tools/defs/docs.d.ts +17 -0
  122. package/dist/tools/defs/docs.d.ts.map +1 -0
  123. package/dist/tools/defs/docs.js +151 -0
  124. package/dist/tools/defs/docs.js.map +1 -0
  125. package/dist/tools/defs/execute.d.ts +6 -0
  126. package/dist/tools/defs/execute.d.ts.map +1 -0
  127. package/dist/tools/defs/execute.js +21 -0
  128. package/dist/tools/defs/execute.js.map +1 -0
  129. package/dist/tools/defs/inspection.d.ts +7 -0
  130. package/dist/tools/defs/inspection.d.ts.map +1 -0
  131. package/dist/tools/defs/inspection.js +202 -0
  132. package/dist/tools/defs/inspection.js.map +1 -0
  133. package/dist/tools/defs/objects.d.ts +6 -0
  134. package/dist/tools/defs/objects.d.ts.map +1 -0
  135. package/dist/tools/defs/objects.js +111 -0
  136. package/dist/tools/defs/objects.js.map +1 -0
  137. package/dist/tools/defs/properties.d.ts +6 -0
  138. package/dist/tools/defs/properties.d.ts.map +1 -0
  139. package/dist/tools/defs/properties.js +71 -0
  140. package/dist/tools/defs/properties.js.map +1 -0
  141. package/dist/tools/defs/runtime.d.ts +6 -0
  142. package/dist/tools/defs/runtime.d.ts.map +1 -0
  143. package/dist/tools/defs/runtime.js +145 -0
  144. package/dist/tools/defs/runtime.js.map +1 -0
  145. package/dist/tools/defs/scripts.d.ts +18 -0
  146. package/dist/tools/defs/scripts.d.ts.map +1 -0
  147. package/dist/tools/defs/scripts.js +163 -0
  148. package/dist/tools/defs/scripts.js.map +1 -0
  149. package/dist/tools/defs/tags.d.ts +6 -0
  150. package/dist/tools/defs/tags.d.ts.map +1 -0
  151. package/dist/tools/defs/tags.js +74 -0
  152. package/dist/tools/defs/tags.js.map +1 -0
  153. package/dist/tools/defs/visual.d.ts +7 -0
  154. package/dist/tools/defs/visual.d.ts.map +1 -0
  155. package/dist/tools/defs/visual.js +208 -0
  156. package/dist/tools/defs/visual.js.map +1 -0
  157. package/dist/tools/index.d.ts +101 -25
  158. package/dist/tools/index.d.ts.map +1 -1
  159. package/dist/tools/index.js +580 -63
  160. package/dist/tools/index.js.map +1 -1
  161. package/dist/tools/nudges.d.ts +25 -0
  162. package/dist/tools/nudges.d.ts.map +1 -0
  163. package/dist/tools/nudges.js +34 -0
  164. package/dist/tools/nudges.js.map +1 -0
  165. package/dist/tools/registry.d.ts +20 -0
  166. package/dist/tools/registry.d.ts.map +1 -0
  167. package/dist/tools/registry.js +65 -0
  168. package/dist/tools/registry.js.map +1 -0
  169. package/dist/tools/types.d.ts +24 -0
  170. package/dist/tools/types.d.ts.map +1 -0
  171. package/dist/tools/types.js +2 -0
  172. package/dist/tools/types.js.map +1 -0
  173. package/package.json +7 -6
  174. package/studio-plugin/MCPPlugin.rbxmx +3 -238
  175. package/studio-plugin/plugin.luau +2041 -365
@@ -0,0 +1,733 @@
1
+ import { promises as fs } from 'fs';
2
+ import * as path from 'path';
3
+ import { contentRoot } from './cache.js';
4
+ import { getOrBuild } from './embeddings/manager.js';
5
+ import { encodeOne, dot } from './embeddings/embedder.js';
6
+ const DEFAULT_EXTENSIONS = ['md', 'yaml', 'yml'];
7
+ function escapeRegex(literal) {
8
+ return literal.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
9
+ }
10
+ function buildPattern(pattern, opts) {
11
+ const body = opts.useRegex ? pattern : escapeRegex(pattern);
12
+ const flags = opts.caseSensitive ? 'g' : 'gi';
13
+ return new RegExp(body, flags);
14
+ }
15
+ /**
16
+ * Shell-style tokenizer: splits on whitespace, but `"foo bar"` stays one
17
+ * token. Lets callers force a phrase match where they want it
18
+ * (e.g. `"Class.Motor6D"` for the literal dotted form).
19
+ *
20
+ * Returns an empty array for an empty/whitespace string.
21
+ */
22
+ function tokenize(query) {
23
+ const tokens = [];
24
+ // Match either:
25
+ // - "...quoted phrase..." (any chars except an unescaped ")
26
+ // - bare run of non-whitespace
27
+ const re = /"([^"]*)"|(\S+)/g;
28
+ let m;
29
+ while ((m = re.exec(query)) !== null) {
30
+ const tok = m[1] ?? m[2];
31
+ if (tok && tok.length > 0)
32
+ tokens.push(tok);
33
+ }
34
+ return tokens;
35
+ }
36
+ /**
37
+ * Common English stopwords that wreck keyword AND-recall on natural-
38
+ * language queries like "how do I rotate a body part smoothly" — words
39
+ * like "how", "do", "I", "a" almost never appear together with the
40
+ * meaningful tokens in API docs, so requiring them in the AND filter
41
+ * zeroes out every hit.
42
+ *
43
+ * We strip these from the KEYWORD filter only — the semantic embedding
44
+ * still sees the full natural-language query (the model benefits from
45
+ * the question framing).
46
+ *
47
+ * Punctuation and short tokens (1 char) are also dropped because they
48
+ * don't carry meaning and dilute keyword density. Keep this list
49
+ * conservative: anything domain-specific (`set`, `get`, `play`, etc.)
50
+ * stays in — we'd rather under-strip than lose a Roblox-relevant term.
51
+ */
52
+ const STOPWORDS = new Set([
53
+ 'a', 'an', 'the',
54
+ 'i', 'me', 'my', 'you', 'your', 'we', 'our', 'us', 'they', 'them', 'their', 'it', 'its',
55
+ 'how', 'what', 'when', 'where', 'why', 'who', 'which', 'whose',
56
+ 'do', 'does', 'did', 'doing',
57
+ 'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
58
+ 'have', 'has', 'had', 'having',
59
+ 'can', 'could', 'should', 'would', 'may', 'might', 'will', 'shall', 'must',
60
+ 'to', 'of', 'for', 'in', 'on', 'at', 'by', 'with', 'from', 'as', 'into', 'onto', 'about',
61
+ 'and', 'or', 'but', 'if', 'so', 'than', 'then', 'because',
62
+ 'that', 'this', 'these', 'those',
63
+ 'there', 'here',
64
+ 'not', 'no',
65
+ 'some', 'any', 'all', 'each', 'every',
66
+ 'just', 'only', 'also', 'very', 'really', 'still',
67
+ 'one', 'two',
68
+ 'something',
69
+ ]);
70
+ /**
71
+ * Strip stopwords (and 1-char garbage) from a token list, returning the
72
+ * "meaningful" tokens. Always preserves at least the longest input
73
+ * token even if it happens to be a stopword — better to AND on
74
+ * something than nothing.
75
+ */
76
+ function meaningfulTokens(tokens) {
77
+ const kept = tokens.filter((t) => t.length >= 2 && !STOPWORDS.has(t.toLowerCase()));
78
+ if (kept.length > 0)
79
+ return kept;
80
+ // Degenerate: query was all stopwords (e.g. "what is the"). Fall
81
+ // back to the longest input token so something gets through.
82
+ const longest = tokens.slice().sort((a, b) => b.length - a.length)[0];
83
+ return longest ? [longest] : [];
84
+ }
85
+ function popcount(n) {
86
+ let c = 0;
87
+ while (n) {
88
+ n &= n - 1;
89
+ c++;
90
+ }
91
+ return c;
92
+ }
93
+ async function* walkFiles(root, scope, exts) {
94
+ const start = scope ? path.join(root, scope) : root;
95
+ // Lazily-recurse using an explicit stack so we don't blow the call
96
+ // stack on deep trees and don't hold the full file list in memory.
97
+ const stack = [start];
98
+ while (stack.length > 0) {
99
+ const dir = stack.pop();
100
+ let entries;
101
+ try {
102
+ entries = await fs.readdir(dir, { withFileTypes: true });
103
+ }
104
+ catch {
105
+ continue;
106
+ }
107
+ for (const e of entries) {
108
+ const p = path.join(dir, e.name);
109
+ if (e.isDirectory()) {
110
+ stack.push(p);
111
+ }
112
+ else if (e.isFile()) {
113
+ const ext = path.extname(e.name).slice(1).toLowerCase();
114
+ if (exts.has(ext)) {
115
+ yield p;
116
+ }
117
+ }
118
+ }
119
+ }
120
+ }
121
+ /**
122
+ * Top-level entry point. Decides between literal (single-token / regex)
123
+ * and token-AND (multi-token, windowed) search and dispatches.
124
+ *
125
+ * Token-AND mode triggers when ALL of:
126
+ * - `useRegex: false` (or unset)
127
+ * - the tokenized query has 2+ tokens
128
+ *
129
+ * Why? Plain literal substring is "the docs say `Motor6D C0` somewhere?"
130
+ * which is almost never true (the docs use dotted/quoted forms like
131
+ * `Class.Motor6D.C0|C0`). Token-AND turns the query into "find lines
132
+ * where Motor6D and C0 are both nearby", which is what the model
133
+ * actually meant.
134
+ */
135
+ export async function searchDocs(cacheDir, pattern, options = {}, internal = {}) {
136
+ const tokens = options.useRegex ? [] : tokenize(pattern);
137
+ if (!options.useRegex && tokens.length >= 2) {
138
+ const semantic = options.semantic ?? true;
139
+ if (semantic) {
140
+ // Hybrid uses the meaningful (stopword-stripped) tokens for the
141
+ // keyword AND filter but feeds the FULL original query to the
142
+ // embedder. This is the key to making natural-language queries
143
+ // like "how do I rotate a body part smoothly" actually surface
144
+ // anything: the AND filter only requires "rotate", "body",
145
+ // "part", "smoothly" to coexist (very plausible), while the
146
+ // semantic rerank still sees the full question form.
147
+ const keywordTokens = meaningfulTokens(tokens);
148
+ return searchDocsHybrid(cacheDir, pattern, tokens, keywordTokens, options, internal);
149
+ }
150
+ // Pure keyword mode (semantic: false): apply the same stopword
151
+ // strip so deterministic keyword queries don't get nuked by
152
+ // "how do I" boilerplate either.
153
+ const keywordTokens = meaningfulTokens(tokens);
154
+ return searchDocsTokenAnd(cacheDir, keywordTokens.length >= 1 ? keywordTokens : tokens, options);
155
+ }
156
+ // Literal mode: if the user wrapped the whole query in double-quotes
157
+ // (one resulting token after tokenize), search for the unquoted phrase
158
+ // — otherwise the literal substring includes the quotes themselves and
159
+ // matches nothing.
160
+ const literalPattern = !options.useRegex && tokens.length === 1 ? tokens[0] : pattern;
161
+ return searchDocsLiteral(cacheDir, literalPattern, options);
162
+ }
163
+ async function searchDocsLiteral(cacheDir, pattern, options) {
164
+ const t0 = Date.now();
165
+ const root = contentRoot(cacheDir);
166
+ const re = buildPattern(pattern, options);
167
+ const exts = new Set((options.extensions ?? DEFAULT_EXTENSIONS).map((e) => e.toLowerCase()));
168
+ const ctx = Math.max(0, Math.min(options.contextLines ?? 0, 5));
169
+ const maxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
170
+ const hits = [];
171
+ let totalHits = 0;
172
+ let filesScanned = 0;
173
+ let truncated = false;
174
+ outer: for await (const filePath of walkFiles(root, options.scope, exts)) {
175
+ filesScanned++;
176
+ let raw;
177
+ try {
178
+ raw = await fs.readFile(filePath, 'utf8');
179
+ }
180
+ catch {
181
+ continue;
182
+ }
183
+ // Cheap binary guard — if there's a NUL byte in the first 4KB, skip.
184
+ if (raw.length > 0 && raw.indexOf('\0', 0) !== -1)
185
+ continue;
186
+ const rel = path.relative(root, filePath);
187
+ const lines = raw.split(/\r?\n/);
188
+ for (let i = 0; i < lines.length; i++) {
189
+ // .test() advances lastIndex on /g flags — reset for each line.
190
+ re.lastIndex = 0;
191
+ if (!re.test(lines[i]))
192
+ continue;
193
+ totalHits++;
194
+ if (hits.length < maxHits) {
195
+ const hit = {
196
+ path: rel,
197
+ line: i + 1,
198
+ text: lines[i].replace(/\s+$/, ''),
199
+ };
200
+ if (ctx > 0) {
201
+ const start = Math.max(0, i - ctx);
202
+ const end = Math.min(lines.length, i + ctx + 1);
203
+ const surround = [];
204
+ for (let j = start; j < end; j++) {
205
+ if (j === i)
206
+ continue;
207
+ surround.push({ line: j + 1, text: lines[j].replace(/\s+$/, '') });
208
+ }
209
+ hit.context = surround;
210
+ }
211
+ hits.push(hit);
212
+ }
213
+ else {
214
+ truncated = true;
215
+ break outer;
216
+ }
217
+ }
218
+ }
219
+ return {
220
+ totalHits,
221
+ truncated,
222
+ hits,
223
+ filesScanned,
224
+ durationMs: Date.now() - t0,
225
+ mode: 'literal',
226
+ };
227
+ }
228
+ /**
229
+ * Multi-token AND search.
230
+ *
231
+ * For each file:
232
+ * 1. Compute a per-line bitmap of which tokens match.
233
+ * 2. Slide a window of `±windowLines` over the file. At each anchor
234
+ * position, OR the masks in `[anchor-w, anchor+w]`. If the union
235
+ * covers all tokens, we have a hit.
236
+ * 3. Pick the anchor as the line in that window with the most token
237
+ * coverage (ties → lowest line number). This way the `text` field
238
+ * is the most "informative" line, not just an arbitrary middle.
239
+ * 4. After firing, skip past `anchor + windowLines` to avoid pile-up
240
+ * of overlapping windows reporting the same dense paragraph N times.
241
+ *
242
+ * Implementation notes:
243
+ * - We bitmask, so token count is capped at 31. Over 31, we truncate
244
+ * and surface the truncation in the response (`tokens` will be the
245
+ * first 31). Real queries never have that many tokens.
246
+ * - The matched window is always emitted as `context` (regardless of
247
+ * `contextLines`) so the model can verify all tokens are nearby.
248
+ * Caller-specified `contextLines` widens this further if larger.
249
+ */
250
+ async function searchDocsTokenAnd(cacheDir, tokensIn, options) {
251
+ const t0 = Date.now();
252
+ const root = contentRoot(cacheDir);
253
+ const exts = new Set((options.extensions ?? DEFAULT_EXTENSIONS).map((e) => e.toLowerCase()));
254
+ const maxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
255
+ const w = Math.max(1, Math.min(options.windowLines ?? 3, 10));
256
+ const ctx = Math.max(w, Math.min(options.contextLines ?? 0, 10));
257
+ // Bitmask cap — see fn comment.
258
+ const tokens = tokensIn.slice(0, 31);
259
+ const tokenRes = tokens.map((t) => new RegExp(escapeRegex(t), options.caseSensitive ? '' : 'i'));
260
+ const allMask = tokens.length === 31 ? 0x7fffffff : (1 << tokens.length) - 1;
261
+ const hits = [];
262
+ let totalHits = 0;
263
+ let filesScanned = 0;
264
+ let truncated = false;
265
+ outer: for await (const filePath of walkFiles(root, options.scope, exts)) {
266
+ filesScanned++;
267
+ let raw;
268
+ try {
269
+ raw = await fs.readFile(filePath, 'utf8');
270
+ }
271
+ catch {
272
+ continue;
273
+ }
274
+ if (raw.length > 0 && raw.indexOf('\0', 0) !== -1)
275
+ continue;
276
+ const rel = path.relative(root, filePath);
277
+ const lines = raw.split(/\r?\n/);
278
+ // Quick reject: file must contain ALL tokens at least once. Cheap
279
+ // string scans avoid the per-line work for files that can never hit.
280
+ const lower = options.caseSensitive ? raw : raw.toLowerCase();
281
+ let canHit = true;
282
+ for (const t of tokens) {
283
+ const needle = options.caseSensitive ? t : t.toLowerCase();
284
+ if (lower.indexOf(needle) === -1) {
285
+ canHit = false;
286
+ break;
287
+ }
288
+ }
289
+ if (!canHit)
290
+ continue;
291
+ // Pass 1: per-line tokenMask.
292
+ const tokenMask = new Array(lines.length).fill(0);
293
+ for (let i = 0; i < lines.length; i++) {
294
+ const line = lines[i];
295
+ for (let t = 0; t < tokens.length; t++) {
296
+ tokenRes[t].lastIndex = 0;
297
+ if (tokenRes[t].test(line))
298
+ tokenMask[i] |= 1 << t;
299
+ }
300
+ }
301
+ // Pass 2: sliding-window union. Skip ahead after a hit.
302
+ let i = 0;
303
+ while (i < lines.length) {
304
+ let combined = 0;
305
+ let bestLine = i;
306
+ let bestCount = -1;
307
+ const lo = Math.max(0, i - w);
308
+ const hi = Math.min(lines.length - 1, i + w);
309
+ for (let j = lo; j <= hi; j++) {
310
+ combined |= tokenMask[j];
311
+ const cnt = popcount(tokenMask[j]);
312
+ if (cnt > bestCount || (cnt === bestCount && j < bestLine)) {
313
+ bestCount = cnt;
314
+ bestLine = j;
315
+ }
316
+ }
317
+ if ((combined & allMask) === allMask && bestCount > 0) {
318
+ totalHits++;
319
+ if (hits.length < maxHits) {
320
+ const ctxLo = Math.max(0, bestLine - ctx);
321
+ const ctxHi = Math.min(lines.length - 1, bestLine + ctx);
322
+ const surround = [];
323
+ for (let j = ctxLo; j <= ctxHi; j++) {
324
+ if (j === bestLine)
325
+ continue;
326
+ surround.push({ line: j + 1, text: lines[j].replace(/\s+$/, '') });
327
+ }
328
+ const matched = [];
329
+ for (let t = 0; t < tokens.length; t++) {
330
+ if (tokenMask[bestLine] & (1 << t))
331
+ matched.push(tokens[t]);
332
+ }
333
+ hits.push({
334
+ path: rel,
335
+ line: bestLine + 1,
336
+ text: lines[bestLine].replace(/\s+$/, ''),
337
+ context: surround,
338
+ matchedTokens: matched,
339
+ });
340
+ }
341
+ else {
342
+ truncated = true;
343
+ break outer;
344
+ }
345
+ // Advance past this window so we don't fire 7 times for one paragraph.
346
+ i = bestLine + w + 1;
347
+ }
348
+ else {
349
+ i++;
350
+ }
351
+ }
352
+ }
353
+ return {
354
+ totalHits,
355
+ truncated,
356
+ hits,
357
+ filesScanned,
358
+ durationMs: Date.now() - t0,
359
+ mode: 'token-and',
360
+ tokens,
361
+ };
362
+ }
363
+ /**
364
+ * Hybrid search: token-AND keyword filter + semantic rerank.
365
+ *
366
+ * Pipeline:
367
+ * 1. Run the existing token-AND keyword search to get a high-recall
368
+ * pool of candidate hits. We bump `maxHits` for this stage so we
369
+ * have more room to rerank — the final user-facing cap is
370
+ * applied after scoring.
371
+ * 2. Look up the chunk(s) that contain each hit's line. A hit
372
+ * inherits the embedding of the chunk it falls in.
373
+ * 3. Embed the query (~5ms cold, ~1ms warm).
374
+ * 4. Score each hit:
375
+ * finalScore = α · cosine(query, chunk) // semantic
376
+ * + β · (matchedTokens / totalTokens) // keyword density
377
+ * + γ · pathBoost // reference > guide for API-ish queries
378
+ * where α=0.7, β=0.2, γ=0.1. These were picked by eyeballing a
379
+ * handful of golden queries — see test/golden-queries.test.ts.
380
+ * 5. Sort by finalScore, slice to `maxHits`, return.
381
+ *
382
+ * Fallbacks (any of which keep search usable):
383
+ * - No SHA passed in → can't load index → return plain token-AND.
384
+ * - Index load/build fails → return plain token-AND with semanticUsed=false.
385
+ * - No candidate hits → return empty (semantic won't invent matches
386
+ * that don't lexically exist; that's a feature, not a bug — we
387
+ * guarantee every returned hit contains every query token).
388
+ */
389
+ async function searchDocsHybrid(cacheDir, pattern, allTokens, keywordTokens, options, internal) {
390
+ const t0 = Date.now();
391
+ const userMaxHits = Math.max(1, Math.min(options.maxHits ?? 200, 1000));
392
+ // High recall pool: rerank wants more to choose from, but we cap
393
+ // hard to avoid embedding 1000 hits for nothing.
394
+ const poolMaxHits = Math.min(Math.max(userMaxHits * 3, 50), 300);
395
+ // 1. Run keyword pass with the STOPWORD-STRIPPED tokens and a
396
+ // beefier cap. We can only AND on meaningful tokens — otherwise
397
+ // "how do I rotate a body part" requires "how" + "do" + "I" + "a"
398
+ // to all be present somewhere in docs, which zeroes out the pool
399
+ // instantly.
400
+ const keyword = keywordTokens.length >= 1
401
+ ? await searchDocsTokenAnd(cacheDir, keywordTokens, {
402
+ ...options,
403
+ maxHits: poolMaxHits,
404
+ })
405
+ : {
406
+ // No usable keyword tokens at all (degenerate). Fake an empty
407
+ // keyword result and let the semantic fallback kick in.
408
+ totalHits: 0,
409
+ truncated: false,
410
+ hits: [],
411
+ filesScanned: 0,
412
+ durationMs: 0,
413
+ mode: 'token-and',
414
+ tokens: [],
415
+ };
416
+ // 2. Load (or build) the semantic index. If unavailable, return what
417
+ // keyword found (possibly empty) and let the caller deal with it.
418
+ let index = null;
419
+ if (internal.docsSha) {
420
+ try {
421
+ index = await getOrBuild(cacheDir, internal.docsSha);
422
+ }
423
+ catch {
424
+ index = null;
425
+ }
426
+ }
427
+ if (!index) {
428
+ return {
429
+ ...keyword,
430
+ hits: keyword.hits.slice(0, userMaxHits),
431
+ mode: 'hybrid',
432
+ semanticUsed: false,
433
+ durationMs: Date.now() - t0,
434
+ tokens: keywordTokens.slice(0, 31),
435
+ };
436
+ }
437
+ // 3. Embed query (full original pattern — model benefits from
438
+ // natural-language framing, even though keyword filter dropped
439
+ // stopwords).
440
+ let qvec;
441
+ try {
442
+ qvec = await encodeOne(pattern);
443
+ }
444
+ catch {
445
+ return {
446
+ ...keyword,
447
+ hits: keyword.hits.slice(0, userMaxHits),
448
+ mode: 'hybrid',
449
+ semanticUsed: false,
450
+ durationMs: Date.now() - t0,
451
+ tokens: keywordTokens.slice(0, 31),
452
+ };
453
+ }
454
+ // 4. If keyword AND filter found nothing, fall back to PURE SEMANTIC
455
+ // top-K from the chunk index. This is what makes natural-
456
+ // language queries work — the user gets conceptually relevant
457
+ // chunks even though no single line lexically contains all the
458
+ // meaningful terms. Returned hits carry semanticScore but no
459
+ // matchedTokens (since there's no keyword guarantee).
460
+ if (keyword.hits.length === 0) {
461
+ return semanticOnlyFallback(cacheDir, pattern, allTokens, keywordTokens, qvec, index, options, userMaxHits, keyword.filesScanned, t0);
462
+ }
463
+ // 5. Build path→chunks lookup so the hit→chunk join is O(hits + chunks),
464
+ // not O(hits × chunks).
465
+ const chunksByPath = new Map();
466
+ const dim = index.meta.dim;
467
+ for (let i = 0; i < index.chunks.length; i++) {
468
+ const c = index.chunks[i];
469
+ const vec = index.vectors.subarray(i * dim, (i + 1) * dim);
470
+ const arr = chunksByPath.get(c.path);
471
+ const entry = { startLine: c.startLine, endLine: c.endLine, vec };
472
+ if (arr)
473
+ arr.push(entry);
474
+ else
475
+ chunksByPath.set(c.path, [entry]);
476
+ }
477
+ // Heuristic: does the query look like an API name lookup? If so,
478
+ // boost reference/engine chunks. Cheap regex test: any PascalCase
479
+ // token (e.g. "Motor6D", "TweenService") triggers the boost.
480
+ const looksApiLike = allTokens.some((t) => /^[A-Z][a-zA-Z0-9_]*$/.test(t));
481
+ // 6. Score every keyword hit.
482
+ const ALPHA = 0.7;
483
+ const BETA = 0.2;
484
+ const GAMMA = 0.1;
485
+ const tokenCount = Math.max(1, keywordTokens.length);
486
+ const scored = [];
487
+ for (const hit of keyword.hits) {
488
+ const chunksForFile = chunksByPath.get(hit.path);
489
+ let semanticScore = 0;
490
+ if (chunksForFile) {
491
+ for (const ch of chunksForFile) {
492
+ if (hit.line < ch.startLine || hit.line > ch.endLine)
493
+ continue;
494
+ const sim = dot(qvec, ch.vec);
495
+ if (sim > semanticScore)
496
+ semanticScore = sim;
497
+ }
498
+ if (semanticScore === 0) {
499
+ for (const ch of chunksForFile) {
500
+ const sim = dot(qvec, ch.vec);
501
+ if (sim > semanticScore)
502
+ semanticScore = sim;
503
+ }
504
+ }
505
+ }
506
+ const keywordDensity = (hit.matchedTokens?.length ?? 0) / tokenCount;
507
+ const pathBoost = looksApiLike && hit.path.includes('reference/engine') ? 1 : 0;
508
+ const finalScore = ALPHA * semanticScore + BETA * keywordDensity + GAMMA * pathBoost;
509
+ scored.push({ ...hit, score: finalScore, semanticScore });
510
+ }
511
+ // 7. Sort & trim.
512
+ scored.sort((a, b) => b.score - a.score);
513
+ const finalHits = scored.slice(0, userMaxHits);
514
+ return {
515
+ totalHits: keyword.totalHits,
516
+ truncated: keyword.totalHits > finalHits.length,
517
+ hits: finalHits,
518
+ filesScanned: keyword.filesScanned,
519
+ durationMs: Date.now() - t0,
520
+ mode: 'hybrid',
521
+ tokens: keywordTokens.slice(0, 31),
522
+ semanticUsed: true,
523
+ keywordFiltered: true,
524
+ };
525
+ }
526
+ /**
527
+ * Pure-semantic search over the chunk index. Used as a fallback when
528
+ * keyword AND-filtering would have killed recall (e.g. natural-language
529
+ * queries where no single line contains every meaningful term).
530
+ *
531
+ * Returns one SearchHit per top-K chunk:
532
+ * - `path`: chunk path
533
+ * - `line`: chunk startLine
534
+ * - `text`: first non-empty line of the chunk (~best summary anchor)
535
+ * - `context`: a few subsequent lines of the chunk so the model can
536
+ * verify topical relevance
537
+ * - `score` / `semanticScore`: cosine similarity (also written into
538
+ * `score` because there's no keyword density component here)
539
+ * - `matchedTokens`: omitted (no keyword guarantee)
540
+ *
541
+ * No keyword guarantee means a hit's `text` may not contain any token
542
+ * from the user's query. That's intentional and correct for queries
543
+ * like "how do I rotate a body part smoothly" — the relevant doc
544
+ * (`AlignOrientation`) doesn't use the word "rotate" in every line.
545
+ *
546
+ * `keywordFiltered: false` in the response signals this to the caller.
547
+ */
548
+ async function semanticOnlyFallback(cacheDir, pattern, allTokens, keywordTokens, qvec, index, options, userMaxHits, filesScanned, t0) {
549
+ const dim = index.meta.dim;
550
+ const scope = options.scope;
551
+ const looksApiLike = allTokens.some((t) => /^[A-Z][a-zA-Z0-9_]*$/.test(t));
552
+ const scored = [];
553
+ for (let i = 0; i < index.chunks.length; i++) {
554
+ const c = index.chunks[i];
555
+ if (scope && !c.path.startsWith(scope))
556
+ continue;
557
+ const vec = index.vectors.subarray(i * dim, (i + 1) * dim);
558
+ const sem = dot(qvec, vec);
559
+ // Add a small path boost for API-ish queries — same idea as hybrid.
560
+ const pathBoost = looksApiLike && c.path.includes('reference/engine') ? 1 : 0;
561
+ const finalScore = 0.9 * sem + 0.1 * pathBoost;
562
+ scored.push({ idx: i, score: finalScore, semanticScore: sem });
563
+ }
564
+ if (scored.length === 0) {
565
+ return {
566
+ totalHits: 0,
567
+ truncated: false,
568
+ hits: [],
569
+ filesScanned,
570
+ durationMs: Date.now() - t0,
571
+ mode: 'hybrid',
572
+ tokens: keywordTokens.slice(0, 31),
573
+ semanticUsed: true,
574
+ keywordFiltered: false,
575
+ };
576
+ }
577
+ scored.sort((a, b) => b.score - a.score);
578
+ // Diversify: don't return 5 chunks from the same file.
579
+ const seenPath = new Map();
580
+ const perPathCap = 2;
581
+ const picked = [];
582
+ for (const s of scored) {
583
+ const p = index.chunks[s.idx].path;
584
+ const used = seenPath.get(p) ?? 0;
585
+ if (used >= perPathCap)
586
+ continue;
587
+ picked.push(s);
588
+ seenPath.set(p, used + 1);
589
+ if (picked.length >= userMaxHits)
590
+ break;
591
+ }
592
+ const hits = picked.map((s) => {
593
+ const c = index.chunks[s.idx];
594
+ // Pick a representative anchor line: first non-empty, non-heading line.
595
+ const chunkLines = c.text.split(/\r?\n/);
596
+ let anchorOffset = 0;
597
+ let anchorText = chunkLines[0] ?? '';
598
+ for (let j = 0; j < chunkLines.length; j++) {
599
+ const t = chunkLines[j].trim();
600
+ if (!t)
601
+ continue;
602
+ // Skip pure heading lines as anchors — the body is more informative.
603
+ if (j < chunkLines.length - 1 && /^#+\s/.test(t))
604
+ continue;
605
+ anchorOffset = j;
606
+ anchorText = chunkLines[j];
607
+ break;
608
+ }
609
+ // Provide a small slice of surrounding lines as context.
610
+ const ctxLines = [];
611
+ const ctxStart = Math.max(0, anchorOffset - 1);
612
+ const ctxEnd = Math.min(chunkLines.length, anchorOffset + 4);
613
+ for (let j = ctxStart; j < ctxEnd; j++) {
614
+ if (j === anchorOffset)
615
+ continue;
616
+ const lineText = chunkLines[j].replace(/\s+$/, '');
617
+ if (!lineText)
618
+ continue;
619
+ ctxLines.push({ line: c.startLine + j, text: lineText });
620
+ }
621
+ return {
622
+ path: c.path,
623
+ line: c.startLine + anchorOffset,
624
+ text: anchorText.replace(/\s+$/, ''),
625
+ context: ctxLines,
626
+ score: s.score,
627
+ semanticScore: s.semanticScore,
628
+ };
629
+ });
630
+ return {
631
+ totalHits: scored.length,
632
+ truncated: scored.length > hits.length,
633
+ hits,
634
+ filesScanned,
635
+ durationMs: Date.now() - t0,
636
+ mode: 'hybrid',
637
+ tokens: keywordTokens.slice(0, 31),
638
+ semanticUsed: true,
639
+ keywordFiltered: false,
640
+ };
641
+ }
642
+ /**
643
+ * Read a single doc file by its content-relative path.
644
+ *
645
+ * Two flavors of input are accepted to make the LLM-facing tool more
646
+ * forgiving:
647
+ * • "en-us/reference/engine/classes/Part.yaml" (canonical)
648
+ * • "Part" / "Part.yaml" — best-effort lookup under reference/engine
649
+ * (handled by `resolveReferenceDoc` in reference.ts, not here).
650
+ */
651
+ export async function readDocFile(cacheDir, relPath) {
652
+ const root = contentRoot(cacheDir);
653
+ // Normalize and reject path-traversal attempts.
654
+ const safe = path.normalize(relPath).replace(/^[/\\]+/, '');
655
+ if (safe.startsWith('..'))
656
+ return null;
657
+ const full = path.join(root, safe);
658
+ if (!full.startsWith(root))
659
+ return null;
660
+ try {
661
+ const stat = await fs.stat(full);
662
+ if (!stat.isFile())
663
+ return null;
664
+ const content = await fs.readFile(full, 'utf8');
665
+ return { path: safe, bytes: stat.size, content };
666
+ }
667
+ catch {
668
+ return null;
669
+ }
670
+ }
671
+ export async function listDocs(cacheDir, relPath = '', options = {}) {
672
+ const root = contentRoot(cacheDir);
673
+ const safe = path.normalize(relPath).replace(/^[/\\]+/, '');
674
+ if (safe.startsWith('..'))
675
+ return null;
676
+ const full = safe ? path.join(root, safe) : root;
677
+ if (!full.startsWith(root))
678
+ return null;
679
+ let entries;
680
+ try {
681
+ entries = await fs.readdir(full, { withFileTypes: true });
682
+ }
683
+ catch {
684
+ return null;
685
+ }
686
+ // Build the full sorted list first (cheap — just names + types + a
687
+ // stat per file). Pagination then slices into it.
688
+ const all = [];
689
+ for (const e of entries) {
690
+ if (e.isDirectory()) {
691
+ all.push({ name: e.name, type: 'dir' });
692
+ }
693
+ else if (e.isFile()) {
694
+ // We stat lazily below — only for the slice we'll return.
695
+ all.push({ name: e.name, type: 'file' });
696
+ }
697
+ }
698
+ // Stable order: directories first, then files, alphabetic within each.
699
+ all.sort((a, b) => {
700
+ if (a.type !== b.type)
701
+ return a.type === 'dir' ? -1 : 1;
702
+ return a.name.localeCompare(b.name);
703
+ });
704
+ const totalEntries = all.length;
705
+ const offset = Math.max(0, Math.floor(options.offset ?? 0));
706
+ const limit = Math.max(1, Math.min(Math.floor(options.limit ?? 100), 1000));
707
+ const pageRaw = all.slice(offset, offset + limit);
708
+ // Stat only the page we're returning.
709
+ const page = [];
710
+ for (const entry of pageRaw) {
711
+ if (entry.type === 'file') {
712
+ try {
713
+ const stat = await fs.stat(path.join(full, entry.name));
714
+ page.push({ ...entry, size: stat.size });
715
+ }
716
+ catch {
717
+ page.push(entry);
718
+ }
719
+ }
720
+ else {
721
+ page.push(entry);
722
+ }
723
+ }
724
+ return {
725
+ path: safe,
726
+ totalEntries,
727
+ offset,
728
+ limit,
729
+ truncated: offset + page.length < totalEntries,
730
+ entries: page,
731
+ };
732
+ }
733
+ //# sourceMappingURL=search.js.map