@sammysnake/fast-context-mcp 1.3.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/package.json +47 -0
- package/src/core.mjs +1906 -0
- package/src/directory-scorer.mjs +1059 -0
- package/src/executor.mjs +597 -0
- package/src/extract-key.mjs +93 -0
- package/src/protobuf.mjs +235 -0
- package/src/server.mjs +320 -0
|
@@ -0,0 +1,1059 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Directory Scorer - BM25F + Probe + RRF
|
|
3
|
+
*
|
|
4
|
+
* Based on IR research:
|
|
5
|
+
* - BM25F for multi-field structured documents (Robertson & Zaragoza)
|
|
6
|
+
* - Probe grep signal with IDF weighting
|
|
7
|
+
* - RRF fusion for combining multiple rankers (Cormack et al.)
|
|
8
|
+
*
|
|
9
|
+
* Directory Profile Fields:
|
|
10
|
+
* - dir_name: Top-level directory name (weight: 1.0)
|
|
11
|
+
* - path_tokens: All file paths under the directory (weight: 4.0) <- MAIN SIGNAL
|
|
12
|
+
* - metadata: package.json, go.mod, Cargo.toml info (weight: 3.0)
|
|
13
|
+
* - headers: First N lines / markdown headers (weight: 2.0)
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { readdirSync, readFileSync, existsSync, statSync } from "fs";
|
|
17
|
+
import { join, resolve, relative, extname, basename, dirname } from "path";
|
|
18
|
+
import { spawnSync } from "child_process";
|
|
19
|
+
|
|
20
|
+
// ─── Constants ───────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
const BM25_K1 = 1.2;
|
|
23
|
+
const BM25_B = 0.75;
|
|
24
|
+
const RRF_K = 60;
|
|
25
|
+
|
|
26
|
+
// Field weights for BM25F (from research recommendations)
|
|
27
|
+
const FIELD_WEIGHTS = {
|
|
28
|
+
dir_name: 1.0,
|
|
29
|
+
path_tokens: 4.0, // Main signal
|
|
30
|
+
metadata: 3.0,
|
|
31
|
+
headers: 2.0,
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
// Default exclude patterns
|
|
35
|
+
const DEFAULT_EXCLUDES = new Set([
|
|
36
|
+
"node_modules", ".git", "dist", "build", "coverage", ".venv", "venv",
|
|
37
|
+
"target", "out", ".cache", "__pycache__", "vendor", "deps", "third_party",
|
|
38
|
+
"logs", "data", ".next", ".nuxt", "bundle", "bundled", "fixtures",
|
|
39
|
+
]);
|
|
40
|
+
|
|
41
|
+
// Stopwords for tokenization
|
|
42
|
+
const STOPWORDS = new Set([
|
|
43
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
44
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
45
|
+
"should", "may", "might", "must", "shall", "can", "need", "dare",
|
|
46
|
+
"to", "of", "in", "for", "on", "with", "at", "by", "from", "as",
|
|
47
|
+
"into", "through", "during", "before", "after", "above", "below",
|
|
48
|
+
"and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
|
|
49
|
+
"not", "only", "own", "same", "than", "too", "very", "just", "also",
|
|
50
|
+
"this", "that", "these", "those", "here", "there", "all", "any",
|
|
51
|
+
"some", "no", "none", "each", "every", "other", "another", "such",
|
|
52
|
+
"get", "set", "use", "used", "using", "make", "made", "if", "then",
|
|
53
|
+
"else", "return", "new", "like", "well", "where", "which", "who",
|
|
54
|
+
"what", "when", "why", "how", "it", "its", "we", "you", "your",
|
|
55
|
+
]);
|
|
56
|
+
|
|
57
|
+
// ─── Tokenization ─────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
// Stem patterns hoisted to module scope — avoids re-allocating 18 RegExp per call
|
|
60
|
+
const STEM_PATTERNS = [
|
|
61
|
+
[/^(.+)(ies)$/, "$1y"],
|
|
62
|
+
[/^(.+)([^aeiou])(es)$/, "$1$2"],
|
|
63
|
+
[/^(.+)([^aeiou])(s)$/, "$1$2"],
|
|
64
|
+
[/^(.+)(ing)$/, "$1"],
|
|
65
|
+
[/^(.+)(edly)$/, "$1"],
|
|
66
|
+
[/^(.+)(ly)$/, "$1"],
|
|
67
|
+
[/^(.+)(ed)$/, "$1"],
|
|
68
|
+
[/^(.+)(ation)$/, "$1ate"],
|
|
69
|
+
[/^(.+)(tion)$/, "$1t"],
|
|
70
|
+
[/^(.+)(ment)$/, "$1"],
|
|
71
|
+
[/^(.+)(ness)$/, "$1"],
|
|
72
|
+
[/^(.+)(ful)$/, "$1"],
|
|
73
|
+
[/^(.+)(less)$/, "$1"],
|
|
74
|
+
[/^(.+)(able)$/, "$1"],
|
|
75
|
+
[/^(.+)(ible)$/, "$1"],
|
|
76
|
+
[/^(.+)(ally)$/, "$1al"],
|
|
77
|
+
[/^(.+)(ity)$/, "$1"],
|
|
78
|
+
[/^(.+)(ive)$/, "$1"],
|
|
79
|
+
];
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Basic Porter-like stemming (simplified)
|
|
83
|
+
*/
|
|
84
|
+
function stem(word) {
|
|
85
|
+
if (!word || word.length < 3) return word;
|
|
86
|
+
const w = word.toLowerCase();
|
|
87
|
+
|
|
88
|
+
for (const [pattern, replacement] of STEM_PATTERNS) {
|
|
89
|
+
if (pattern.test(w)) {
|
|
90
|
+
return w.replace(pattern, replacement);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return w;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Tokenize text with stemming and stopword removal
|
|
98
|
+
*/
|
|
99
|
+
function tokenize(text, options = {}) {
|
|
100
|
+
if (!text) return [];
|
|
101
|
+
const { keepCase = false, minLen = 2 } = options;
|
|
102
|
+
|
|
103
|
+
return text
|
|
104
|
+
.toLowerCase()
|
|
105
|
+
.replace(/[^\w\s\-./\\@]/g, " ")
|
|
106
|
+
.split(/[\s\-./\\]+/)
|
|
107
|
+
.filter(t => t.length >= minLen && !STOPWORDS.has(t))
|
|
108
|
+
.map(t => stem(keepCase ? t : t.toLowerCase()));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Tokenize file path (handles code paths better)
|
|
113
|
+
*/
|
|
114
|
+
function tokenizePath(pathStr) {
|
|
115
|
+
if (!pathStr) return [];
|
|
116
|
+
return pathStr
|
|
117
|
+
.toLowerCase()
|
|
118
|
+
.replace(/[\/\\]/g, " ")
|
|
119
|
+
.replace(/[._-]/g, " ")
|
|
120
|
+
.split(/\s+/)
|
|
121
|
+
.filter(t => t.length >= 2)
|
|
122
|
+
.map(stem);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// ─── Directory Profile Builder ────────────────────────────────
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Extract metadata from common config files
|
|
129
|
+
*/
|
|
130
|
+
function extractMetadata(dirPath) {
|
|
131
|
+
const metadata = [];
|
|
132
|
+
|
|
133
|
+
// package.json
|
|
134
|
+
const pkgPath = join(dirPath, "package.json");
|
|
135
|
+
if (existsSync(pkgPath)) {
|
|
136
|
+
try {
|
|
137
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
138
|
+
if (pkg.name) metadata.push(pkg.name);
|
|
139
|
+
if (pkg.description) metadata.push(...tokenize(pkg.description));
|
|
140
|
+
if (pkg.keywords) metadata.push(...pkg.keywords.flatMap(k => tokenize(k)));
|
|
141
|
+
if (pkg.dependencies) metadata.push(...Object.keys(pkg.dependencies).flatMap(k => tokenize(k)));
|
|
142
|
+
} catch {}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// go.mod
|
|
146
|
+
const goModPath = join(dirPath, "go.mod");
|
|
147
|
+
if (existsSync(goModPath)) {
|
|
148
|
+
try {
|
|
149
|
+
const content = readFileSync(goModPath, "utf-8");
|
|
150
|
+
const moduleMatch = content.match(/module\s+(\S+)/);
|
|
151
|
+
if (moduleMatch) metadata.push(...tokenizePath(moduleMatch[1]));
|
|
152
|
+
} catch {}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Cargo.toml
|
|
156
|
+
const cargoPath = join(dirPath, "Cargo.toml");
|
|
157
|
+
if (existsSync(cargoPath)) {
|
|
158
|
+
try {
|
|
159
|
+
const content = readFileSync(cargoPath, "utf-8");
|
|
160
|
+
const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
|
|
161
|
+
if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
|
|
162
|
+
} catch {}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// pyproject.toml
|
|
166
|
+
const pyprojectPath = join(dirPath, "pyproject.toml");
|
|
167
|
+
if (existsSync(pyprojectPath)) {
|
|
168
|
+
try {
|
|
169
|
+
const content = readFileSync(pyprojectPath, "utf-8");
|
|
170
|
+
const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
|
|
171
|
+
if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
|
|
172
|
+
} catch {}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return metadata.join(" ");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Extract headers from a file (markdown headers, code comments, etc.)
|
|
180
|
+
*/
|
|
181
|
+
function extractFileHeaders(filePath) {
|
|
182
|
+
try {
|
|
183
|
+
const content = readFileSync(filePath, "utf-8").slice(0, 2000); // First 2KB
|
|
184
|
+
const headers = [];
|
|
185
|
+
|
|
186
|
+
// Markdown headers
|
|
187
|
+
const mdHeaders = content.match(/^#+\s+.+$/gm) || [];
|
|
188
|
+
headers.push(...mdHeaders.map(h => h.replace(/^#+\s+/, "")));
|
|
189
|
+
|
|
190
|
+
// Code comments (first 10 lines)
|
|
191
|
+
const lines = content.split("\n").slice(0, 10);
|
|
192
|
+
for (const line of lines) {
|
|
193
|
+
const comment = line.match(/^\s*(?:(?:\/\/|#|;|\*)\s*)(.+)$/);
|
|
194
|
+
if (comment) headers.push(comment[1]);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return headers.join(" ");
|
|
198
|
+
} catch {
|
|
199
|
+
return "";
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// ─── Profile Cache (process-level TTL) ───────────────────────
|
|
204
|
+
//
|
|
205
|
+
// MCP server is long-running: repeated search() calls for the same project
|
|
206
|
+
// would re-walk every directory each time (~200ms+ per large dir due to
|
|
207
|
+
// readFileSync for headers). This cache avoids that.
|
|
208
|
+
//
|
|
209
|
+
// Key: projectRoot + "|" + dirName + "|" + sortedExcludes
|
|
210
|
+
// TTL: 120s (configurable via FC_PROFILE_CACHE_TTL env)
|
|
211
|
+
// Scope: process lifetime only, not persisted
|
|
212
|
+
|
|
213
|
+
const _profileCache = new Map();
|
|
214
|
+
const PROFILE_CACHE_TTL_MS = (parseInt(process.env.FC_PROFILE_CACHE_TTL, 10) || 120) * 1000;
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Invalidate all cached profiles for a project root.
|
|
218
|
+
* Call this if you know files have changed (optional — TTL handles normal staleness).
|
|
219
|
+
*/
|
|
220
|
+
export function invalidateProfileCache(projectRoot) {
|
|
221
|
+
for (const key of _profileCache.keys()) {
|
|
222
|
+
if (key.startsWith(projectRoot + "|")) {
|
|
223
|
+
_profileCache.delete(key);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Build a profile for a top-level directory (with TTL cache)
|
|
230
|
+
*/
|
|
231
|
+
export function buildDirectoryProfile(projectRoot, dirName, excludePaths = [], maxDepth = 3) {
|
|
232
|
+
// Cache lookup
|
|
233
|
+
const cacheKey = `${projectRoot}|${dirName}|${[...excludePaths].sort().join(",")}`;
|
|
234
|
+
const cached = _profileCache.get(cacheKey);
|
|
235
|
+
if (cached && (Date.now() - cached.cachedAt) < PROFILE_CACHE_TTL_MS) {
|
|
236
|
+
return cached.profile;
|
|
237
|
+
}
|
|
238
|
+
const dirPath = join(projectRoot, dirName);
|
|
239
|
+
const profile = {
|
|
240
|
+
dir_name: dirName,
|
|
241
|
+
path_tokens: [],
|
|
242
|
+
metadata: "",
|
|
243
|
+
headers: [],
|
|
244
|
+
file_count: 0,
|
|
245
|
+
file_paths: [], // Store actual file paths for path spines
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
const excludeSet = new Set(excludePaths);
|
|
249
|
+
|
|
250
|
+
function walk(currentPath, depth) {
|
|
251
|
+
if (depth > maxDepth) return;
|
|
252
|
+
try {
|
|
253
|
+
const entries = readdirSync(currentPath, { withFileTypes: true });
|
|
254
|
+
for (const entry of entries) {
|
|
255
|
+
const name = entry.name;
|
|
256
|
+
|
|
257
|
+
// Skip excluded and noise
|
|
258
|
+
if (DEFAULT_EXCLUDES.has(name) || excludeSet.has(name)) continue;
|
|
259
|
+
if (name.startsWith(".") && name !== ".github") continue;
|
|
260
|
+
|
|
261
|
+
const fullPath = join(currentPath, name);
|
|
262
|
+
const relPath = relative(projectRoot, fullPath);
|
|
263
|
+
|
|
264
|
+
if (entry.isDirectory()) {
|
|
265
|
+
profile.path_tokens.push(relPath);
|
|
266
|
+
walk(fullPath, depth + 1);
|
|
267
|
+
} else if (entry.isFile()) {
|
|
268
|
+
profile.path_tokens.push(relPath);
|
|
269
|
+
profile.file_paths.push(relPath);
|
|
270
|
+
profile.file_count++;
|
|
271
|
+
|
|
272
|
+
// Extract headers from relevant files
|
|
273
|
+
const ext = extname(name);
|
|
274
|
+
if ([".md", ".mdx", ".ts", ".tsx", ".js", ".jsx", ".py", ".go"].includes(ext)) {
|
|
275
|
+
const headers = extractFileHeaders(fullPath);
|
|
276
|
+
if (headers) profile.headers.push(headers);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
} catch { /* ignore walk errors */ }
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
walk(dirPath, 1);
|
|
284
|
+
|
|
285
|
+
// Extract metadata from config files
|
|
286
|
+
profile.metadata = extractMetadata(dirPath);
|
|
287
|
+
|
|
288
|
+
// Convert arrays to text
|
|
289
|
+
profile.path_tokens_text = profile.path_tokens.join(" ");
|
|
290
|
+
profile.headers_text = profile.headers.join(" ");
|
|
291
|
+
|
|
292
|
+
// Store in cache
|
|
293
|
+
_profileCache.set(cacheKey, { profile, cachedAt: Date.now() });
|
|
294
|
+
|
|
295
|
+
return profile;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// ─── BM25/BM25F Implementation ────────────────────────────────
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Compute IDF for terms across documents
|
|
302
|
+
*/
|
|
303
|
+
function computeIDF(documents) {
|
|
304
|
+
const docCount = documents.length;
|
|
305
|
+
const termDocCount = {};
|
|
306
|
+
const idf = {};
|
|
307
|
+
|
|
308
|
+
for (const doc of documents) {
|
|
309
|
+
const uniqueTerms = new Set(doc);
|
|
310
|
+
for (const term of uniqueTerms) {
|
|
311
|
+
termDocCount[term] = (termDocCount[term] || 0) + 1;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
for (const [term, count] of Object.entries(termDocCount)) {
|
|
316
|
+
// Standard IDF formula
|
|
317
|
+
idf[term] = Math.log((docCount - count + 0.5) / (count + 0.5) + 1);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return idf;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* BM25 score for a single field
|
|
325
|
+
*/
|
|
326
|
+
function bm25FieldScore(queryTerms, fieldTerms, avgLen, fieldLen, idf) {
|
|
327
|
+
const termFreqs = {};
|
|
328
|
+
fieldTerms.forEach(t => { termFreqs[t] = (termFreqs[t] || 0) + 1; });
|
|
329
|
+
|
|
330
|
+
let score = 0;
|
|
331
|
+
for (const term of queryTerms) {
|
|
332
|
+
const tf = termFreqs[term] || 0;
|
|
333
|
+
if (tf === 0) continue;
|
|
334
|
+
|
|
335
|
+
const termIDF = idf[term] || Math.log(2); // Default IDF for unseen terms
|
|
336
|
+
const numerator = tf * (BM25_K1 + 1);
|
|
337
|
+
const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (fieldLen / avgLen));
|
|
338
|
+
score += termIDF * (numerator / denominator);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return score;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
/**
|
|
345
|
+
* BM25F score across all fields (uses pre-cached tokenized fields when available)
|
|
346
|
+
*/
|
|
347
|
+
function bm25fScore(queryTerms, profile, avgFieldLens, idf) {
|
|
348
|
+
const tok = profile._tok;
|
|
349
|
+
const fields = [
|
|
350
|
+
{ name: "dir_name", terms: tok ? tok.dir_name : tokenize(profile.dir_name || ""), weight: FIELD_WEIGHTS.dir_name },
|
|
351
|
+
{ name: "path_tokens", terms: tok ? tok.path_tokens : tokenize(profile.path_tokens_text || ""), weight: FIELD_WEIGHTS.path_tokens },
|
|
352
|
+
{ name: "metadata", terms: tok ? tok.metadata : tokenize(profile.metadata || ""), weight: FIELD_WEIGHTS.metadata },
|
|
353
|
+
{ name: "headers", terms: tok ? tok.headers : tokenize(profile.headers_text || ""), weight: FIELD_WEIGHTS.headers },
|
|
354
|
+
];
|
|
355
|
+
|
|
356
|
+
let totalScore = 0;
|
|
357
|
+
for (const field of fields) {
|
|
358
|
+
const avgLen = avgFieldLens[field.name] || 50;
|
|
359
|
+
const fieldLen = field.terms.length || 1;
|
|
360
|
+
const fieldScore = bm25FieldScore(queryTerms, field.terms, avgLen, fieldLen, idf);
|
|
361
|
+
totalScore += field.weight * fieldScore;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return totalScore;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// ─── Probe Grep Signal ────────────────────────────────────────
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Select probe terms from query (prioritize high IDF, but include diverse terms)
|
|
371
|
+
*/
|
|
372
|
+
function selectProbeTerms(queryTerms, idf, maxTerms = 6) {
|
|
373
|
+
// Sort by IDF (descending) and select top terms
|
|
374
|
+
const sorted = queryTerms
|
|
375
|
+
.map(t => ({ term: t, idf: idf[t] || 0 }))
|
|
376
|
+
.sort((a, b) => b.idf - a.idf);
|
|
377
|
+
|
|
378
|
+
// Return unique terms (top-N by IDF)
|
|
379
|
+
const unique = [...new Set(sorted.map(t => t.term))];
|
|
380
|
+
return unique.slice(0, maxTerms);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Execute probe grep to count matches per directory.
|
|
385
|
+
* Uses a single rg call with regex alternation (term1|term2|...) instead of
|
|
386
|
+
* N sequential calls — saves (N-1) process spawns (~2-5s).
|
|
387
|
+
*
|
|
388
|
+
* Scoring: each matching file contributes 1 hit to its directory.
|
|
389
|
+
* RRF only cares about rank order, which is robust to this simplification.
|
|
390
|
+
*/
|
|
391
|
+
function probeGrep(projectRoot, topDirs, probeTerms, excludePaths = []) {
|
|
392
|
+
if (probeTerms.length === 0) return {};
|
|
393
|
+
|
|
394
|
+
const dirHits = {};
|
|
395
|
+
const excludeSet = new Set([...excludePaths, ...DEFAULT_EXCLUDES]);
|
|
396
|
+
|
|
397
|
+
for (const dir of topDirs) {
|
|
398
|
+
dirHits[dir] = 0;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Build single regex alternation: escape each term for regex safety
|
|
402
|
+
const pattern = probeTerms
|
|
403
|
+
.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
|
|
404
|
+
.join("|");
|
|
405
|
+
|
|
406
|
+
try {
|
|
407
|
+
const result = spawnSync("rg", [
|
|
408
|
+
"-l", // List matching files only
|
|
409
|
+
"--hidden",
|
|
410
|
+
"-g", `!{${[...excludeSet].join(",")}}`,
|
|
411
|
+
"-g", "*.ts", "-g", "*.tsx", "-g", "*.js", "-g", "*.jsx",
|
|
412
|
+
"-g", "*.py", "-g", "*.go", "-g", "*.rs", "-g", "*.java",
|
|
413
|
+
"-g", "*.md", "-g", "*.mdx", "-g", "*.json",
|
|
414
|
+
pattern,
|
|
415
|
+
projectRoot,
|
|
416
|
+
], {
|
|
417
|
+
encoding: "utf-8",
|
|
418
|
+
timeout: 8000, // Slightly longer for combined search
|
|
419
|
+
maxBuffer: 2 * 1024 * 1024,
|
|
420
|
+
});
|
|
421
|
+
|
|
422
|
+
if (result.stdout) {
|
|
423
|
+
const files = result.stdout.trim().split("\n").filter(Boolean);
|
|
424
|
+
for (const file of files) {
|
|
425
|
+
const relPath = relative(projectRoot, file);
|
|
426
|
+
const topDir = relPath.split(/[\/\\]/)[0];
|
|
427
|
+
if (dirHits.hasOwnProperty(topDir)) {
|
|
428
|
+
dirHits[topDir]++;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
} catch { /* rg not found or error - skip */ }
|
|
433
|
+
|
|
434
|
+
return dirHits;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Compute probe score with normalization
|
|
439
|
+
*/
|
|
440
|
+
function computeProbeScore(hits, fileCount) {
|
|
441
|
+
if (hits === 0) return 0;
|
|
442
|
+
// Normalize by log(hits) / sqrt(fileCount) to prevent large dirs from dominating
|
|
443
|
+
return Math.log(1 + hits) / Math.sqrt(1 + fileCount);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// ─── RRF Fusion ────────────────────────────────────────────────
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Reciprocal Rank Fusion
|
|
450
|
+
*/
|
|
451
|
+
function rrfFusion(rankings, weights = null) {
|
|
452
|
+
const finalScores = {};
|
|
453
|
+
const w = weights || rankings.map(() => 1);
|
|
454
|
+
|
|
455
|
+
for (let r = 0; r < rankings.length; r++) {
|
|
456
|
+
const ranking = rankings[r];
|
|
457
|
+
for (let pos = 0; pos < ranking.length; pos++) {
|
|
458
|
+
const { dir } = ranking[pos];
|
|
459
|
+
const rrfScore = w[r] / (RRF_K + pos + 1);
|
|
460
|
+
finalScores[dir] = (finalScores[dir] || 0) + rrfScore;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return Object.entries(finalScores)
|
|
465
|
+
.map(([dir, score]) => ({ dir, score }))
|
|
466
|
+
.sort((a, b) => b.score - a.score);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// ─── Adaptive TopK (Literature-backed) ───────────────────────────
|
|
470
|
+
//
|
|
471
|
+
// References:
|
|
472
|
+
// [1] Taguchi et al. (2025) "Adaptive-k" — max-gap on sorted scores
|
|
473
|
+
// [2] Xu et al. (2025) "CAR" — entropy-based cluster cutoff
|
|
474
|
+
// [3] CMU Selective Search — shard cutoff via distribution features
|
|
475
|
+
// [4] Kratzwald et al. (2018) — cumulative threshold for query-dependent k
|
|
476
|
+
//
|
|
477
|
+
// Three signals combined:
|
|
478
|
+
// K_base: N-proportional safety floor (handles degenerate cases)
|
|
479
|
+
// K_knee: Kneedle gap detection (query-sensitive, finds natural breakpoint)
|
|
480
|
+
// H_norm: Entropy scaling (flat distributions → expand K)
|
|
481
|
+
// Tail inclusion uses adaptive threshold based on score decay rate.
|
|
482
|
+
|
|
483
|
+
const K_MIN = 3;
|
|
484
|
+
const K_MAX = 10;
|
|
485
|
+
const ENTROPY_GAMMA = 0.5; // Entropy scaling factor
|
|
486
|
+
const SOFTMAX_TEMP = 1.0; // Temperature for softmax normalization
|
|
487
|
+
const TAIL_SCAN_WINDOW = 6; // Max dirs to scan beyond cutoff
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Adaptive topK selection based on RRF score distribution.
|
|
491
|
+
*
|
|
492
|
+
* @param {Array<{dir: string, score: number}>} fused - RRF-fused sorted rankings
|
|
493
|
+
* @param {number} userTopK - User-specified topK (default 4)
|
|
494
|
+
* @param {number} N - Total number of top-level directories
|
|
495
|
+
* @returns {string[]} Selected hotDirs
|
|
496
|
+
*/
|
|
497
|
+
function _adaptiveTopK(fused, userTopK, N) {
|
|
498
|
+
if (fused.length <= K_MIN) return fused.map(r => r.dir);
|
|
499
|
+
|
|
500
|
+
const scores = fused.map(r => r.score);
|
|
501
|
+
|
|
502
|
+
// ── Signal 1: K_base (N-proportional safety floor) ──
|
|
503
|
+
const kBase = Math.max(userTopK, Math.min(K_MAX, Math.ceil(N * 0.15)));
|
|
504
|
+
|
|
505
|
+
// ── Signal 2: K_knee (Kneedle max-gap detection) ──
|
|
506
|
+
// Find the position with the largest score drop (Taguchi Adaptive-k).
|
|
507
|
+
// This is where the "relevance cliff" occurs.
|
|
508
|
+
// Only search within [K_MIN-1, min(K_MAX, scores.length-1)] to stay bounded.
|
|
509
|
+
let maxGap = 0;
|
|
510
|
+
let kKnee = kBase;
|
|
511
|
+
const searchEnd = Math.min(K_MAX, scores.length - 1);
|
|
512
|
+
for (let i = K_MIN - 1; i < searchEnd; i++) {
|
|
513
|
+
const gap = scores[i] - scores[i + 1];
|
|
514
|
+
if (gap > maxGap) {
|
|
515
|
+
maxGap = gap;
|
|
516
|
+
kKnee = i + 1; // Include everything up to and including position i
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// ── Signal 3: Entropy scaling (distribution flatness) ──
|
|
521
|
+
// Softmax-normalized entropy: H_norm ∈ [0, 1].
|
|
522
|
+
// High H_norm (flat distribution) → relevance is dispersed → expand K.
|
|
523
|
+
// Low H_norm (peaked distribution) → relevance is concentrated → keep K tight.
|
|
524
|
+
const maxScore = scores[0];
|
|
525
|
+
const expScores = scores.map(s => Math.exp((s - maxScore) / SOFTMAX_TEMP)); // shifted for numerical stability
|
|
526
|
+
const expSum = expScores.reduce((a, b) => a + b, 0);
|
|
527
|
+
const probs = expScores.map(e => e / expSum);
|
|
528
|
+
const entropy = -probs.reduce((h, p) => h + (p > 0 ? p * Math.log(p) : 0), 0);
|
|
529
|
+
const hNorm = scores.length > 1 ? entropy / Math.log(scores.length) : 0;
|
|
530
|
+
|
|
531
|
+
// Entropy-adjusted K: scale kBase by distribution flatness
|
|
532
|
+
const kEntropy = Math.ceil(kBase * (1 + ENTROPY_GAMMA * hNorm));
|
|
533
|
+
|
|
534
|
+
// ── Combine: take the max of all signals, clamp to [K_MIN, K_MAX] ──
|
|
535
|
+
const primaryK = Math.max(K_MIN, Math.min(K_MAX, Math.max(kBase, kKnee, kEntropy)));
|
|
536
|
+
let hotDirs = fused.slice(0, primaryK).map(r => r.dir);
|
|
537
|
+
|
|
538
|
+
// ── Adaptive tail inclusion ──
|
|
539
|
+
// Instead of fixed 0.6 threshold, use score decay rate to determine tail cutoff.
|
|
540
|
+
// If scores are still decaying slowly (flat tail), include more;
|
|
541
|
+
// if there's a sharp drop, stop.
|
|
542
|
+
if (fused.length > primaryK) {
|
|
543
|
+
const cutoffScore = scores[primaryK - 1];
|
|
544
|
+
// Adaptive threshold: based on the average decay rate in the head
|
|
545
|
+
// If head decays slowly (flat), threshold is lenient; if steep, threshold is strict.
|
|
546
|
+
const headDecayRate = primaryK > 1 ? (scores[0] - cutoffScore) / (primaryK - 1) : 0;
|
|
547
|
+
// Threshold = cutoffScore minus one "average step" worth of decay
|
|
548
|
+
// This is more lenient when the head is flat (small headDecayRate)
|
|
549
|
+
const tailThreshold = Math.max(cutoffScore - headDecayRate, cutoffScore * 0.4);
|
|
550
|
+
|
|
551
|
+
for (let i = primaryK; i < fused.length && i < primaryK + TAIL_SCAN_WINDOW; i++) {
|
|
552
|
+
if (scores[i] >= tailThreshold) {
|
|
553
|
+
hotDirs.push(fused[i].dir);
|
|
554
|
+
} else {
|
|
555
|
+
break; // Stop at first dir below threshold (scores are sorted)
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
return hotDirs;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
// ─── Path Spine Extraction ──────────────────────────────────────
|
|
564
|
+
|
|
565
|
+
/**
|
|
566
|
+
* Extract path spines from matched files.
|
|
567
|
+
*
|
|
568
|
+
* Previous approach: first-match iteration with topN break — caused files from
|
|
569
|
+
* later-iterated directories to be missed even when relevant (e.g., prompt.go
|
|
570
|
+
* in server/, shape.ts in packages/element/).
|
|
571
|
+
*
|
|
572
|
+
* New approach: score ALL candidate files, sort by relevance, take topN.
|
|
573
|
+
* Includes path-quality signals:
|
|
574
|
+
* - Source-code paths (src/, core/, lib/, internal/) get a bonus
|
|
575
|
+
* - Noise paths (migrations/, test/, fixtures/) get a penalty
|
|
576
|
+
* - Filename-level term matches get a bonus
|
|
577
|
+
*/
|
|
578
|
+
// Paths that indicate core source code (bonus)
|
|
579
|
+
const SOURCE_PATH_PATTERNS = ["/src/", "/core/", "/lib/", "/internal/", "/pkg/", "/cmd/"];
|
|
580
|
+
// Paths that indicate non-essential files (penalty)
|
|
581
|
+
const NOISE_PATH_PATTERNS = ["/migrations/", "/test/", "/__tests__/", "/fixtures/", "/examples/", "/vendor/", "/mock/", "/mocks/", "/i18n/", "/locales/", "/versions/"];
|
|
582
|
+
|
|
583
|
+
function extractPathSpines(profiles, queryTerms, keywords, topN = 30) {
|
|
584
|
+
const allTerms = [...new Set([...queryTerms, ...keywords])];
|
|
585
|
+
if (allTerms.length === 0) return [];
|
|
586
|
+
|
|
587
|
+
// Score all candidate files across all directories
|
|
588
|
+
const candidates = [];
|
|
589
|
+
|
|
590
|
+
for (const [dir, profile] of Object.entries(profiles)) {
|
|
591
|
+
for (const filePath of profile.file_paths || []) {
|
|
592
|
+
const pathTokens = tokenizePath(filePath);
|
|
593
|
+
const pathText = filePath.toLowerCase();
|
|
594
|
+
// Extract bare filename without extension for filename-level matching
|
|
595
|
+
const parts = filePath.split("/");
|
|
596
|
+
const fileName = parts[parts.length - 1].replace(/\.[^.]+$/, "").toLowerCase();
|
|
597
|
+
const fileNameTokens = tokenizePath(fileName);
|
|
598
|
+
|
|
599
|
+
let score = 0;
|
|
600
|
+
for (const term of allTerms) {
|
|
601
|
+
// Filename match (highest signal — file is specifically about this concept)
|
|
602
|
+
if (fileName.includes(term) || fileNameTokens.some(ft => ft === term)) {
|
|
603
|
+
score += 4;
|
|
604
|
+
}
|
|
605
|
+
// Direct path text match
|
|
606
|
+
else if (pathText.includes(term)) {
|
|
607
|
+
score += 2;
|
|
608
|
+
}
|
|
609
|
+
// Token-level match (partial overlap)
|
|
610
|
+
else if (pathTokens.some(pt => pt.includes(term) || term.includes(pt))) {
|
|
611
|
+
score += 1;
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if (score > 0) {
|
|
616
|
+
// Path quality adjustments
|
|
617
|
+
const lowerPath = "/" + pathText;
|
|
618
|
+
if (SOURCE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
|
|
619
|
+
score *= 1.5; // Bonus for source code paths
|
|
620
|
+
}
|
|
621
|
+
if (NOISE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
|
|
622
|
+
score *= 0.3; // Heavy penalty for noise paths
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
candidates.push({ path: filePath, score });
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Sort by score descending, take topN
|
|
631
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
632
|
+
return candidates.slice(0, topN).map(c => c.path);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// ─── Git History RFM Signal ──────────────────────────────────────
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Compute evolutionary activity scores for directories based on Git history.
|
|
639
|
+
*
|
|
640
|
+
* Uses the RFM (Recency-Frequency-Modification) model:
|
|
641
|
+
* - R: Time since last commit (exponential decay, half-life 30 days)
|
|
642
|
+
* - F: Commit frequency relative to total commits
|
|
643
|
+
* - M: Code churn volume (log2 scale)
|
|
644
|
+
*
|
|
645
|
+
* @param {string} projectRoot
|
|
646
|
+
* @param {string[]} topDirs
|
|
647
|
+
* @param {object} options
|
|
648
|
+
* @returns {Array<{dir: string, score: number, recency: number, frequency: number, modification: number}>}
|
|
649
|
+
*/
|
|
650
|
+
// Git RFM cache: git log is expensive (~1-10s for large repos).
|
|
651
|
+
// TTL shorter than profile cache since commits happen more frequently.
|
|
652
|
+
const _gitRFMCache = new Map();
|
|
653
|
+
const GIT_RFM_CACHE_TTL_MS = (parseInt(process.env.FC_GIT_CACHE_TTL, 10) || 300) * 1000;
|
|
654
|
+
|
|
655
|
+
function computeGitRFM(projectRoot, topDirs, options = {}) {
|
|
656
|
+
// Cache lookup — key includes sorted topDirs to handle different dir sets
|
|
657
|
+
const cacheKey = `${projectRoot}|${topDirs.join(",")}`;
|
|
658
|
+
const cached = _gitRFMCache.get(cacheKey);
|
|
659
|
+
if (cached && (Date.now() - cached.cachedAt) < GIT_RFM_CACHE_TTL_MS) {
|
|
660
|
+
return cached.ranking;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
const {
|
|
664
|
+
windowDays = 180, // 6 month lookback
|
|
665
|
+
halfLifeDays = 30, // Recency decay half-life
|
|
666
|
+
wr = 0.4, // Recency weight
|
|
667
|
+
wf = 0.35, // Frequency weight
|
|
668
|
+
wm = 0.25, // Modification weight
|
|
669
|
+
} = options;
|
|
670
|
+
|
|
671
|
+
const lambda = Math.LN2 / halfLifeDays;
|
|
672
|
+
const nowSec = Math.floor(Date.now() / 1000);
|
|
673
|
+
const sinceDate = new Date(Date.now() - windowDays * 86400000).toISOString().slice(0, 10);
|
|
674
|
+
|
|
675
|
+
// Gather per-directory stats in a single git log pass
|
|
676
|
+
const dirStats = {};
|
|
677
|
+
for (const d of topDirs) {
|
|
678
|
+
dirStats[d] = { lastCommitSec: 0, commits: 0, linesChanged: 0 };
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
try {
|
|
682
|
+
// Single git log: author-date + numstat, limited to window
|
|
683
|
+
const result = spawnSync("git", [
|
|
684
|
+
"log",
|
|
685
|
+
"--format=%at", // author timestamp (epoch)
|
|
686
|
+
"--numstat",
|
|
687
|
+
`--since=${sinceDate}`,
|
|
688
|
+
"--no-merges",
|
|
689
|
+
], {
|
|
690
|
+
cwd: projectRoot,
|
|
691
|
+
encoding: "utf-8",
|
|
692
|
+
timeout: 10000,
|
|
693
|
+
maxBuffer: 4 * 1024 * 1024,
|
|
694
|
+
});
|
|
695
|
+
|
|
696
|
+
if (result.stdout) {
|
|
697
|
+
let currentTimestamp = 0;
|
|
698
|
+
const seenDirsForCommit = new Set();
|
|
699
|
+
|
|
700
|
+
for (const line of result.stdout.split("\n")) {
|
|
701
|
+
const trimmed = line.trim();
|
|
702
|
+
if (!trimmed) {
|
|
703
|
+
// Empty line = commit boundary
|
|
704
|
+
seenDirsForCommit.clear();
|
|
705
|
+
continue;
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// Timestamp line
|
|
709
|
+
if (/^\d+$/.test(trimmed)) {
|
|
710
|
+
currentTimestamp = parseInt(trimmed, 10);
|
|
711
|
+
seenDirsForCommit.clear();
|
|
712
|
+
continue;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// Numstat line: added \t deleted \t filepath
|
|
716
|
+
const parts = trimmed.split("\t");
|
|
717
|
+
if (parts.length >= 3) {
|
|
718
|
+
const added = parseInt(parts[0], 10) || 0;
|
|
719
|
+
const deleted = parseInt(parts[1], 10) || 0;
|
|
720
|
+
const filePath = parts[2];
|
|
721
|
+
const topDir = filePath.split(/[\/\\]/)[0];
|
|
722
|
+
|
|
723
|
+
if (dirStats[topDir]) {
|
|
724
|
+
dirStats[topDir].linesChanged += added + deleted;
|
|
725
|
+
if (currentTimestamp > dirStats[topDir].lastCommitSec) {
|
|
726
|
+
dirStats[topDir].lastCommitSec = currentTimestamp;
|
|
727
|
+
}
|
|
728
|
+
// Count unique commits per dir (not per file)
|
|
729
|
+
if (!seenDirsForCommit.has(topDir)) {
|
|
730
|
+
seenDirsForCommit.add(topDir);
|
|
731
|
+
dirStats[topDir].commits++;
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
} catch { /* git not available or not a git repo - return empty */ }
|
|
738
|
+
|
|
739
|
+
// Compute total commits for frequency normalization
|
|
740
|
+
const totalCommits = Object.values(dirStats).reduce((s, d) => s + d.commits, 0) || 1;
|
|
741
|
+
|
|
742
|
+
// Compute RFM scores
|
|
743
|
+
const ranking = [];
|
|
744
|
+
for (const dir of topDirs) {
|
|
745
|
+
const stats = dirStats[dir];
|
|
746
|
+
|
|
747
|
+
// R: Recency (exponential decay)
|
|
748
|
+
let recency = 0;
|
|
749
|
+
if (stats.lastCommitSec > 0) {
|
|
750
|
+
const daysSince = (nowSec - stats.lastCommitSec) / 86400;
|
|
751
|
+
recency = Math.exp(-lambda * daysSince);
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// F: Frequency (relative to total)
|
|
755
|
+
const frequency = stats.commits / totalCommits;
|
|
756
|
+
|
|
757
|
+
// M: Modification (log2 scale)
|
|
758
|
+
const modification = Math.log2(1 + stats.linesChanged);
|
|
759
|
+
// Normalize M to [0,1] range approximately
|
|
760
|
+
const mNorm = modification / (modification + 10);
|
|
761
|
+
|
|
762
|
+
const score = wr * recency + wf * frequency + wm * mNorm;
|
|
763
|
+
ranking.push({ dir, score, recency, frequency, modification: stats.linesChanged, commits: stats.commits });
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
ranking.sort((a, b) => b.score - a.score);
|
|
767
|
+
|
|
768
|
+
// Store in cache
|
|
769
|
+
_gitRFMCache.set(cacheKey, { ranking, cachedAt: Date.now() });
|
|
770
|
+
|
|
771
|
+
return ranking;
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// ─── File-level Log-Sum Aggregation ─────────────────────────────
|
|
775
|
+
|
|
776
|
+
/**
|
|
777
|
+
* Compute file-level BM25 scores then aggregate to directory using Log-Sum.
|
|
778
|
+
*
|
|
779
|
+
* Instead of treating the entire directory as one flat text blob,
|
|
780
|
+
* score individual files independently then aggregate:
|
|
781
|
+
* Score(D) = max(file_scores) + α * log(1 + Σ(s_i - τ) for s_i > τ)
|
|
782
|
+
*
|
|
783
|
+
* @param {string[]} queryTerms
|
|
784
|
+
* @param {object} profile - Directory profile with file_paths
|
|
785
|
+
* @param {string} projectRoot
|
|
786
|
+
* @param {object} options
|
|
787
|
+
* @returns {number}
|
|
788
|
+
*/
|
|
789
|
+
function fileAggregateScore(queryTerms, profile, projectRoot, options = {}) {
|
|
790
|
+
const {
|
|
791
|
+
alpha = 0.5, // Density bonus weight
|
|
792
|
+
threshold = 0.3, // Noise filter threshold (τ)
|
|
793
|
+
maxFiles = 200, // Cap files to score for performance
|
|
794
|
+
sampleHeaderLines = 5, // Lines to read per file for scoring
|
|
795
|
+
} = options;
|
|
796
|
+
|
|
797
|
+
if (!profile.file_paths || profile.file_paths.length === 0) return 0;
|
|
798
|
+
|
|
799
|
+
// Score each file by matching query terms against its path + header content
|
|
800
|
+
const fileScores = [];
|
|
801
|
+
const filesToScore = profile.file_paths.slice(0, maxFiles);
|
|
802
|
+
|
|
803
|
+
for (const relPath of filesToScore) {
|
|
804
|
+
const pathTokens = tokenizePath(relPath);
|
|
805
|
+
let score = 0;
|
|
806
|
+
|
|
807
|
+
// Path-based matching (fast, no I/O)
|
|
808
|
+
for (const qt of queryTerms) {
|
|
809
|
+
// Exact token match in path
|
|
810
|
+
if (pathTokens.some(pt => pt === qt)) {
|
|
811
|
+
score += 2.0;
|
|
812
|
+
}
|
|
813
|
+
// Partial match in path
|
|
814
|
+
else if (pathTokens.some(pt => pt.includes(qt) || qt.includes(pt))) {
|
|
815
|
+
score += 1.0;
|
|
816
|
+
}
|
|
817
|
+
// Raw path string match (catches compound names like "RLSManagement")
|
|
818
|
+
else if (relPath.toLowerCase().includes(qt)) {
|
|
819
|
+
score += 0.5;
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
if (score > 0) {
|
|
824
|
+
fileScores.push(score);
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
if (fileScores.length === 0) return 0;
|
|
829
|
+
|
|
830
|
+
fileScores.sort((a, b) => b - a);
|
|
831
|
+
|
|
832
|
+
// Log-Sum aggregation
|
|
833
|
+
const maxScore = fileScores[0];
|
|
834
|
+
const aboveThreshold = fileScores.filter(s => s > threshold);
|
|
835
|
+
const densitySum = aboveThreshold.reduce((sum, s) => sum + (s - threshold), 0);
|
|
836
|
+
const densityBonus = Math.log(1 + densitySum);
|
|
837
|
+
|
|
838
|
+
return maxScore + alpha * densityBonus;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// ─── Main API ────────────────────────────────────────────────────
|
|
842
|
+
|
|
843
|
+
/**
|
|
844
|
+
* Score directories using BM25F + Probe + Git RFM + File Aggregation + RRF
|
|
845
|
+
*
|
|
846
|
+
* @param {string} query - User query
|
|
847
|
+
* @param {string} projectRoot - Project root path
|
|
848
|
+
* @param {string[]} topDirs - List of top-level directories
|
|
849
|
+
* @param {string[]} excludePaths - Paths to exclude
|
|
850
|
+
* @param {object} options - Configuration options
|
|
851
|
+
* @returns {{ hotDirs: string[], pathSpines: string[], signals: object }}
|
|
852
|
+
*/
|
|
853
|
+
export function scoreDirectories(query, projectRoot, topDirs, excludePaths = [], options = {}) {
|
|
854
|
+
const {
|
|
855
|
+
topK = 4,
|
|
856
|
+
useProbe = true,
|
|
857
|
+
useGitRFM = true,
|
|
858
|
+
useFileAgg = true,
|
|
859
|
+
keywords = [], // From bootstrap phase
|
|
860
|
+
minReturn = 2,
|
|
861
|
+
} = options;
|
|
862
|
+
|
|
863
|
+
const queryTerms = tokenize(query);
|
|
864
|
+
|
|
865
|
+
// Step 1: Build profiles for all directories + pre-tokenize fields (once)
|
|
866
|
+
const profiles = {};
|
|
867
|
+
for (const dir of topDirs) {
|
|
868
|
+
const profile = buildDirectoryProfile(projectRoot, dir, excludePaths);
|
|
869
|
+
// Cache tokenized fields on profile — avoids triple tokenization in IDF/avgLen/BM25F
|
|
870
|
+
profile._tok = {
|
|
871
|
+
dir_name: tokenize(profile.dir_name),
|
|
872
|
+
path_tokens: tokenize(profile.path_tokens_text),
|
|
873
|
+
metadata: tokenize(profile.metadata),
|
|
874
|
+
headers: tokenize(profile.headers_text),
|
|
875
|
+
};
|
|
876
|
+
profiles[dir] = profile;
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// Step 2: Compute IDF across all profiles (uses cached tokens)
|
|
880
|
+
const allFieldTerms = [];
|
|
881
|
+
for (const profile of Object.values(profiles)) {
|
|
882
|
+
const t = profile._tok;
|
|
883
|
+
allFieldTerms.push([...t.dir_name, ...t.path_tokens, ...t.metadata, ...t.headers]);
|
|
884
|
+
}
|
|
885
|
+
const idf = computeIDF(allFieldTerms);
|
|
886
|
+
|
|
887
|
+
// Step 3: Compute average field lengths (uses cached tokens)
|
|
888
|
+
const avgFieldLens = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
|
|
889
|
+
const counts = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
|
|
890
|
+
|
|
891
|
+
for (const profile of Object.values(profiles)) {
|
|
892
|
+
const t = profile._tok;
|
|
893
|
+
avgFieldLens.dir_name += t.dir_name.length;
|
|
894
|
+
counts.dir_name++;
|
|
895
|
+
avgFieldLens.path_tokens += t.path_tokens.length;
|
|
896
|
+
counts.path_tokens++;
|
|
897
|
+
avgFieldLens.metadata += t.metadata.length;
|
|
898
|
+
counts.metadata++;
|
|
899
|
+
avgFieldLens.headers += t.headers.length;
|
|
900
|
+
counts.headers++;
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
for (const field of Object.keys(avgFieldLens)) {
|
|
904
|
+
avgFieldLens[field] = counts[field] > 0 ? avgFieldLens[field] / counts[field] : 10;
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
// Step 4: Signal 1 - BM25F scores
|
|
908
|
+
const bm25fRanking = [];
|
|
909
|
+
for (const dir of topDirs) {
|
|
910
|
+
const score = bm25fScore(queryTerms, profiles[dir], avgFieldLens, idf);
|
|
911
|
+
bm25fRanking.push({ dir, score });
|
|
912
|
+
}
|
|
913
|
+
bm25fRanking.sort((a, b) => b.score - a.score);
|
|
914
|
+
|
|
915
|
+
const rankings = [bm25fRanking];
|
|
916
|
+
const signals = { bm25f: bm25fRanking.map(r => r.dir) };
|
|
917
|
+
|
|
918
|
+
// Step 5: Signal 2 - Probe grep (if enabled)
|
|
919
|
+
if (useProbe && queryTerms.length > 0) {
|
|
920
|
+
// Fuse query terms with bootstrap keywords for probe selection
|
|
921
|
+
const keywordTerms = keywords && keywords.length > 0
|
|
922
|
+
? keywords.flatMap(k => tokenize(k))
|
|
923
|
+
: [];
|
|
924
|
+
const allProbeCandidates = [...new Set([...queryTerms, ...keywordTerms])];
|
|
925
|
+
|
|
926
|
+
const probeTerms = selectProbeTerms(allProbeCandidates, idf);
|
|
927
|
+
if (probeTerms.length > 0) {
|
|
928
|
+
const dirHits = probeGrep(projectRoot, topDirs, probeTerms, excludePaths);
|
|
929
|
+
|
|
930
|
+
const probeRanking = [];
|
|
931
|
+
for (const dir of topDirs) {
|
|
932
|
+
const hits = dirHits[dir] || 0;
|
|
933
|
+
const fileCount = profiles[dir].file_count || 1;
|
|
934
|
+
const score = computeProbeScore(hits, fileCount);
|
|
935
|
+
probeRanking.push({ dir, score, hits, fileCount });
|
|
936
|
+
}
|
|
937
|
+
probeRanking.sort((a, b) => b.score - a.score);
|
|
938
|
+
rankings.push(probeRanking);
|
|
939
|
+
signals.probe = probeRanking.map(r => `${r.dir}:${r.hits}`);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
// Step 6: Signal 3 - Keywords from bootstrap (if provided)
|
|
944
|
+
if (keywords && keywords.length > 0) {
|
|
945
|
+
const keywordTerms = keywords.flatMap(k => tokenize(k));
|
|
946
|
+
const keywordRanking = [];
|
|
947
|
+
|
|
948
|
+
for (const dir of topDirs) {
|
|
949
|
+
let score = 0;
|
|
950
|
+
const profile = profiles[dir];
|
|
951
|
+
|
|
952
|
+
// Check if keywords match in paths
|
|
953
|
+
for (const term of keywordTerms) {
|
|
954
|
+
if (profile.path_tokens_text.toLowerCase().includes(term)) {
|
|
955
|
+
score += 1;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
keywordRanking.push({ dir, score });
|
|
960
|
+
}
|
|
961
|
+
keywordRanking.sort((a, b) => b.score - a.score);
|
|
962
|
+
rankings.push(keywordRanking);
|
|
963
|
+
signals.keywords = keywordRanking.map(r => r.dir);
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
// Step 7: Signal 4 - Git History RFM (evolutionary activity)
|
|
967
|
+
if (useGitRFM) {
|
|
968
|
+
try {
|
|
969
|
+
const gitRanking = computeGitRFM(projectRoot, topDirs);
|
|
970
|
+
if (gitRanking.some(r => r.score > 0)) {
|
|
971
|
+
rankings.push(gitRanking);
|
|
972
|
+
signals.gitRFM = gitRanking.slice(0, 6).map(r =>
|
|
973
|
+
`${r.dir}:R=${r.recency.toFixed(2)},C=${r.commits}`
|
|
974
|
+
);
|
|
975
|
+
}
|
|
976
|
+
} catch { /* git not available */ }
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
// Step 8: Signal 5 - File-level Log-Sum aggregation
|
|
980
|
+
if (useFileAgg) {
|
|
981
|
+
const fileAggRanking = [];
|
|
982
|
+
for (const dir of topDirs) {
|
|
983
|
+
const score = fileAggregateScore(queryTerms, profiles[dir], projectRoot);
|
|
984
|
+
fileAggRanking.push({ dir, score });
|
|
985
|
+
}
|
|
986
|
+
fileAggRanking.sort((a, b) => b.score - a.score);
|
|
987
|
+
if (fileAggRanking.some(r => r.score > 0)) {
|
|
988
|
+
rankings.push(fileAggRanking);
|
|
989
|
+
signals.fileAgg = fileAggRanking.slice(0, 6).map(r =>
|
|
990
|
+
`${r.dir}:${r.score.toFixed(2)}`
|
|
991
|
+
);
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// Step 9: RRF Fusion
|
|
996
|
+
const fused = rrfFusion(rankings);
|
|
997
|
+
|
|
998
|
+
// Step 10: Ensure minimum return
|
|
999
|
+
while (fused.length < minReturn && fused.length < topDirs.length) {
|
|
1000
|
+
const missing = topDirs.find(d => !fused.some(f => f.dir === d));
|
|
1001
|
+
if (missing) {
|
|
1002
|
+
fused.push({ dir: missing, score: 0.001 });
|
|
1003
|
+
} else {
|
|
1004
|
+
break;
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
// Step 11: Extract path spines from matched files
|
|
1009
|
+
const pathSpines = extractPathSpines(profiles, queryTerms, keywords, 30);
|
|
1010
|
+
|
|
1011
|
+
// Step 12: Adaptive topK via score distribution analysis
|
|
1012
|
+
//
|
|
1013
|
+
// Based on IR literature:
|
|
1014
|
+
// - Taguchi et al. (2025) "Adaptive-k": max-gap detection on sorted scores
|
|
1015
|
+
// - Xu et al. (2025) "CAR": entropy-based distribution analysis for cutoff
|
|
1016
|
+
// - Kratzwald et al. (2018): cumulative score threshold for query-dependent k
|
|
1017
|
+
// - CMU Selective Search: shard cutoff via distribution skewness/entropy
|
|
1018
|
+
//
|
|
1019
|
+
// Hybrid approach: K_base (safety floor) + K_knee (gap detection) + entropy scaling
|
|
1020
|
+
// + adaptive tail threshold (replaces fixed 0.6)
|
|
1021
|
+
const hotDirs = _adaptiveTopK(fused, topK, topDirs.length);
|
|
1022
|
+
|
|
1023
|
+
return {
|
|
1024
|
+
hotDirs,
|
|
1025
|
+
pathSpines,
|
|
1026
|
+
signals,
|
|
1027
|
+
rawRankings: {
|
|
1028
|
+
bm25f: bm25fRanking,
|
|
1029
|
+
fused,
|
|
1030
|
+
},
|
|
1031
|
+
};
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
/**
|
|
1035
|
+
* Quick scoring for when profiles are already built
|
|
1036
|
+
*/
|
|
1037
|
+
export function quickScore(query, topDirs, profiles) {
|
|
1038
|
+
const queryTerms = tokenize(query);
|
|
1039
|
+
const scored = [];
|
|
1040
|
+
|
|
1041
|
+
for (const dir of topDirs) {
|
|
1042
|
+
const profile = profiles[dir] || { path_tokens_text: "", dir_name: dir };
|
|
1043
|
+
const dirTerms = [...tokenize(profile.dir_name), ...tokenize(profile.path_tokens_text)];
|
|
1044
|
+
|
|
1045
|
+
let score = 0;
|
|
1046
|
+
for (const qt of queryTerms) {
|
|
1047
|
+
if (dirTerms.some(dt => dt.includes(qt) || qt.includes(dt))) {
|
|
1048
|
+
score += 1;
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
scored.push({ dir, score });
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
scored.sort((a, b) => b.score - a.score);
|
|
1056
|
+
return scored;
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
export { tokenize, tokenizePath, stem, computeIDF };
|