@sammysnake/fast-context-mcp 1.3.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1059 @@
1
+ /**
2
+ * Directory Scorer - BM25F + Probe + RRF
3
+ *
4
+ * Based on IR research:
5
+ * - BM25F for multi-field structured documents (Robertson & Zaragoza)
6
+ * - Probe grep signal with IDF weighting
7
+ * - RRF fusion for combining multiple rankers (Cormack et al.)
8
+ *
9
+ * Directory Profile Fields:
10
+ * - dir_name: Top-level directory name (weight: 1.0)
11
+ * - path_tokens: All file paths under the directory (weight: 4.0) <- MAIN SIGNAL
12
+ * - metadata: package.json, go.mod, Cargo.toml info (weight: 3.0)
13
+ * - headers: First N lines / markdown headers (weight: 2.0)
14
+ */
15
+
16
+ import { readdirSync, readFileSync, existsSync, statSync } from "fs";
17
+ import { join, resolve, relative, extname, basename, dirname } from "path";
18
+ import { spawnSync } from "child_process";
19
+
20
+ // ─── Constants ───────────────────────────────────────────────
21
+
22
+ const BM25_K1 = 1.2;
23
+ const BM25_B = 0.75;
24
+ const RRF_K = 60;
25
+
26
+ // Field weights for BM25F (from research recommendations)
27
+ const FIELD_WEIGHTS = {
28
+ dir_name: 1.0,
29
+ path_tokens: 4.0, // Main signal
30
+ metadata: 3.0,
31
+ headers: 2.0,
32
+ };
33
+
34
+ // Default exclude patterns
35
+ const DEFAULT_EXCLUDES = new Set([
36
+ "node_modules", ".git", "dist", "build", "coverage", ".venv", "venv",
37
+ "target", "out", ".cache", "__pycache__", "vendor", "deps", "third_party",
38
+ "logs", "data", ".next", ".nuxt", "bundle", "bundled", "fixtures",
39
+ ]);
40
+
41
+ // Stopwords for tokenization
42
+ const STOPWORDS = new Set([
43
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
44
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
45
+ "should", "may", "might", "must", "shall", "can", "need", "dare",
46
+ "to", "of", "in", "for", "on", "with", "at", "by", "from", "as",
47
+ "into", "through", "during", "before", "after", "above", "below",
48
+ "and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
49
+ "not", "only", "own", "same", "than", "too", "very", "just", "also",
50
+ "this", "that", "these", "those", "here", "there", "all", "any",
51
+ "some", "no", "none", "each", "every", "other", "another", "such",
52
+ "get", "set", "use", "used", "using", "make", "made", "if", "then",
53
+ "else", "return", "new", "like", "well", "where", "which", "who",
54
+ "what", "when", "why", "how", "it", "its", "we", "you", "your",
55
+ ]);
56
+
57
+ // ─── Tokenization ─────────────────────────────────────────────
58
+
59
+ // Stem patterns hoisted to module scope — avoids re-allocating 18 RegExp per call
60
+ const STEM_PATTERNS = [
61
+ [/^(.+)(ies)$/, "$1y"],
62
+ [/^(.+)([^aeiou])(es)$/, "$1$2"],
63
+ [/^(.+)([^aeiou])(s)$/, "$1$2"],
64
+ [/^(.+)(ing)$/, "$1"],
65
+ [/^(.+)(edly)$/, "$1"],
66
+ [/^(.+)(ly)$/, "$1"],
67
+ [/^(.+)(ed)$/, "$1"],
68
+ [/^(.+)(ation)$/, "$1ate"],
69
+ [/^(.+)(tion)$/, "$1t"],
70
+ [/^(.+)(ment)$/, "$1"],
71
+ [/^(.+)(ness)$/, "$1"],
72
+ [/^(.+)(ful)$/, "$1"],
73
+ [/^(.+)(less)$/, "$1"],
74
+ [/^(.+)(able)$/, "$1"],
75
+ [/^(.+)(ible)$/, "$1"],
76
+ [/^(.+)(ally)$/, "$1al"],
77
+ [/^(.+)(ity)$/, "$1"],
78
+ [/^(.+)(ive)$/, "$1"],
79
+ ];
80
+
81
+ /**
82
+ * Basic Porter-like stemming (simplified)
83
+ */
84
+ function stem(word) {
85
+ if (!word || word.length < 3) return word;
86
+ const w = word.toLowerCase();
87
+
88
+ for (const [pattern, replacement] of STEM_PATTERNS) {
89
+ if (pattern.test(w)) {
90
+ return w.replace(pattern, replacement);
91
+ }
92
+ }
93
+ return w;
94
+ }
95
+
96
+ /**
97
+ * Tokenize text with stemming and stopword removal
98
+ */
99
+ function tokenize(text, options = {}) {
100
+ if (!text) return [];
101
+ const { keepCase = false, minLen = 2 } = options;
102
+
103
+ return text
104
+ .toLowerCase()
105
+ .replace(/[^\w\s\-./\\@]/g, " ")
106
+ .split(/[\s\-./\\]+/)
107
+ .filter(t => t.length >= minLen && !STOPWORDS.has(t))
108
+ .map(t => stem(keepCase ? t : t.toLowerCase()));
109
+ }
110
+
111
+ /**
112
+ * Tokenize file path (handles code paths better)
113
+ */
114
+ function tokenizePath(pathStr) {
115
+ if (!pathStr) return [];
116
+ return pathStr
117
+ .toLowerCase()
118
+ .replace(/[\/\\]/g, " ")
119
+ .replace(/[._-]/g, " ")
120
+ .split(/\s+/)
121
+ .filter(t => t.length >= 2)
122
+ .map(stem);
123
+ }
124
+
125
+ // ─── Directory Profile Builder ────────────────────────────────
126
+
127
+ /**
128
+ * Extract metadata from common config files
129
+ */
130
+ function extractMetadata(dirPath) {
131
+ const metadata = [];
132
+
133
+ // package.json
134
+ const pkgPath = join(dirPath, "package.json");
135
+ if (existsSync(pkgPath)) {
136
+ try {
137
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
138
+ if (pkg.name) metadata.push(pkg.name);
139
+ if (pkg.description) metadata.push(...tokenize(pkg.description));
140
+ if (pkg.keywords) metadata.push(...pkg.keywords.flatMap(k => tokenize(k)));
141
+ if (pkg.dependencies) metadata.push(...Object.keys(pkg.dependencies).flatMap(k => tokenize(k)));
142
+ } catch {}
143
+ }
144
+
145
+ // go.mod
146
+ const goModPath = join(dirPath, "go.mod");
147
+ if (existsSync(goModPath)) {
148
+ try {
149
+ const content = readFileSync(goModPath, "utf-8");
150
+ const moduleMatch = content.match(/module\s+(\S+)/);
151
+ if (moduleMatch) metadata.push(...tokenizePath(moduleMatch[1]));
152
+ } catch {}
153
+ }
154
+
155
+ // Cargo.toml
156
+ const cargoPath = join(dirPath, "Cargo.toml");
157
+ if (existsSync(cargoPath)) {
158
+ try {
159
+ const content = readFileSync(cargoPath, "utf-8");
160
+ const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
161
+ if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
162
+ } catch {}
163
+ }
164
+
165
+ // pyproject.toml
166
+ const pyprojectPath = join(dirPath, "pyproject.toml");
167
+ if (existsSync(pyprojectPath)) {
168
+ try {
169
+ const content = readFileSync(pyprojectPath, "utf-8");
170
+ const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
171
+ if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
172
+ } catch {}
173
+ }
174
+
175
+ return metadata.join(" ");
176
+ }
177
+
178
+ /**
179
+ * Extract headers from a file (markdown headers, code comments, etc.)
180
+ */
181
+ function extractFileHeaders(filePath) {
182
+ try {
183
+ const content = readFileSync(filePath, "utf-8").slice(0, 2000); // First 2KB
184
+ const headers = [];
185
+
186
+ // Markdown headers
187
+ const mdHeaders = content.match(/^#+\s+.+$/gm) || [];
188
+ headers.push(...mdHeaders.map(h => h.replace(/^#+\s+/, "")));
189
+
190
+ // Code comments (first 10 lines)
191
+ const lines = content.split("\n").slice(0, 10);
192
+ for (const line of lines) {
193
+ const comment = line.match(/^\s*(?:(?:\/\/|#|;|\*)\s*)(.+)$/);
194
+ if (comment) headers.push(comment[1]);
195
+ }
196
+
197
+ return headers.join(" ");
198
+ } catch {
199
+ return "";
200
+ }
201
+ }
202
+
203
+ // ─── Profile Cache (process-level TTL) ───────────────────────
204
+ //
205
+ // MCP server is long-running: repeated search() calls for the same project
206
+ // would re-walk every directory each time (~200ms+ per large dir due to
207
+ // readFileSync for headers). This cache avoids that.
208
+ //
209
+ // Key: projectRoot + "|" + dirName + "|" + sortedExcludes
210
+ // TTL: 120s (configurable via FC_PROFILE_CACHE_TTL env)
211
+ // Scope: process lifetime only, not persisted
212
+
213
+ const _profileCache = new Map();
214
+ const PROFILE_CACHE_TTL_MS = (parseInt(process.env.FC_PROFILE_CACHE_TTL, 10) || 120) * 1000;
215
+
216
+ /**
217
+ * Invalidate all cached profiles for a project root.
218
+ * Call this if you know files have changed (optional — TTL handles normal staleness).
219
+ */
220
+ export function invalidateProfileCache(projectRoot) {
221
+ for (const key of _profileCache.keys()) {
222
+ if (key.startsWith(projectRoot + "|")) {
223
+ _profileCache.delete(key);
224
+ }
225
+ }
226
+ }
227
+
228
+ /**
229
+ * Build a profile for a top-level directory (with TTL cache)
230
+ */
231
+ export function buildDirectoryProfile(projectRoot, dirName, excludePaths = [], maxDepth = 3) {
232
+ // Cache lookup
233
+ const cacheKey = `${projectRoot}|${dirName}|${[...excludePaths].sort().join(",")}`;
234
+ const cached = _profileCache.get(cacheKey);
235
+ if (cached && (Date.now() - cached.cachedAt) < PROFILE_CACHE_TTL_MS) {
236
+ return cached.profile;
237
+ }
238
+ const dirPath = join(projectRoot, dirName);
239
+ const profile = {
240
+ dir_name: dirName,
241
+ path_tokens: [],
242
+ metadata: "",
243
+ headers: [],
244
+ file_count: 0,
245
+ file_paths: [], // Store actual file paths for path spines
246
+ };
247
+
248
+ const excludeSet = new Set(excludePaths);
249
+
250
+ function walk(currentPath, depth) {
251
+ if (depth > maxDepth) return;
252
+ try {
253
+ const entries = readdirSync(currentPath, { withFileTypes: true });
254
+ for (const entry of entries) {
255
+ const name = entry.name;
256
+
257
+ // Skip excluded and noise
258
+ if (DEFAULT_EXCLUDES.has(name) || excludeSet.has(name)) continue;
259
+ if (name.startsWith(".") && name !== ".github") continue;
260
+
261
+ const fullPath = join(currentPath, name);
262
+ const relPath = relative(projectRoot, fullPath);
263
+
264
+ if (entry.isDirectory()) {
265
+ profile.path_tokens.push(relPath);
266
+ walk(fullPath, depth + 1);
267
+ } else if (entry.isFile()) {
268
+ profile.path_tokens.push(relPath);
269
+ profile.file_paths.push(relPath);
270
+ profile.file_count++;
271
+
272
+ // Extract headers from relevant files
273
+ const ext = extname(name);
274
+ if ([".md", ".mdx", ".ts", ".tsx", ".js", ".jsx", ".py", ".go"].includes(ext)) {
275
+ const headers = extractFileHeaders(fullPath);
276
+ if (headers) profile.headers.push(headers);
277
+ }
278
+ }
279
+ }
280
+ } catch { /* ignore walk errors */ }
281
+ }
282
+
283
+ walk(dirPath, 1);
284
+
285
+ // Extract metadata from config files
286
+ profile.metadata = extractMetadata(dirPath);
287
+
288
+ // Convert arrays to text
289
+ profile.path_tokens_text = profile.path_tokens.join(" ");
290
+ profile.headers_text = profile.headers.join(" ");
291
+
292
+ // Store in cache
293
+ _profileCache.set(cacheKey, { profile, cachedAt: Date.now() });
294
+
295
+ return profile;
296
+ }
297
+
298
+ // ─── BM25/BM25F Implementation ────────────────────────────────
299
+
300
+ /**
301
+ * Compute IDF for terms across documents
302
+ */
303
+ function computeIDF(documents) {
304
+ const docCount = documents.length;
305
+ const termDocCount = {};
306
+ const idf = {};
307
+
308
+ for (const doc of documents) {
309
+ const uniqueTerms = new Set(doc);
310
+ for (const term of uniqueTerms) {
311
+ termDocCount[term] = (termDocCount[term] || 0) + 1;
312
+ }
313
+ }
314
+
315
+ for (const [term, count] of Object.entries(termDocCount)) {
316
+ // Standard IDF formula
317
+ idf[term] = Math.log((docCount - count + 0.5) / (count + 0.5) + 1);
318
+ }
319
+
320
+ return idf;
321
+ }
322
+
323
+ /**
324
+ * BM25 score for a single field
325
+ */
326
+ function bm25FieldScore(queryTerms, fieldTerms, avgLen, fieldLen, idf) {
327
+ const termFreqs = {};
328
+ fieldTerms.forEach(t => { termFreqs[t] = (termFreqs[t] || 0) + 1; });
329
+
330
+ let score = 0;
331
+ for (const term of queryTerms) {
332
+ const tf = termFreqs[term] || 0;
333
+ if (tf === 0) continue;
334
+
335
+ const termIDF = idf[term] || Math.log(2); // Default IDF for unseen terms
336
+ const numerator = tf * (BM25_K1 + 1);
337
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (fieldLen / avgLen));
338
+ score += termIDF * (numerator / denominator);
339
+ }
340
+
341
+ return score;
342
+ }
343
+
344
+ /**
345
+ * BM25F score across all fields (uses pre-cached tokenized fields when available)
346
+ */
347
+ function bm25fScore(queryTerms, profile, avgFieldLens, idf) {
348
+ const tok = profile._tok;
349
+ const fields = [
350
+ { name: "dir_name", terms: tok ? tok.dir_name : tokenize(profile.dir_name || ""), weight: FIELD_WEIGHTS.dir_name },
351
+ { name: "path_tokens", terms: tok ? tok.path_tokens : tokenize(profile.path_tokens_text || ""), weight: FIELD_WEIGHTS.path_tokens },
352
+ { name: "metadata", terms: tok ? tok.metadata : tokenize(profile.metadata || ""), weight: FIELD_WEIGHTS.metadata },
353
+ { name: "headers", terms: tok ? tok.headers : tokenize(profile.headers_text || ""), weight: FIELD_WEIGHTS.headers },
354
+ ];
355
+
356
+ let totalScore = 0;
357
+ for (const field of fields) {
358
+ const avgLen = avgFieldLens[field.name] || 50;
359
+ const fieldLen = field.terms.length || 1;
360
+ const fieldScore = bm25FieldScore(queryTerms, field.terms, avgLen, fieldLen, idf);
361
+ totalScore += field.weight * fieldScore;
362
+ }
363
+
364
+ return totalScore;
365
+ }
366
+
367
+ // ─── Probe Grep Signal ────────────────────────────────────────
368
+
369
+ /**
370
+ * Select probe terms from query (prioritize high IDF, but include diverse terms)
371
+ */
372
+ function selectProbeTerms(queryTerms, idf, maxTerms = 6) {
373
+ // Sort by IDF (descending) and select top terms
374
+ const sorted = queryTerms
375
+ .map(t => ({ term: t, idf: idf[t] || 0 }))
376
+ .sort((a, b) => b.idf - a.idf);
377
+
378
+ // Return unique terms (top-N by IDF)
379
+ const unique = [...new Set(sorted.map(t => t.term))];
380
+ return unique.slice(0, maxTerms);
381
+ }
382
+
383
+ /**
384
+ * Execute probe grep to count matches per directory.
385
+ * Uses a single rg call with regex alternation (term1|term2|...) instead of
386
+ * N sequential calls — saves (N-1) process spawns (~2-5s).
387
+ *
388
+ * Scoring: each matching file contributes 1 hit to its directory.
389
+ * RRF only cares about rank order, which is robust to this simplification.
390
+ */
391
+ function probeGrep(projectRoot, topDirs, probeTerms, excludePaths = []) {
392
+ if (probeTerms.length === 0) return {};
393
+
394
+ const dirHits = {};
395
+ const excludeSet = new Set([...excludePaths, ...DEFAULT_EXCLUDES]);
396
+
397
+ for (const dir of topDirs) {
398
+ dirHits[dir] = 0;
399
+ }
400
+
401
+ // Build single regex alternation: escape each term for regex safety
402
+ const pattern = probeTerms
403
+ .map(t => t.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
404
+ .join("|");
405
+
406
+ try {
407
+ const result = spawnSync("rg", [
408
+ "-l", // List matching files only
409
+ "--hidden",
410
+ "-g", `!{${[...excludeSet].join(",")}}`,
411
+ "-g", "*.ts", "-g", "*.tsx", "-g", "*.js", "-g", "*.jsx",
412
+ "-g", "*.py", "-g", "*.go", "-g", "*.rs", "-g", "*.java",
413
+ "-g", "*.md", "-g", "*.mdx", "-g", "*.json",
414
+ pattern,
415
+ projectRoot,
416
+ ], {
417
+ encoding: "utf-8",
418
+ timeout: 8000, // Slightly longer for combined search
419
+ maxBuffer: 2 * 1024 * 1024,
420
+ });
421
+
422
+ if (result.stdout) {
423
+ const files = result.stdout.trim().split("\n").filter(Boolean);
424
+ for (const file of files) {
425
+ const relPath = relative(projectRoot, file);
426
+ const topDir = relPath.split(/[\/\\]/)[0];
427
+ if (dirHits.hasOwnProperty(topDir)) {
428
+ dirHits[topDir]++;
429
+ }
430
+ }
431
+ }
432
+ } catch { /* rg not found or error - skip */ }
433
+
434
+ return dirHits;
435
+ }
436
+
437
+ /**
438
+ * Compute probe score with normalization
439
+ */
440
+ function computeProbeScore(hits, fileCount) {
441
+ if (hits === 0) return 0;
442
+ // Normalize by log(hits) / sqrt(fileCount) to prevent large dirs from dominating
443
+ return Math.log(1 + hits) / Math.sqrt(1 + fileCount);
444
+ }
445
+
446
+ // ─── RRF Fusion ────────────────────────────────────────────────
447
+
448
+ /**
449
+ * Reciprocal Rank Fusion
450
+ */
451
+ function rrfFusion(rankings, weights = null) {
452
+ const finalScores = {};
453
+ const w = weights || rankings.map(() => 1);
454
+
455
+ for (let r = 0; r < rankings.length; r++) {
456
+ const ranking = rankings[r];
457
+ for (let pos = 0; pos < ranking.length; pos++) {
458
+ const { dir } = ranking[pos];
459
+ const rrfScore = w[r] / (RRF_K + pos + 1);
460
+ finalScores[dir] = (finalScores[dir] || 0) + rrfScore;
461
+ }
462
+ }
463
+
464
+ return Object.entries(finalScores)
465
+ .map(([dir, score]) => ({ dir, score }))
466
+ .sort((a, b) => b.score - a.score);
467
+ }
468
+
469
+ // ─── Adaptive TopK (Literature-backed) ───────────────────────────
470
+ //
471
+ // References:
472
+ // [1] Taguchi et al. (2025) "Adaptive-k" — max-gap on sorted scores
473
+ // [2] Xu et al. (2025) "CAR" — entropy-based cluster cutoff
474
+ // [3] CMU Selective Search — shard cutoff via distribution features
475
+ // [4] Kratzwald et al. (2018) — cumulative threshold for query-dependent k
476
+ //
477
+ // Three signals combined:
478
+ // K_base: N-proportional safety floor (handles degenerate cases)
479
+ // K_knee: Kneedle gap detection (query-sensitive, finds natural breakpoint)
480
+ // H_norm: Entropy scaling (flat distributions → expand K)
481
+ // Tail inclusion uses adaptive threshold based on score decay rate.
482
+
483
+ const K_MIN = 3;
484
+ const K_MAX = 10;
485
+ const ENTROPY_GAMMA = 0.5; // Entropy scaling factor
486
+ const SOFTMAX_TEMP = 1.0; // Temperature for softmax normalization
487
+ const TAIL_SCAN_WINDOW = 6; // Max dirs to scan beyond cutoff
488
+
489
+ /**
490
+ * Adaptive topK selection based on RRF score distribution.
491
+ *
492
+ * @param {Array<{dir: string, score: number}>} fused - RRF-fused sorted rankings
493
+ * @param {number} userTopK - User-specified topK (default 4)
494
+ * @param {number} N - Total number of top-level directories
495
+ * @returns {string[]} Selected hotDirs
496
+ */
497
+ function _adaptiveTopK(fused, userTopK, N) {
498
+ if (fused.length <= K_MIN) return fused.map(r => r.dir);
499
+
500
+ const scores = fused.map(r => r.score);
501
+
502
+ // ── Signal 1: K_base (N-proportional safety floor) ──
503
+ const kBase = Math.max(userTopK, Math.min(K_MAX, Math.ceil(N * 0.15)));
504
+
505
+ // ── Signal 2: K_knee (Kneedle max-gap detection) ──
506
+ // Find the position with the largest score drop (Taguchi Adaptive-k).
507
+ // This is where the "relevance cliff" occurs.
508
+ // Only search within [K_MIN-1, min(K_MAX, scores.length-1)] to stay bounded.
509
+ let maxGap = 0;
510
+ let kKnee = kBase;
511
+ const searchEnd = Math.min(K_MAX, scores.length - 1);
512
+ for (let i = K_MIN - 1; i < searchEnd; i++) {
513
+ const gap = scores[i] - scores[i + 1];
514
+ if (gap > maxGap) {
515
+ maxGap = gap;
516
+ kKnee = i + 1; // Include everything up to and including position i
517
+ }
518
+ }
519
+
520
+ // ── Signal 3: Entropy scaling (distribution flatness) ──
521
+ // Softmax-normalized entropy: H_norm ∈ [0, 1].
522
+ // High H_norm (flat distribution) → relevance is dispersed → expand K.
523
+ // Low H_norm (peaked distribution) → relevance is concentrated → keep K tight.
524
+ const maxScore = scores[0];
525
+ const expScores = scores.map(s => Math.exp((s - maxScore) / SOFTMAX_TEMP)); // shifted for numerical stability
526
+ const expSum = expScores.reduce((a, b) => a + b, 0);
527
+ const probs = expScores.map(e => e / expSum);
528
+ const entropy = -probs.reduce((h, p) => h + (p > 0 ? p * Math.log(p) : 0), 0);
529
+ const hNorm = scores.length > 1 ? entropy / Math.log(scores.length) : 0;
530
+
531
+ // Entropy-adjusted K: scale kBase by distribution flatness
532
+ const kEntropy = Math.ceil(kBase * (1 + ENTROPY_GAMMA * hNorm));
533
+
534
+ // ── Combine: take the max of all signals, clamp to [K_MIN, K_MAX] ──
535
+ const primaryK = Math.max(K_MIN, Math.min(K_MAX, Math.max(kBase, kKnee, kEntropy)));
536
+ let hotDirs = fused.slice(0, primaryK).map(r => r.dir);
537
+
538
+ // ── Adaptive tail inclusion ──
539
+ // Instead of fixed 0.6 threshold, use score decay rate to determine tail cutoff.
540
+ // If scores are still decaying slowly (flat tail), include more;
541
+ // if there's a sharp drop, stop.
542
+ if (fused.length > primaryK) {
543
+ const cutoffScore = scores[primaryK - 1];
544
+ // Adaptive threshold: based on the average decay rate in the head
545
+ // If head decays slowly (flat), threshold is lenient; if steep, threshold is strict.
546
+ const headDecayRate = primaryK > 1 ? (scores[0] - cutoffScore) / (primaryK - 1) : 0;
547
+ // Threshold = cutoffScore minus one "average step" worth of decay
548
+ // This is more lenient when the head is flat (small headDecayRate)
549
+ const tailThreshold = Math.max(cutoffScore - headDecayRate, cutoffScore * 0.4);
550
+
551
+ for (let i = primaryK; i < fused.length && i < primaryK + TAIL_SCAN_WINDOW; i++) {
552
+ if (scores[i] >= tailThreshold) {
553
+ hotDirs.push(fused[i].dir);
554
+ } else {
555
+ break; // Stop at first dir below threshold (scores are sorted)
556
+ }
557
+ }
558
+ }
559
+
560
+ return hotDirs;
561
+ }
562
+
563
+ // ─── Path Spine Extraction ──────────────────────────────────────
564
+
565
+ /**
566
+ * Extract path spines from matched files.
567
+ *
568
+ * Previous approach: first-match iteration with topN break — caused files from
569
+ * later-iterated directories to be missed even when relevant (e.g., prompt.go
570
+ * in server/, shape.ts in packages/element/).
571
+ *
572
+ * New approach: score ALL candidate files, sort by relevance, take topN.
573
+ * Includes path-quality signals:
574
+ * - Source-code paths (src/, core/, lib/, internal/) get a bonus
575
+ * - Noise paths (migrations/, test/, fixtures/) get a penalty
576
+ * - Filename-level term matches get a bonus
577
+ */
578
+ // Paths that indicate core source code (bonus)
579
+ const SOURCE_PATH_PATTERNS = ["/src/", "/core/", "/lib/", "/internal/", "/pkg/", "/cmd/"];
580
+ // Paths that indicate non-essential files (penalty)
581
+ const NOISE_PATH_PATTERNS = ["/migrations/", "/test/", "/__tests__/", "/fixtures/", "/examples/", "/vendor/", "/mock/", "/mocks/", "/i18n/", "/locales/", "/versions/"];
582
+
583
+ function extractPathSpines(profiles, queryTerms, keywords, topN = 30) {
584
+ const allTerms = [...new Set([...queryTerms, ...keywords])];
585
+ if (allTerms.length === 0) return [];
586
+
587
+ // Score all candidate files across all directories
588
+ const candidates = [];
589
+
590
+ for (const [dir, profile] of Object.entries(profiles)) {
591
+ for (const filePath of profile.file_paths || []) {
592
+ const pathTokens = tokenizePath(filePath);
593
+ const pathText = filePath.toLowerCase();
594
+ // Extract bare filename without extension for filename-level matching
595
+ const parts = filePath.split("/");
596
+ const fileName = parts[parts.length - 1].replace(/\.[^.]+$/, "").toLowerCase();
597
+ const fileNameTokens = tokenizePath(fileName);
598
+
599
+ let score = 0;
600
+ for (const term of allTerms) {
601
+ // Filename match (highest signal — file is specifically about this concept)
602
+ if (fileName.includes(term) || fileNameTokens.some(ft => ft === term)) {
603
+ score += 4;
604
+ }
605
+ // Direct path text match
606
+ else if (pathText.includes(term)) {
607
+ score += 2;
608
+ }
609
+ // Token-level match (partial overlap)
610
+ else if (pathTokens.some(pt => pt.includes(term) || term.includes(pt))) {
611
+ score += 1;
612
+ }
613
+ }
614
+
615
+ if (score > 0) {
616
+ // Path quality adjustments
617
+ const lowerPath = "/" + pathText;
618
+ if (SOURCE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
619
+ score *= 1.5; // Bonus for source code paths
620
+ }
621
+ if (NOISE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
622
+ score *= 0.3; // Heavy penalty for noise paths
623
+ }
624
+
625
+ candidates.push({ path: filePath, score });
626
+ }
627
+ }
628
+ }
629
+
630
+ // Sort by score descending, take topN
631
+ candidates.sort((a, b) => b.score - a.score);
632
+ return candidates.slice(0, topN).map(c => c.path);
633
+ }
634
+
635
+ // ─── Git History RFM Signal ──────────────────────────────────────
636
+
637
+ /**
638
+ * Compute evolutionary activity scores for directories based on Git history.
639
+ *
640
+ * Uses the RFM (Recency-Frequency-Modification) model:
641
+ * - R: Time since last commit (exponential decay, half-life 30 days)
642
+ * - F: Commit frequency relative to total commits
643
+ * - M: Code churn volume (log2 scale)
644
+ *
645
+ * @param {string} projectRoot
646
+ * @param {string[]} topDirs
647
+ * @param {object} options
648
+ * @returns {Array<{dir: string, score: number, recency: number, frequency: number, modification: number}>}
649
+ */
650
+ // Git RFM cache: git log is expensive (~1-10s for large repos).
651
+ // TTL shorter than profile cache since commits happen more frequently.
652
+ const _gitRFMCache = new Map();
653
+ const GIT_RFM_CACHE_TTL_MS = (parseInt(process.env.FC_GIT_CACHE_TTL, 10) || 300) * 1000;
654
+
655
+ function computeGitRFM(projectRoot, topDirs, options = {}) {
656
+ // Cache lookup — key includes sorted topDirs to handle different dir sets
657
+ const cacheKey = `${projectRoot}|${topDirs.join(",")}`;
658
+ const cached = _gitRFMCache.get(cacheKey);
659
+ if (cached && (Date.now() - cached.cachedAt) < GIT_RFM_CACHE_TTL_MS) {
660
+ return cached.ranking;
661
+ }
662
+
663
+ const {
664
+ windowDays = 180, // 6 month lookback
665
+ halfLifeDays = 30, // Recency decay half-life
666
+ wr = 0.4, // Recency weight
667
+ wf = 0.35, // Frequency weight
668
+ wm = 0.25, // Modification weight
669
+ } = options;
670
+
671
+ const lambda = Math.LN2 / halfLifeDays;
672
+ const nowSec = Math.floor(Date.now() / 1000);
673
+ const sinceDate = new Date(Date.now() - windowDays * 86400000).toISOString().slice(0, 10);
674
+
675
+ // Gather per-directory stats in a single git log pass
676
+ const dirStats = {};
677
+ for (const d of topDirs) {
678
+ dirStats[d] = { lastCommitSec: 0, commits: 0, linesChanged: 0 };
679
+ }
680
+
681
+ try {
682
+ // Single git log: author-date + numstat, limited to window
683
+ const result = spawnSync("git", [
684
+ "log",
685
+ "--format=%at", // author timestamp (epoch)
686
+ "--numstat",
687
+ `--since=${sinceDate}`,
688
+ "--no-merges",
689
+ ], {
690
+ cwd: projectRoot,
691
+ encoding: "utf-8",
692
+ timeout: 10000,
693
+ maxBuffer: 4 * 1024 * 1024,
694
+ });
695
+
696
+ if (result.stdout) {
697
+ let currentTimestamp = 0;
698
+ const seenDirsForCommit = new Set();
699
+
700
+ for (const line of result.stdout.split("\n")) {
701
+ const trimmed = line.trim();
702
+ if (!trimmed) {
703
+ // Empty line = commit boundary
704
+ seenDirsForCommit.clear();
705
+ continue;
706
+ }
707
+
708
+ // Timestamp line
709
+ if (/^\d+$/.test(trimmed)) {
710
+ currentTimestamp = parseInt(trimmed, 10);
711
+ seenDirsForCommit.clear();
712
+ continue;
713
+ }
714
+
715
+ // Numstat line: added \t deleted \t filepath
716
+ const parts = trimmed.split("\t");
717
+ if (parts.length >= 3) {
718
+ const added = parseInt(parts[0], 10) || 0;
719
+ const deleted = parseInt(parts[1], 10) || 0;
720
+ const filePath = parts[2];
721
+ const topDir = filePath.split(/[\/\\]/)[0];
722
+
723
+ if (dirStats[topDir]) {
724
+ dirStats[topDir].linesChanged += added + deleted;
725
+ if (currentTimestamp > dirStats[topDir].lastCommitSec) {
726
+ dirStats[topDir].lastCommitSec = currentTimestamp;
727
+ }
728
+ // Count unique commits per dir (not per file)
729
+ if (!seenDirsForCommit.has(topDir)) {
730
+ seenDirsForCommit.add(topDir);
731
+ dirStats[topDir].commits++;
732
+ }
733
+ }
734
+ }
735
+ }
736
+ }
737
+ } catch { /* git not available or not a git repo - return empty */ }
738
+
739
+ // Compute total commits for frequency normalization
740
+ const totalCommits = Object.values(dirStats).reduce((s, d) => s + d.commits, 0) || 1;
741
+
742
+ // Compute RFM scores
743
+ const ranking = [];
744
+ for (const dir of topDirs) {
745
+ const stats = dirStats[dir];
746
+
747
+ // R: Recency (exponential decay)
748
+ let recency = 0;
749
+ if (stats.lastCommitSec > 0) {
750
+ const daysSince = (nowSec - stats.lastCommitSec) / 86400;
751
+ recency = Math.exp(-lambda * daysSince);
752
+ }
753
+
754
+ // F: Frequency (relative to total)
755
+ const frequency = stats.commits / totalCommits;
756
+
757
+ // M: Modification (log2 scale)
758
+ const modification = Math.log2(1 + stats.linesChanged);
759
+ // Normalize M to [0,1] range approximately
760
+ const mNorm = modification / (modification + 10);
761
+
762
+ const score = wr * recency + wf * frequency + wm * mNorm;
763
+ ranking.push({ dir, score, recency, frequency, modification: stats.linesChanged, commits: stats.commits });
764
+ }
765
+
766
+ ranking.sort((a, b) => b.score - a.score);
767
+
768
+ // Store in cache
769
+ _gitRFMCache.set(cacheKey, { ranking, cachedAt: Date.now() });
770
+
771
+ return ranking;
772
+ }
773
+
774
+ // ─── File-level Log-Sum Aggregation ─────────────────────────────
775
+
776
+ /**
777
+ * Compute file-level BM25 scores then aggregate to directory using Log-Sum.
778
+ *
779
+ * Instead of treating the entire directory as one flat text blob,
780
+ * score individual files independently then aggregate:
781
+ * Score(D) = max(file_scores) + α * log(1 + Σ(s_i - τ) for s_i > τ)
782
+ *
783
+ * @param {string[]} queryTerms
784
+ * @param {object} profile - Directory profile with file_paths
785
+ * @param {string} projectRoot
786
+ * @param {object} options
787
+ * @returns {number}
788
+ */
789
+ function fileAggregateScore(queryTerms, profile, projectRoot, options = {}) {
790
+ const {
791
+ alpha = 0.5, // Density bonus weight
792
+ threshold = 0.3, // Noise filter threshold (τ)
793
+ maxFiles = 200, // Cap files to score for performance
794
+ sampleHeaderLines = 5, // Lines to read per file for scoring
795
+ } = options;
796
+
797
+ if (!profile.file_paths || profile.file_paths.length === 0) return 0;
798
+
799
+ // Score each file by matching query terms against its path + header content
800
+ const fileScores = [];
801
+ const filesToScore = profile.file_paths.slice(0, maxFiles);
802
+
803
+ for (const relPath of filesToScore) {
804
+ const pathTokens = tokenizePath(relPath);
805
+ let score = 0;
806
+
807
+ // Path-based matching (fast, no I/O)
808
+ for (const qt of queryTerms) {
809
+ // Exact token match in path
810
+ if (pathTokens.some(pt => pt === qt)) {
811
+ score += 2.0;
812
+ }
813
+ // Partial match in path
814
+ else if (pathTokens.some(pt => pt.includes(qt) || qt.includes(pt))) {
815
+ score += 1.0;
816
+ }
817
+ // Raw path string match (catches compound names like "RLSManagement")
818
+ else if (relPath.toLowerCase().includes(qt)) {
819
+ score += 0.5;
820
+ }
821
+ }
822
+
823
+ if (score > 0) {
824
+ fileScores.push(score);
825
+ }
826
+ }
827
+
828
+ if (fileScores.length === 0) return 0;
829
+
830
+ fileScores.sort((a, b) => b - a);
831
+
832
+ // Log-Sum aggregation
833
+ const maxScore = fileScores[0];
834
+ const aboveThreshold = fileScores.filter(s => s > threshold);
835
+ const densitySum = aboveThreshold.reduce((sum, s) => sum + (s - threshold), 0);
836
+ const densityBonus = Math.log(1 + densitySum);
837
+
838
+ return maxScore + alpha * densityBonus;
839
+ }
840
+
841
+ // ─── Main API ────────────────────────────────────────────────────
842
+
843
+ /**
844
+ * Score directories using BM25F + Probe + Git RFM + File Aggregation + RRF
845
+ *
846
+ * @param {string} query - User query
847
+ * @param {string} projectRoot - Project root path
848
+ * @param {string[]} topDirs - List of top-level directories
849
+ * @param {string[]} excludePaths - Paths to exclude
850
+ * @param {object} options - Configuration options
851
+ * @returns {{ hotDirs: string[], pathSpines: string[], signals: object }}
852
+ */
853
+ export function scoreDirectories(query, projectRoot, topDirs, excludePaths = [], options = {}) {
854
+ const {
855
+ topK = 4,
856
+ useProbe = true,
857
+ useGitRFM = true,
858
+ useFileAgg = true,
859
+ keywords = [], // From bootstrap phase
860
+ minReturn = 2,
861
+ } = options;
862
+
863
+ const queryTerms = tokenize(query);
864
+
865
+ // Step 1: Build profiles for all directories + pre-tokenize fields (once)
866
+ const profiles = {};
867
+ for (const dir of topDirs) {
868
+ const profile = buildDirectoryProfile(projectRoot, dir, excludePaths);
869
+ // Cache tokenized fields on profile — avoids triple tokenization in IDF/avgLen/BM25F
870
+ profile._tok = {
871
+ dir_name: tokenize(profile.dir_name),
872
+ path_tokens: tokenize(profile.path_tokens_text),
873
+ metadata: tokenize(profile.metadata),
874
+ headers: tokenize(profile.headers_text),
875
+ };
876
+ profiles[dir] = profile;
877
+ }
878
+
879
+ // Step 2: Compute IDF across all profiles (uses cached tokens)
880
+ const allFieldTerms = [];
881
+ for (const profile of Object.values(profiles)) {
882
+ const t = profile._tok;
883
+ allFieldTerms.push([...t.dir_name, ...t.path_tokens, ...t.metadata, ...t.headers]);
884
+ }
885
+ const idf = computeIDF(allFieldTerms);
886
+
887
+ // Step 3: Compute average field lengths (uses cached tokens)
888
+ const avgFieldLens = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
889
+ const counts = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
890
+
891
+ for (const profile of Object.values(profiles)) {
892
+ const t = profile._tok;
893
+ avgFieldLens.dir_name += t.dir_name.length;
894
+ counts.dir_name++;
895
+ avgFieldLens.path_tokens += t.path_tokens.length;
896
+ counts.path_tokens++;
897
+ avgFieldLens.metadata += t.metadata.length;
898
+ counts.metadata++;
899
+ avgFieldLens.headers += t.headers.length;
900
+ counts.headers++;
901
+ }
902
+
903
+ for (const field of Object.keys(avgFieldLens)) {
904
+ avgFieldLens[field] = counts[field] > 0 ? avgFieldLens[field] / counts[field] : 10;
905
+ }
906
+
907
+ // Step 4: Signal 1 - BM25F scores
908
+ const bm25fRanking = [];
909
+ for (const dir of topDirs) {
910
+ const score = bm25fScore(queryTerms, profiles[dir], avgFieldLens, idf);
911
+ bm25fRanking.push({ dir, score });
912
+ }
913
+ bm25fRanking.sort((a, b) => b.score - a.score);
914
+
915
+ const rankings = [bm25fRanking];
916
+ const signals = { bm25f: bm25fRanking.map(r => r.dir) };
917
+
918
+ // Step 5: Signal 2 - Probe grep (if enabled)
919
+ if (useProbe && queryTerms.length > 0) {
920
+ // Fuse query terms with bootstrap keywords for probe selection
921
+ const keywordTerms = keywords && keywords.length > 0
922
+ ? keywords.flatMap(k => tokenize(k))
923
+ : [];
924
+ const allProbeCandidates = [...new Set([...queryTerms, ...keywordTerms])];
925
+
926
+ const probeTerms = selectProbeTerms(allProbeCandidates, idf);
927
+ if (probeTerms.length > 0) {
928
+ const dirHits = probeGrep(projectRoot, topDirs, probeTerms, excludePaths);
929
+
930
+ const probeRanking = [];
931
+ for (const dir of topDirs) {
932
+ const hits = dirHits[dir] || 0;
933
+ const fileCount = profiles[dir].file_count || 1;
934
+ const score = computeProbeScore(hits, fileCount);
935
+ probeRanking.push({ dir, score, hits, fileCount });
936
+ }
937
+ probeRanking.sort((a, b) => b.score - a.score);
938
+ rankings.push(probeRanking);
939
+ signals.probe = probeRanking.map(r => `${r.dir}:${r.hits}`);
940
+ }
941
+ }
942
+
943
+ // Step 6: Signal 3 - Keywords from bootstrap (if provided)
944
+ if (keywords && keywords.length > 0) {
945
+ const keywordTerms = keywords.flatMap(k => tokenize(k));
946
+ const keywordRanking = [];
947
+
948
+ for (const dir of topDirs) {
949
+ let score = 0;
950
+ const profile = profiles[dir];
951
+
952
+ // Check if keywords match in paths
953
+ for (const term of keywordTerms) {
954
+ if (profile.path_tokens_text.toLowerCase().includes(term)) {
955
+ score += 1;
956
+ }
957
+ }
958
+
959
+ keywordRanking.push({ dir, score });
960
+ }
961
+ keywordRanking.sort((a, b) => b.score - a.score);
962
+ rankings.push(keywordRanking);
963
+ signals.keywords = keywordRanking.map(r => r.dir);
964
+ }
965
+
966
+ // Step 7: Signal 4 - Git History RFM (evolutionary activity)
967
+ if (useGitRFM) {
968
+ try {
969
+ const gitRanking = computeGitRFM(projectRoot, topDirs);
970
+ if (gitRanking.some(r => r.score > 0)) {
971
+ rankings.push(gitRanking);
972
+ signals.gitRFM = gitRanking.slice(0, 6).map(r =>
973
+ `${r.dir}:R=${r.recency.toFixed(2)},C=${r.commits}`
974
+ );
975
+ }
976
+ } catch { /* git not available */ }
977
+ }
978
+
979
+ // Step 8: Signal 5 - File-level Log-Sum aggregation
980
+ if (useFileAgg) {
981
+ const fileAggRanking = [];
982
+ for (const dir of topDirs) {
983
+ const score = fileAggregateScore(queryTerms, profiles[dir], projectRoot);
984
+ fileAggRanking.push({ dir, score });
985
+ }
986
+ fileAggRanking.sort((a, b) => b.score - a.score);
987
+ if (fileAggRanking.some(r => r.score > 0)) {
988
+ rankings.push(fileAggRanking);
989
+ signals.fileAgg = fileAggRanking.slice(0, 6).map(r =>
990
+ `${r.dir}:${r.score.toFixed(2)}`
991
+ );
992
+ }
993
+ }
994
+
995
+ // Step 9: RRF Fusion
996
+ const fused = rrfFusion(rankings);
997
+
998
+ // Step 10: Ensure minimum return
999
+ while (fused.length < minReturn && fused.length < topDirs.length) {
1000
+ const missing = topDirs.find(d => !fused.some(f => f.dir === d));
1001
+ if (missing) {
1002
+ fused.push({ dir: missing, score: 0.001 });
1003
+ } else {
1004
+ break;
1005
+ }
1006
+ }
1007
+
1008
+ // Step 11: Extract path spines from matched files
1009
+ const pathSpines = extractPathSpines(profiles, queryTerms, keywords, 30);
1010
+
1011
+ // Step 12: Adaptive topK via score distribution analysis
1012
+ //
1013
+ // Based on IR literature:
1014
+ // - Taguchi et al. (2025) "Adaptive-k": max-gap detection on sorted scores
1015
+ // - Xu et al. (2025) "CAR": entropy-based distribution analysis for cutoff
1016
+ // - Kratzwald et al. (2018): cumulative score threshold for query-dependent k
1017
+ // - CMU Selective Search: shard cutoff via distribution skewness/entropy
1018
+ //
1019
+ // Hybrid approach: K_base (safety floor) + K_knee (gap detection) + entropy scaling
1020
+ // + adaptive tail threshold (replaces fixed 0.6)
1021
+ const hotDirs = _adaptiveTopK(fused, topK, topDirs.length);
1022
+
1023
+ return {
1024
+ hotDirs,
1025
+ pathSpines,
1026
+ signals,
1027
+ rawRankings: {
1028
+ bm25f: bm25fRanking,
1029
+ fused,
1030
+ },
1031
+ };
1032
+ }
1033
+
1034
+ /**
1035
+ * Quick scoring for when profiles are already built
1036
+ */
1037
+ export function quickScore(query, topDirs, profiles) {
1038
+ const queryTerms = tokenize(query);
1039
+ const scored = [];
1040
+
1041
+ for (const dir of topDirs) {
1042
+ const profile = profiles[dir] || { path_tokens_text: "", dir_name: dir };
1043
+ const dirTerms = [...tokenize(profile.dir_name), ...tokenize(profile.path_tokens_text)];
1044
+
1045
+ let score = 0;
1046
+ for (const qt of queryTerms) {
1047
+ if (dirTerms.some(dt => dt.includes(qt) || qt.includes(dt))) {
1048
+ score += 1;
1049
+ }
1050
+ }
1051
+
1052
+ scored.push({ dir, score });
1053
+ }
1054
+
1055
+ scored.sort((a, b) => b.score - a.score);
1056
+ return scored;
1057
+ }
1058
+
1059
+ export { tokenize, tokenizePath, stem, computeIDF };