fast-context-skill 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1086 @@
1
+ /**
2
+ * Directory Scorer - BM25F + Probe + RRF
3
+ *
4
+ * Based on IR research:
5
+ * - BM25F for multi-field structured documents (Robertson & Zaragoza)
6
+ * - Probe grep signal with IDF weighting
7
+ * - RRF fusion for combining multiple rankers (Cormack et al.)
8
+ *
9
+ * Directory Profile Fields:
10
+ * - dir_name: Top-level directory name (weight: 1.0)
11
+ * - path_tokens: All file paths under the directory (weight: 4.0) <- MAIN SIGNAL
12
+ * - metadata: package.json, go.mod, Cargo.toml info (weight: 3.0)
13
+ * - headers: First N lines / markdown headers (weight: 2.0)
14
+ */
15
+
16
+ import { readdirSync, readFileSync, existsSync, statSync } from "fs";
17
+ import { join, resolve, relative, extname, basename, dirname } from "path";
18
+ import { spawnSync } from "child_process";
19
+ import { splitByCase } from "scule";
20
+ import { rgPath } from "@vscode/ripgrep";
21
+
22
+ // ─── Constants ───────────────────────────────────────────────
23
+
24
+ const BM25_K1 = 1.2;
25
+ const BM25_B = 0.75;
26
+ const RRF_K = 60;
27
+
28
+ // Field weights for BM25F (from research recommendations)
29
+ const FIELD_WEIGHTS = {
30
+ dir_name: 1.0,
31
+ path_tokens: 4.0, // Main signal
32
+ metadata: 3.0,
33
+ headers: 2.0,
34
+ };
35
+
36
+ // Default exclude patterns
37
+ const DEFAULT_EXCLUDES = new Set([
38
+ "node_modules", ".git", "dist", "build", "coverage", ".venv", "venv",
39
+ "target", "out", ".cache", "__pycache__", "vendor", "deps", "third_party",
40
+ "logs", "data", ".next", ".nuxt", "bundle", "bundled", "fixtures",
41
+ ]);
42
+
43
+ // Stopwords for tokenization
44
+ const STOPWORDS = new Set([
45
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
46
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
47
+ "should", "may", "might", "must", "shall", "can", "need", "dare",
48
+ "to", "of", "in", "for", "on", "with", "at", "by", "from", "as",
49
+ "into", "through", "during", "before", "after", "above", "below",
50
+ "and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
51
+ "not", "only", "own", "same", "than", "too", "very", "just", "also",
52
+ "this", "that", "these", "those", "here", "there", "all", "any",
53
+ "some", "no", "none", "each", "every", "other", "another", "such",
54
+ "get", "set", "use", "used", "using", "make", "made", "if", "then",
55
+ "else", "return", "new", "like", "well", "where", "which", "who",
56
+ "what", "when", "why", "how", "it", "its", "we", "you", "your",
57
+ ]);
58
+
59
+ // ─── Tokenization ─────────────────────────────────────────────
60
+
61
+ // Stem patterns hoisted to module scope — avoids re-allocating 18 RegExp per call
62
+ const STEM_PATTERNS = [
63
+ [/^(.+)(ies)$/, "$1y"],
64
+ [/^(.+)([^aeiou])(es)$/, "$1$2"],
65
+ [/^(.+)([^aeiou])(s)$/, "$1$2"],
66
+ [/^(.+)(ing)$/, "$1"],
67
+ [/^(.+)(edly)$/, "$1"],
68
+ [/^(.+)(ly)$/, "$1"],
69
+ [/^(.+)(ed)$/, "$1"],
70
+ [/^(.+)(ation)$/, "$1ate"],
71
+ [/^(.+)(tion)$/, "$1t"],
72
+ [/^(.+)(ment)$/, "$1"],
73
+ [/^(.+)(ness)$/, "$1"],
74
+ [/^(.+)(ful)$/, "$1"],
75
+ [/^(.+)(less)$/, "$1"],
76
+ [/^(.+)(able)$/, "$1"],
77
+ [/^(.+)(ible)$/, "$1"],
78
+ [/^(.+)(ally)$/, "$1al"],
79
+ [/^(.+)(ity)$/, "$1"],
80
+ [/^(.+)(ive)$/, "$1"],
81
+ ];
82
+
83
+ /**
84
+ * Basic Porter-like stemming (simplified)
85
+ */
86
+ function stem(word) {
87
+ if (!word || word.length < 3) return word;
88
+ const w = word.toLowerCase();
89
+
90
+ for (const [pattern, replacement] of STEM_PATTERNS) {
91
+ if (pattern.test(w)) {
92
+ return w.replace(pattern, replacement);
93
+ }
94
+ }
95
+ return w;
96
+ }
97
+
98
+ /**
99
+ * Tokenize text with identifier splitting, stemming and stopword removal
100
+ * Uses scule's splitByCase for CamelCase / snake_case / SCREAMING_SNAKE
101
+ */
102
+ function tokenize(text, options = {}) {
103
+ if (!text) return [];
104
+ const { keepCase = false, minLen = 2 } = options;
105
+
106
+ // First split by spaces and common separators
107
+ const segments = text.split(/[\s\-./\\]+/);
108
+
109
+ const tokens = [];
110
+ for (const seg of segments) {
111
+ if (!seg || seg.length < minLen) continue;
112
+
113
+ // Use scule's splitByCase to handle CamelCase / snake_case
114
+ // splitByCase preserves case, so we lowercase after
115
+ const words = splitByCase(seg);
116
+ for (const word of words) {
117
+ const lower = word.toLowerCase();
118
+ if (lower.length >= minLen && !STOPWORDS.has(lower)) {
119
+ tokens.push(stem(lower));
120
+ }
121
+ }
122
+ }
123
+
124
+ return tokens;
125
+ }
126
+
127
+ /**
128
+ * Tokenize file path (handles code paths better)
129
+ * Uses scule's splitByCase for CamelCase in path components
130
+ */
131
+ function tokenizePath(pathStr) {
132
+ if (!pathStr) return [];
133
+
134
+ // Split by path separators first
135
+ const segments = pathStr.replace(/[\/\\]/g, " ").split(/\s+/);
136
+
137
+ const tokens = [];
138
+ for (const seg of segments) {
139
+ if (!seg || seg.length < 2) continue;
140
+
141
+ // splitByCase handles CamelCase, snake_case, SCREAMING_SNAKE
142
+ const words = splitByCase(seg);
143
+ for (const word of words) {
144
+ const lower = word.toLowerCase();
145
+ if (lower.length >= 2) {
146
+ tokens.push(stem(lower));
147
+ }
148
+ }
149
+ }
150
+
151
+ return tokens;
152
+ }
153
+
154
+ // ─── Directory Profile Builder ────────────────────────────────
155
+
156
+ /**
157
+ * Extract metadata from common config files
158
+ */
159
+ function extractMetadata(dirPath) {
160
+ const metadata = [];
161
+
162
+ // package.json
163
+ const pkgPath = join(dirPath, "package.json");
164
+ if (existsSync(pkgPath)) {
165
+ try {
166
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
167
+ if (pkg.name) metadata.push(pkg.name);
168
+ if (pkg.description) metadata.push(...tokenize(pkg.description));
169
+ if (pkg.keywords) metadata.push(...pkg.keywords.flatMap(k => tokenize(k)));
170
+ if (pkg.dependencies) metadata.push(...Object.keys(pkg.dependencies).flatMap(k => tokenize(k)));
171
+ } catch {}
172
+ }
173
+
174
+ // go.mod
175
+ const goModPath = join(dirPath, "go.mod");
176
+ if (existsSync(goModPath)) {
177
+ try {
178
+ const content = readFileSync(goModPath, "utf-8");
179
+ const moduleMatch = content.match(/module\s+(\S+)/);
180
+ if (moduleMatch) metadata.push(...tokenizePath(moduleMatch[1]));
181
+ } catch {}
182
+ }
183
+
184
+ // Cargo.toml
185
+ const cargoPath = join(dirPath, "Cargo.toml");
186
+ if (existsSync(cargoPath)) {
187
+ try {
188
+ const content = readFileSync(cargoPath, "utf-8");
189
+ const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
190
+ if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
191
+ } catch {}
192
+ }
193
+
194
+ // pyproject.toml
195
+ const pyprojectPath = join(dirPath, "pyproject.toml");
196
+ if (existsSync(pyprojectPath)) {
197
+ try {
198
+ const content = readFileSync(pyprojectPath, "utf-8");
199
+ const nameMatch = content.match(/name\s*=\s*"([^"]+)"/);
200
+ if (nameMatch) metadata.push(...tokenizePath(nameMatch[1]));
201
+ } catch {}
202
+ }
203
+
204
+ return metadata.join(" ");
205
+ }
206
+
207
+ /**
208
+ * Extract headers from a file (markdown headers, code comments, etc.)
209
+ */
210
+ function extractFileHeaders(filePath) {
211
+ try {
212
+ const content = readFileSync(filePath, "utf-8").slice(0, 2000); // First 2KB
213
+ const headers = [];
214
+
215
+ // Markdown headers
216
+ const mdHeaders = content.match(/^#+\s+.+$/gm) || [];
217
+ headers.push(...mdHeaders.map(h => h.replace(/^#+\s+/, "")));
218
+
219
+ // Code comments (first 10 lines)
220
+ const lines = content.split("\n").slice(0, 10);
221
+ for (const line of lines) {
222
+ const comment = line.match(/^\s*(?:(?:\/\/|#|;|\*)\s*)(.+)$/);
223
+ if (comment) headers.push(comment[1]);
224
+ }
225
+
226
+ return headers.join(" ");
227
+ } catch {
228
+ return "";
229
+ }
230
+ }
231
+
232
+ // ─── Profile Cache (process-level TTL) ───────────────────────
233
+ //
234
+ // MCP server is long-running: repeated search() calls for the same project
235
+ // would re-walk every directory each time (~200ms+ per large dir due to
236
+ // readFileSync for headers). This cache avoids that.
237
+ //
238
+ // Key: projectRoot + "|" + dirName + "|" + sortedExcludes
239
+ // TTL: 120s (configurable via FC_PROFILE_CACHE_TTL env)
240
+ // Scope: process lifetime only, not persisted
241
+
242
+ const _profileCache = new Map();
243
+ const PROFILE_CACHE_TTL_MS = (parseInt(process.env.FC_PROFILE_CACHE_TTL, 10) || 120) * 1000;
244
+
245
+ /**
246
+ * Invalidate all cached profiles for a project root.
247
+ * Call this if you know files have changed (optional — TTL handles normal staleness).
248
+ */
249
+ export function invalidateProfileCache(projectRoot) {
250
+ for (const key of _profileCache.keys()) {
251
+ if (key.startsWith(projectRoot + "|")) {
252
+ _profileCache.delete(key);
253
+ }
254
+ }
255
+ }
256
+
257
+ /**
258
+ * Build a profile for a top-level directory (with TTL cache)
259
+ */
260
+ export function buildDirectoryProfile(projectRoot, dirName, excludePaths = [], maxDepth = 3) {
261
+ // Cache lookup
262
+ const cacheKey = `${projectRoot}|${dirName}|${[...excludePaths].sort().join(",")}`;
263
+ const cached = _profileCache.get(cacheKey);
264
+ if (cached && (Date.now() - cached.cachedAt) < PROFILE_CACHE_TTL_MS) {
265
+ return cached.profile;
266
+ }
267
+ const dirPath = join(projectRoot, dirName);
268
+ const profile = {
269
+ dir_name: dirName,
270
+ path_tokens: [],
271
+ metadata: "",
272
+ headers: [],
273
+ file_count: 0,
274
+ file_paths: [], // Store actual file paths for path spines
275
+ };
276
+
277
+ const excludeSet = new Set(excludePaths);
278
+
279
+ function walk(currentPath, depth) {
280
+ if (depth > maxDepth) return;
281
+ try {
282
+ const entries = readdirSync(currentPath, { withFileTypes: true });
283
+ for (const entry of entries) {
284
+ const name = entry.name;
285
+
286
+ // Skip excluded and noise
287
+ if (DEFAULT_EXCLUDES.has(name) || excludeSet.has(name)) continue;
288
+ if (name.startsWith(".") && name !== ".github") continue;
289
+
290
+ const fullPath = join(currentPath, name);
291
+ const relPath = relative(projectRoot, fullPath);
292
+
293
+ if (entry.isDirectory()) {
294
+ profile.path_tokens.push(relPath);
295
+ walk(fullPath, depth + 1);
296
+ } else if (entry.isFile()) {
297
+ profile.path_tokens.push(relPath);
298
+ profile.file_paths.push(relPath);
299
+ profile.file_count++;
300
+
301
+ // Extract headers from relevant files
302
+ const ext = extname(name);
303
+ if ([".md", ".mdx", ".ts", ".tsx", ".js", ".jsx", ".py", ".go"].includes(ext)) {
304
+ const headers = extractFileHeaders(fullPath);
305
+ if (headers) profile.headers.push(headers);
306
+ }
307
+ }
308
+ }
309
+ } catch { /* ignore walk errors */ }
310
+ }
311
+
312
+ walk(dirPath, 1);
313
+
314
+ // Extract metadata from config files
315
+ profile.metadata = extractMetadata(dirPath);
316
+
317
+ // Convert arrays to text
318
+ profile.path_tokens_text = profile.path_tokens.join(" ");
319
+ profile.headers_text = profile.headers.join(" ");
320
+
321
+ // Store in cache
322
+ _profileCache.set(cacheKey, { profile, cachedAt: Date.now() });
323
+
324
+ return profile;
325
+ }
326
+
327
+ // ─── BM25/BM25F Implementation ────────────────────────────────
328
+
329
+ /**
330
+ * Compute IDF for terms across documents
331
+ */
332
+ function computeIDF(documents) {
333
+ const docCount = documents.length;
334
+ const termDocCount = {};
335
+ const idf = {};
336
+
337
+ for (const doc of documents) {
338
+ const uniqueTerms = new Set(doc);
339
+ for (const term of uniqueTerms) {
340
+ termDocCount[term] = (termDocCount[term] || 0) + 1;
341
+ }
342
+ }
343
+
344
+ for (const [term, count] of Object.entries(termDocCount)) {
345
+ // Standard IDF formula
346
+ idf[term] = Math.log((docCount - count + 0.5) / (count + 0.5) + 1);
347
+ }
348
+
349
+ return idf;
350
+ }
351
+
352
+ /**
353
+ * BM25 score for a single field
354
+ */
355
+ function bm25FieldScore(queryTerms, fieldTerms, avgLen, fieldLen, idf) {
356
+ const termFreqs = {};
357
+ fieldTerms.forEach(t => { termFreqs[t] = (termFreqs[t] || 0) + 1; });
358
+
359
+ let score = 0;
360
+ for (const term of queryTerms) {
361
+ const tf = termFreqs[term] || 0;
362
+ if (tf === 0) continue;
363
+
364
+ const termIDF = idf[term] || Math.log(2); // Default IDF for unseen terms
365
+ const numerator = tf * (BM25_K1 + 1);
366
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (fieldLen / avgLen));
367
+ score += termIDF * (numerator / denominator);
368
+ }
369
+
370
+ return score;
371
+ }
372
+
373
+ /**
374
+ * BM25F score across all fields (uses pre-cached tokenized fields when available)
375
+ */
376
+ function bm25fScore(queryTerms, profile, avgFieldLens, idf) {
377
+ const tok = profile._tok;
378
+ const fields = [
379
+ { name: "dir_name", terms: tok ? tok.dir_name : tokenize(profile.dir_name || ""), weight: FIELD_WEIGHTS.dir_name },
380
+ { name: "path_tokens", terms: tok ? tok.path_tokens : tokenize(profile.path_tokens_text || ""), weight: FIELD_WEIGHTS.path_tokens },
381
+ { name: "metadata", terms: tok ? tok.metadata : tokenize(profile.metadata || ""), weight: FIELD_WEIGHTS.metadata },
382
+ { name: "headers", terms: tok ? tok.headers : tokenize(profile.headers_text || ""), weight: FIELD_WEIGHTS.headers },
383
+ ];
384
+
385
+ let totalScore = 0;
386
+ for (const field of fields) {
387
+ const avgLen = avgFieldLens[field.name] || 50;
388
+ const fieldLen = field.terms.length || 1;
389
+ const fieldScore = bm25FieldScore(queryTerms, field.terms, avgLen, fieldLen, idf);
390
+ totalScore += field.weight * fieldScore;
391
+ }
392
+
393
+ return totalScore;
394
+ }
395
+
396
+ // ─── Probe Grep Signal ────────────────────────────────────────
397
+
398
+ /**
399
+ * Select probe terms from query (prioritize high IDF, but include diverse terms)
400
+ */
401
+ function selectProbeTerms(queryTerms, idf, maxTerms = 6) {
402
+ // Sort by IDF (descending) and select top terms
403
+ const sorted = queryTerms
404
+ .map(t => ({ term: t, idf: idf[t] || 0 }))
405
+ .sort((a, b) => b.idf - a.idf);
406
+
407
+ // Return unique terms (top-N by IDF)
408
+ const unique = [...new Set(sorted.map(t => t.term))];
409
+ return unique.slice(0, maxTerms);
410
+ }
411
+
412
+ /**
413
+ * Execute probe grep to count matches per directory.
414
+ * Uses a single rg call with regex alternation (term1|term2|...) instead of
415
+ * N sequential calls — saves (N-1) process spawns (~2-5s).
416
+ *
417
+ * Scoring: each matching file contributes 1 hit to its directory.
418
+ * RRF only cares about rank order, which is robust to this simplification.
419
+ */
420
+ function probeGrep(projectRoot, topDirs, probeTerms, excludePaths = []) {
421
+ if (probeTerms.length === 0) return {};
422
+
423
+ const dirHits = {};
424
+ const excludeSet = new Set([...excludePaths, ...DEFAULT_EXCLUDES]);
425
+
426
+ for (const dir of topDirs) {
427
+ dirHits[dir] = 0;
428
+ }
429
+
430
+ try {
431
+ // Use smart-case (-S): pattern is all lowercase -> case-insensitive
432
+ // Use fixed-strings (-F) with multiple -e for safety and speed
433
+ const result = spawnSync(rgPath, [
434
+ "-l", // List matching files only
435
+ "--hidden",
436
+ "-S", // Smart-case: auto ignore-case when pattern is lowercase
437
+ "-F", // Fixed strings (no regex interpretation)
438
+ ...probeTerms.flatMap(t => ["-e", t]), // Multiple -e "term"
439
+ "-g", `!{${[...excludeSet].join(",")}}`,
440
+ // No extension filter - let ripgrep search all text files
441
+ projectRoot,
442
+ ], {
443
+ encoding: "utf-8",
444
+ timeout: 8000, // Slightly longer for combined search
445
+ maxBuffer: 2 * 1024 * 1024,
446
+ env: { ...process.env, RIPGREP_CONFIG_PATH: "" },
447
+ });
448
+
449
+ if (result.stdout) {
450
+ const files = result.stdout.trim().split("\n").filter(Boolean);
451
+ for (const file of files) {
452
+ const relPath = relative(projectRoot, file);
453
+ const topDir = relPath.split(/[\/\\]/)[0];
454
+ if (dirHits.hasOwnProperty(topDir)) {
455
+ dirHits[topDir]++;
456
+ }
457
+ }
458
+ }
459
+ } catch { /* rg not found or error - skip */ }
460
+
461
+ return dirHits;
462
+ }
463
+
464
+ /**
465
+ * Compute probe score with normalization
466
+ */
467
+ function computeProbeScore(hits, fileCount) {
468
+ if (hits === 0) return 0;
469
+ // Normalize by log(hits) / sqrt(fileCount) to prevent large dirs from dominating
470
+ return Math.log(1 + hits) / Math.sqrt(1 + fileCount);
471
+ }
472
+
473
+ // ─── RRF Fusion ────────────────────────────────────────────────
474
+
475
+ /**
476
+ * Reciprocal Rank Fusion
477
+ */
478
+ function rrfFusion(rankings, weights = null) {
479
+ const finalScores = {};
480
+ const w = weights || rankings.map(() => 1);
481
+
482
+ for (let r = 0; r < rankings.length; r++) {
483
+ const ranking = rankings[r];
484
+ for (let pos = 0; pos < ranking.length; pos++) {
485
+ const { dir } = ranking[pos];
486
+ const rrfScore = w[r] / (RRF_K + pos + 1);
487
+ finalScores[dir] = (finalScores[dir] || 0) + rrfScore;
488
+ }
489
+ }
490
+
491
+ return Object.entries(finalScores)
492
+ .map(([dir, score]) => ({ dir, score }))
493
+ .sort((a, b) => b.score - a.score);
494
+ }
495
+
496
+ // ─── Adaptive TopK (Literature-backed) ───────────────────────────
497
+ //
498
+ // References:
499
+ // [1] Taguchi et al. (2025) "Adaptive-k" — max-gap on sorted scores
500
+ // [2] Xu et al. (2025) "CAR" — entropy-based cluster cutoff
501
+ // [3] CMU Selective Search — shard cutoff via distribution features
502
+ // [4] Kratzwald et al. (2018) — cumulative threshold for query-dependent k
503
+ //
504
+ // Three signals combined:
505
+ // K_base: N-proportional safety floor (handles degenerate cases)
506
+ // K_knee: Kneedle gap detection (query-sensitive, finds natural breakpoint)
507
+ // H_norm: Entropy scaling (flat distributions → expand K)
508
+ // Tail inclusion uses adaptive threshold based on score decay rate.
509
+
510
+ const K_MIN = 3;
511
+ const K_MAX = 10;
512
+ const ENTROPY_GAMMA = 0.5; // Entropy scaling factor
513
+ const SOFTMAX_TEMP = 1.0; // Temperature for softmax normalization
514
+ const TAIL_SCAN_WINDOW = 6; // Max dirs to scan beyond cutoff
515
+
516
+ /**
517
+ * Adaptive topK selection based on RRF score distribution.
518
+ *
519
+ * @param {Array<{dir: string, score: number}>} fused - RRF-fused sorted rankings
520
+ * @param {number} userTopK - User-specified topK (default 4)
521
+ * @param {number} N - Total number of top-level directories
522
+ * @returns {string[]} Selected hotDirs
523
+ */
524
+ function _adaptiveTopK(fused, userTopK, N) {
525
+ if (fused.length <= K_MIN) return fused.map(r => r.dir);
526
+
527
+ const scores = fused.map(r => r.score);
528
+
529
+ // ── Signal 1: K_base (N-proportional safety floor) ──
530
+ const kBase = Math.max(userTopK, Math.min(K_MAX, Math.ceil(N * 0.15)));
531
+
532
+ // ── Signal 2: K_knee (Kneedle max-gap detection) ──
533
+ // Find the position with the largest score drop (Taguchi Adaptive-k).
534
+ // This is where the "relevance cliff" occurs.
535
+ // Only search within [K_MIN-1, min(K_MAX, scores.length-1)] to stay bounded.
536
+ let maxGap = 0;
537
+ let kKnee = kBase;
538
+ const searchEnd = Math.min(K_MAX, scores.length - 1);
539
+ for (let i = K_MIN - 1; i < searchEnd; i++) {
540
+ const gap = scores[i] - scores[i + 1];
541
+ if (gap > maxGap) {
542
+ maxGap = gap;
543
+ kKnee = i + 1; // Include everything up to and including position i
544
+ }
545
+ }
546
+
547
+ // ── Signal 3: Entropy scaling (distribution flatness) ──
548
+ // Softmax-normalized entropy: H_norm ∈ [0, 1].
549
+ // High H_norm (flat distribution) → relevance is dispersed → expand K.
550
+ // Low H_norm (peaked distribution) → relevance is concentrated → keep K tight.
551
+ const maxScore = scores[0];
552
+ const expScores = scores.map(s => Math.exp((s - maxScore) / SOFTMAX_TEMP)); // shifted for numerical stability
553
+ const expSum = expScores.reduce((a, b) => a + b, 0);
554
+ const probs = expScores.map(e => e / expSum);
555
+ const entropy = -probs.reduce((h, p) => h + (p > 0 ? p * Math.log(p) : 0), 0);
556
+ const hNorm = scores.length > 1 ? entropy / Math.log(scores.length) : 0;
557
+
558
+ // Entropy-adjusted K: scale kBase by distribution flatness
559
+ const kEntropy = Math.ceil(kBase * (1 + ENTROPY_GAMMA * hNorm));
560
+
561
+ // ── Combine: take the max of all signals, clamp to [K_MIN, K_MAX] ──
562
+ const primaryK = Math.max(K_MIN, Math.min(K_MAX, Math.max(kBase, kKnee, kEntropy)));
563
+ let hotDirs = fused.slice(0, primaryK).map(r => r.dir);
564
+
565
+ // ── Adaptive tail inclusion ──
566
+ // Instead of fixed 0.6 threshold, use score decay rate to determine tail cutoff.
567
+ // If scores are still decaying slowly (flat tail), include more;
568
+ // if there's a sharp drop, stop.
569
+ if (fused.length > primaryK) {
570
+ const cutoffScore = scores[primaryK - 1];
571
+ // Adaptive threshold: based on the average decay rate in the head
572
+ // If head decays slowly (flat), threshold is lenient; if steep, threshold is strict.
573
+ const headDecayRate = primaryK > 1 ? (scores[0] - cutoffScore) / (primaryK - 1) : 0;
574
+ // Threshold = cutoffScore minus one "average step" worth of decay
575
+ // This is more lenient when the head is flat (small headDecayRate)
576
+ const tailThreshold = Math.max(cutoffScore - headDecayRate, cutoffScore * 0.4);
577
+
578
+ for (let i = primaryK; i < fused.length && i < primaryK + TAIL_SCAN_WINDOW; i++) {
579
+ if (scores[i] >= tailThreshold) {
580
+ hotDirs.push(fused[i].dir);
581
+ } else {
582
+ break; // Stop at first dir below threshold (scores are sorted)
583
+ }
584
+ }
585
+ }
586
+
587
+ return hotDirs;
588
+ }
589
+
590
+ // ─── Path Spine Extraction ──────────────────────────────────────
591
+
592
+ /**
593
+ * Extract path spines from matched files.
594
+ *
595
+ * Previous approach: first-match iteration with topN break — caused files from
596
+ * later-iterated directories to be missed even when relevant (e.g., prompt.go
597
+ * in server/, shape.ts in packages/element/).
598
+ *
599
+ * New approach: score ALL candidate files, sort by relevance, take topN.
600
+ * Includes path-quality signals:
601
+ * - Source-code paths (src/, core/, lib/, internal/) get a bonus
602
+ * - Noise paths (migrations/, test/, fixtures/) get a penalty
603
+ * - Filename-level term matches get a bonus
604
+ */
605
+ // Paths that indicate core source code (bonus)
606
+ const SOURCE_PATH_PATTERNS = ["/src/", "/core/", "/lib/", "/internal/", "/pkg/", "/cmd/"];
607
+ // Paths that indicate non-essential files (penalty)
608
+ const NOISE_PATH_PATTERNS = ["/migrations/", "/test/", "/__tests__/", "/fixtures/", "/examples/", "/vendor/", "/mock/", "/mocks/", "/i18n/", "/locales/", "/versions/"];
609
+
610
+ function extractPathSpines(profiles, queryTerms, keywords, topN = 30) {
611
+ const allTerms = [...new Set([...queryTerms, ...keywords])];
612
+ if (allTerms.length === 0) return [];
613
+
614
+ // Score all candidate files across all directories
615
+ const candidates = [];
616
+
617
+ for (const [dir, profile] of Object.entries(profiles)) {
618
+ for (const filePath of profile.file_paths || []) {
619
+ const pathTokens = tokenizePath(filePath);
620
+ const pathText = filePath.toLowerCase();
621
+ // Extract bare filename without extension for filename-level matching
622
+ const parts = filePath.split(/[\/\\]/);
623
+ const fileName = parts[parts.length - 1].replace(/\.[^.]+$/, "").toLowerCase();
624
+ const fileNameTokens = tokenizePath(fileName);
625
+
626
+ let score = 0;
627
+ for (const term of allTerms) {
628
+ // Filename match (highest signal — file is specifically about this concept)
629
+ if (fileName.includes(term) || fileNameTokens.some(ft => ft === term)) {
630
+ score += 4;
631
+ }
632
+ // Direct path text match
633
+ else if (pathText.includes(term)) {
634
+ score += 2;
635
+ }
636
+ // Token-level match (partial overlap)
637
+ else if (pathTokens.some(pt => pt.includes(term) || term.includes(pt))) {
638
+ score += 1;
639
+ }
640
+ }
641
+
642
+ if (score > 0) {
643
+ // Path quality adjustments
644
+ const lowerPath = "/" + pathText;
645
+ if (SOURCE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
646
+ score *= 1.5; // Bonus for source code paths
647
+ }
648
+ if (NOISE_PATH_PATTERNS.some(p => lowerPath.includes(p))) {
649
+ score *= 0.3; // Heavy penalty for noise paths
650
+ }
651
+
652
+ candidates.push({ path: filePath, score });
653
+ }
654
+ }
655
+ }
656
+
657
+ // Sort by score descending, take topN
658
+ candidates.sort((a, b) => b.score - a.score);
659
+ return candidates.slice(0, topN).map(c => c.path);
660
+ }
661
+
662
+ // ─── Git History RFM Signal ──────────────────────────────────────
663
+
664
+ /**
665
+ * Compute evolutionary activity scores for directories based on Git history.
666
+ *
667
+ * Uses the RFM (Recency-Frequency-Modification) model:
668
+ * - R: Time since last commit (exponential decay, half-life 30 days)
669
+ * - F: Commit frequency relative to total commits
670
+ * - M: Code churn volume (log2 scale)
671
+ *
672
+ * @param {string} projectRoot
673
+ * @param {string[]} topDirs
674
+ * @param {object} options
675
+ * @returns {Array<{dir: string, score: number, recency: number, frequency: number, modification: number}>}
676
+ */
677
+ // Git RFM cache: git log is expensive (~1-10s for large repos).
678
+ // TTL shorter than profile cache since commits happen more frequently.
679
+ const _gitRFMCache = new Map();
680
+ const GIT_RFM_CACHE_TTL_MS = (parseInt(process.env.FC_GIT_CACHE_TTL, 10) || 300) * 1000;
681
+
682
+ function computeGitRFM(projectRoot, topDirs, options = {}) {
683
+ // Cache lookup — key includes sorted topDirs to handle different dir sets
684
+ const cacheKey = `${projectRoot}|${topDirs.join(",")}`;
685
+ const cached = _gitRFMCache.get(cacheKey);
686
+ if (cached && (Date.now() - cached.cachedAt) < GIT_RFM_CACHE_TTL_MS) {
687
+ return cached.ranking;
688
+ }
689
+
690
+ const {
691
+ windowDays = 180, // 6 month lookback
692
+ halfLifeDays = 30, // Recency decay half-life
693
+ wr = 0.4, // Recency weight
694
+ wf = 0.35, // Frequency weight
695
+ wm = 0.25, // Modification weight
696
+ } = options;
697
+
698
+ const lambda = Math.LN2 / halfLifeDays;
699
+ const nowSec = Math.floor(Date.now() / 1000);
700
+ const sinceDate = new Date(Date.now() - windowDays * 86400000).toISOString().slice(0, 10);
701
+
702
+ // Gather per-directory stats in a single git log pass
703
+ const dirStats = {};
704
+ for (const d of topDirs) {
705
+ dirStats[d] = { lastCommitSec: 0, commits: 0, linesChanged: 0 };
706
+ }
707
+
708
+ try {
709
+ // Single git log: author-date + numstat, limited to window
710
+ const result = spawnSync("git", [
711
+ "log",
712
+ "--format=%at", // author timestamp (epoch)
713
+ "--numstat",
714
+ `--since=${sinceDate}`,
715
+ "--no-merges",
716
+ ], {
717
+ cwd: projectRoot,
718
+ encoding: "utf-8",
719
+ timeout: 10000,
720
+ maxBuffer: 4 * 1024 * 1024,
721
+ });
722
+
723
+ if (result.stdout) {
724
+ let currentTimestamp = 0;
725
+ const seenDirsForCommit = new Set();
726
+
727
+ for (const line of result.stdout.split("\n")) {
728
+ const trimmed = line.trim();
729
+ if (!trimmed) {
730
+ // Empty line = commit boundary
731
+ seenDirsForCommit.clear();
732
+ continue;
733
+ }
734
+
735
+ // Timestamp line
736
+ if (/^\d+$/.test(trimmed)) {
737
+ currentTimestamp = parseInt(trimmed, 10);
738
+ seenDirsForCommit.clear();
739
+ continue;
740
+ }
741
+
742
+ // Numstat line: added \t deleted \t filepath
743
+ const parts = trimmed.split("\t");
744
+ if (parts.length >= 3) {
745
+ const added = parseInt(parts[0], 10) || 0;
746
+ const deleted = parseInt(parts[1], 10) || 0;
747
+ const filePath = parts[2];
748
+ const topDir = filePath.split(/[\/\\]/)[0];
749
+
750
+ if (dirStats[topDir]) {
751
+ dirStats[topDir].linesChanged += added + deleted;
752
+ if (currentTimestamp > dirStats[topDir].lastCommitSec) {
753
+ dirStats[topDir].lastCommitSec = currentTimestamp;
754
+ }
755
+ // Count unique commits per dir (not per file)
756
+ if (!seenDirsForCommit.has(topDir)) {
757
+ seenDirsForCommit.add(topDir);
758
+ dirStats[topDir].commits++;
759
+ }
760
+ }
761
+ }
762
+ }
763
+ }
764
+ } catch { /* git not available or not a git repo - return empty */ }
765
+
766
+ // Compute total commits for frequency normalization
767
+ const totalCommits = Object.values(dirStats).reduce((s, d) => s + d.commits, 0) || 1;
768
+
769
+ // Compute RFM scores
770
+ const ranking = [];
771
+ for (const dir of topDirs) {
772
+ const stats = dirStats[dir];
773
+
774
+ // R: Recency (exponential decay)
775
+ let recency = 0;
776
+ if (stats.lastCommitSec > 0) {
777
+ const daysSince = (nowSec - stats.lastCommitSec) / 86400;
778
+ recency = Math.exp(-lambda * daysSince);
779
+ }
780
+
781
+ // F: Frequency (relative to total)
782
+ const frequency = stats.commits / totalCommits;
783
+
784
+ // M: Modification (log2 scale)
785
+ const modification = Math.log2(1 + stats.linesChanged);
786
+ // Normalize M to [0,1] range approximately
787
+ const mNorm = modification / (modification + 10);
788
+
789
+ const score = wr * recency + wf * frequency + wm * mNorm;
790
+ ranking.push({ dir, score, recency, frequency, modification: stats.linesChanged, commits: stats.commits });
791
+ }
792
+
793
+ ranking.sort((a, b) => b.score - a.score);
794
+
795
+ // Store in cache
796
+ _gitRFMCache.set(cacheKey, { ranking, cachedAt: Date.now() });
797
+
798
+ return ranking;
799
+ }
800
+
801
+ // ─── File-level Log-Sum Aggregation ─────────────────────────────
802
+
803
+ /**
804
+ * Compute file-level BM25 scores then aggregate to directory using Log-Sum.
805
+ *
806
+ * Instead of treating the entire directory as one flat text blob,
807
+ * score individual files independently then aggregate:
808
+ * Score(D) = max(file_scores) + α * log(1 + Σ(s_i - τ) for s_i > τ)
809
+ *
810
+ * @param {string[]} queryTerms
811
+ * @param {object} profile - Directory profile with file_paths
812
+ * @param {string} projectRoot
813
+ * @param {object} options
814
+ * @returns {number}
815
+ */
816
+ function fileAggregateScore(queryTerms, profile, projectRoot, options = {}) {
817
+ const {
818
+ alpha = 0.5, // Density bonus weight
819
+ threshold = 0.3, // Noise filter threshold (τ)
820
+ maxFiles = 200, // Cap files to score for performance
821
+ sampleHeaderLines = 5, // Lines to read per file for scoring
822
+ } = options;
823
+
824
+ if (!profile.file_paths || profile.file_paths.length === 0) return 0;
825
+
826
+ // Score each file by matching query terms against its path + header content
827
+ const fileScores = [];
828
+ const filesToScore = profile.file_paths.slice(0, maxFiles);
829
+
830
+ for (const relPath of filesToScore) {
831
+ const pathTokens = tokenizePath(relPath);
832
+ let score = 0;
833
+
834
+ // Path-based matching (fast, no I/O)
835
+ for (const qt of queryTerms) {
836
+ // Exact token match in path
837
+ if (pathTokens.some(pt => pt === qt)) {
838
+ score += 2.0;
839
+ }
840
+ // Partial match in path
841
+ else if (pathTokens.some(pt => pt.includes(qt) || qt.includes(pt))) {
842
+ score += 1.0;
843
+ }
844
+ // Raw path string match (catches compound names like "RLSManagement")
845
+ else if (relPath.toLowerCase().includes(qt)) {
846
+ score += 0.5;
847
+ }
848
+ }
849
+
850
+ if (score > 0) {
851
+ fileScores.push(score);
852
+ }
853
+ }
854
+
855
+ if (fileScores.length === 0) return 0;
856
+
857
+ fileScores.sort((a, b) => b - a);
858
+
859
+ // Log-Sum aggregation
860
+ const maxScore = fileScores[0];
861
+ const aboveThreshold = fileScores.filter(s => s > threshold);
862
+ const densitySum = aboveThreshold.reduce((sum, s) => sum + (s - threshold), 0);
863
+ const densityBonus = Math.log(1 + densitySum);
864
+
865
+ return maxScore + alpha * densityBonus;
866
+ }
867
+
868
+ // ─── Main API ────────────────────────────────────────────────────
869
+
870
+ /**
871
+ * Score directories using BM25F + Probe + Git RFM + File Aggregation + RRF
872
+ *
873
+ * @param {string} query - User query
874
+ * @param {string} projectRoot - Project root path
875
+ * @param {string[]} topDirs - List of top-level directories
876
+ * @param {string[]} excludePaths - Paths to exclude
877
+ * @param {object} options - Configuration options
878
+ * @returns {{ hotDirs: string[], pathSpines: string[], signals: object }}
879
+ */
880
+ export function scoreDirectories(query, projectRoot, topDirs, excludePaths = [], options = {}) {
881
+ const {
882
+ topK = 4,
883
+ useProbe = true,
884
+ useGitRFM = true,
885
+ useFileAgg = true,
886
+ keywords = [], // From bootstrap phase
887
+ minReturn = 2,
888
+ } = options;
889
+
890
+ const queryTerms = tokenize(query);
891
+
892
+ // Step 1: Build profiles for all directories + pre-tokenize fields (once)
893
+ const profiles = {};
894
+ for (const dir of topDirs) {
895
+ const profile = buildDirectoryProfile(projectRoot, dir, excludePaths);
896
+ // Cache tokenized fields on profile — avoids triple tokenization in IDF/avgLen/BM25F
897
+ profile._tok = {
898
+ dir_name: tokenize(profile.dir_name),
899
+ path_tokens: tokenize(profile.path_tokens_text),
900
+ metadata: tokenize(profile.metadata),
901
+ headers: tokenize(profile.headers_text),
902
+ };
903
+ profiles[dir] = profile;
904
+ }
905
+
906
+ // Step 2: Compute IDF across all profiles (uses cached tokens)
907
+ const allFieldTerms = [];
908
+ for (const profile of Object.values(profiles)) {
909
+ const t = profile._tok;
910
+ allFieldTerms.push([...t.dir_name, ...t.path_tokens, ...t.metadata, ...t.headers]);
911
+ }
912
+ const idf = computeIDF(allFieldTerms);
913
+
914
+ // Step 3: Compute average field lengths (uses cached tokens)
915
+ const avgFieldLens = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
916
+ const counts = { dir_name: 0, path_tokens: 0, metadata: 0, headers: 0 };
917
+
918
+ for (const profile of Object.values(profiles)) {
919
+ const t = profile._tok;
920
+ avgFieldLens.dir_name += t.dir_name.length;
921
+ counts.dir_name++;
922
+ avgFieldLens.path_tokens += t.path_tokens.length;
923
+ counts.path_tokens++;
924
+ avgFieldLens.metadata += t.metadata.length;
925
+ counts.metadata++;
926
+ avgFieldLens.headers += t.headers.length;
927
+ counts.headers++;
928
+ }
929
+
930
+ for (const field of Object.keys(avgFieldLens)) {
931
+ avgFieldLens[field] = counts[field] > 0 ? avgFieldLens[field] / counts[field] : 10;
932
+ }
933
+
934
+ // Step 4: Signal 1 - BM25F scores
935
+ const bm25fRanking = [];
936
+ for (const dir of topDirs) {
937
+ const score = bm25fScore(queryTerms, profiles[dir], avgFieldLens, idf);
938
+ bm25fRanking.push({ dir, score });
939
+ }
940
+ bm25fRanking.sort((a, b) => b.score - a.score);
941
+
942
+ const rankings = [bm25fRanking];
943
+ const signals = { bm25f: bm25fRanking.map(r => r.dir) };
944
+
945
+ // Step 5: Signal 2 - Probe grep (if enabled)
946
+ if (useProbe && queryTerms.length > 0) {
947
+ // Fuse query terms with bootstrap keywords for probe selection
948
+ const keywordTerms = keywords && keywords.length > 0
949
+ ? keywords.flatMap(k => tokenize(k))
950
+ : [];
951
+ const allProbeCandidates = [...new Set([...queryTerms, ...keywordTerms])];
952
+
953
+ const probeTerms = selectProbeTerms(allProbeCandidates, idf);
954
+ if (probeTerms.length > 0) {
955
+ const dirHits = probeGrep(projectRoot, topDirs, probeTerms, excludePaths);
956
+
957
+ const probeRanking = [];
958
+ for (const dir of topDirs) {
959
+ const hits = dirHits[dir] || 0;
960
+ const fileCount = profiles[dir].file_count || 1;
961
+ const score = computeProbeScore(hits, fileCount);
962
+ probeRanking.push({ dir, score, hits, fileCount });
963
+ }
964
+ probeRanking.sort((a, b) => b.score - a.score);
965
+ rankings.push(probeRanking);
966
+ signals.probe = probeRanking.map(r => `${r.dir}:${r.hits}`);
967
+ }
968
+ }
969
+
970
+ // Step 6: Signal 3 - Keywords from bootstrap (if provided)
971
+ if (keywords && keywords.length > 0) {
972
+ const keywordTerms = keywords.flatMap(k => tokenize(k));
973
+ const keywordRanking = [];
974
+
975
+ for (const dir of topDirs) {
976
+ let score = 0;
977
+ const profile = profiles[dir];
978
+
979
+ // Check if keywords match in paths
980
+ for (const term of keywordTerms) {
981
+ if (profile.path_tokens_text.toLowerCase().includes(term)) {
982
+ score += 1;
983
+ }
984
+ }
985
+
986
+ keywordRanking.push({ dir, score });
987
+ }
988
+ keywordRanking.sort((a, b) => b.score - a.score);
989
+ rankings.push(keywordRanking);
990
+ signals.keywords = keywordRanking.map(r => r.dir);
991
+ }
992
+
993
+ // Step 7: Signal 4 - Git History RFM (evolutionary activity)
994
+ if (useGitRFM) {
995
+ try {
996
+ const gitRanking = computeGitRFM(projectRoot, topDirs);
997
+ if (gitRanking.some(r => r.score > 0)) {
998
+ rankings.push(gitRanking);
999
+ signals.gitRFM = gitRanking.slice(0, 6).map(r =>
1000
+ `${r.dir}:R=${r.recency.toFixed(2)},C=${r.commits}`
1001
+ );
1002
+ }
1003
+ } catch { /* git not available */ }
1004
+ }
1005
+
1006
+ // Step 8: Signal 5 - File-level Log-Sum aggregation
1007
+ if (useFileAgg) {
1008
+ const fileAggRanking = [];
1009
+ for (const dir of topDirs) {
1010
+ const score = fileAggregateScore(queryTerms, profiles[dir], projectRoot);
1011
+ fileAggRanking.push({ dir, score });
1012
+ }
1013
+ fileAggRanking.sort((a, b) => b.score - a.score);
1014
+ if (fileAggRanking.some(r => r.score > 0)) {
1015
+ rankings.push(fileAggRanking);
1016
+ signals.fileAgg = fileAggRanking.slice(0, 6).map(r =>
1017
+ `${r.dir}:${r.score.toFixed(2)}`
1018
+ );
1019
+ }
1020
+ }
1021
+
1022
+ // Step 9: RRF Fusion
1023
+ const fused = rrfFusion(rankings);
1024
+
1025
+ // Step 10: Ensure minimum return
1026
+ while (fused.length < minReturn && fused.length < topDirs.length) {
1027
+ const missing = topDirs.find(d => !fused.some(f => f.dir === d));
1028
+ if (missing) {
1029
+ fused.push({ dir: missing, score: 0.001 });
1030
+ } else {
1031
+ break;
1032
+ }
1033
+ }
1034
+
1035
+ // Step 11: Extract path spines from matched files
1036
+ const pathSpines = extractPathSpines(profiles, queryTerms, keywords, 30);
1037
+
1038
+ // Step 12: Adaptive topK via score distribution analysis
1039
+ //
1040
+ // Based on IR literature:
1041
+ // - Taguchi et al. (2025) "Adaptive-k": max-gap detection on sorted scores
1042
+ // - Xu et al. (2025) "CAR": entropy-based distribution analysis for cutoff
1043
+ // - Kratzwald et al. (2018): cumulative score threshold for query-dependent k
1044
+ // - CMU Selective Search: shard cutoff via distribution skewness/entropy
1045
+ //
1046
+ // Hybrid approach: K_base (safety floor) + K_knee (gap detection) + entropy scaling
1047
+ // + adaptive tail threshold (replaces fixed 0.6)
1048
+ const hotDirs = _adaptiveTopK(fused, topK, topDirs.length);
1049
+
1050
+ return {
1051
+ hotDirs,
1052
+ pathSpines,
1053
+ signals,
1054
+ rawRankings: {
1055
+ bm25f: bm25fRanking,
1056
+ fused,
1057
+ },
1058
+ };
1059
+ }
1060
+
1061
+ /**
1062
+ * Quick scoring for when profiles are already built
1063
+ */
1064
+ export function quickScore(query, topDirs, profiles) {
1065
+ const queryTerms = tokenize(query);
1066
+ const scored = [];
1067
+
1068
+ for (const dir of topDirs) {
1069
+ const profile = profiles[dir] || { path_tokens_text: "", dir_name: dir };
1070
+ const dirTerms = [...tokenize(profile.dir_name), ...tokenize(profile.path_tokens_text)];
1071
+
1072
+ let score = 0;
1073
+ for (const qt of queryTerms) {
1074
+ if (dirTerms.some(dt => dt.includes(qt) || qt.includes(dt))) {
1075
+ score += 1;
1076
+ }
1077
+ }
1078
+
1079
+ scored.push({ dir, score });
1080
+ }
1081
+
1082
+ scored.sort((a, b) => b.score - a.score);
1083
+ return scored;
1084
+ }
1085
+
1086
+ export { tokenize, tokenizePath, stem, computeIDF };