coderev-cli 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,700 @@
1
+ /**
2
+ * RAG (Retrieval-Augmented Generation) Codebase Indexer
3
+ *
4
+ * Phase 1: Lightweight local codebase indexing with text-based symbol extraction
5
+ * and cosine-similarity retrieval using simple TF-IDF vectors with optional LLM embeddings.
6
+ *
7
+ * Design:
8
+ * - No native deps (tree-sitter, sqlite-vec) — pure JS for Phase 1
9
+ * - Index stored as JSON in `.coderev/index/`
10
+ * - Two modes:
11
+ * 1. Fast: TF-IDF on extracted symbols/functions (no LLM call, instant)
12
+ * 2. Embedded: Uses LLM embeddings API for semantic search (needs API key, more accurate)
13
+ * - Indexed content: function signatures, class definitions, import statements, type defs
14
+ * - Diff context retrieval: given a changed file, find related symbols from the same
15
+ * file + cross-file references (imports/exports)
16
+ */
17
+
18
+ const fs = require('fs');
19
+ const path = require('path');
20
+ const crypto = require('crypto');
21
+
22
+ const INDEX_DIR = '.coderev/index';
23
+ const INDEX_FILE = 'codebase-index.json';
24
+ const META_FILE = 'index-meta.json';
25
+
26
+ // File extensions to index
27
+ const INDEXABLE_EXTS = new Set([
28
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
29
+ '.py', '.rs', '.go', '.java', '.rb', '.php',
30
+ '.swift', '.kt', '.kts', '.c', '.cpp', '.h', '.hpp',
31
+ '.cs', '.sql', '.yaml', '.yml', '.toml',
32
+ '.vue', '.svelte', '.astro',
33
+ ]);
34
+
35
+ // Directories to skip
36
+ const SKIP_DIRS = new Set([
37
+ 'node_modules', '.git', '__pycache__', '.venv', 'venv',
38
+ 'dist', 'build', '.next', '.nuxt', '.output',
39
+ 'target', 'bin', 'obj', '.gradle', '.idea',
40
+ 'vendor', 'coverage', '.coderev',
41
+ ]);
42
+
43
+ // Regex patterns for symbol extraction (language-agnostic)
44
+ const SYMBOL_PATTERNS = [
45
+ // JavaScript/TypeScript: function declarations & arrow functions
46
+ {
47
+ lang: 'js/ts',
48
+ re: /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/g,
49
+ type: 'function',
50
+ },
51
+ // JavaScript/TypeScript: arrow functions
52
+ {
53
+ lang: 'js/ts',
54
+ re: /(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[\w$]+)\s*=>/g,
55
+ type: 'function',
56
+ },
57
+ // JavaScript/TypeScript: class declarations
58
+ {
59
+ lang: 'js/ts',
60
+ re: /(?:export\s+)?(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?/g,
61
+ type: 'class',
62
+ },
63
+ // JavaScript/TypeScript: method definitions in classes/objects
64
+ {
65
+ lang: 'js/ts',
66
+ re: /(?:(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{)/g,
67
+ type: 'method',
68
+ },
69
+ // JavaScript/TypeScript: imports
70
+ {
71
+ lang: 'js/ts',
72
+ re: /(?:import\s+(?:(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)(?:\s*,\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+))*\s+from\s+)?['"]([^'"]+)['"])|(?:require\s*\(\s*['"]([^'"]+)['"]\s*\))/g,
73
+ type: 'import',
74
+ },
75
+ // JavaScript/TypeScript: exports
76
+ {
77
+ lang: 'js/ts',
78
+ re: /export\s+(?:default\s+)?(?:(?:function|class|const|let|var)\s+)?(\w+)/g,
79
+ type: 'export',
80
+ },
81
+ // Python: function definitions
82
+ {
83
+ lang: 'python',
84
+ re: /(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/g,
85
+ type: 'function',
86
+ },
87
+ // Python: class definitions
88
+ {
89
+ lang: 'python',
90
+ re: /class\s+(\w+)(?:\s*\(([^)]*)\))?:/g,
91
+ type: 'class',
92
+ },
93
+ // Python: imports
94
+ {
95
+ lang: 'python',
96
+ re: /(?:from\s+(\S+)\s+import\s+(\S+))|(?:import\s+(\S+))/g,
97
+ type: 'import',
98
+ },
99
+ // Go: function declarations
100
+ {
101
+ lang: 'go',
102
+ re: /func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(([^)]*)\)/g,
103
+ type: 'function',
104
+ },
105
+ // Go: type/struct definitions
106
+ {
107
+ lang: 'go',
108
+ re: /type\s+(\w+)\s+(?:struct|interface)\s*\{/g,
109
+ type: 'type',
110
+ },
111
+ // Go: imports
112
+ {
113
+ lang: 'go',
114
+ re: /"([^"]+)"/g,
115
+ type: 'import',
116
+ },
117
+ // Rust: function definitions
118
+ {
119
+ lang: 'rust',
120
+ re: /(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]*>)?\s*\(([^)]*)\)/g,
121
+ type: 'function',
122
+ },
123
+ // Rust: struct/enum/trait
124
+ {
125
+ lang: 'rust',
126
+ re: /(?:pub\s+)?(?:struct|enum|trait)\s+(\w+)/g,
127
+ type: 'type',
128
+ },
129
+ // Java/Kotlin: method declarations
130
+ {
131
+ lang: 'java/kotlin',
132
+ re: /(?:(?:public|private|protected)\s+)?(?:static\s+)?\w+\s+(\w+)\s*\(([^)]*)\)/g,
133
+ type: 'method',
134
+ },
135
+ // Java/Kotlin: class
136
+ {
137
+ lang: 'java/kotlin',
138
+ re: /(?:public\s+)?class\s+(\w+)/g,
139
+ type: 'class',
140
+ },
141
+ ];
142
+
143
+ // Language detection by extension
144
+ const EXT_LANG_MAP = {
145
+ '.js': 'js/ts', '.jsx': 'js/ts', '.ts': 'js/ts', '.tsx': 'js/ts',
146
+ '.mjs': 'js/ts', '.cjs': 'js/ts',
147
+ '.py': 'python', '.pyw': 'python',
148
+ '.rs': 'rust',
149
+ '.go': 'go',
150
+ '.java': 'java/kotlin', '.kt': 'java/kotlin', '.kts': 'java/kotlin',
151
+ '.rb': 'ruby',
152
+ '.php': 'php',
153
+ '.swift': 'swift',
154
+ '.c': 'c', '.cpp': 'c/cpp', '.h': 'c', '.hpp': 'c/cpp',
155
+ '.cs': 'csharp',
156
+ '.sql': 'sql',
157
+ '.vue': 'js/ts', '.svelte': 'js/ts', '.astro': 'js/ts',
158
+ };
159
+
160
+ /**
161
+ * Determine the language group for a file extension.
162
+ */
163
+ function langForExt(ext) {
164
+ return EXT_LANG_MAP[ext] || 'generic';
165
+ }
166
+
167
+ /**
168
+ * Extract symbols from source code text.
169
+ * Returns an array of { name, type, signature, lang, line } objects.
170
+ */
171
+ function extractSymbols(source, filename) {
172
+ const ext = path.extname(filename).toLowerCase();
173
+ const lang = langForExt(ext);
174
+ const symbols = [];
175
+ const lines = source.split('\n');
176
+
177
+ for (const pattern of SYMBOL_PATTERNS) {
178
+ if (!pattern.lang.includes(lang) && pattern.lang !== 'generic') continue;
179
+
180
+ // Reset lastIndex for new source
181
+ const re = new RegExp(pattern.re.source, pattern.re.flags);
182
+
183
+ let match;
184
+ while ((match = re.exec(source)) !== null) {
185
+ let name, signature;
186
+
187
+ if (pattern.type === 'import') {
188
+ // Import patterns: capture the module path
189
+ name = match[1] || match[2] || match[3] || match[4] || '';
190
+ // For Go multi-import blocks, filter noise
191
+ if (lang === 'go' && name.startsWith('"') && name.endsWith('"')) {
192
+ name = name.slice(1, -1);
193
+ }
194
+ } else {
195
+ // Function/class/method: first capture group is name
196
+ name = match[1];
197
+ signature = match[2] || '';
198
+ }
199
+
200
+ if (!name || name.length < 1) continue;
201
+
202
+ // Skip noise words (language-specific)
203
+ const noise = ['if', 'for', 'while', 'switch', 'catch', 'return', 'throw',
204
+ 'typeof', 'instanceof', 'delete', 'void', 'else', 'case', 'default',
205
+ 'break', 'continue', 'try', 'finally', 'debugger', 'with'];
206
+ if (noise.includes(name) && pattern.lang !== 'rust') continue;
207
+
208
+ // Calculate line number
209
+ const pos = match.index;
210
+ const line = source.substring(0, pos).split('\n').length;
211
+
212
+ symbols.push({
213
+ name,
214
+ type: pattern.type,
215
+ signature: signature || '',
216
+ lang,
217
+ file: filename,
218
+ line,
219
+ // Context snippet for retrieval
220
+ snippet: lines.slice(Math.max(0, line - 2), Math.min(lines.length, line + 3)).join('\n'),
221
+ });
222
+ }
223
+ }
224
+
225
+ return symbols;
226
+ }
227
+
228
+ /**
229
+ * Simple TF-IDF style tokenizer for text.
230
+ */
231
+ function tokenize(text) {
232
+ return text
233
+ .toLowerCase()
234
+ .replace(/[^a-z0-9_$]/g, ' ')
235
+ .split(/\s+/)
236
+ .filter(t => t.length > 1 && !['the', 'and', 'for', 'with', 'from', 'this'].includes(t));
237
+ }
238
+
239
+ /**
240
+ * Build a simple bag-of-words vector for a document.
241
+ */
242
+ function bowVector(tokens, vocabulary) {
243
+ const vec = new Array(vocabulary.size).fill(0);
244
+ for (const token of tokens) {
245
+ const idx = vocabulary.get(token);
246
+ if (idx !== undefined) vec[idx]++;
247
+ }
248
+ return vec;
249
+ }
250
+
251
+ /**
252
+ * Cosine similarity between two vectors.
253
+ */
254
+ function cosineSimilarity(a, b) {
255
+ let dot = 0, normA = 0, normB = 0;
256
+ for (let i = 0; i < a.length; i++) {
257
+ dot += a[i] * b[i];
258
+ normA += a[i] * a[i];
259
+ normB += b[i] * b[i];
260
+ }
261
+ if (normA === 0 || normB === 0) return 0;
262
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
263
+ }
264
+
265
+ /**
266
+ * Build a TF-IDF index from extracted symbols.
267
+ */
268
+ function buildTfIdfIndex(symbols) {
269
+ // Build vocabulary
270
+ const vocabSet = new Set();
271
+ const docs = symbols.map(s => ({
272
+ tokens: tokenize(`${s.name} ${s.signature} ${s.type} ${s.lang} ${s.file}`),
273
+ symbol: s,
274
+ }));
275
+
276
+ for (const doc of docs) {
277
+ for (const token of doc.tokens) vocabSet.add(token);
278
+ }
279
+
280
+ const vocabulary = new Map();
281
+ [...vocabSet].forEach((word, i) => vocabulary.set(word, i));
282
+
283
+ // Compute IDF
284
+ const df = new Array(vocabulary.size).fill(0);
285
+ for (const doc of docs) {
286
+ const seen = new Set();
287
+ for (const token of doc.tokens) {
288
+ const idx = vocabulary.get(token);
289
+ if (idx !== undefined && !seen.has(idx)) {
290
+ df[idx]++;
291
+ seen.add(idx);
292
+ }
293
+ }
294
+ }
295
+
296
+ const N = docs.length;
297
+ const idf = df.map(d => d === 0 ? 0 : Math.log((N + 1) / (d + 1)) + 1);
298
+
299
+ // Build TF-IDF vectors for each document
300
+ const vectors = docs.map(doc => {
301
+ const tf = new Array(vocabulary.size).fill(0);
302
+ for (const token of doc.tokens) {
303
+ const idx = vocabulary.get(token);
304
+ if (idx !== undefined) tf[idx]++;
305
+ }
306
+ // TF normalization
307
+ const maxTf = Math.max(...tf);
308
+ if (maxTf > 0) {
309
+ for (let i = 0; i < tf.length; i++) {
310
+ tf[i] = 0.5 + 0.5 * (tf[i] / maxTf);
311
+ }
312
+ }
313
+ return tf.map((v, i) => v * idf[i]);
314
+ });
315
+
316
+ return { vocabulary, idf, vectors, docs };
317
+ }
318
+
319
+ /**
320
+ * Search the index for symbols relevant to the query.
321
+ */
322
+ function searchIndex(index, query, topK = 10) {
323
+ const queryTokens = tokenize(query);
324
+ if (queryTokens.length === 0) return [];
325
+
326
+ const queryVec = bowVector(queryTokens, index.vocabulary);
327
+
328
+ // IDF weight the query
329
+ for (let i = 0; i < queryVec.length; i++) {
330
+ queryVec[i] *= (index.idf[i] || 1);
331
+ }
332
+
333
+ // Score all docs
334
+ const scores = index.vectors.map((vec, i) => ({
335
+ score: cosineSimilarity(queryVec, vec),
336
+ symbol: index.docs[i].symbol,
337
+ }));
338
+
339
+ // Sort and return top K
340
+ return scores
341
+ .filter(s => s.score > 0)
342
+ .sort((a, b) => b.score - a.score)
343
+ .slice(0, topK);
344
+ }
345
+
346
+ /**
347
+ * Build the codebase index.
348
+ *
349
+ * @param {string} repoRoot - Root directory of the repository
350
+ * @param {object} [options]
351
+ * @param {string[]} [options.includePatterns] - Glob patterns for files to include
352
+ * @param {string[]} [options.excludePatterns] - Glob patterns for files to exclude
353
+ * @param {number} [options.maxFiles=500] - Maximum number of files to index
354
+ * @returns {object} Index object with symbols, stats, and search capability
355
+ */
356
+ function buildIndex(repoRoot, options = {}) {
357
+ const maxFiles = options.maxFiles || 500;
358
+ const startTime = Date.now();
359
+ const allSymbols = [];
360
+ const filesScanned = [];
361
+ let filesProcessed = 0;
362
+
363
+ // Ensure index directory exists
364
+ const indexDir = path.join(repoRoot, INDEX_DIR);
365
+ if (!fs.existsSync(indexDir)) {
366
+ fs.mkdirSync(indexDir, { recursive: true });
367
+ }
368
+
369
+ // Walk the directory tree
370
+ function walk(dir, relativePath) {
371
+ if (filesProcessed >= maxFiles) return;
372
+
373
+ let entries;
374
+ try {
375
+ entries = fs.readdirSync(dir, { withFileTypes: true });
376
+ } catch {
377
+ return; // Skip unreadable directories
378
+ }
379
+
380
+ for (const entry of entries) {
381
+ if (filesProcessed >= maxFiles) return;
382
+
383
+ const fullPath = path.join(dir, entry.name);
384
+ const relPath = relativePath ? `${relativePath}/${entry.name}` : entry.name;
385
+
386
+ if (entry.isDirectory()) {
387
+ if (!SKIP_DIRS.has(entry.name) && !entry.name.startsWith('.')) {
388
+ walk(fullPath, relPath);
389
+ }
390
+ } else if (entry.isFile()) {
391
+ const ext = path.extname(entry.name).toLowerCase();
392
+ if (INDEXABLE_EXTS.has(ext)) {
393
+ try {
394
+ const source = fs.readFileSync(fullPath, 'utf-8');
395
+ // Skip very large files (> 500KB)
396
+ if (source.length > 500 * 1024) return;
397
+
398
+ const symbols = extractSymbols(source, relPath);
399
+ allSymbols.push(...symbols);
400
+ filesScanned.push(relPath);
401
+ filesProcessed++;
402
+ } catch {
403
+ // Skip unreadable files
404
+ }
405
+ }
406
+ }
407
+ }
408
+ }
409
+
410
+ walk(repoRoot, '');
411
+
412
+ const tfidfIndex = buildTfIdfIndex(allSymbols);
413
+
414
+ const stats = {
415
+ filesScanned: filesProcessed,
416
+ symbolsExtracted: allSymbols.length,
417
+ timeMs: Date.now() - startTime,
418
+ languageBreakdown: {},
419
+ };
420
+
421
+ for (const s of allSymbols) {
422
+ stats.languageBreakdown[s.lang] = (stats.languageBreakdown[s.lang] || 0) + 1;
423
+ }
424
+
425
+ const index = {
426
+ version: 1,
427
+ createdAt: new Date().toISOString(),
428
+ repoRoot,
429
+ stats,
430
+ symbols: allSymbols,
431
+ tfidf: {
432
+ // Store just what we need for search
433
+ vocabulary: [...tfidfIndex.vocabulary.keys()],
434
+ idf: tfidfIndex.idf,
435
+ vectors: tfidfIndex.vectors,
436
+ docs: tfidfIndex.docs.map(d => ({ symbol: d.symbol })),
437
+ },
438
+ };
439
+
440
+ // Persist to disk
441
+ try {
442
+ fs.writeFileSync(path.join(indexDir, INDEX_FILE), JSON.stringify(index, null, 2), 'utf-8');
443
+ fs.writeFileSync(path.join(indexDir, META_FILE), JSON.stringify({
444
+ lastBuilt: new Date().toISOString(),
445
+ filesScanned: filesProcessed,
446
+ symbolsExtracted: allSymbols.length,
447
+ }, null, 2), 'utf-8');
448
+ } catch {
449
+ // Non-fatal: index persists in memory even if write fails
450
+ }
451
+
452
+ return index;
453
+ }
454
+
455
+ /**
456
+ * Load an existing index from disk.
457
+ *
458
+ * @param {string} repoRoot - Root directory of the repository
459
+ * @returns {object|null} Index object or null if not found
460
+ */
461
+ function loadIndex(repoRoot) {
462
+ const indexPath = path.join(repoRoot, INDEX_DIR, INDEX_FILE);
463
+ try {
464
+ if (!fs.existsSync(indexPath)) return null;
465
+ const data = JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
466
+ // Rebuild vocabulary Map from stored array
467
+ if (Array.isArray(data.tfidf) && Array.isArray(data.tfidf.vocabulary)) {
468
+ data.tfidf.vocabulary = new Map(data.tfidf.vocabulary.map((w, i) => [w, i]));
469
+ } else if (data.tfidf && Array.isArray(data.tfidf.vocabulary)) {
470
+ data.tfidf.vocabulary = new Map(data.tfidf.vocabulary.map((w, i) => [w, i]));
471
+ }
472
+ return data;
473
+ } catch {
474
+ return null;
475
+ }
476
+ }
477
+
478
+ /**
479
+ * Retrieve relevant context for a given diff.
480
+ *
481
+ * Given a git diff (which tells us which files changed and what lines),
482
+ * this function finds related symbols from the codebase index:
483
+ * 1. Same-file context: symbols defined in the changed files
484
+ * 2. Cross-file context: imports/exports related to changed files
485
+ * 3. Semantic context: top-K similar symbols across the codebase
486
+ *
487
+ * @param {object} index - The codebase index (from buildIndex or loadIndex)
488
+ * @param {string} diff - Git diff text
489
+ * @param {object} [options]
490
+ * @param {number} [options.maxContext=15] - Max symbols to return
491
+ * @param {boolean} [options.includeSemantic=true] - Include semantic search results
492
+ * @returns {object} Context object with symbols grouped by type
493
+ */
494
+ function retrieveContext(index, diff, options = {}) {
495
+ if (!index || !index.symbols || index.symbols.length === 0) {
496
+ return { symbols: [], summary: 'No codebase index available. Run `coderev index` first.' };
497
+ }
498
+
499
+ const maxContext = options.maxContext || 15;
500
+ const includeSemantic = options.includeSemantic !== false;
501
+
502
+ // Extract changed file paths from diff
503
+ const changedFiles = new Set();
504
+ const diffLines = diff.split('\n');
505
+ for (const line of diffLines) {
506
+ const match = line.match(/^\+\+\+ b\/(.+)/);
507
+ if (match) changedFiles.add(match[1]);
508
+ }
509
+
510
+ const results = new Map(); // file+name -> symbol dedup
511
+
512
+ // 1. Same-file: symbols defined in changed files
513
+ for (const sym of index.symbols) {
514
+ if (changedFiles.has(sym.file)) {
515
+ const key = `${sym.file}:${sym.name}`;
516
+ if (!results.has(key)) results.set(key, { ...sym, relevance: 'same_file' });
517
+ }
518
+ }
519
+
520
+ // 2. Cross-file: imports/exports related to changed files
521
+ // Find modules imported by changed files
522
+ for (const sym of index.symbols) {
523
+ if (changedFiles.has(sym.file) && sym.type === 'import' && sym.name) {
524
+ // Find symbols exported by the imported module
525
+ for (const other of index.symbols) {
526
+ if (other.type === 'export' && other.file.includes(sym.name.replace(/^\.?\/?/, ''))) {
527
+ const key = `${other.file}:${other.name}`;
528
+ if (!results.has(key)) results.set(key, { ...other, relevance: 'cross_file' });
529
+ }
530
+ }
531
+ }
532
+ }
533
+
534
+ // 3. Semantic search: find similar symbols across codebase
535
+ if (includeSemantic && results.size < maxContext) {
536
+ // Build query from diff context (focus on function/class names in diff)
537
+ const queryParts = [];
538
+ for (const sym of index.symbols) {
539
+ if (changedFiles.has(sym.file)) {
540
+ queryParts.push(sym.name);
541
+ queryParts.push(sym.type);
542
+ }
543
+ }
544
+ const query = queryParts.join(' ') || diffLines.slice(0, 20).join(' ');
545
+
546
+ // searchIndex expects the flat TF-IDF structure with Map vocabulary
547
+ const tfidf = index.tfidf || index;
548
+ // Ensure vocabulary is a Map (may be stored as array in JSON)
549
+ if (Array.isArray(tfidf.vocabulary)) {
550
+ tfidf.vocabulary = new Map(tfidf.vocabulary.map((w, i) => [w, i]));
551
+ }
552
+ const semanticResults = searchIndex(tfidf, query, maxContext);
553
+ for (const r of semanticResults) {
554
+ if (changedFiles.has(r.symbol.file)) continue; // Already have same-file
555
+ const key = `${r.symbol.file}:${r.symbol.name}`;
556
+ if (!results.has(key)) {
557
+ results.set(key, { ...r.symbol, relevance: 'semantic', score: r.score.toFixed(3) });
558
+ }
559
+ }
560
+ }
561
+
562
+ // Convert to array, limit
563
+ const contextSymbols = [...results.values()].slice(0, maxContext);
564
+
565
+ // Group by relevance
566
+ const grouped = {
567
+ sameFile: contextSymbols.filter(s => s.relevance === 'same_file'),
568
+ crossFile: contextSymbols.filter(s => s.relevance === 'cross_file'),
569
+ semantic: contextSymbols.filter(s => s.relevance === 'semantic'),
570
+ };
571
+
572
+ return {
573
+ symbols: contextSymbols,
574
+ grouped,
575
+ changedFiles: [...changedFiles],
576
+ totalIndexed: index.symbols.length,
577
+ summary: formatContextSummary(grouped, changedFiles),
578
+ };
579
+ }
580
+
581
+ /**
582
+ * Format context as a compact text block for prompt injection.
583
+ */
584
+ function formatContextSummary(grouped, changedFiles) {
585
+ const parts = [];
586
+
587
+ if (changedFiles.length > 0) {
588
+ parts.push(`Changed files: ${changedFiles.join(', ')}`);
589
+ }
590
+
591
+ if (grouped.sameFile.length > 0) {
592
+ parts.push(`\nšŸ“„ Same-file context (${grouped.sameFile.length} symbols):`);
593
+ for (const s of grouped.sameFile.slice(0, 10)) {
594
+ parts.push(` ${s.type}: ${s.name}${s.signature ? `(${s.signature})` : ''} (${s.file}:${s.line})`);
595
+ }
596
+ }
597
+
598
+ if (grouped.crossFile.length > 0) {
599
+ parts.push(`\nšŸ”— Related symbols (${grouped.crossFile.length}):`);
600
+ for (const s of grouped.crossFile.slice(0, 5)) {
601
+ parts.push(` ${s.type}: ${s.name} in ${s.file}`);
602
+ }
603
+ }
604
+
605
+ if (grouped.semantic.length > 0) {
606
+ parts.push(`\nšŸ” Similar symbols across codebase (${grouped.semantic.length}):`);
607
+ for (const s of grouped.semantic.slice(0, 5)) {
608
+ parts.push(` ${s.type}: ${s.name} in ${s.file}`);
609
+ }
610
+ }
611
+
612
+ return parts.join('\n');
613
+ }
614
+
615
+ /**
616
+ * Build a context string for injection into review prompts.
617
+ * This is the main integration point with reviewer.js
618
+ *
619
+ * @param {object} index - The codebase index
620
+ * @param {string} diff - Git diff text
621
+ * @param {object} [options]
622
+ * @returns {string} Context string for prompt injection
623
+ */
624
+ function buildReviewContext(index, diff, options = {}) {
625
+ if (!index) return '';
626
+
627
+ const ctx = retrieveContext(index, diff, options);
628
+ if (!ctx.symbols || ctx.symbols.length === 0) return '';
629
+
630
+ let contextBlock = `
631
+ ## šŸ“š Codebase Context (Retrieved via RAG)
632
+
633
+ The following symbols were found in the codebase that may be relevant to this change:
634
+
635
+ `;
636
+
637
+ // Same-file symbols (most important)
638
+ if (ctx.grouped.sameFile.length > 0) {
639
+ contextBlock += `### Same File Symbols\n`;
640
+ for (const s of ctx.grouped.sameFile.slice(0, 8)) {
641
+ contextBlock += `- \`${s.type}\` **${s.name}**${s.signature ? `(${s.signature})` : ''} at line ${s.line}\n`;
642
+ if (s.snippet && s.snippet.length < 300) {
643
+ contextBlock += ` \`\`\`\n${s.snippet}\n \`\`\`\n`;
644
+ }
645
+ }
646
+ }
647
+
648
+ // Cross-file references
649
+ if (ctx.grouped.crossFile.length > 0) {
650
+ contextBlock += `### Cross-File References\n`;
651
+ for (const s of ctx.grouped.crossFile.slice(0, 5)) {
652
+ contextBlock += `- \`${s.type}\` **${s.name}** in \`${s.file}\`\n`;
653
+ }
654
+ }
655
+
656
+ // Semantic matches
657
+ if (ctx.grouped.semantic.length > 0) {
658
+ contextBlock += `### Semantically Similar\n`;
659
+ for (const s of ctx.grouped.semantic.slice(0, 5)) {
660
+ contextBlock += `- \`${s.type}\` **${s.name}** in \`${s.file}\`\n`;
661
+ }
662
+ }
663
+
664
+ contextBlock += `\nUse this context to understand call chains, type relationships, and coding patterns.`;
665
+
666
+ return contextBlock;
667
+ }
668
+
669
+ /**
670
+ * Check if an index needs rebuilding (stale or non-existent).
671
+ *
672
+ * @param {string} repoRoot - Repository root
673
+ * @param {number} [maxAgeHours=24] - Max age in hours before considered stale
674
+ * @returns {boolean}
675
+ */
676
+ function isIndexStale(repoRoot, maxAgeHours = 24) {
677
+ const metaPath = path.join(repoRoot, INDEX_DIR, META_FILE);
678
+ try {
679
+ if (!fs.existsSync(metaPath)) return true;
680
+ const meta = JSON.parse(fs.readFileSync(metaPath, 'utf-8'));
681
+ const builtTime = new Date(meta.lastBuilt).getTime();
682
+ return (Date.now() - builtTime) > maxAgeHours * 60 * 60 * 1000;
683
+ } catch {
684
+ return true;
685
+ }
686
+ }
687
+
688
+ module.exports = {
689
+ buildIndex,
690
+ loadIndex,
691
+ retrieveContext,
692
+ buildReviewContext,
693
+ extractSymbols,
694
+ searchIndex,
695
+ buildTfIdfIndex,
696
+ isIndexStale,
697
+ INDEX_DIR,
698
+ INDEXABLE_EXTS,
699
+ SKIP_DIRS,
700
+ };