coderev-cli 1.0.26 ā 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +776 -0
- package/package.json +1 -1
- package/src/cli.js +162 -1
- package/src/issue-validator.js +499 -0
- package/src/issue-validator.test.js +404 -0
- package/src/models.js +59 -0
- package/src/models.test.js +139 -2
- package/src/rag-indexer.js +700 -0
- package/src/rag-indexer.test.js +385 -0
- package/src/reviewer.js +36 -6
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RAG (Retrieval-Augmented Generation) Codebase Indexer
|
|
3
|
+
*
|
|
4
|
+
* Phase 1: Lightweight local codebase indexing with text-based symbol extraction
|
|
5
|
+
* and cosine-similarity retrieval using simple TF-IDF vectors with optional LLM embeddings.
|
|
6
|
+
*
|
|
7
|
+
* Design:
|
|
8
|
+
* - No native deps (tree-sitter, sqlite-vec) ā pure JS for Phase 1
|
|
9
|
+
* - Index stored as JSON in `.coderev/index/`
|
|
10
|
+
* - Two modes:
|
|
11
|
+
* 1. Fast: TF-IDF on extracted symbols/functions (no LLM call, instant)
|
|
12
|
+
* 2. Embedded: Uses LLM embeddings API for semantic search (needs API key, more accurate)
|
|
13
|
+
* - Indexed content: function signatures, class definitions, import statements, type defs
|
|
14
|
+
* - Diff context retrieval: given a changed file, find related symbols from the same
|
|
15
|
+
* file + cross-file references (imports/exports)
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const fs = require('fs');
|
|
19
|
+
const path = require('path');
|
|
20
|
+
const crypto = require('crypto');
|
|
21
|
+
|
|
22
|
+
const INDEX_DIR = '.coderev/index';
|
|
23
|
+
const INDEX_FILE = 'codebase-index.json';
|
|
24
|
+
const META_FILE = 'index-meta.json';
|
|
25
|
+
|
|
26
|
+
// File extensions to index
|
|
27
|
+
const INDEXABLE_EXTS = new Set([
|
|
28
|
+
'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
|
|
29
|
+
'.py', '.rs', '.go', '.java', '.rb', '.php',
|
|
30
|
+
'.swift', '.kt', '.kts', '.c', '.cpp', '.h', '.hpp',
|
|
31
|
+
'.cs', '.sql', '.yaml', '.yml', '.toml',
|
|
32
|
+
'.vue', '.svelte', '.astro',
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
// Directories to skip
|
|
36
|
+
const SKIP_DIRS = new Set([
|
|
37
|
+
'node_modules', '.git', '__pycache__', '.venv', 'venv',
|
|
38
|
+
'dist', 'build', '.next', '.nuxt', '.output',
|
|
39
|
+
'target', 'bin', 'obj', '.gradle', '.idea',
|
|
40
|
+
'vendor', 'coverage', '.coderev',
|
|
41
|
+
]);
|
|
42
|
+
|
|
43
|
+
// Regex patterns for symbol extraction (language-agnostic)
|
|
44
|
+
const SYMBOL_PATTERNS = [
|
|
45
|
+
// JavaScript/TypeScript: function declarations & arrow functions
|
|
46
|
+
{
|
|
47
|
+
lang: 'js/ts',
|
|
48
|
+
re: /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/g,
|
|
49
|
+
type: 'function',
|
|
50
|
+
},
|
|
51
|
+
// JavaScript/TypeScript: arrow functions
|
|
52
|
+
{
|
|
53
|
+
lang: 'js/ts',
|
|
54
|
+
re: /(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[\w$]+)\s*=>/g,
|
|
55
|
+
type: 'function',
|
|
56
|
+
},
|
|
57
|
+
// JavaScript/TypeScript: class declarations
|
|
58
|
+
{
|
|
59
|
+
lang: 'js/ts',
|
|
60
|
+
re: /(?:export\s+)?(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?/g,
|
|
61
|
+
type: 'class',
|
|
62
|
+
},
|
|
63
|
+
// JavaScript/TypeScript: method definitions in classes/objects
|
|
64
|
+
{
|
|
65
|
+
lang: 'js/ts',
|
|
66
|
+
re: /(?:(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{)/g,
|
|
67
|
+
type: 'method',
|
|
68
|
+
},
|
|
69
|
+
// JavaScript/TypeScript: imports
|
|
70
|
+
{
|
|
71
|
+
lang: 'js/ts',
|
|
72
|
+
re: /(?:import\s+(?:(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)(?:\s*,\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+))*\s+from\s+)?['"]([^'"]+)['"])|(?:require\s*\(\s*['"]([^'"]+)['"]\s*\))/g,
|
|
73
|
+
type: 'import',
|
|
74
|
+
},
|
|
75
|
+
// JavaScript/TypeScript: exports
|
|
76
|
+
{
|
|
77
|
+
lang: 'js/ts',
|
|
78
|
+
re: /export\s+(?:default\s+)?(?:(?:function|class|const|let|var)\s+)?(\w+)/g,
|
|
79
|
+
type: 'export',
|
|
80
|
+
},
|
|
81
|
+
// Python: function definitions
|
|
82
|
+
{
|
|
83
|
+
lang: 'python',
|
|
84
|
+
re: /(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/g,
|
|
85
|
+
type: 'function',
|
|
86
|
+
},
|
|
87
|
+
// Python: class definitions
|
|
88
|
+
{
|
|
89
|
+
lang: 'python',
|
|
90
|
+
re: /class\s+(\w+)(?:\s*\(([^)]*)\))?:/g,
|
|
91
|
+
type: 'class',
|
|
92
|
+
},
|
|
93
|
+
// Python: imports
|
|
94
|
+
{
|
|
95
|
+
lang: 'python',
|
|
96
|
+
re: /(?:from\s+(\S+)\s+import\s+(\S+))|(?:import\s+(\S+))/g,
|
|
97
|
+
type: 'import',
|
|
98
|
+
},
|
|
99
|
+
// Go: function declarations
|
|
100
|
+
{
|
|
101
|
+
lang: 'go',
|
|
102
|
+
re: /func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(([^)]*)\)/g,
|
|
103
|
+
type: 'function',
|
|
104
|
+
},
|
|
105
|
+
// Go: type/struct definitions
|
|
106
|
+
{
|
|
107
|
+
lang: 'go',
|
|
108
|
+
re: /type\s+(\w+)\s+(?:struct|interface)\s*\{/g,
|
|
109
|
+
type: 'type',
|
|
110
|
+
},
|
|
111
|
+
// Go: imports
|
|
112
|
+
{
|
|
113
|
+
lang: 'go',
|
|
114
|
+
re: /"([^"]+)"/g,
|
|
115
|
+
type: 'import',
|
|
116
|
+
},
|
|
117
|
+
// Rust: function definitions
|
|
118
|
+
{
|
|
119
|
+
lang: 'rust',
|
|
120
|
+
re: /(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]*>)?\s*\(([^)]*)\)/g,
|
|
121
|
+
type: 'function',
|
|
122
|
+
},
|
|
123
|
+
// Rust: struct/enum/trait
|
|
124
|
+
{
|
|
125
|
+
lang: 'rust',
|
|
126
|
+
re: /(?:pub\s+)?(?:struct|enum|trait)\s+(\w+)/g,
|
|
127
|
+
type: 'type',
|
|
128
|
+
},
|
|
129
|
+
// Java/Kotlin: method declarations
|
|
130
|
+
{
|
|
131
|
+
lang: 'java/kotlin',
|
|
132
|
+
re: /(?:(?:public|private|protected)\s+)?(?:static\s+)?\w+\s+(\w+)\s*\(([^)]*)\)/g,
|
|
133
|
+
type: 'method',
|
|
134
|
+
},
|
|
135
|
+
// Java/Kotlin: class
|
|
136
|
+
{
|
|
137
|
+
lang: 'java/kotlin',
|
|
138
|
+
re: /(?:public\s+)?class\s+(\w+)/g,
|
|
139
|
+
type: 'class',
|
|
140
|
+
},
|
|
141
|
+
];
|
|
142
|
+
|
|
143
|
+
// Language detection by extension
|
|
144
|
+
const EXT_LANG_MAP = {
|
|
145
|
+
'.js': 'js/ts', '.jsx': 'js/ts', '.ts': 'js/ts', '.tsx': 'js/ts',
|
|
146
|
+
'.mjs': 'js/ts', '.cjs': 'js/ts',
|
|
147
|
+
'.py': 'python', '.pyw': 'python',
|
|
148
|
+
'.rs': 'rust',
|
|
149
|
+
'.go': 'go',
|
|
150
|
+
'.java': 'java/kotlin', '.kt': 'java/kotlin', '.kts': 'java/kotlin',
|
|
151
|
+
'.rb': 'ruby',
|
|
152
|
+
'.php': 'php',
|
|
153
|
+
'.swift': 'swift',
|
|
154
|
+
'.c': 'c', '.cpp': 'c/cpp', '.h': 'c', '.hpp': 'c/cpp',
|
|
155
|
+
'.cs': 'csharp',
|
|
156
|
+
'.sql': 'sql',
|
|
157
|
+
'.vue': 'js/ts', '.svelte': 'js/ts', '.astro': 'js/ts',
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Determine the language group for a file extension.
|
|
162
|
+
*/
|
|
163
|
+
function langForExt(ext) {
|
|
164
|
+
return EXT_LANG_MAP[ext] || 'generic';
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Extract symbols from source code text.
|
|
169
|
+
* Returns an array of { name, type, signature, lang, line } objects.
|
|
170
|
+
*/
|
|
171
|
+
function extractSymbols(source, filename) {
|
|
172
|
+
const ext = path.extname(filename).toLowerCase();
|
|
173
|
+
const lang = langForExt(ext);
|
|
174
|
+
const symbols = [];
|
|
175
|
+
const lines = source.split('\n');
|
|
176
|
+
|
|
177
|
+
for (const pattern of SYMBOL_PATTERNS) {
|
|
178
|
+
if (!pattern.lang.includes(lang) && pattern.lang !== 'generic') continue;
|
|
179
|
+
|
|
180
|
+
// Reset lastIndex for new source
|
|
181
|
+
const re = new RegExp(pattern.re.source, pattern.re.flags);
|
|
182
|
+
|
|
183
|
+
let match;
|
|
184
|
+
while ((match = re.exec(source)) !== null) {
|
|
185
|
+
let name, signature;
|
|
186
|
+
|
|
187
|
+
if (pattern.type === 'import') {
|
|
188
|
+
// Import patterns: capture the module path
|
|
189
|
+
name = match[1] || match[2] || match[3] || match[4] || '';
|
|
190
|
+
// For Go multi-import blocks, filter noise
|
|
191
|
+
if (lang === 'go' && name.startsWith('"') && name.endsWith('"')) {
|
|
192
|
+
name = name.slice(1, -1);
|
|
193
|
+
}
|
|
194
|
+
} else {
|
|
195
|
+
// Function/class/method: first capture group is name
|
|
196
|
+
name = match[1];
|
|
197
|
+
signature = match[2] || '';
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (!name || name.length < 1) continue;
|
|
201
|
+
|
|
202
|
+
// Skip noise words (language-specific)
|
|
203
|
+
const noise = ['if', 'for', 'while', 'switch', 'catch', 'return', 'throw',
|
|
204
|
+
'typeof', 'instanceof', 'delete', 'void', 'else', 'case', 'default',
|
|
205
|
+
'break', 'continue', 'try', 'finally', 'debugger', 'with'];
|
|
206
|
+
if (noise.includes(name) && pattern.lang !== 'rust') continue;
|
|
207
|
+
|
|
208
|
+
// Calculate line number
|
|
209
|
+
const pos = match.index;
|
|
210
|
+
const line = source.substring(0, pos).split('\n').length;
|
|
211
|
+
|
|
212
|
+
symbols.push({
|
|
213
|
+
name,
|
|
214
|
+
type: pattern.type,
|
|
215
|
+
signature: signature || '',
|
|
216
|
+
lang,
|
|
217
|
+
file: filename,
|
|
218
|
+
line,
|
|
219
|
+
// Context snippet for retrieval
|
|
220
|
+
snippet: lines.slice(Math.max(0, line - 2), Math.min(lines.length, line + 3)).join('\n'),
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return symbols;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Simple TF-IDF style tokenizer for text.
|
|
230
|
+
*/
|
|
231
|
+
function tokenize(text) {
|
|
232
|
+
return text
|
|
233
|
+
.toLowerCase()
|
|
234
|
+
.replace(/[^a-z0-9_$]/g, ' ')
|
|
235
|
+
.split(/\s+/)
|
|
236
|
+
.filter(t => t.length > 1 && !['the', 'and', 'for', 'with', 'from', 'this'].includes(t));
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Build a simple bag-of-words vector for a document.
|
|
241
|
+
*/
|
|
242
|
+
function bowVector(tokens, vocabulary) {
|
|
243
|
+
const vec = new Array(vocabulary.size).fill(0);
|
|
244
|
+
for (const token of tokens) {
|
|
245
|
+
const idx = vocabulary.get(token);
|
|
246
|
+
if (idx !== undefined) vec[idx]++;
|
|
247
|
+
}
|
|
248
|
+
return vec;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Cosine similarity between two vectors.
|
|
253
|
+
*/
|
|
254
|
+
function cosineSimilarity(a, b) {
|
|
255
|
+
let dot = 0, normA = 0, normB = 0;
|
|
256
|
+
for (let i = 0; i < a.length; i++) {
|
|
257
|
+
dot += a[i] * b[i];
|
|
258
|
+
normA += a[i] * a[i];
|
|
259
|
+
normB += b[i] * b[i];
|
|
260
|
+
}
|
|
261
|
+
if (normA === 0 || normB === 0) return 0;
|
|
262
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Build a TF-IDF index from extracted symbols.
|
|
267
|
+
*/
|
|
268
|
+
function buildTfIdfIndex(symbols) {
|
|
269
|
+
// Build vocabulary
|
|
270
|
+
const vocabSet = new Set();
|
|
271
|
+
const docs = symbols.map(s => ({
|
|
272
|
+
tokens: tokenize(`${s.name} ${s.signature} ${s.type} ${s.lang} ${s.file}`),
|
|
273
|
+
symbol: s,
|
|
274
|
+
}));
|
|
275
|
+
|
|
276
|
+
for (const doc of docs) {
|
|
277
|
+
for (const token of doc.tokens) vocabSet.add(token);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const vocabulary = new Map();
|
|
281
|
+
[...vocabSet].forEach((word, i) => vocabulary.set(word, i));
|
|
282
|
+
|
|
283
|
+
// Compute IDF
|
|
284
|
+
const df = new Array(vocabulary.size).fill(0);
|
|
285
|
+
for (const doc of docs) {
|
|
286
|
+
const seen = new Set();
|
|
287
|
+
for (const token of doc.tokens) {
|
|
288
|
+
const idx = vocabulary.get(token);
|
|
289
|
+
if (idx !== undefined && !seen.has(idx)) {
|
|
290
|
+
df[idx]++;
|
|
291
|
+
seen.add(idx);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const N = docs.length;
|
|
297
|
+
const idf = df.map(d => d === 0 ? 0 : Math.log((N + 1) / (d + 1)) + 1);
|
|
298
|
+
|
|
299
|
+
// Build TF-IDF vectors for each document
|
|
300
|
+
const vectors = docs.map(doc => {
|
|
301
|
+
const tf = new Array(vocabulary.size).fill(0);
|
|
302
|
+
for (const token of doc.tokens) {
|
|
303
|
+
const idx = vocabulary.get(token);
|
|
304
|
+
if (idx !== undefined) tf[idx]++;
|
|
305
|
+
}
|
|
306
|
+
// TF normalization
|
|
307
|
+
const maxTf = Math.max(...tf);
|
|
308
|
+
if (maxTf > 0) {
|
|
309
|
+
for (let i = 0; i < tf.length; i++) {
|
|
310
|
+
tf[i] = 0.5 + 0.5 * (tf[i] / maxTf);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
return tf.map((v, i) => v * idf[i]);
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
return { vocabulary, idf, vectors, docs };
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Search the index for symbols relevant to the query.
|
|
321
|
+
*/
|
|
322
|
+
function searchIndex(index, query, topK = 10) {
|
|
323
|
+
const queryTokens = tokenize(query);
|
|
324
|
+
if (queryTokens.length === 0) return [];
|
|
325
|
+
|
|
326
|
+
const queryVec = bowVector(queryTokens, index.vocabulary);
|
|
327
|
+
|
|
328
|
+
// IDF weight the query
|
|
329
|
+
for (let i = 0; i < queryVec.length; i++) {
|
|
330
|
+
queryVec[i] *= (index.idf[i] || 1);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Score all docs
|
|
334
|
+
const scores = index.vectors.map((vec, i) => ({
|
|
335
|
+
score: cosineSimilarity(queryVec, vec),
|
|
336
|
+
symbol: index.docs[i].symbol,
|
|
337
|
+
}));
|
|
338
|
+
|
|
339
|
+
// Sort and return top K
|
|
340
|
+
return scores
|
|
341
|
+
.filter(s => s.score > 0)
|
|
342
|
+
.sort((a, b) => b.score - a.score)
|
|
343
|
+
.slice(0, topK);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Build the codebase index.
|
|
348
|
+
*
|
|
349
|
+
* @param {string} repoRoot - Root directory of the repository
|
|
350
|
+
* @param {object} [options]
|
|
351
|
+
* @param {string[]} [options.includePatterns] - Glob patterns for files to include
|
|
352
|
+
* @param {string[]} [options.excludePatterns] - Glob patterns for files to exclude
|
|
353
|
+
* @param {number} [options.maxFiles=500] - Maximum number of files to index
|
|
354
|
+
* @returns {object} Index object with symbols, stats, and search capability
|
|
355
|
+
*/
|
|
356
|
+
function buildIndex(repoRoot, options = {}) {
|
|
357
|
+
const maxFiles = options.maxFiles || 500;
|
|
358
|
+
const startTime = Date.now();
|
|
359
|
+
const allSymbols = [];
|
|
360
|
+
const filesScanned = [];
|
|
361
|
+
let filesProcessed = 0;
|
|
362
|
+
|
|
363
|
+
// Ensure index directory exists
|
|
364
|
+
const indexDir = path.join(repoRoot, INDEX_DIR);
|
|
365
|
+
if (!fs.existsSync(indexDir)) {
|
|
366
|
+
fs.mkdirSync(indexDir, { recursive: true });
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Walk the directory tree
|
|
370
|
+
function walk(dir, relativePath) {
|
|
371
|
+
if (filesProcessed >= maxFiles) return;
|
|
372
|
+
|
|
373
|
+
let entries;
|
|
374
|
+
try {
|
|
375
|
+
entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
376
|
+
} catch {
|
|
377
|
+
return; // Skip unreadable directories
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
for (const entry of entries) {
|
|
381
|
+
if (filesProcessed >= maxFiles) return;
|
|
382
|
+
|
|
383
|
+
const fullPath = path.join(dir, entry.name);
|
|
384
|
+
const relPath = relativePath ? `${relativePath}/${entry.name}` : entry.name;
|
|
385
|
+
|
|
386
|
+
if (entry.isDirectory()) {
|
|
387
|
+
if (!SKIP_DIRS.has(entry.name) && !entry.name.startsWith('.')) {
|
|
388
|
+
walk(fullPath, relPath);
|
|
389
|
+
}
|
|
390
|
+
} else if (entry.isFile()) {
|
|
391
|
+
const ext = path.extname(entry.name).toLowerCase();
|
|
392
|
+
if (INDEXABLE_EXTS.has(ext)) {
|
|
393
|
+
try {
|
|
394
|
+
const source = fs.readFileSync(fullPath, 'utf-8');
|
|
395
|
+
// Skip very large files (> 500KB)
|
|
396
|
+
if (source.length > 500 * 1024) return;
|
|
397
|
+
|
|
398
|
+
const symbols = extractSymbols(source, relPath);
|
|
399
|
+
allSymbols.push(...symbols);
|
|
400
|
+
filesScanned.push(relPath);
|
|
401
|
+
filesProcessed++;
|
|
402
|
+
} catch {
|
|
403
|
+
// Skip unreadable files
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
walk(repoRoot, '');
|
|
411
|
+
|
|
412
|
+
const tfidfIndex = buildTfIdfIndex(allSymbols);
|
|
413
|
+
|
|
414
|
+
const stats = {
|
|
415
|
+
filesScanned: filesProcessed,
|
|
416
|
+
symbolsExtracted: allSymbols.length,
|
|
417
|
+
timeMs: Date.now() - startTime,
|
|
418
|
+
languageBreakdown: {},
|
|
419
|
+
};
|
|
420
|
+
|
|
421
|
+
for (const s of allSymbols) {
|
|
422
|
+
stats.languageBreakdown[s.lang] = (stats.languageBreakdown[s.lang] || 0) + 1;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const index = {
|
|
426
|
+
version: 1,
|
|
427
|
+
createdAt: new Date().toISOString(),
|
|
428
|
+
repoRoot,
|
|
429
|
+
stats,
|
|
430
|
+
symbols: allSymbols,
|
|
431
|
+
tfidf: {
|
|
432
|
+
// Store just what we need for search
|
|
433
|
+
vocabulary: [...tfidfIndex.vocabulary.keys()],
|
|
434
|
+
idf: tfidfIndex.idf,
|
|
435
|
+
vectors: tfidfIndex.vectors,
|
|
436
|
+
docs: tfidfIndex.docs.map(d => ({ symbol: d.symbol })),
|
|
437
|
+
},
|
|
438
|
+
};
|
|
439
|
+
|
|
440
|
+
// Persist to disk
|
|
441
|
+
try {
|
|
442
|
+
fs.writeFileSync(path.join(indexDir, INDEX_FILE), JSON.stringify(index, null, 2), 'utf-8');
|
|
443
|
+
fs.writeFileSync(path.join(indexDir, META_FILE), JSON.stringify({
|
|
444
|
+
lastBuilt: new Date().toISOString(),
|
|
445
|
+
filesScanned: filesProcessed,
|
|
446
|
+
symbolsExtracted: allSymbols.length,
|
|
447
|
+
}, null, 2), 'utf-8');
|
|
448
|
+
} catch {
|
|
449
|
+
// Non-fatal: index persists in memory even if write fails
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
return index;
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Load an existing index from disk.
|
|
457
|
+
*
|
|
458
|
+
* @param {string} repoRoot - Root directory of the repository
|
|
459
|
+
* @returns {object|null} Index object or null if not found
|
|
460
|
+
*/
|
|
461
|
+
function loadIndex(repoRoot) {
|
|
462
|
+
const indexPath = path.join(repoRoot, INDEX_DIR, INDEX_FILE);
|
|
463
|
+
try {
|
|
464
|
+
if (!fs.existsSync(indexPath)) return null;
|
|
465
|
+
const data = JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
|
|
466
|
+
// Rebuild vocabulary Map from stored array
|
|
467
|
+
if (Array.isArray(data.tfidf) && Array.isArray(data.tfidf.vocabulary)) {
|
|
468
|
+
data.tfidf.vocabulary = new Map(data.tfidf.vocabulary.map((w, i) => [w, i]));
|
|
469
|
+
} else if (data.tfidf && Array.isArray(data.tfidf.vocabulary)) {
|
|
470
|
+
data.tfidf.vocabulary = new Map(data.tfidf.vocabulary.map((w, i) => [w, i]));
|
|
471
|
+
}
|
|
472
|
+
return data;
|
|
473
|
+
} catch {
|
|
474
|
+
return null;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* Retrieve relevant context for a given diff.
|
|
480
|
+
*
|
|
481
|
+
* Given a git diff (which tells us which files changed and what lines),
|
|
482
|
+
* this function finds related symbols from the codebase index:
|
|
483
|
+
* 1. Same-file context: symbols defined in the changed files
|
|
484
|
+
* 2. Cross-file context: imports/exports related to changed files
|
|
485
|
+
* 3. Semantic context: top-K similar symbols across the codebase
|
|
486
|
+
*
|
|
487
|
+
* @param {object} index - The codebase index (from buildIndex or loadIndex)
|
|
488
|
+
* @param {string} diff - Git diff text
|
|
489
|
+
* @param {object} [options]
|
|
490
|
+
* @param {number} [options.maxContext=15] - Max symbols to return
|
|
491
|
+
* @param {boolean} [options.includeSemantic=true] - Include semantic search results
|
|
492
|
+
* @returns {object} Context object with symbols grouped by type
|
|
493
|
+
*/
|
|
494
|
+
function retrieveContext(index, diff, options = {}) {
|
|
495
|
+
if (!index || !index.symbols || index.symbols.length === 0) {
|
|
496
|
+
return { symbols: [], summary: 'No codebase index available. Run `coderev index` first.' };
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
const maxContext = options.maxContext || 15;
|
|
500
|
+
const includeSemantic = options.includeSemantic !== false;
|
|
501
|
+
|
|
502
|
+
// Extract changed file paths from diff
|
|
503
|
+
const changedFiles = new Set();
|
|
504
|
+
const diffLines = diff.split('\n');
|
|
505
|
+
for (const line of diffLines) {
|
|
506
|
+
const match = line.match(/^\+\+\+ b\/(.+)/);
|
|
507
|
+
if (match) changedFiles.add(match[1]);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
const results = new Map(); // file+name -> symbol dedup
|
|
511
|
+
|
|
512
|
+
// 1. Same-file: symbols defined in changed files
|
|
513
|
+
for (const sym of index.symbols) {
|
|
514
|
+
if (changedFiles.has(sym.file)) {
|
|
515
|
+
const key = `${sym.file}:${sym.name}`;
|
|
516
|
+
if (!results.has(key)) results.set(key, { ...sym, relevance: 'same_file' });
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// 2. Cross-file: imports/exports related to changed files
|
|
521
|
+
// Find modules imported by changed files
|
|
522
|
+
for (const sym of index.symbols) {
|
|
523
|
+
if (changedFiles.has(sym.file) && sym.type === 'import' && sym.name) {
|
|
524
|
+
// Find symbols exported by the imported module
|
|
525
|
+
for (const other of index.symbols) {
|
|
526
|
+
if (other.type === 'export' && other.file.includes(sym.name.replace(/^\.?\/?/, ''))) {
|
|
527
|
+
const key = `${other.file}:${other.name}`;
|
|
528
|
+
if (!results.has(key)) results.set(key, { ...other, relevance: 'cross_file' });
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// 3. Semantic search: find similar symbols across codebase
|
|
535
|
+
if (includeSemantic && results.size < maxContext) {
|
|
536
|
+
// Build query from diff context (focus on function/class names in diff)
|
|
537
|
+
const queryParts = [];
|
|
538
|
+
for (const sym of index.symbols) {
|
|
539
|
+
if (changedFiles.has(sym.file)) {
|
|
540
|
+
queryParts.push(sym.name);
|
|
541
|
+
queryParts.push(sym.type);
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
const query = queryParts.join(' ') || diffLines.slice(0, 20).join(' ');
|
|
545
|
+
|
|
546
|
+
// searchIndex expects the flat TF-IDF structure with Map vocabulary
|
|
547
|
+
const tfidf = index.tfidf || index;
|
|
548
|
+
// Ensure vocabulary is a Map (may be stored as array in JSON)
|
|
549
|
+
if (Array.isArray(tfidf.vocabulary)) {
|
|
550
|
+
tfidf.vocabulary = new Map(tfidf.vocabulary.map((w, i) => [w, i]));
|
|
551
|
+
}
|
|
552
|
+
const semanticResults = searchIndex(tfidf, query, maxContext);
|
|
553
|
+
for (const r of semanticResults) {
|
|
554
|
+
if (changedFiles.has(r.symbol.file)) continue; // Already have same-file
|
|
555
|
+
const key = `${r.symbol.file}:${r.symbol.name}`;
|
|
556
|
+
if (!results.has(key)) {
|
|
557
|
+
results.set(key, { ...r.symbol, relevance: 'semantic', score: r.score.toFixed(3) });
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Convert to array, limit
|
|
563
|
+
const contextSymbols = [...results.values()].slice(0, maxContext);
|
|
564
|
+
|
|
565
|
+
// Group by relevance
|
|
566
|
+
const grouped = {
|
|
567
|
+
sameFile: contextSymbols.filter(s => s.relevance === 'same_file'),
|
|
568
|
+
crossFile: contextSymbols.filter(s => s.relevance === 'cross_file'),
|
|
569
|
+
semantic: contextSymbols.filter(s => s.relevance === 'semantic'),
|
|
570
|
+
};
|
|
571
|
+
|
|
572
|
+
return {
|
|
573
|
+
symbols: contextSymbols,
|
|
574
|
+
grouped,
|
|
575
|
+
changedFiles: [...changedFiles],
|
|
576
|
+
totalIndexed: index.symbols.length,
|
|
577
|
+
summary: formatContextSummary(grouped, changedFiles),
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
/**
|
|
582
|
+
* Format context as a compact text block for prompt injection.
|
|
583
|
+
*/
|
|
584
|
+
function formatContextSummary(grouped, changedFiles) {
|
|
585
|
+
const parts = [];
|
|
586
|
+
|
|
587
|
+
if (changedFiles.length > 0) {
|
|
588
|
+
parts.push(`Changed files: ${changedFiles.join(', ')}`);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if (grouped.sameFile.length > 0) {
|
|
592
|
+
parts.push(`\nš Same-file context (${grouped.sameFile.length} symbols):`);
|
|
593
|
+
for (const s of grouped.sameFile.slice(0, 10)) {
|
|
594
|
+
parts.push(` ${s.type}: ${s.name}${s.signature ? `(${s.signature})` : ''} (${s.file}:${s.line})`);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if (grouped.crossFile.length > 0) {
|
|
599
|
+
parts.push(`\nš Related symbols (${grouped.crossFile.length}):`);
|
|
600
|
+
for (const s of grouped.crossFile.slice(0, 5)) {
|
|
601
|
+
parts.push(` ${s.type}: ${s.name} in ${s.file}`);
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
if (grouped.semantic.length > 0) {
|
|
606
|
+
parts.push(`\nš Similar symbols across codebase (${grouped.semantic.length}):`);
|
|
607
|
+
for (const s of grouped.semantic.slice(0, 5)) {
|
|
608
|
+
parts.push(` ${s.type}: ${s.name} in ${s.file}`);
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return parts.join('\n');
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
/**
|
|
616
|
+
* Build a context string for injection into review prompts.
|
|
617
|
+
* This is the main integration point with reviewer.js
|
|
618
|
+
*
|
|
619
|
+
* @param {object} index - The codebase index
|
|
620
|
+
* @param {string} diff - Git diff text
|
|
621
|
+
* @param {object} [options]
|
|
622
|
+
* @returns {string} Context string for prompt injection
|
|
623
|
+
*/
|
|
624
|
+
function buildReviewContext(index, diff, options = {}) {
|
|
625
|
+
if (!index) return '';
|
|
626
|
+
|
|
627
|
+
const ctx = retrieveContext(index, diff, options);
|
|
628
|
+
if (!ctx.symbols || ctx.symbols.length === 0) return '';
|
|
629
|
+
|
|
630
|
+
let contextBlock = `
|
|
631
|
+
## š Codebase Context (Retrieved via RAG)
|
|
632
|
+
|
|
633
|
+
The following symbols were found in the codebase that may be relevant to this change:
|
|
634
|
+
|
|
635
|
+
`;
|
|
636
|
+
|
|
637
|
+
// Same-file symbols (most important)
|
|
638
|
+
if (ctx.grouped.sameFile.length > 0) {
|
|
639
|
+
contextBlock += `### Same File Symbols\n`;
|
|
640
|
+
for (const s of ctx.grouped.sameFile.slice(0, 8)) {
|
|
641
|
+
contextBlock += `- \`${s.type}\` **${s.name}**${s.signature ? `(${s.signature})` : ''} at line ${s.line}\n`;
|
|
642
|
+
if (s.snippet && s.snippet.length < 300) {
|
|
643
|
+
contextBlock += ` \`\`\`\n${s.snippet}\n \`\`\`\n`;
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// Cross-file references
|
|
649
|
+
if (ctx.grouped.crossFile.length > 0) {
|
|
650
|
+
contextBlock += `### Cross-File References\n`;
|
|
651
|
+
for (const s of ctx.grouped.crossFile.slice(0, 5)) {
|
|
652
|
+
contextBlock += `- \`${s.type}\` **${s.name}** in \`${s.file}\`\n`;
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// Semantic matches
|
|
657
|
+
if (ctx.grouped.semantic.length > 0) {
|
|
658
|
+
contextBlock += `### Semantically Similar\n`;
|
|
659
|
+
for (const s of ctx.grouped.semantic.slice(0, 5)) {
|
|
660
|
+
contextBlock += `- \`${s.type}\` **${s.name}** in \`${s.file}\`\n`;
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
contextBlock += `\nUse this context to understand call chains, type relationships, and coding patterns.`;
|
|
665
|
+
|
|
666
|
+
return contextBlock;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
/**
|
|
670
|
+
* Check if an index needs rebuilding (stale or non-existent).
|
|
671
|
+
*
|
|
672
|
+
* @param {string} repoRoot - Repository root
|
|
673
|
+
* @param {number} [maxAgeHours=24] - Max age in hours before considered stale
|
|
674
|
+
* @returns {boolean}
|
|
675
|
+
*/
|
|
676
|
+
function isIndexStale(repoRoot, maxAgeHours = 24) {
|
|
677
|
+
const metaPath = path.join(repoRoot, INDEX_DIR, META_FILE);
|
|
678
|
+
try {
|
|
679
|
+
if (!fs.existsSync(metaPath)) return true;
|
|
680
|
+
const meta = JSON.parse(fs.readFileSync(metaPath, 'utf-8'));
|
|
681
|
+
const builtTime = new Date(meta.lastBuilt).getTime();
|
|
682
|
+
return (Date.now() - builtTime) > maxAgeHours * 60 * 60 * 1000;
|
|
683
|
+
} catch {
|
|
684
|
+
return true;
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
module.exports = {
|
|
689
|
+
buildIndex,
|
|
690
|
+
loadIndex,
|
|
691
|
+
retrieveContext,
|
|
692
|
+
buildReviewContext,
|
|
693
|
+
extractSymbols,
|
|
694
|
+
searchIndex,
|
|
695
|
+
buildTfIdfIndex,
|
|
696
|
+
isIndexStale,
|
|
697
|
+
INDEX_DIR,
|
|
698
|
+
INDEXABLE_EXTS,
|
|
699
|
+
SKIP_DIRS,
|
|
700
|
+
};
|