prjct-cli 1.16.0 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -1
- package/core/__tests__/domain/bm25.test.ts +225 -0
- package/core/__tests__/domain/file-ranker.test.ts +169 -0
- package/core/__tests__/domain/git-cochange.test.ts +121 -0
- package/core/__tests__/domain/import-graph.test.ts +156 -0
- package/core/agentic/smart-context.ts +33 -2
- package/core/domain/bm25.ts +525 -0
- package/core/domain/file-ranker.ts +151 -0
- package/core/domain/git-cochange.ts +250 -0
- package/core/domain/import-graph.ts +315 -0
- package/core/services/sync-service.ts +17 -0
- package/dist/bin/prjct.mjs +890 -366
- package/package.json +1 -1
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 Text Search Index
|
|
3
|
+
*
|
|
4
|
+
* Implements the Okapi BM25 ranking algorithm for file relevance scoring.
|
|
5
|
+
* Indexes files by extracting tokens from:
|
|
6
|
+
* - Function names, class names, export names
|
|
7
|
+
* - Import paths
|
|
8
|
+
* - Comments and JSDoc
|
|
9
|
+
* - File path segments
|
|
10
|
+
*
|
|
11
|
+
* Zero API calls — pure math on filesystem data.
|
|
12
|
+
*
|
|
13
|
+
* @module domain/bm25
|
|
14
|
+
* @version 1.0.0
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import fs from 'node:fs/promises'
|
|
18
|
+
import path from 'node:path'
|
|
19
|
+
import prjctDb from '../storage/database'
|
|
20
|
+
|
|
21
|
+
// =============================================================================
|
|
22
|
+
// Types
|
|
23
|
+
// =============================================================================
|
|
24
|
+
|
|
25
|
+
export interface BM25Document {
|
|
26
|
+
path: string
|
|
27
|
+
tokens: string[]
|
|
28
|
+
length: number
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface BM25Index {
|
|
32
|
+
/** Map of file path → token list and document length */
|
|
33
|
+
documents: Record<string, { tokens: string[]; length: number }>
|
|
34
|
+
/** Inverted index: token → list of (path, term frequency) */
|
|
35
|
+
invertedIndex: Record<string, Array<{ path: string; tf: number }>>
|
|
36
|
+
/** Average document length across all documents */
|
|
37
|
+
avgDocLength: number
|
|
38
|
+
/** Total number of indexed documents */
|
|
39
|
+
totalDocs: number
|
|
40
|
+
/** Build timestamp */
|
|
41
|
+
builtAt: string
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface BM25Score {
|
|
45
|
+
path: string
|
|
46
|
+
score: number
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// =============================================================================
|
|
50
|
+
// Constants
|
|
51
|
+
// =============================================================================
|
|
52
|
+
|
|
53
|
+
/** BM25 tuning: term frequency saturation */
|
|
54
|
+
const K1 = 1.2
|
|
55
|
+
/** BM25 tuning: document length normalization */
|
|
56
|
+
const B = 0.75
|
|
57
|
+
|
|
58
|
+
/** File extensions to index */
|
|
59
|
+
const INDEXABLE_EXTENSIONS = new Set([
|
|
60
|
+
'.ts',
|
|
61
|
+
'.tsx',
|
|
62
|
+
'.js',
|
|
63
|
+
'.jsx',
|
|
64
|
+
'.mjs',
|
|
65
|
+
'.cjs',
|
|
66
|
+
'.py',
|
|
67
|
+
'.go',
|
|
68
|
+
'.rs',
|
|
69
|
+
'.java',
|
|
70
|
+
'.cs',
|
|
71
|
+
'.rb',
|
|
72
|
+
'.php',
|
|
73
|
+
'.vue',
|
|
74
|
+
'.svelte',
|
|
75
|
+
])
|
|
76
|
+
|
|
77
|
+
/** Common stop words to exclude from indexing */
|
|
78
|
+
const STOP_WORDS = new Set([
|
|
79
|
+
'the',
|
|
80
|
+
'a',
|
|
81
|
+
'an',
|
|
82
|
+
'is',
|
|
83
|
+
'are',
|
|
84
|
+
'was',
|
|
85
|
+
'were',
|
|
86
|
+
'be',
|
|
87
|
+
'been',
|
|
88
|
+
'being',
|
|
89
|
+
'have',
|
|
90
|
+
'has',
|
|
91
|
+
'had',
|
|
92
|
+
'do',
|
|
93
|
+
'does',
|
|
94
|
+
'did',
|
|
95
|
+
'will',
|
|
96
|
+
'would',
|
|
97
|
+
'could',
|
|
98
|
+
'should',
|
|
99
|
+
'may',
|
|
100
|
+
'might',
|
|
101
|
+
'shall',
|
|
102
|
+
'can',
|
|
103
|
+
'of',
|
|
104
|
+
'in',
|
|
105
|
+
'to',
|
|
106
|
+
'for',
|
|
107
|
+
'with',
|
|
108
|
+
'on',
|
|
109
|
+
'at',
|
|
110
|
+
'from',
|
|
111
|
+
'by',
|
|
112
|
+
'as',
|
|
113
|
+
'or',
|
|
114
|
+
'and',
|
|
115
|
+
'but',
|
|
116
|
+
'if',
|
|
117
|
+
'not',
|
|
118
|
+
'no',
|
|
119
|
+
'so',
|
|
120
|
+
'up',
|
|
121
|
+
'out',
|
|
122
|
+
'this',
|
|
123
|
+
'that',
|
|
124
|
+
'it',
|
|
125
|
+
'its',
|
|
126
|
+
'all',
|
|
127
|
+
'any',
|
|
128
|
+
// Code noise
|
|
129
|
+
'import',
|
|
130
|
+
'export',
|
|
131
|
+
'default',
|
|
132
|
+
'const',
|
|
133
|
+
'let',
|
|
134
|
+
'var',
|
|
135
|
+
'function',
|
|
136
|
+
'class',
|
|
137
|
+
'interface',
|
|
138
|
+
'type',
|
|
139
|
+
'return',
|
|
140
|
+
'new',
|
|
141
|
+
'true',
|
|
142
|
+
'false',
|
|
143
|
+
'null',
|
|
144
|
+
'undefined',
|
|
145
|
+
'void',
|
|
146
|
+
'async',
|
|
147
|
+
'await',
|
|
148
|
+
'static',
|
|
149
|
+
'public',
|
|
150
|
+
'private',
|
|
151
|
+
'protected',
|
|
152
|
+
'readonly',
|
|
153
|
+
'string',
|
|
154
|
+
'number',
|
|
155
|
+
'boolean',
|
|
156
|
+
'object',
|
|
157
|
+
'array',
|
|
158
|
+
])
|
|
159
|
+
|
|
160
|
+
/** Directories to skip during indexing */
|
|
161
|
+
const SKIP_DIRS = new Set([
|
|
162
|
+
'node_modules',
|
|
163
|
+
'.git',
|
|
164
|
+
'dist',
|
|
165
|
+
'build',
|
|
166
|
+
'out',
|
|
167
|
+
'.next',
|
|
168
|
+
'coverage',
|
|
169
|
+
'.cache',
|
|
170
|
+
'.turbo',
|
|
171
|
+
'.vercel',
|
|
172
|
+
'__pycache__',
|
|
173
|
+
'vendor',
|
|
174
|
+
'target',
|
|
175
|
+
])
|
|
176
|
+
|
|
177
|
+
// =============================================================================
|
|
178
|
+
// Tokenization
|
|
179
|
+
// =============================================================================
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Split camelCase/PascalCase identifiers into words.
|
|
183
|
+
* e.g., "getUserById" → ["get", "user", "by", "id"]
|
|
184
|
+
*/
|
|
185
|
+
function splitIdentifier(identifier: string): string[] {
|
|
186
|
+
return identifier
|
|
187
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
188
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
189
|
+
.replace(/[-_./]/g, ' ')
|
|
190
|
+
.toLowerCase()
|
|
191
|
+
.split(/\s+/)
|
|
192
|
+
.filter((w) => w.length > 1)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Extract tokens from a file's content and path.
|
|
197
|
+
*
|
|
198
|
+
* Extracts:
|
|
199
|
+
* - Path segments (e.g., "core/domain/bm25.ts" → ["core", "domain", "bm25"])
|
|
200
|
+
* - Function/class/interface/type names (split camelCase)
|
|
201
|
+
* - Import sources (split on / and -)
|
|
202
|
+
* - Single-line and multi-line comments
|
|
203
|
+
* - JSDoc content
|
|
204
|
+
*/
|
|
205
|
+
export function tokenizeFile(content: string, filePath: string): string[] {
|
|
206
|
+
const tokens: string[] = []
|
|
207
|
+
|
|
208
|
+
// 1. Path segments (weighted: appear in every query match)
|
|
209
|
+
const pathParts = filePath
|
|
210
|
+
.replace(/\.[^.]+$/, '') // remove extension
|
|
211
|
+
.split(/[/\\]/)
|
|
212
|
+
.filter(Boolean)
|
|
213
|
+
for (const part of pathParts) {
|
|
214
|
+
tokens.push(...splitIdentifier(part))
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// 2. Export names: export function/class/interface/type/const
|
|
218
|
+
const exportPatterns = [
|
|
219
|
+
/export\s+(?:async\s+)?function\s+(\w+)/g,
|
|
220
|
+
/export\s+class\s+(\w+)/g,
|
|
221
|
+
/export\s+interface\s+(\w+)/g,
|
|
222
|
+
/export\s+type\s+(\w+)/g,
|
|
223
|
+
/export\s+(?:const|let|var)\s+(\w+)/g,
|
|
224
|
+
/export\s+default\s+(?:class|function)\s+(\w+)/g,
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
for (const pattern of exportPatterns) {
|
|
228
|
+
let match: RegExpExecArray | null
|
|
229
|
+
while ((match = pattern.exec(content)) !== null) {
|
|
230
|
+
if (match[1]) {
|
|
231
|
+
tokens.push(...splitIdentifier(match[1]))
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// 3. Non-exported function/class/interface names
|
|
237
|
+
const declPatterns = [
|
|
238
|
+
/(?:async\s+)?function\s+(\w+)/g,
|
|
239
|
+
/class\s+(\w+)/g,
|
|
240
|
+
/interface\s+(\w+)/g,
|
|
241
|
+
/type\s+(\w+)\s*=/g,
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
for (const pattern of declPatterns) {
|
|
245
|
+
let match: RegExpExecArray | null
|
|
246
|
+
while ((match = pattern.exec(content)) !== null) {
|
|
247
|
+
if (match[1]) {
|
|
248
|
+
tokens.push(...splitIdentifier(match[1]))
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// 4. Import sources
|
|
254
|
+
const importPattern = /(?:from|import)\s+['"]([^'"]+)['"]/g
|
|
255
|
+
let importMatch: RegExpExecArray | null
|
|
256
|
+
while ((importMatch = importPattern.exec(content)) !== null) {
|
|
257
|
+
const source = importMatch[1]
|
|
258
|
+
if (source.startsWith('.') || source.startsWith('@/')) {
|
|
259
|
+
// Internal import — extract path tokens
|
|
260
|
+
tokens.push(...splitIdentifier(source))
|
|
261
|
+
} else {
|
|
262
|
+
// External package — use package name
|
|
263
|
+
const pkgName = source.startsWith('@')
|
|
264
|
+
? source.split('/').slice(0, 2).join('/')
|
|
265
|
+
: source.split('/')[0]
|
|
266
|
+
tokens.push(...splitIdentifier(pkgName))
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// 5. Comments (single-line)
|
|
271
|
+
const singleLineComments = /\/\/\s*(.+)/g
|
|
272
|
+
let commentMatch: RegExpExecArray | null
|
|
273
|
+
while ((commentMatch = singleLineComments.exec(content)) !== null) {
|
|
274
|
+
const words = commentMatch[1]
|
|
275
|
+
.toLowerCase()
|
|
276
|
+
.split(/\s+/)
|
|
277
|
+
.filter((w) => w.length > 2)
|
|
278
|
+
tokens.push(...words)
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// 6. JSDoc / multi-line comments — extract meaningful words
|
|
282
|
+
const multiLineComments = /\/\*\*?([\s\S]*?)\*\//g
|
|
283
|
+
let multiMatch: RegExpExecArray | null
|
|
284
|
+
while ((multiMatch = multiLineComments.exec(content)) !== null) {
|
|
285
|
+
const words = multiMatch[1]
|
|
286
|
+
.replace(/@\w+/g, '') // strip JSDoc tags
|
|
287
|
+
.replace(/\*/g, '')
|
|
288
|
+
.toLowerCase()
|
|
289
|
+
.split(/\s+/)
|
|
290
|
+
.filter((w) => w.length > 2 && /^[a-z]+$/.test(w))
|
|
291
|
+
tokens.push(...words)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Filter: remove stop words, short tokens, non-alpha
|
|
295
|
+
return tokens.filter((t) => t.length > 1 && !STOP_WORDS.has(t) && /^[a-z][a-z0-9]*$/.test(t))
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Tokenize a query string (task description).
|
|
300
|
+
*/
|
|
301
|
+
export function tokenizeQuery(query: string): string[] {
|
|
302
|
+
return query
|
|
303
|
+
.split(/\s+/)
|
|
304
|
+
.flatMap((word) => splitIdentifier(word))
|
|
305
|
+
.filter((t) => t.length > 1 && !STOP_WORDS.has(t) && /^[a-z][a-z0-9]*$/.test(t))
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// =============================================================================
|
|
309
|
+
// Index Building
|
|
310
|
+
// =============================================================================
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Recursively list all indexable files in a project.
|
|
314
|
+
*/
|
|
315
|
+
async function listFiles(dir: string, projectPath: string): Promise<string[]> {
|
|
316
|
+
const files: string[] = []
|
|
317
|
+
const entries = await fs.readdir(dir, { withFileTypes: true })
|
|
318
|
+
|
|
319
|
+
for (const entry of entries) {
|
|
320
|
+
if (SKIP_DIRS.has(entry.name)) continue
|
|
321
|
+
|
|
322
|
+
const fullPath = path.join(dir, entry.name)
|
|
323
|
+
if (entry.isDirectory()) {
|
|
324
|
+
files.push(...(await listFiles(fullPath, projectPath)))
|
|
325
|
+
} else if (entry.isFile()) {
|
|
326
|
+
const ext = path.extname(entry.name).toLowerCase()
|
|
327
|
+
if (INDEXABLE_EXTENSIONS.has(ext)) {
|
|
328
|
+
files.push(path.relative(projectPath, fullPath))
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return files
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Build a BM25 index for all files in a project.
|
|
338
|
+
*
|
|
339
|
+
* Performance target: <5 seconds for 500-file project.
|
|
340
|
+
*/
|
|
341
|
+
export async function buildIndex(projectPath: string): Promise<BM25Index> {
|
|
342
|
+
const files = await listFiles(projectPath, projectPath)
|
|
343
|
+
|
|
344
|
+
const documents: BM25Index['documents'] = {}
|
|
345
|
+
const invertedIndex: BM25Index['invertedIndex'] = {}
|
|
346
|
+
let totalLength = 0
|
|
347
|
+
|
|
348
|
+
// Process files in parallel batches of 50
|
|
349
|
+
const BATCH_SIZE = 50
|
|
350
|
+
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
351
|
+
const batch = files.slice(i, i + BATCH_SIZE)
|
|
352
|
+
const results = await Promise.all(
|
|
353
|
+
batch.map(async (filePath) => {
|
|
354
|
+
try {
|
|
355
|
+
const content = await fs.readFile(path.join(projectPath, filePath), 'utf-8')
|
|
356
|
+
const tokens = tokenizeFile(content, filePath)
|
|
357
|
+
return { filePath, tokens }
|
|
358
|
+
} catch {
|
|
359
|
+
return { filePath, tokens: [] as string[] }
|
|
360
|
+
}
|
|
361
|
+
})
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
for (const { filePath, tokens } of results) {
|
|
365
|
+
if (tokens.length === 0) continue
|
|
366
|
+
|
|
367
|
+
documents[filePath] = { tokens, length: tokens.length }
|
|
368
|
+
totalLength += tokens.length
|
|
369
|
+
|
|
370
|
+
// Build term frequency map for this document
|
|
371
|
+
const tfMap = new Map<string, number>()
|
|
372
|
+
for (const token of tokens) {
|
|
373
|
+
tfMap.set(token, (tfMap.get(token) || 0) + 1)
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Add to inverted index
|
|
377
|
+
for (const [token, tf] of tfMap) {
|
|
378
|
+
if (!invertedIndex[token]) {
|
|
379
|
+
invertedIndex[token] = []
|
|
380
|
+
}
|
|
381
|
+
invertedIndex[token].push({ path: filePath, tf })
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const totalDocs = Object.keys(documents).length
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
documents,
|
|
390
|
+
invertedIndex,
|
|
391
|
+
avgDocLength: totalDocs > 0 ? totalLength / totalDocs : 0,
|
|
392
|
+
totalDocs,
|
|
393
|
+
builtAt: new Date().toISOString(),
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// =============================================================================
|
|
398
|
+
// BM25 Scoring
|
|
399
|
+
// =============================================================================
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Calculate IDF (Inverse Document Frequency) for a term.
|
|
403
|
+
*/
|
|
404
|
+
function idf(docFrequency: number, totalDocs: number): number {
|
|
405
|
+
return Math.log((totalDocs - docFrequency + 0.5) / (docFrequency + 0.5) + 1)
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Score all documents against a query using BM25.
|
|
410
|
+
*
|
|
411
|
+
* Performance target: <50ms per query.
|
|
412
|
+
*
|
|
413
|
+
* @returns Sorted array of (path, score) tuples, highest score first.
|
|
414
|
+
*/
|
|
415
|
+
export function score(query: string, index: BM25Index): BM25Score[] {
|
|
416
|
+
const queryTokens = tokenizeQuery(query)
|
|
417
|
+
if (queryTokens.length === 0) return []
|
|
418
|
+
|
|
419
|
+
const scores = new Map<string, number>()
|
|
420
|
+
|
|
421
|
+
for (const token of queryTokens) {
|
|
422
|
+
const postings = index.invertedIndex[token]
|
|
423
|
+
if (!postings) continue
|
|
424
|
+
|
|
425
|
+
const tokenIdf = idf(postings.length, index.totalDocs)
|
|
426
|
+
|
|
427
|
+
for (const { path: docPath, tf } of postings) {
|
|
428
|
+
const doc = index.documents[docPath]
|
|
429
|
+
if (!doc) continue
|
|
430
|
+
|
|
431
|
+
// BM25 term score
|
|
432
|
+
const numerator = tf * (K1 + 1)
|
|
433
|
+
const denominator = tf + K1 * (1 - B + B * (doc.length / index.avgDocLength))
|
|
434
|
+
const termScore = tokenIdf * (numerator / denominator)
|
|
435
|
+
|
|
436
|
+
scores.set(docPath, (scores.get(docPath) || 0) + termScore)
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Sort by score descending
|
|
441
|
+
return Array.from(scores.entries())
|
|
442
|
+
.map(([p, s]) => ({ path: p, score: s }))
|
|
443
|
+
.sort((a, b) => b.score - a.score)
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// =============================================================================
|
|
447
|
+
// SQLite Persistence
|
|
448
|
+
// =============================================================================
|
|
449
|
+
|
|
450
|
+
const INDEX_KEY = 'bm25-index'
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Save a BM25 index to SQLite.
|
|
454
|
+
*/
|
|
455
|
+
export function saveIndex(projectId: string, index: BM25Index): void {
|
|
456
|
+
// Store only the inverted index + metadata (not raw tokens, to save space)
|
|
457
|
+
const storable = {
|
|
458
|
+
invertedIndex: index.invertedIndex,
|
|
459
|
+
avgDocLength: index.avgDocLength,
|
|
460
|
+
totalDocs: index.totalDocs,
|
|
461
|
+
builtAt: index.builtAt,
|
|
462
|
+
// Store document lengths (needed for scoring) but not full token lists
|
|
463
|
+
docLengths: Object.fromEntries(Object.entries(index.documents).map(([p, d]) => [p, d.length])),
|
|
464
|
+
}
|
|
465
|
+
prjctDb.setDoc(projectId, INDEX_KEY, storable)
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Load a BM25 index from SQLite.
|
|
470
|
+
* Returns null if no index exists.
|
|
471
|
+
*/
|
|
472
|
+
export function loadIndex(projectId: string): BM25Index | null {
|
|
473
|
+
const stored = prjctDb.getDoc<{
|
|
474
|
+
invertedIndex: BM25Index['invertedIndex']
|
|
475
|
+
avgDocLength: number
|
|
476
|
+
totalDocs: number
|
|
477
|
+
builtAt: string
|
|
478
|
+
docLengths: Record<string, number>
|
|
479
|
+
}>(projectId, INDEX_KEY)
|
|
480
|
+
|
|
481
|
+
if (!stored) return null
|
|
482
|
+
|
|
483
|
+
// Reconstruct documents map with lengths only (tokens not needed for scoring)
|
|
484
|
+
const documents: BM25Index['documents'] = {}
|
|
485
|
+
for (const [p, length] of Object.entries(stored.docLengths)) {
|
|
486
|
+
documents[p] = { tokens: [], length }
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return {
|
|
490
|
+
documents,
|
|
491
|
+
invertedIndex: stored.invertedIndex,
|
|
492
|
+
avgDocLength: stored.avgDocLength,
|
|
493
|
+
totalDocs: stored.totalDocs,
|
|
494
|
+
builtAt: stored.builtAt,
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// =============================================================================
|
|
499
|
+
// High-level API
|
|
500
|
+
// =============================================================================
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Build and persist a BM25 index for a project.
|
|
504
|
+
*/
|
|
505
|
+
export async function indexProject(projectPath: string, projectId: string): Promise<BM25Index> {
|
|
506
|
+
const index = await buildIndex(projectPath)
|
|
507
|
+
saveIndex(projectId, index)
|
|
508
|
+
return index
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Query files by relevance to a task description.
|
|
513
|
+
* Loads index from SQLite, scores against query, returns top N.
|
|
514
|
+
*
|
|
515
|
+
* @param projectId - Project ID for SQLite lookup
|
|
516
|
+
* @param query - Task description or search query
|
|
517
|
+
* @param topN - Maximum number of results (default: 15)
|
|
518
|
+
* @returns Sorted array of (path, score) — empty if no index exists
|
|
519
|
+
*/
|
|
520
|
+
export function queryFiles(projectId: string, query: string, topN = 15): BM25Score[] {
|
|
521
|
+
const index = loadIndex(projectId)
|
|
522
|
+
if (!index) return []
|
|
523
|
+
|
|
524
|
+
return score(query, index).slice(0, topN)
|
|
525
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Ranker — Combined Scoring
|
|
3
|
+
*
|
|
4
|
+
* Combines three signals to rank files by relevance to a task:
|
|
5
|
+
* - BM25 text search (0.5 weight)
|
|
6
|
+
* - Import graph proximity (0.3 weight)
|
|
7
|
+
* - Git co-change correlation (0.2 weight)
|
|
8
|
+
*
|
|
9
|
+
* Zero API calls. Pure math on pre-built indexes.
|
|
10
|
+
*
|
|
11
|
+
* @module domain/file-ranker
|
|
12
|
+
* @version 1.0.0
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { loadIndex, queryFiles } from './bm25'
|
|
16
|
+
import { scoreFromSeeds as cochangeScore, loadMatrix } from './git-cochange'
|
|
17
|
+
import { scoreFromSeeds as importScore, loadGraph } from './import-graph'
|
|
18
|
+
|
|
19
|
+
// =============================================================================
|
|
20
|
+
// Types
|
|
21
|
+
// =============================================================================
|
|
22
|
+
|
|
23
|
+
export interface RankedFile {
|
|
24
|
+
path: string
|
|
25
|
+
finalScore: number
|
|
26
|
+
signals: {
|
|
27
|
+
bm25: number
|
|
28
|
+
imports: number
|
|
29
|
+
cochange: number
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface RankingConfig {
|
|
34
|
+
/** Weight for BM25 text relevance (default: 0.5) */
|
|
35
|
+
bm25Weight?: number
|
|
36
|
+
/** Weight for import graph proximity (default: 0.3) */
|
|
37
|
+
importWeight?: number
|
|
38
|
+
/** Weight for git co-change correlation (default: 0.2) */
|
|
39
|
+
cochangeWeight?: number
|
|
40
|
+
/** Maximum number of results (default: 15) */
|
|
41
|
+
topN?: number
|
|
42
|
+
/** Maximum depth for import graph traversal (default: 2) */
|
|
43
|
+
importDepth?: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const DEFAULT_CONFIG: Required<RankingConfig> = {
|
|
47
|
+
bm25Weight: 0.5,
|
|
48
|
+
importWeight: 0.3,
|
|
49
|
+
cochangeWeight: 0.2,
|
|
50
|
+
topN: 15,
|
|
51
|
+
importDepth: 2,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// =============================================================================
|
|
55
|
+
// Combined Ranking
|
|
56
|
+
// =============================================================================
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Rank files by combined relevance to a task description.
|
|
60
|
+
*
|
|
61
|
+
* Algorithm:
|
|
62
|
+
* 1. BM25: Score all files against the query
|
|
63
|
+
* 2. Import graph: From top BM25 hits, follow imports 2 levels deep
|
|
64
|
+
* 3. Co-change: From top BM25 hits, find co-changed files
|
|
65
|
+
* 4. Normalize each signal to [0, 1]
|
|
66
|
+
* 5. Combine: finalScore = bm25 * 0.5 + imports * 0.3 + cochange * 0.2
|
|
67
|
+
*
|
|
68
|
+
* Performance target: <50ms per query (all indexes pre-loaded from SQLite).
|
|
69
|
+
*/
|
|
70
|
+
export function rankFiles(
|
|
71
|
+
projectId: string,
|
|
72
|
+
query: string,
|
|
73
|
+
config: RankingConfig = {}
|
|
74
|
+
): RankedFile[] {
|
|
75
|
+
const cfg = { ...DEFAULT_CONFIG, ...config }
|
|
76
|
+
|
|
77
|
+
// 1. BM25 scoring — get broad candidate set
|
|
78
|
+
const bm25Results = queryFiles(projectId, query, cfg.topN * 3) // Get more candidates
|
|
79
|
+
if (bm25Results.length === 0) return []
|
|
80
|
+
|
|
81
|
+
// Normalize BM25 scores to [0, 1]
|
|
82
|
+
const maxBm25 = bm25Results[0]?.score || 1
|
|
83
|
+
const bm25Map = new Map<string, number>()
|
|
84
|
+
for (const result of bm25Results) {
|
|
85
|
+
bm25Map.set(result.path, result.score / maxBm25)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Seed files: top BM25 hits for graph traversal
|
|
89
|
+
const seedFiles = bm25Results.slice(0, 10).map((r) => r.path)
|
|
90
|
+
|
|
91
|
+
// 2. Import graph scoring
|
|
92
|
+
const importMap = new Map<string, number>()
|
|
93
|
+
const graph = loadGraph(projectId)
|
|
94
|
+
if (graph) {
|
|
95
|
+
const importResults = importScore(seedFiles, graph, cfg.importDepth)
|
|
96
|
+
const maxImport = importResults[0]?.score || 1
|
|
97
|
+
for (const result of importResults) {
|
|
98
|
+
importMap.set(result.path, result.score / maxImport)
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// 3. Co-change scoring
|
|
103
|
+
const cochangeMap = new Map<string, number>()
|
|
104
|
+
const cochangeIndex = loadMatrix(projectId)
|
|
105
|
+
if (cochangeIndex) {
|
|
106
|
+
const cochangeResults = cochangeScore(seedFiles, cochangeIndex)
|
|
107
|
+
const maxCochange = cochangeResults[0]?.score || 1
|
|
108
|
+
for (const result of cochangeResults) {
|
|
109
|
+
cochangeMap.set(result.path, result.score / maxCochange)
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// 4. Collect all candidate files
|
|
114
|
+
const allFiles = new Set([...bm25Map.keys(), ...importMap.keys(), ...cochangeMap.keys()])
|
|
115
|
+
|
|
116
|
+
// 5. Combined scoring
|
|
117
|
+
const ranked: RankedFile[] = []
|
|
118
|
+
for (const filePath of allFiles) {
|
|
119
|
+
const bm25 = bm25Map.get(filePath) || 0
|
|
120
|
+
const imports = importMap.get(filePath) || 0
|
|
121
|
+
const cochange = cochangeMap.get(filePath) || 0
|
|
122
|
+
|
|
123
|
+
const finalScore =
|
|
124
|
+
bm25 * cfg.bm25Weight + imports * cfg.importWeight + cochange * cfg.cochangeWeight
|
|
125
|
+
|
|
126
|
+
ranked.push({
|
|
127
|
+
path: filePath,
|
|
128
|
+
finalScore,
|
|
129
|
+
signals: { bm25, imports, cochange },
|
|
130
|
+
})
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Sort by finalScore descending, return top N
|
|
134
|
+
ranked.sort((a, b) => b.finalScore - a.finalScore)
|
|
135
|
+
return ranked.slice(0, cfg.topN)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Check if all three indexes exist for a project.
|
|
140
|
+
*/
|
|
141
|
+
export function hasIndexes(projectId: string): {
|
|
142
|
+
bm25: boolean
|
|
143
|
+
imports: boolean
|
|
144
|
+
cochange: boolean
|
|
145
|
+
} {
|
|
146
|
+
return {
|
|
147
|
+
bm25: loadIndex(projectId) !== null,
|
|
148
|
+
imports: loadGraph(projectId) !== null,
|
|
149
|
+
cochange: loadMatrix(projectId) !== null,
|
|
150
|
+
}
|
|
151
|
+
}
|