prjct-cli 1.16.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ /**
2
+ * BM25 Text Search Index
3
+ *
4
+ * Implements the Okapi BM25 ranking algorithm for file relevance scoring.
5
+ * Indexes files by extracting tokens from:
6
+ * - Function names, class names, export names
7
+ * - Import paths
8
+ * - Comments and JSDoc
9
+ * - File path segments
10
+ *
11
+ * Zero API calls — pure math on filesystem data.
12
+ *
13
+ * @module domain/bm25
14
+ * @version 1.0.0
15
+ */
16
+
17
+ import fs from 'node:fs/promises'
18
+ import path from 'node:path'
19
+ import prjctDb from '../storage/database'
20
+
21
+ // =============================================================================
22
+ // Types
23
+ // =============================================================================
24
+
25
+ export interface BM25Document {
26
+ path: string
27
+ tokens: string[]
28
+ length: number
29
+ }
30
+
31
+ export interface BM25Index {
32
+ /** Map of file path → token list and document length */
33
+ documents: Record<string, { tokens: string[]; length: number }>
34
+ /** Inverted index: token → list of (path, term frequency) */
35
+ invertedIndex: Record<string, Array<{ path: string; tf: number }>>
36
+ /** Average document length across all documents */
37
+ avgDocLength: number
38
+ /** Total number of indexed documents */
39
+ totalDocs: number
40
+ /** Build timestamp */
41
+ builtAt: string
42
+ }
43
+
44
+ export interface BM25Score {
45
+ path: string
46
+ score: number
47
+ }
48
+
49
+ // =============================================================================
50
+ // Constants
51
+ // =============================================================================
52
+
53
+ /** BM25 tuning: term frequency saturation */
54
+ const K1 = 1.2
55
+ /** BM25 tuning: document length normalization */
56
+ const B = 0.75
57
+
58
+ /** File extensions to index */
59
+ const INDEXABLE_EXTENSIONS = new Set([
60
+ '.ts',
61
+ '.tsx',
62
+ '.js',
63
+ '.jsx',
64
+ '.mjs',
65
+ '.cjs',
66
+ '.py',
67
+ '.go',
68
+ '.rs',
69
+ '.java',
70
+ '.cs',
71
+ '.rb',
72
+ '.php',
73
+ '.vue',
74
+ '.svelte',
75
+ ])
76
+
77
+ /** Common stop words to exclude from indexing */
78
+ const STOP_WORDS = new Set([
79
+ 'the',
80
+ 'a',
81
+ 'an',
82
+ 'is',
83
+ 'are',
84
+ 'was',
85
+ 'were',
86
+ 'be',
87
+ 'been',
88
+ 'being',
89
+ 'have',
90
+ 'has',
91
+ 'had',
92
+ 'do',
93
+ 'does',
94
+ 'did',
95
+ 'will',
96
+ 'would',
97
+ 'could',
98
+ 'should',
99
+ 'may',
100
+ 'might',
101
+ 'shall',
102
+ 'can',
103
+ 'of',
104
+ 'in',
105
+ 'to',
106
+ 'for',
107
+ 'with',
108
+ 'on',
109
+ 'at',
110
+ 'from',
111
+ 'by',
112
+ 'as',
113
+ 'or',
114
+ 'and',
115
+ 'but',
116
+ 'if',
117
+ 'not',
118
+ 'no',
119
+ 'so',
120
+ 'up',
121
+ 'out',
122
+ 'this',
123
+ 'that',
124
+ 'it',
125
+ 'its',
126
+ 'all',
127
+ 'any',
128
+ // Code noise
129
+ 'import',
130
+ 'export',
131
+ 'default',
132
+ 'const',
133
+ 'let',
134
+ 'var',
135
+ 'function',
136
+ 'class',
137
+ 'interface',
138
+ 'type',
139
+ 'return',
140
+ 'new',
141
+ 'true',
142
+ 'false',
143
+ 'null',
144
+ 'undefined',
145
+ 'void',
146
+ 'async',
147
+ 'await',
148
+ 'static',
149
+ 'public',
150
+ 'private',
151
+ 'protected',
152
+ 'readonly',
153
+ 'string',
154
+ 'number',
155
+ 'boolean',
156
+ 'object',
157
+ 'array',
158
+ ])
159
+
160
+ /** Directories to skip during indexing */
161
+ const SKIP_DIRS = new Set([
162
+ 'node_modules',
163
+ '.git',
164
+ 'dist',
165
+ 'build',
166
+ 'out',
167
+ '.next',
168
+ 'coverage',
169
+ '.cache',
170
+ '.turbo',
171
+ '.vercel',
172
+ '__pycache__',
173
+ 'vendor',
174
+ 'target',
175
+ ])
176
+
177
+ // =============================================================================
178
+ // Tokenization
179
+ // =============================================================================
180
+
181
+ /**
182
+ * Split camelCase/PascalCase identifiers into words.
183
+ * e.g., "getUserById" → ["get", "user", "by", "id"]
184
+ */
185
+ function splitIdentifier(identifier: string): string[] {
186
+ return identifier
187
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
188
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
189
+ .replace(/[-_./]/g, ' ')
190
+ .toLowerCase()
191
+ .split(/\s+/)
192
+ .filter((w) => w.length > 1)
193
+ }
194
+
195
+ /**
196
+ * Extract tokens from a file's content and path.
197
+ *
198
+ * Extracts:
199
+ * - Path segments (e.g., "core/domain/bm25.ts" → ["core", "domain", "bm25"])
200
+ * - Function/class/interface/type names (split camelCase)
201
+ * - Import sources (split on / and -)
202
+ * - Single-line and multi-line comments
203
+ * - JSDoc content
204
+ */
205
+ export function tokenizeFile(content: string, filePath: string): string[] {
206
+ const tokens: string[] = []
207
+
208
+ // 1. Path segments (weighted: appear in every query match)
209
+ const pathParts = filePath
210
+ .replace(/\.[^.]+$/, '') // remove extension
211
+ .split(/[/\\]/)
212
+ .filter(Boolean)
213
+ for (const part of pathParts) {
214
+ tokens.push(...splitIdentifier(part))
215
+ }
216
+
217
+ // 2. Export names: export function/class/interface/type/const
218
+ const exportPatterns = [
219
+ /export\s+(?:async\s+)?function\s+(\w+)/g,
220
+ /export\s+class\s+(\w+)/g,
221
+ /export\s+interface\s+(\w+)/g,
222
+ /export\s+type\s+(\w+)/g,
223
+ /export\s+(?:const|let|var)\s+(\w+)/g,
224
+ /export\s+default\s+(?:class|function)\s+(\w+)/g,
225
+ ]
226
+
227
+ for (const pattern of exportPatterns) {
228
+ let match: RegExpExecArray | null
229
+ while ((match = pattern.exec(content)) !== null) {
230
+ if (match[1]) {
231
+ tokens.push(...splitIdentifier(match[1]))
232
+ }
233
+ }
234
+ }
235
+
236
+ // 3. Non-exported function/class/interface names
237
+ const declPatterns = [
238
+ /(?:async\s+)?function\s+(\w+)/g,
239
+ /class\s+(\w+)/g,
240
+ /interface\s+(\w+)/g,
241
+ /type\s+(\w+)\s*=/g,
242
+ ]
243
+
244
+ for (const pattern of declPatterns) {
245
+ let match: RegExpExecArray | null
246
+ while ((match = pattern.exec(content)) !== null) {
247
+ if (match[1]) {
248
+ tokens.push(...splitIdentifier(match[1]))
249
+ }
250
+ }
251
+ }
252
+
253
+ // 4. Import sources
254
+ const importPattern = /(?:from|import)\s+['"]([^'"]+)['"]/g
255
+ let importMatch: RegExpExecArray | null
256
+ while ((importMatch = importPattern.exec(content)) !== null) {
257
+ const source = importMatch[1]
258
+ if (source.startsWith('.') || source.startsWith('@/')) {
259
+ // Internal import — extract path tokens
260
+ tokens.push(...splitIdentifier(source))
261
+ } else {
262
+ // External package — use package name
263
+ const pkgName = source.startsWith('@')
264
+ ? source.split('/').slice(0, 2).join('/')
265
+ : source.split('/')[0]
266
+ tokens.push(...splitIdentifier(pkgName))
267
+ }
268
+ }
269
+
270
+ // 5. Comments (single-line)
271
+ const singleLineComments = /\/\/\s*(.+)/g
272
+ let commentMatch: RegExpExecArray | null
273
+ while ((commentMatch = singleLineComments.exec(content)) !== null) {
274
+ const words = commentMatch[1]
275
+ .toLowerCase()
276
+ .split(/\s+/)
277
+ .filter((w) => w.length > 2)
278
+ tokens.push(...words)
279
+ }
280
+
281
+ // 6. JSDoc / multi-line comments — extract meaningful words
282
+ const multiLineComments = /\/\*\*?([\s\S]*?)\*\//g
283
+ let multiMatch: RegExpExecArray | null
284
+ while ((multiMatch = multiLineComments.exec(content)) !== null) {
285
+ const words = multiMatch[1]
286
+ .replace(/@\w+/g, '') // strip JSDoc tags
287
+ .replace(/\*/g, '')
288
+ .toLowerCase()
289
+ .split(/\s+/)
290
+ .filter((w) => w.length > 2 && /^[a-z]+$/.test(w))
291
+ tokens.push(...words)
292
+ }
293
+
294
+ // Filter: remove stop words, short tokens, non-alpha
295
+ return tokens.filter((t) => t.length > 1 && !STOP_WORDS.has(t) && /^[a-z][a-z0-9]*$/.test(t))
296
+ }
297
+
298
+ /**
299
+ * Tokenize a query string (task description).
300
+ */
301
+ export function tokenizeQuery(query: string): string[] {
302
+ return query
303
+ .split(/\s+/)
304
+ .flatMap((word) => splitIdentifier(word))
305
+ .filter((t) => t.length > 1 && !STOP_WORDS.has(t) && /^[a-z][a-z0-9]*$/.test(t))
306
+ }
307
+
308
+ // =============================================================================
309
+ // Index Building
310
+ // =============================================================================
311
+
312
+ /**
313
+ * Recursively list all indexable files in a project.
314
+ */
315
+ async function listFiles(dir: string, projectPath: string): Promise<string[]> {
316
+ const files: string[] = []
317
+ const entries = await fs.readdir(dir, { withFileTypes: true })
318
+
319
+ for (const entry of entries) {
320
+ if (SKIP_DIRS.has(entry.name)) continue
321
+
322
+ const fullPath = path.join(dir, entry.name)
323
+ if (entry.isDirectory()) {
324
+ files.push(...(await listFiles(fullPath, projectPath)))
325
+ } else if (entry.isFile()) {
326
+ const ext = path.extname(entry.name).toLowerCase()
327
+ if (INDEXABLE_EXTENSIONS.has(ext)) {
328
+ files.push(path.relative(projectPath, fullPath))
329
+ }
330
+ }
331
+ }
332
+
333
+ return files
334
+ }
335
+
336
+ /**
337
+ * Build a BM25 index for all files in a project.
338
+ *
339
+ * Performance target: <5 seconds for 500-file project.
340
+ */
341
+ export async function buildIndex(projectPath: string): Promise<BM25Index> {
342
+ const files = await listFiles(projectPath, projectPath)
343
+
344
+ const documents: BM25Index['documents'] = {}
345
+ const invertedIndex: BM25Index['invertedIndex'] = {}
346
+ let totalLength = 0
347
+
348
+ // Process files in parallel batches of 50
349
+ const BATCH_SIZE = 50
350
+ for (let i = 0; i < files.length; i += BATCH_SIZE) {
351
+ const batch = files.slice(i, i + BATCH_SIZE)
352
+ const results = await Promise.all(
353
+ batch.map(async (filePath) => {
354
+ try {
355
+ const content = await fs.readFile(path.join(projectPath, filePath), 'utf-8')
356
+ const tokens = tokenizeFile(content, filePath)
357
+ return { filePath, tokens }
358
+ } catch {
359
+ return { filePath, tokens: [] as string[] }
360
+ }
361
+ })
362
+ )
363
+
364
+ for (const { filePath, tokens } of results) {
365
+ if (tokens.length === 0) continue
366
+
367
+ documents[filePath] = { tokens, length: tokens.length }
368
+ totalLength += tokens.length
369
+
370
+ // Build term frequency map for this document
371
+ const tfMap = new Map<string, number>()
372
+ for (const token of tokens) {
373
+ tfMap.set(token, (tfMap.get(token) || 0) + 1)
374
+ }
375
+
376
+ // Add to inverted index
377
+ for (const [token, tf] of tfMap) {
378
+ if (!invertedIndex[token]) {
379
+ invertedIndex[token] = []
380
+ }
381
+ invertedIndex[token].push({ path: filePath, tf })
382
+ }
383
+ }
384
+ }
385
+
386
+ const totalDocs = Object.keys(documents).length
387
+
388
+ return {
389
+ documents,
390
+ invertedIndex,
391
+ avgDocLength: totalDocs > 0 ? totalLength / totalDocs : 0,
392
+ totalDocs,
393
+ builtAt: new Date().toISOString(),
394
+ }
395
+ }
396
+
397
+ // =============================================================================
398
+ // BM25 Scoring
399
+ // =============================================================================
400
+
401
+ /**
402
+ * Calculate IDF (Inverse Document Frequency) for a term.
403
+ */
404
+ function idf(docFrequency: number, totalDocs: number): number {
405
+ return Math.log((totalDocs - docFrequency + 0.5) / (docFrequency + 0.5) + 1)
406
+ }
407
+
408
+ /**
409
+ * Score all documents against a query using BM25.
410
+ *
411
+ * Performance target: <50ms per query.
412
+ *
413
+ * @returns Sorted array of (path, score) tuples, highest score first.
414
+ */
415
+ export function score(query: string, index: BM25Index): BM25Score[] {
416
+ const queryTokens = tokenizeQuery(query)
417
+ if (queryTokens.length === 0) return []
418
+
419
+ const scores = new Map<string, number>()
420
+
421
+ for (const token of queryTokens) {
422
+ const postings = index.invertedIndex[token]
423
+ if (!postings) continue
424
+
425
+ const tokenIdf = idf(postings.length, index.totalDocs)
426
+
427
+ for (const { path: docPath, tf } of postings) {
428
+ const doc = index.documents[docPath]
429
+ if (!doc) continue
430
+
431
+ // BM25 term score
432
+ const numerator = tf * (K1 + 1)
433
+ const denominator = tf + K1 * (1 - B + B * (doc.length / index.avgDocLength))
434
+ const termScore = tokenIdf * (numerator / denominator)
435
+
436
+ scores.set(docPath, (scores.get(docPath) || 0) + termScore)
437
+ }
438
+ }
439
+
440
+ // Sort by score descending
441
+ return Array.from(scores.entries())
442
+ .map(([p, s]) => ({ path: p, score: s }))
443
+ .sort((a, b) => b.score - a.score)
444
+ }
445
+
446
+ // =============================================================================
447
+ // SQLite Persistence
448
+ // =============================================================================
449
+
450
+ const INDEX_KEY = 'bm25-index'
451
+
452
+ /**
453
+ * Save a BM25 index to SQLite.
454
+ */
455
+ export function saveIndex(projectId: string, index: BM25Index): void {
456
+ // Store only the inverted index + metadata (not raw tokens, to save space)
457
+ const storable = {
458
+ invertedIndex: index.invertedIndex,
459
+ avgDocLength: index.avgDocLength,
460
+ totalDocs: index.totalDocs,
461
+ builtAt: index.builtAt,
462
+ // Store document lengths (needed for scoring) but not full token lists
463
+ docLengths: Object.fromEntries(Object.entries(index.documents).map(([p, d]) => [p, d.length])),
464
+ }
465
+ prjctDb.setDoc(projectId, INDEX_KEY, storable)
466
+ }
467
+
468
+ /**
469
+ * Load a BM25 index from SQLite.
470
+ * Returns null if no index exists.
471
+ */
472
+ export function loadIndex(projectId: string): BM25Index | null {
473
+ const stored = prjctDb.getDoc<{
474
+ invertedIndex: BM25Index['invertedIndex']
475
+ avgDocLength: number
476
+ totalDocs: number
477
+ builtAt: string
478
+ docLengths: Record<string, number>
479
+ }>(projectId, INDEX_KEY)
480
+
481
+ if (!stored) return null
482
+
483
+ // Reconstruct documents map with lengths only (tokens not needed for scoring)
484
+ const documents: BM25Index['documents'] = {}
485
+ for (const [p, length] of Object.entries(stored.docLengths)) {
486
+ documents[p] = { tokens: [], length }
487
+ }
488
+
489
+ return {
490
+ documents,
491
+ invertedIndex: stored.invertedIndex,
492
+ avgDocLength: stored.avgDocLength,
493
+ totalDocs: stored.totalDocs,
494
+ builtAt: stored.builtAt,
495
+ }
496
+ }
497
+
498
+ // =============================================================================
499
+ // High-level API
500
+ // =============================================================================
501
+
502
+ /**
503
+ * Build and persist a BM25 index for a project.
504
+ */
505
+ export async function indexProject(projectPath: string, projectId: string): Promise<BM25Index> {
506
+ const index = await buildIndex(projectPath)
507
+ saveIndex(projectId, index)
508
+ return index
509
+ }
510
+
511
+ /**
512
+ * Query files by relevance to a task description.
513
+ * Loads index from SQLite, scores against query, returns top N.
514
+ *
515
+ * @param projectId - Project ID for SQLite lookup
516
+ * @param query - Task description or search query
517
+ * @param topN - Maximum number of results (default: 15)
518
+ * @returns Sorted array of (path, score) — empty if no index exists
519
+ */
520
+ export function queryFiles(projectId: string, query: string, topN = 15): BM25Score[] {
521
+ const index = loadIndex(projectId)
522
+ if (!index) return []
523
+
524
+ return score(query, index).slice(0, topN)
525
+ }
@@ -0,0 +1,151 @@
1
+ /**
2
+ * File Ranker — Combined Scoring
3
+ *
4
+ * Combines three signals to rank files by relevance to a task:
5
+ * - BM25 text search (0.5 weight)
6
+ * - Import graph proximity (0.3 weight)
7
+ * - Git co-change correlation (0.2 weight)
8
+ *
9
+ * Zero API calls. Pure math on pre-built indexes.
10
+ *
11
+ * @module domain/file-ranker
12
+ * @version 1.0.0
13
+ */
14
+
15
+ import { loadIndex, queryFiles } from './bm25'
16
+ import { scoreFromSeeds as cochangeScore, loadMatrix } from './git-cochange'
17
+ import { scoreFromSeeds as importScore, loadGraph } from './import-graph'
18
+
19
+ // =============================================================================
20
+ // Types
21
+ // =============================================================================
22
+
23
+ export interface RankedFile {
24
+ path: string
25
+ finalScore: number
26
+ signals: {
27
+ bm25: number
28
+ imports: number
29
+ cochange: number
30
+ }
31
+ }
32
+
33
+ export interface RankingConfig {
34
+ /** Weight for BM25 text relevance (default: 0.5) */
35
+ bm25Weight?: number
36
+ /** Weight for import graph proximity (default: 0.3) */
37
+ importWeight?: number
38
+ /** Weight for git co-change correlation (default: 0.2) */
39
+ cochangeWeight?: number
40
+ /** Maximum number of results (default: 15) */
41
+ topN?: number
42
+ /** Maximum depth for import graph traversal (default: 2) */
43
+ importDepth?: number
44
+ }
45
+
46
+ const DEFAULT_CONFIG: Required<RankingConfig> = {
47
+ bm25Weight: 0.5,
48
+ importWeight: 0.3,
49
+ cochangeWeight: 0.2,
50
+ topN: 15,
51
+ importDepth: 2,
52
+ }
53
+
54
+ // =============================================================================
55
+ // Combined Ranking
56
+ // =============================================================================
57
+
58
+ /**
59
+ * Rank files by combined relevance to a task description.
60
+ *
61
+ * Algorithm:
62
+ * 1. BM25: Score all files against the query
63
+ * 2. Import graph: From top BM25 hits, follow imports 2 levels deep
64
+ * 3. Co-change: From top BM25 hits, find co-changed files
65
+ * 4. Normalize each signal to [0, 1]
66
+ * 5. Combine: finalScore = bm25 * 0.5 + imports * 0.3 + cochange * 0.2
67
+ *
68
+ * Performance target: <50ms per query (all indexes pre-loaded from SQLite).
69
+ */
70
+ export function rankFiles(
71
+ projectId: string,
72
+ query: string,
73
+ config: RankingConfig = {}
74
+ ): RankedFile[] {
75
+ const cfg = { ...DEFAULT_CONFIG, ...config }
76
+
77
+ // 1. BM25 scoring — get broad candidate set
78
+ const bm25Results = queryFiles(projectId, query, cfg.topN * 3) // Get more candidates
79
+ if (bm25Results.length === 0) return []
80
+
81
+ // Normalize BM25 scores to [0, 1]
82
+ const maxBm25 = bm25Results[0]?.score || 1
83
+ const bm25Map = new Map<string, number>()
84
+ for (const result of bm25Results) {
85
+ bm25Map.set(result.path, result.score / maxBm25)
86
+ }
87
+
88
+ // Seed files: top BM25 hits for graph traversal
89
+ const seedFiles = bm25Results.slice(0, 10).map((r) => r.path)
90
+
91
+ // 2. Import graph scoring
92
+ const importMap = new Map<string, number>()
93
+ const graph = loadGraph(projectId)
94
+ if (graph) {
95
+ const importResults = importScore(seedFiles, graph, cfg.importDepth)
96
+ const maxImport = importResults[0]?.score || 1
97
+ for (const result of importResults) {
98
+ importMap.set(result.path, result.score / maxImport)
99
+ }
100
+ }
101
+
102
+ // 3. Co-change scoring
103
+ const cochangeMap = new Map<string, number>()
104
+ const cochangeIndex = loadMatrix(projectId)
105
+ if (cochangeIndex) {
106
+ const cochangeResults = cochangeScore(seedFiles, cochangeIndex)
107
+ const maxCochange = cochangeResults[0]?.score || 1
108
+ for (const result of cochangeResults) {
109
+ cochangeMap.set(result.path, result.score / maxCochange)
110
+ }
111
+ }
112
+
113
+ // 4. Collect all candidate files
114
+ const allFiles = new Set([...bm25Map.keys(), ...importMap.keys(), ...cochangeMap.keys()])
115
+
116
+ // 5. Combined scoring
117
+ const ranked: RankedFile[] = []
118
+ for (const filePath of allFiles) {
119
+ const bm25 = bm25Map.get(filePath) || 0
120
+ const imports = importMap.get(filePath) || 0
121
+ const cochange = cochangeMap.get(filePath) || 0
122
+
123
+ const finalScore =
124
+ bm25 * cfg.bm25Weight + imports * cfg.importWeight + cochange * cfg.cochangeWeight
125
+
126
+ ranked.push({
127
+ path: filePath,
128
+ finalScore,
129
+ signals: { bm25, imports, cochange },
130
+ })
131
+ }
132
+
133
+ // Sort by finalScore descending, return top N
134
+ ranked.sort((a, b) => b.finalScore - a.finalScore)
135
+ return ranked.slice(0, cfg.topN)
136
+ }
137
+
138
+ /**
139
+ * Check if all three indexes exist for a project.
140
+ */
141
+ export function hasIndexes(projectId: string): {
142
+ bm25: boolean
143
+ imports: boolean
144
+ cochange: boolean
145
+ } {
146
+ return {
147
+ bm25: loadIndex(projectId) !== null,
148
+ imports: loadGraph(projectId) !== null,
149
+ cochange: loadMatrix(projectId) !== null,
150
+ }
151
+ }