codecritique 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +82 -114
  2. package/package.json +10 -9
  3. package/src/content-retrieval.test.js +775 -0
  4. package/src/custom-documents.test.js +440 -0
  5. package/src/feedback-loader.test.js +529 -0
  6. package/src/llm.test.js +256 -0
  7. package/src/project-analyzer.test.js +747 -0
  8. package/src/rag-analyzer.js +12 -0
  9. package/src/rag-analyzer.test.js +1109 -0
  10. package/src/rag-review.test.js +317 -0
  11. package/src/setupTests.js +131 -0
  12. package/src/zero-shot-classifier-open.test.js +278 -0
  13. package/src/embeddings/cache-manager.js +0 -364
  14. package/src/embeddings/constants.js +0 -40
  15. package/src/embeddings/database.js +0 -921
  16. package/src/embeddings/errors.js +0 -208
  17. package/src/embeddings/factory.js +0 -447
  18. package/src/embeddings/file-processor.js +0 -851
  19. package/src/embeddings/model-manager.js +0 -337
  20. package/src/embeddings/similarity-calculator.js +0 -97
  21. package/src/embeddings/types.js +0 -113
  22. package/src/pr-history/analyzer.js +0 -579
  23. package/src/pr-history/bot-detector.js +0 -123
  24. package/src/pr-history/cli-utils.js +0 -204
  25. package/src/pr-history/comment-processor.js +0 -549
  26. package/src/pr-history/database.js +0 -819
  27. package/src/pr-history/github-client.js +0 -629
  28. package/src/technology-keywords.json +0 -753
  29. package/src/utils/command.js +0 -48
  30. package/src/utils/constants.js +0 -263
  31. package/src/utils/context-inference.js +0 -364
  32. package/src/utils/document-detection.js +0 -105
  33. package/src/utils/file-validation.js +0 -271
  34. package/src/utils/git.js +0 -232
  35. package/src/utils/language-detection.js +0 -170
  36. package/src/utils/logging.js +0 -24
  37. package/src/utils/markdown.js +0 -132
  38. package/src/utils/mobilebert-tokenizer.js +0 -141
  39. package/src/utils/pr-chunking.js +0 -276
  40. package/src/utils/string-utils.js +0 -28
@@ -1,851 +0,0 @@
1
- /**
2
- * File Processor Module
3
- *
4
- * This module provides centralized file processing capabilities for embeddings.
5
- * It handles batch processing, directory structure generation, and progress tracking.
6
- *
7
- * Features:
8
- * - Batch file processing with progress tracking
9
- * - Directory structure generation and embedding
10
- * - File filtering and exclusion logic
11
- * - Document chunk processing
12
- * - Vector index creation
13
- * - Comprehensive error handling
14
- */
15
-
16
- import { createHash } from 'node:crypto';
17
- import fs from 'node:fs';
18
- import path from 'node:path';
19
- import chalk from 'chalk';
20
- import { isDocumentationFile, shouldProcessFile as utilsShouldProcessFile, batchCheckGitignore } from '../utils/file-validation.js';
21
- import { detectLanguageFromExtension } from '../utils/language-detection.js';
22
- import { debug } from '../utils/logging.js';
23
- import { extractMarkdownChunks } from '../utils/markdown.js';
24
- import { slugify } from '../utils/string-utils.js';
25
- import { TABLE_NAMES, LANCEDB_DIR_NAME, FASTEMBED_CACHE_DIR_NAME } from './constants.js';
26
- import { createFileProcessingError } from './errors.js';
27
-
28
- // ============================================================================
29
- // FILE PROCESSOR CLASS
30
- // ============================================================================
31
-
32
- export class FileProcessor {
33
- constructor(options = {}) {
34
- this.modelManager = options.modelManager || null;
35
- this.databaseManager = options.databaseManager || null;
36
- this.cacheManager = options.cacheManager || null;
37
-
38
- // Processing state
39
- this.processedFiles = new Map();
40
- this.cleaningUp = false;
41
- this.progressTracker = {
42
- totalFiles: 0,
43
- processedCount: 0,
44
- skippedCount: 0,
45
- failedCount: 0,
46
- startTime: 0,
47
- reset(total) {
48
- this.totalFiles = total;
49
- this.processedCount = 0;
50
- this.skippedCount = 0;
51
- this.failedCount = 0;
52
- this.startTime = Date.now();
53
- },
54
- update(type) {
55
- if (type === 'processed') this.processedCount++;
56
- if (type === 'skipped') this.skippedCount++;
57
- if (type === 'failed') this.failedCount++;
58
- // Progress logging is now handled by the spinner in index.js via onProgress callback
59
- },
60
- };
61
-
62
- // Table names
63
- this.fileEmbeddingsTable = TABLE_NAMES.FILE_EMBEDDINGS;
64
- this.documentChunkTable = TABLE_NAMES.DOCUMENT_CHUNK;
65
- }
66
-
67
- // ============================================================================
68
- // PROGRESS TRACKING
69
- // ============================================================================
70
-
71
- /**
72
- * Get progress tracker
73
- * @returns {Object} Progress tracker object
74
- */
75
- getProgressTracker() {
76
- return this.progressTracker;
77
- }
78
-
79
- /**
80
- * Reset progress tracker
81
- * @param {number} totalFiles - Total number of files to process
82
- */
83
- resetProgressTracker(totalFiles = 0) {
84
- this.progressTracker.reset(totalFiles);
85
- }
86
-
87
- // ============================================================================
88
- // DIRECTORY STRUCTURE PROCESSING
89
- // ============================================================================
90
-
91
- /**
92
- * Generate directory structure string
93
- * @param {Object} options - Options for generating directory structure
94
- * @returns {string} Directory structure as a string
95
- */
96
- generateDirectoryStructure(options = {}) {
97
- const { rootDir = process.cwd(), maxDepth = 5, ignorePatterns = [], showFiles = true } = options;
98
- debug(`Generating directory structure: rootDir=${rootDir}, maxDepth=${maxDepth}, showFiles=${showFiles}`);
99
-
100
- // Use path.sep for platform compatibility
101
- const pathSep = path.sep;
102
- // More robust ignore pattern matching (handles directory separators)
103
- const shouldIgnore = (relPath) =>
104
- ignorePatterns.some((pattern) => {
105
- const normalizedPattern = pattern.replace(/\//g, pathSep); // Normalize pattern separators
106
- const normalizedPath = relPath.replace(/\//g, pathSep);
107
- if (normalizedPattern.startsWith(`**${pathSep}`)) {
108
- return normalizedPath.includes(normalizedPattern.slice(3));
109
- }
110
- return normalizedPath.includes(normalizedPattern);
111
- });
112
-
113
- const buildStructure = (dir, depth = 0, prefix = '') => {
114
- if (depth > maxDepth) return '';
115
- let result = '';
116
- try {
117
- const entries = fs
118
- .readdirSync(dir, { withFileTypes: true })
119
- .sort((a, b) => (a.isDirectory() === b.isDirectory() ? a.name.localeCompare(b.name) : a.isDirectory() ? -1 : 1));
120
- for (let i = 0; i < entries.length; i++) {
121
- const entry = entries[i];
122
- const isLast = i === entries.length - 1;
123
- const entryPath = path.join(dir, entry.name);
124
- const relativePath = path.relative(rootDir, entryPath);
125
- // Skip if ignored
126
- if (shouldIgnore(relativePath) || entry.name === LANCEDB_DIR_NAME || entry.name === FASTEMBED_CACHE_DIR_NAME) continue; // Also ignore DB/cache dirs
127
-
128
- const connector = isLast ? '└── ' : '├── ';
129
- const nextPrefix = isLast ? prefix + ' ' : prefix + '│ ';
130
- if (entry.isDirectory()) {
131
- result += `${prefix}${connector}${entry.name}/\n`;
132
- result += buildStructure(entryPath, depth + 1, nextPrefix);
133
- } else if (showFiles) {
134
- result += `${prefix}${connector}${entry.name}\n`;
135
- }
136
- }
137
- } catch (error) {
138
- console.error(`Error reading directory ${dir}:`, error.message);
139
- }
140
- return result;
141
- };
142
- return buildStructure(rootDir);
143
- }
144
-
145
- /**
146
- * Generate and store an embedding for the project directory structure
147
- * @param {Object} options - Options for generating the directory structure
148
- * @returns {Promise<boolean>} True if successful, false otherwise
149
- */
150
- async generateDirectoryStructureEmbedding(options = {}) {
151
- console.log(chalk.cyan('[generateDirEmb] Starting...')); // Log entry
152
-
153
- if (!this.modelManager) {
154
- throw createFileProcessingError('ModelManager is required for directory structure embedding');
155
- }
156
-
157
- if (!this.databaseManager) {
158
- throw createFileProcessingError('DatabaseManager is required for directory structure embedding');
159
- }
160
-
161
- try {
162
- await this.databaseManager.getDB();
163
- const table = await this.databaseManager.getTable(this.fileEmbeddingsTable);
164
- if (!table) {
165
- throw new Error(`[generateDirEmb] Table ${this.fileEmbeddingsTable} not found.`);
166
- }
167
-
168
- // Create project-specific structure ID based on the root directory
169
- const rootDir = options.rootDir || process.cwd();
170
- const projectName = path.basename(path.resolve(rootDir));
171
- const structureId = `__project_structure__${projectName}`;
172
-
173
- try {
174
- await table.delete(`id = '${structureId}'`);
175
- debug('[generateDirEmb] Deleted existing project structure embedding');
176
- } catch (error) {
177
- if (!error.message.includes('Record not found') && !error.message.includes('cannot find')) {
178
- debug(`[generateDirEmb] Error deleting existing project structure: ${error.message}`);
179
- } else {
180
- debug('[generateDirEmb] No existing project structure to delete.');
181
- }
182
- }
183
-
184
- const directoryStructure = this.generateDirectoryStructure(options);
185
- if (!directoryStructure) throw new Error('[generateDirEmb] Failed to generate directory structure string');
186
- debug('[generateDirEmb] Directory structure string generated.');
187
-
188
- // *** Calculate embedding explicitly ***
189
- const embedding = await this.modelManager.calculateEmbedding(directoryStructure);
190
-
191
- if (!embedding) {
192
- console.error(chalk.red('[generateDirEmb] Failed to calculate embedding for directory structure.'));
193
- return false; // Indicate failure
194
- }
195
- debug(`[generateDirEmb] Embedding calculated, length: ${embedding.length}`);
196
-
197
- const record = {
198
- vector: embedding, // Include calculated embedding
199
- id: structureId,
200
- content: directoryStructure,
201
- type: 'directory-structure',
202
- name: `${projectName} Project Structure`,
203
- path: `${projectName} Project Structure`, // Project-specific path
204
- project_path: path.resolve(rootDir), // Add project path for consistency with new schema
205
- language: 'text',
206
- content_hash: createHash('md5').update(directoryStructure).digest('hex').substring(0, 8),
207
- last_modified: new Date().toISOString(), // Use current timestamp for directory structure
208
- };
209
-
210
- debug(`[generateDirEmb] Prepared record: ID=${record.id}, Vector length=${record.vector?.length}`);
211
- if (record.vector?.length !== this.modelManager.embeddingDimensions) {
212
- console.error(chalk.red(`[generateDirEmb] !!! Vector dimension mismatch before add !!!`));
213
- return false; // Don't add invalid record
214
- }
215
-
216
- // *** Add record with specific try/catch ***
217
- debug('[generateDirEmb] Attempting table.add...');
218
- try {
219
- await table.add([record]);
220
- console.log(chalk.green('[generateDirEmb] Successfully added directory structure embedding.'));
221
- return true; // Indicate success
222
- } catch (addError) {
223
- console.error(chalk.red(`[generateDirEmb] !!! Error during table.add: ${addError.message}`), addError.stack);
224
- return false; // Indicate failure
225
- }
226
- } catch (error) {
227
- console.error(chalk.red(`[generateDirEmb] Overall error: ${error.message}`), error.stack);
228
- return false; // Indicate failure
229
- }
230
- }
231
-
232
- // ============================================================================
233
- // BATCH PROCESSING
234
- // ============================================================================
235
-
236
- /**
237
- * Process embeddings for multiple files in batch
238
- * @param {string[]} filePaths - Array of file paths to process
239
- * @param {Object} options - Processing options
240
- * @returns {Promise<Object>} Processing results
241
- */
242
- async processBatchEmbeddings(filePaths, options = {}) {
243
- const {
244
- excludePatterns = [],
245
- respectGitignore = true,
246
- baseDir: optionBaseDir = process.cwd(),
247
- maxLines = 1000,
248
- onProgress, // <<< Add onProgress here
249
- } = options;
250
- const resolvedCanonicalBaseDir = path.resolve(optionBaseDir);
251
- debug(`Resolved canonical base directory: ${resolvedCanonicalBaseDir}`);
252
-
253
- if (!this.modelManager) {
254
- throw createFileProcessingError('ModelManager is required for batch processing');
255
- }
256
-
257
- if (!this.databaseManager) {
258
- throw createFileProcessingError('DatabaseManager is required for batch processing');
259
- }
260
-
261
- try {
262
- await this.modelManager.initialize(); // Ensure model is ready
263
- } catch {
264
- console.error(chalk.red('Failed to initialize embedding model. Aborting batch process.'));
265
- return { processed: 0, failed: filePaths.length, skipped: 0, excluded: 0, files: [], failedFiles: [...filePaths], excludedFiles: [] };
266
- }
267
-
268
- console.log(chalk.blue('Ensuring database tables exist before batch processing...'));
269
- try {
270
- await this.databaseManager.getDB();
271
- console.log(chalk.green('Database table check complete.'));
272
- } catch (dbError) {
273
- console.error(chalk.red(`Failed to initialize database or tables: ${dbError.message}. Aborting batch process.`));
274
- return { processed: 0, failed: filePaths.length, skipped: 0, excluded: 0, files: [], failedFiles: [...filePaths], excludedFiles: [] };
275
- }
276
-
277
- const results = { processed: 0, failed: 0, skipped: 0, excluded: 0, files: [], failedFiles: [], excludedFiles: [] };
278
- const exclusionOptions = { excludePatterns, respectGitignore, baseDir: resolvedCanonicalBaseDir };
279
- this.processedFiles.clear();
280
- this.progressTracker.reset(filePaths.length);
281
- console.log(chalk.blue(`Starting batch processing of ${filePaths.length} files...`));
282
-
283
- // Generate directory structure embedding first
284
- try {
285
- await this.generateDirectoryStructureEmbedding({
286
- rootDir: resolvedCanonicalBaseDir,
287
- maxDepth: 5,
288
- ignorePatterns: excludePatterns,
289
- showFiles: true,
290
- });
291
- } catch (structureError) {
292
- console.warn(chalk.yellow(`Warning: Failed to generate directory structure embedding: ${structureError.message}`));
293
- }
294
-
295
- const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable);
296
- if (!fileTable) {
297
- console.error(chalk.red(`Table ${this.fileEmbeddingsTable} not found. Aborting batch file embedding.`));
298
- results.failed = filePaths.length;
299
- results.failedFiles = [...filePaths];
300
- this.progressTracker.failedCount = filePaths.length;
301
- this.progressTracker.update('failed');
302
- return results;
303
- }
304
-
305
- // Process files in batches
306
- console.log(chalk.cyan('--- Starting Phase 1: File Embeddings ---'));
307
- const BATCH_SIZE = 50; // Process files in smaller batches for better performance
308
-
309
- for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
310
- const batch = filePaths.slice(i, i + BATCH_SIZE);
311
- const batchResults = await this._processBatch(batch, resolvedCanonicalBaseDir, exclusionOptions, onProgress, maxLines);
312
-
313
- // Merge results
314
- results.processed += batchResults.processed;
315
- results.failed += batchResults.failed;
316
- results.skipped += batchResults.skipped;
317
- results.excluded += batchResults.excluded;
318
- results.files.push(...batchResults.files);
319
- results.failedFiles.push(...batchResults.failedFiles);
320
- results.excludedFiles.push(...batchResults.excludedFiles);
321
- }
322
-
323
- // Process document chunks
324
- await this._processDocumentChunks(filePaths, resolvedCanonicalBaseDir, excludePatterns);
325
-
326
- console.log(chalk.green(`Batch processing complete!`));
327
-
328
- // Update progress tracker counts for internal tracking
329
- this.progressTracker.processedCount = results.processed;
330
- this.progressTracker.skippedCount = results.excluded + results.skipped;
331
- this.progressTracker.failedCount = results.failed;
332
-
333
- return results;
334
- }
335
-
336
- // ============================================================================
337
- // PRIVATE METHODS
338
- // ============================================================================
339
-
340
- /**
341
- * Process a batch of files
342
- * @param {string[]} filePaths - File paths to process
343
- * @param {string} baseDir - Base directory
344
- * @param {Object} exclusionOptions - Exclusion options
345
- * @param {Function} onProgress - Progress callback
346
- * @returns {Promise<Object>} Batch processing results
347
- * @private
348
- */
349
- async _processBatch(filePaths, baseDir, exclusionOptions, onProgress, maxLines = 1000) {
350
- const results = { processed: 0, failed: 0, skipped: 0, excluded: 0, files: [], failedFiles: [], excludedFiles: [] };
351
-
352
- // ============================================================================
353
- // PHASE 1: BATCH GITIGNORE CHECK
354
- // ============================================================================
355
- let gitignoreCache = new Map();
356
- if (exclusionOptions.respectGitignore !== false) {
357
- console.log(chalk.cyan(`Performing batch gitignore check for ${filePaths.length} files...`));
358
- const gitStartTime = Date.now();
359
- const absoluteFilePaths = filePaths.map((fp) => (path.isAbsolute(fp) ? path.resolve(fp) : path.resolve(baseDir, fp)));
360
- gitignoreCache = await batchCheckGitignore(absoluteFilePaths, baseDir);
361
- const gitDuration = ((Date.now() - gitStartTime) / 1000).toFixed(2);
362
- console.log(chalk.green(`✓ Batch gitignore check completed in ${gitDuration}s`));
363
- }
364
-
365
- // ============================================================================
366
- // PHASE 2: GET EXISTING EMBEDDINGS (for early filtering)
367
- // ============================================================================
368
- // Query existing embeddings BEFORE reading any files
369
- const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable);
370
- let existingFilesMap = new Map();
371
-
372
- try {
373
- const existingRecords = await fileTable
374
- .query()
375
- .where(`project_path = '${baseDir.replace(/'/g, "''")}'`)
376
- .toArray();
377
-
378
- for (const record of existingRecords) {
379
- if (!existingFilesMap.has(record.path)) {
380
- existingFilesMap.set(record.path, []);
381
- }
382
- existingFilesMap.get(record.path).push(record);
383
- }
384
-
385
- console.log(chalk.cyan(`Found ${existingRecords.length} existing embeddings for comparison`));
386
- } catch (queryError) {
387
- console.warn(chalk.yellow(`Warning: Could not query existing embeddings: ${queryError.message}`));
388
- }
389
-
390
- // ============================================================================
391
- // PHASE 3: FAST PRE-FILTERING (without reading file contents)
392
- // ============================================================================
393
- // Filter files based on basic checks and file timestamps before reading content
394
- const candidateFiles = [];
395
-
396
- for (const filePath of filePaths) {
397
- const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath);
398
- const consistentRelativePath = path.relative(baseDir, absoluteFilePath);
399
-
400
- try {
401
- // Get file stats (size, mtime, etc.)
402
- const stats = fs.statSync(absoluteFilePath);
403
-
404
- // Check if file should be processed (using cached gitignore results)
405
- if (
406
- !utilsShouldProcessFile(absoluteFilePath, '', {
407
- ...exclusionOptions,
408
- baseDir: baseDir,
409
- relativePathToCheck: consistentRelativePath,
410
- gitignoreCache, // Pass the pre-computed cache
411
- fileStats: stats, // Pass stats to avoid re-reading
412
- })
413
- ) {
414
- results.excluded++;
415
- results.excludedFiles.push(filePath);
416
- this.progressTracker.update('skipped');
417
- if (typeof onProgress === 'function') onProgress('excluded', filePath);
418
- this.processedFiles.set(filePath, 'excluded');
419
- continue;
420
- }
421
-
422
- // Early skip based on modification time if file exists in database
423
- const existingRecords = existingFilesMap.get(consistentRelativePath) || [];
424
- let potentiallyUnchanged = false;
425
-
426
- if (existingRecords.length > 0) {
427
- // Check if any existing record has the same modification time
428
- for (const existing of existingRecords) {
429
- const existingMtime = new Date(existing.last_modified).getTime();
430
- const currentMtime = stats.mtime.getTime();
431
-
432
- // If modification times match (within 1 second to account for filesystem precision)
433
- if (Math.abs(existingMtime - currentMtime) < 1000) {
434
- potentiallyUnchanged = true;
435
- break;
436
- }
437
- }
438
- }
439
-
440
- candidateFiles.push({
441
- filePath: absoluteFilePath,
442
- originalInputPath: filePath,
443
- relativePath: consistentRelativePath,
444
- stats,
445
- potentiallyUnchanged,
446
- existingRecords,
447
- });
448
- } catch {
449
- results.failed++;
450
- results.failedFiles.push(filePath);
451
- this.progressTracker.update('failed');
452
- if (typeof onProgress === 'function') onProgress('failed', filePath);
453
- this.processedFiles.set(filePath, 'failed_stat');
454
- }
455
- }
456
-
457
- console.log(chalk.cyan(`Pre-filtered to ${candidateFiles.length} candidate files (excluded ${results.excluded})`));
458
-
459
- // ============================================================================
460
- // PHASE 4: READ FILES AND CONTENT HASH CHECK
461
- // ============================================================================
462
- // Now read file contents only for candidates that passed initial filtering
463
- const filesToProcess = [];
464
- const contentsForBatch = [];
465
-
466
- for (const fileData of candidateFiles) {
467
- try {
468
- // Read file content
469
- let content = await fs.promises.readFile(fileData.filePath, 'utf8');
470
-
471
- // Check if empty
472
- if (content.trim().length === 0) {
473
- results.skipped++;
474
- this.progressTracker.update('skipped');
475
- if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath);
476
- this.processedFiles.set(fileData.originalInputPath, 'skipped_empty');
477
- continue;
478
- }
479
-
480
- // Truncate content to maximum specified lines for code files only
481
- const isDocFile = isDocumentationFile(fileData.filePath);
482
- if (!isDocFile) {
483
- const lines = content.split('\n');
484
- if (lines.length > maxLines) {
485
- content = lines.slice(0, maxLines).join('\n') + '\n... (truncated from ' + lines.length + ' lines)';
486
- debug(`Truncated code file ${fileData.relativePath} from ${lines.length} lines to ${maxLines} lines`);
487
- }
488
- }
489
-
490
- // Add content to file data
491
- fileData.content = content;
492
- filesToProcess.push(fileData);
493
- contentsForBatch.push(content);
494
- } catch {
495
- results.failed++;
496
- results.failedFiles.push(fileData.originalInputPath);
497
- this.progressTracker.update('failed');
498
- if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
499
- this.processedFiles.set(fileData.originalInputPath, 'failed_read');
500
- }
501
- }
502
-
503
- // ============================================================================
504
- // PHASE 5: CONTENT HASH CHECK AND DEDUPLICATION
505
- // ============================================================================
506
- // Check each file against existing embeddings using content hash
507
- const filesToActuallyProcess = [];
508
- const contentsToActuallyProcess = [];
509
- const recordsToDelete = [];
510
-
511
- for (let i = 0; i < filesToProcess.length; i++) {
512
- const fileData = filesToProcess[i];
513
- const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8);
514
-
515
- const existingRecords = fileData.existingRecords || [];
516
- let needsUpdate = true;
517
-
518
- if (existingRecords.length > 0) {
519
- // Check if any existing record matches our current file state
520
- for (const existing of existingRecords) {
521
- if (existing.content_hash === contentHash) {
522
- // File content hasn't changed - skip processing (CI-friendly)
523
- // Note: We rely on content_hash rather than last_modified because
524
- // GitHub Actions checkout changes file timestamps even for unchanged files
525
- needsUpdate = false;
526
- results.skipped++;
527
- this.progressTracker.update('skipped');
528
- if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath);
529
- this.processedFiles.set(fileData.originalInputPath, 'skipped_unchanged');
530
- debug(`Skipping unchanged file: ${fileData.relativePath} (hash: ${contentHash})`);
531
- break;
532
- } else if (existing.path === fileData.relativePath) {
533
- // Same file path but different content - mark old version for deletion
534
- recordsToDelete.push(existing);
535
- }
536
- }
537
- }
538
-
539
- if (needsUpdate) {
540
- // File needs processing (new or changed)
541
- filesToActuallyProcess.push(fileData);
542
- contentsToActuallyProcess.push(fileData.content);
543
- }
544
- }
545
-
546
- // Batch delete old versions if any
547
- if (recordsToDelete.length > 0) {
548
- for (const recordToDelete of recordsToDelete) {
549
- try {
550
- await fileTable.delete(`id = '${recordToDelete.id.replace(/'/g, "''")}'`);
551
- debug(`Deleted old version: ${recordToDelete.path} (old hash: ${recordToDelete.content_hash})`);
552
- } catch (deleteError) {
553
- console.warn(chalk.yellow(`Warning: Could not delete old version of ${recordToDelete.path}: ${deleteError.message}`));
554
- }
555
- }
556
- }
557
-
558
- // Generate embeddings only for files that need processing
559
- if (filesToActuallyProcess.length > 0) {
560
- console.log(
561
- chalk.cyan(
562
- `Processing ${filesToActuallyProcess.length} new/changed files (skipped ${filesToProcess.length - filesToActuallyProcess.length} unchanged)`
563
- )
564
- );
565
-
566
- try {
567
- const embeddings = await this.modelManager.calculateEmbeddingBatch(contentsToActuallyProcess);
568
- const recordsToAdd = [];
569
-
570
- for (let i = 0; i < embeddings.length; i++) {
571
- const fileData = filesToActuallyProcess[i];
572
- const embeddingVector = embeddings[i];
573
-
574
- if (embeddingVector) {
575
- const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8);
576
- const fileId = `${fileData.relativePath}#${contentHash}`;
577
-
578
- const record = {
579
- vector: embeddingVector,
580
- id: fileId,
581
- content: fileData.content,
582
- type: 'file',
583
- name: path.basename(fileData.filePath),
584
- path: fileData.relativePath,
585
- project_path: baseDir,
586
- language: detectLanguageFromExtension(path.extname(fileData.filePath)),
587
- content_hash: contentHash,
588
- last_modified: fileData.stats.mtime.toISOString(),
589
- };
590
- recordsToAdd.push(record);
591
- } else {
592
- results.failed++;
593
- results.failedFiles.push(fileData.originalInputPath);
594
- this.progressTracker.update('failed');
595
- if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
596
- this.processedFiles.set(fileData.originalInputPath, 'failed_embedding');
597
- }
598
- }
599
-
600
- // Add new/updated records to database
601
- if (recordsToAdd.length > 0) {
602
- await fileTable.add(recordsToAdd);
603
-
604
- // Optimize table to sync indices with data and prevent TakeExec panics
605
- try {
606
- await fileTable.optimize();
607
- } catch (optimizeError) {
608
- if (optimizeError.message && optimizeError.message.includes('legacy format')) {
609
- console.log(
610
- chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`)
611
- );
612
- } else {
613
- console.warn(
614
- chalk.yellow(`Warning: Failed to optimize file embeddings table after adding records: ${optimizeError.message}`)
615
- );
616
- }
617
- }
618
-
619
- recordsToAdd.forEach((record, index) => {
620
- const fileData = filesToActuallyProcess[index];
621
- if (embeddings[index]) {
622
- results.processed++;
623
- results.files.push(fileData.originalInputPath);
624
- this.progressTracker.update('processed');
625
- if (typeof onProgress === 'function') onProgress('processed', fileData.originalInputPath);
626
- this.processedFiles.set(fileData.originalInputPath, 'processed');
627
- }
628
- });
629
- }
630
- } catch (error) {
631
- console.error(chalk.red(`Error processing batch: ${error.message}`));
632
- filesToProcess.forEach((fileData) => {
633
- results.failed++;
634
- results.failedFiles.push(fileData.originalInputPath);
635
- this.progressTracker.update('failed');
636
- if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
637
- this.processedFiles.set(fileData.originalInputPath, 'failed_batch');
638
- });
639
- }
640
- }
641
-
642
- return results;
643
- }
644
-
645
- /**
646
- * Process document chunks
647
- * @param {string[]} filePaths - File paths to process
648
- * @param {string} baseDir - Base directory
649
- * @param {string[]} excludePatterns - Exclude patterns
650
- * @returns {Promise<void>}
651
- * @private
652
- */
653
- async _processDocumentChunks(filePaths, baseDir) {
654
- console.log(chalk.cyan('--- Starting Phase 2: Document Chunk Embeddings ---'));
655
- const documentChunkTable = await this.databaseManager.getTable(this.documentChunkTable);
656
- if (!documentChunkTable) {
657
- console.warn(chalk.yellow(`Skipping Phase 2: Document Chunk Embeddings because table ${this.documentChunkTable} was not found.`));
658
- return;
659
- }
660
-
661
- // Efficient batch check: Get all existing document chunks for this project
662
- let existingDocChunksMap = new Map();
663
- try {
664
- const existingChunks = await documentChunkTable
665
- .query()
666
- .where(`project_path = '${baseDir.replace(/'/g, "''")}'`)
667
- .toArray();
668
-
669
- // Build a map for fast lookup: original_document_path -> [chunks]
670
- for (const chunk of existingChunks) {
671
- if (!existingDocChunksMap.has(chunk.original_document_path)) {
672
- existingDocChunksMap.set(chunk.original_document_path, []);
673
- }
674
- existingDocChunksMap.get(chunk.original_document_path).push(chunk);
675
- }
676
-
677
- console.log(chalk.cyan(`Found ${existingChunks.length} existing document chunks for comparison`));
678
- } catch (queryError) {
679
- console.warn(chalk.yellow(`Warning: Could not query existing document chunks, will process all docs: ${queryError.message}`));
680
- existingDocChunksMap = new Map();
681
- }
682
-
683
- const allDocChunksToEmbed = [];
684
- const allDocChunkRecordsToAdd = [];
685
- const processedDocPathsForDeletion = new Set();
686
- let skippedDocCount = 0;
687
-
688
- for (const filePath of filePaths) {
689
- const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath);
690
- const consistentRelativePath = path.relative(baseDir, absoluteFilePath);
691
- const language = detectLanguageFromExtension(path.extname(absoluteFilePath));
692
-
693
- if (isDocumentationFile(absoluteFilePath, language)) {
694
- try {
695
- const stats = fs.statSync(absoluteFilePath);
696
- if (stats.size > 5 * 1024 * 1024) {
697
- // 5MB limit for docs
698
- continue;
699
- }
700
-
701
- const content = await fs.promises.readFile(absoluteFilePath, 'utf8');
702
- if (content.trim().length === 0) {
703
- continue;
704
- }
705
-
706
- // Check if document has changed by comparing chunk content hashes
707
- const existingChunks = existingDocChunksMap.get(consistentRelativePath) || [];
708
-
709
- // Extract chunks to compare with existing ones
710
- const { chunks: currentChunks, documentH1 } = extractMarkdownChunks(absoluteFilePath, content, consistentRelativePath);
711
- let hasUnchangedDocument = false;
712
-
713
- if (existingChunks.length > 0 && currentChunks.length === existingChunks.length) {
714
- // Create a signature of the document by combining all chunk content hashes
715
- const currentChunkHashes = currentChunks
716
- .map((chunk) => createHash('md5').update(chunk.content).digest('hex').substring(0, 8))
717
- .sort()
718
- .join('|');
719
-
720
- const existingChunkHashes = existingChunks
721
- .map((chunk) => chunk.content_hash)
722
- .sort()
723
- .join('|');
724
-
725
- hasUnchangedDocument = currentChunkHashes === existingChunkHashes;
726
- }
727
-
728
- if (hasUnchangedDocument) {
729
- // Document hasn't changed - skip processing
730
- skippedDocCount++;
731
- debug(`Skipping unchanged document: ${consistentRelativePath} (${currentChunks.length} chunks match)`);
732
- continue;
733
- }
734
-
735
- // Document has changed or is new - process it
736
- if (!processedDocPathsForDeletion.has(consistentRelativePath)) {
737
- processedDocPathsForDeletion.add(consistentRelativePath);
738
- }
739
-
740
- if (currentChunks.length > 0) {
741
- currentChunks.forEach((chunk) => {
742
- const chunkWithTitle = {
743
- ...chunk,
744
- documentTitle: documentH1 || path.basename(absoluteFilePath, path.extname(absoluteFilePath)),
745
- fileStats: stats,
746
- };
747
- allDocChunksToEmbed.push(chunkWithTitle);
748
- });
749
- }
750
- } catch (docError) {
751
- console.warn(chalk.yellow(`Error processing document ${consistentRelativePath} for chunking: ${docError.message}`));
752
- }
753
- }
754
- }
755
-
756
- if (skippedDocCount > 0) {
757
- console.log(chalk.cyan(`Skipped ${skippedDocCount} unchanged documentation files`));
758
- }
759
-
760
- if (allDocChunksToEmbed.length > 0) {
761
- console.log(chalk.blue(`Extracted ${allDocChunksToEmbed.length} total document chunks to process for embeddings.`));
762
- const chunkContentsForBatching = allDocChunksToEmbed.map((chunk) => chunk.content);
763
- const chunkEmbeddings = await this.modelManager.calculateEmbeddingBatch(chunkContentsForBatching);
764
-
765
- for (let i = 0; i < chunkEmbeddings.length; i++) {
766
- const chunkData = allDocChunksToEmbed[i];
767
- const chunkEmbeddingVector = chunkEmbeddings[i];
768
-
769
- if (chunkEmbeddingVector) {
770
- const chunkContentHash = createHash('md5').update(chunkData.content).digest('hex').substring(0, 8);
771
- const chunkId = `${chunkData.original_document_path}#${slugify(chunkData.heading || 'section')}_${chunkData.start_line_in_doc}`;
772
-
773
- const record = {
774
- id: chunkId,
775
- content: chunkData.content,
776
- original_document_path: chunkData.original_document_path,
777
- project_path: baseDir,
778
- heading_text: chunkData.heading || '',
779
- document_title: chunkData.documentTitle,
780
- language: chunkData.language || 'markdown',
781
- vector: chunkEmbeddingVector,
782
- content_hash: chunkContentHash,
783
- last_modified: chunkData.fileStats ? chunkData.fileStats.mtime.toISOString() : new Date().toISOString(),
784
- };
785
- allDocChunkRecordsToAdd.push(record);
786
- }
787
- }
788
- }
789
-
790
- // Delete old chunks and add new ones
791
- if (processedDocPathsForDeletion.size > 0) {
792
- for (const docPathToDelete of processedDocPathsForDeletion) {
793
- try {
794
- await documentChunkTable.delete(`original_document_path = '${docPathToDelete.replace(/'/g, "''")}'`);
795
- } catch (deleteError) {
796
- console.warn(chalk.yellow(`Error deleting chunks for document ${docPathToDelete}: ${deleteError.message}`));
797
- }
798
- }
799
- }
800
-
801
- if (allDocChunkRecordsToAdd.length > 0) {
802
- try {
803
- await documentChunkTable.add(allDocChunkRecordsToAdd);
804
-
805
- // Optimize table to sync indices with data and prevent TakeExec panics
806
- try {
807
- await documentChunkTable.optimize();
808
- } catch (optimizeError) {
809
- if (optimizeError.message && optimizeError.message.includes('legacy format')) {
810
- console.log(chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`));
811
- } else {
812
- console.warn(chalk.yellow(`Warning: Failed to optimize document chunk table after adding records: ${optimizeError.message}`));
813
- }
814
- }
815
-
816
- console.log(
817
- chalk.green(`Successfully added ${allDocChunkRecordsToAdd.length} document chunk embeddings to ${this.documentChunkTable}.`)
818
- );
819
- } catch (addError) {
820
- console.error(chalk.red(`Error batch adding document chunk embeddings to DB: ${addError.message}`), addError.stack);
821
- }
822
- }
823
-
824
- console.log(chalk.green('--- Finished Phase 2: Document Chunk Embeddings ---'));
825
- }
826
-
827
- // ============================================================================
828
- // CLEANUP
829
- // ============================================================================
830
-
831
- /**
832
- * Clean up file processor resources
833
- */
834
- async cleanup() {
835
- if (this.cleaningUp) {
836
- return; // Already cleaning up, prevent duplicate calls
837
- }
838
-
839
- this.cleaningUp = true;
840
-
841
- try {
842
- this.processedFiles.clear();
843
- this.progressTracker.reset(0);
844
- console.log(chalk.green('[FileProcessor] Resources cleaned up.'));
845
- } catch (error) {
846
- console.error(chalk.red(`[FileProcessor] Error during cleanup: ${error.message}`));
847
- } finally {
848
- this.cleaningUp = false;
849
- }
850
- }
851
- }