codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,851 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Processor Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides centralized file processing capabilities for embeddings.
|
|
5
|
+
* It handles batch processing, directory structure generation, and progress tracking.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Batch file processing with progress tracking
|
|
9
|
+
* - Directory structure generation and embedding
|
|
10
|
+
* - File filtering and exclusion logic
|
|
11
|
+
* - Document chunk processing
|
|
12
|
+
* - Vector index creation
|
|
13
|
+
* - Comprehensive error handling
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { createHash } from 'node:crypto';
|
|
17
|
+
import fs from 'node:fs';
|
|
18
|
+
import path from 'node:path';
|
|
19
|
+
import chalk from 'chalk';
|
|
20
|
+
import { isDocumentationFile, shouldProcessFile as utilsShouldProcessFile, batchCheckGitignore } from '../utils/file-validation.js';
|
|
21
|
+
import { detectLanguageFromExtension } from '../utils/language-detection.js';
|
|
22
|
+
import { debug } from '../utils/logging.js';
|
|
23
|
+
import { extractMarkdownChunks } from '../utils/markdown.js';
|
|
24
|
+
import { slugify } from '../utils/string-utils.js';
|
|
25
|
+
import { TABLE_NAMES, LANCEDB_DIR_NAME, FASTEMBED_CACHE_DIR_NAME } from './constants.js';
|
|
26
|
+
import { createFileProcessingError } from './errors.js';
|
|
27
|
+
|
|
28
|
+
// ============================================================================
|
|
29
|
+
// FILE PROCESSOR CLASS
|
|
30
|
+
// ============================================================================
|
|
31
|
+
|
|
32
|
+
export class FileProcessor {
|
|
33
|
+
constructor(options = {}) {
|
|
34
|
+
this.modelManager = options.modelManager || null;
|
|
35
|
+
this.databaseManager = options.databaseManager || null;
|
|
36
|
+
this.cacheManager = options.cacheManager || null;
|
|
37
|
+
|
|
38
|
+
// Processing state
|
|
39
|
+
this.processedFiles = new Map();
|
|
40
|
+
this.cleaningUp = false;
|
|
41
|
+
this.progressTracker = {
|
|
42
|
+
totalFiles: 0,
|
|
43
|
+
processedCount: 0,
|
|
44
|
+
skippedCount: 0,
|
|
45
|
+
failedCount: 0,
|
|
46
|
+
startTime: 0,
|
|
47
|
+
reset(total) {
|
|
48
|
+
this.totalFiles = total;
|
|
49
|
+
this.processedCount = 0;
|
|
50
|
+
this.skippedCount = 0;
|
|
51
|
+
this.failedCount = 0;
|
|
52
|
+
this.startTime = Date.now();
|
|
53
|
+
},
|
|
54
|
+
update(type) {
|
|
55
|
+
if (type === 'processed') this.processedCount++;
|
|
56
|
+
if (type === 'skipped') this.skippedCount++;
|
|
57
|
+
if (type === 'failed') this.failedCount++;
|
|
58
|
+
// Progress logging is now handled by the spinner in index.js via onProgress callback
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Table names
|
|
63
|
+
this.fileEmbeddingsTable = TABLE_NAMES.FILE_EMBEDDINGS;
|
|
64
|
+
this.documentChunkTable = TABLE_NAMES.DOCUMENT_CHUNK;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ============================================================================
|
|
68
|
+
// PROGRESS TRACKING
|
|
69
|
+
// ============================================================================
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Get progress tracker
|
|
73
|
+
* @returns {Object} Progress tracker object
|
|
74
|
+
*/
|
|
75
|
+
getProgressTracker() {
|
|
76
|
+
return this.progressTracker;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Reset progress tracker
|
|
81
|
+
* @param {number} totalFiles - Total number of files to process
|
|
82
|
+
*/
|
|
83
|
+
resetProgressTracker(totalFiles = 0) {
|
|
84
|
+
this.progressTracker.reset(totalFiles);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// ============================================================================
|
|
88
|
+
// DIRECTORY STRUCTURE PROCESSING
|
|
89
|
+
// ============================================================================
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Generate directory structure string
|
|
93
|
+
* @param {Object} options - Options for generating directory structure
|
|
94
|
+
* @returns {string} Directory structure as a string
|
|
95
|
+
*/
|
|
96
|
+
generateDirectoryStructure(options = {}) {
|
|
97
|
+
const { rootDir = process.cwd(), maxDepth = 5, ignorePatterns = [], showFiles = true } = options;
|
|
98
|
+
debug(`Generating directory structure: rootDir=${rootDir}, maxDepth=${maxDepth}, showFiles=${showFiles}`);
|
|
99
|
+
|
|
100
|
+
// Use path.sep for platform compatibility
|
|
101
|
+
const pathSep = path.sep;
|
|
102
|
+
// More robust ignore pattern matching (handles directory separators)
|
|
103
|
+
const shouldIgnore = (relPath) =>
|
|
104
|
+
ignorePatterns.some((pattern) => {
|
|
105
|
+
const normalizedPattern = pattern.replace(/\//g, pathSep); // Normalize pattern separators
|
|
106
|
+
const normalizedPath = relPath.replace(/\//g, pathSep);
|
|
107
|
+
if (normalizedPattern.startsWith(`**${pathSep}`)) {
|
|
108
|
+
return normalizedPath.includes(normalizedPattern.slice(3));
|
|
109
|
+
}
|
|
110
|
+
return normalizedPath.includes(normalizedPattern);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
const buildStructure = (dir, depth = 0, prefix = '') => {
|
|
114
|
+
if (depth > maxDepth) return '';
|
|
115
|
+
let result = '';
|
|
116
|
+
try {
|
|
117
|
+
const entries = fs
|
|
118
|
+
.readdirSync(dir, { withFileTypes: true })
|
|
119
|
+
.sort((a, b) => (a.isDirectory() === b.isDirectory() ? a.name.localeCompare(b.name) : a.isDirectory() ? -1 : 1));
|
|
120
|
+
for (let i = 0; i < entries.length; i++) {
|
|
121
|
+
const entry = entries[i];
|
|
122
|
+
const isLast = i === entries.length - 1;
|
|
123
|
+
const entryPath = path.join(dir, entry.name);
|
|
124
|
+
const relativePath = path.relative(rootDir, entryPath);
|
|
125
|
+
// Skip if ignored
|
|
126
|
+
if (shouldIgnore(relativePath) || entry.name === LANCEDB_DIR_NAME || entry.name === FASTEMBED_CACHE_DIR_NAME) continue; // Also ignore DB/cache dirs
|
|
127
|
+
|
|
128
|
+
const connector = isLast ? '└── ' : '├── ';
|
|
129
|
+
const nextPrefix = isLast ? prefix + ' ' : prefix + '│ ';
|
|
130
|
+
if (entry.isDirectory()) {
|
|
131
|
+
result += `${prefix}${connector}${entry.name}/\n`;
|
|
132
|
+
result += buildStructure(entryPath, depth + 1, nextPrefix);
|
|
133
|
+
} else if (showFiles) {
|
|
134
|
+
result += `${prefix}${connector}${entry.name}\n`;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
} catch (error) {
|
|
138
|
+
console.error(`Error reading directory ${dir}:`, error.message);
|
|
139
|
+
}
|
|
140
|
+
return result;
|
|
141
|
+
};
|
|
142
|
+
return buildStructure(rootDir);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Generate and store an embedding for the project directory structure
|
|
147
|
+
* @param {Object} options - Options for generating the directory structure
|
|
148
|
+
* @returns {Promise<boolean>} True if successful, false otherwise
|
|
149
|
+
*/
|
|
150
|
+
async generateDirectoryStructureEmbedding(options = {}) {
|
|
151
|
+
console.log(chalk.cyan('[generateDirEmb] Starting...')); // Log entry
|
|
152
|
+
|
|
153
|
+
if (!this.modelManager) {
|
|
154
|
+
throw createFileProcessingError('ModelManager is required for directory structure embedding');
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (!this.databaseManager) {
|
|
158
|
+
throw createFileProcessingError('DatabaseManager is required for directory structure embedding');
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
try {
|
|
162
|
+
await this.databaseManager.getDB();
|
|
163
|
+
const table = await this.databaseManager.getTable(this.fileEmbeddingsTable);
|
|
164
|
+
if (!table) {
|
|
165
|
+
throw new Error(`[generateDirEmb] Table ${this.fileEmbeddingsTable} not found.`);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Create project-specific structure ID based on the root directory
|
|
169
|
+
const rootDir = options.rootDir || process.cwd();
|
|
170
|
+
const projectName = path.basename(path.resolve(rootDir));
|
|
171
|
+
const structureId = `__project_structure__${projectName}`;
|
|
172
|
+
|
|
173
|
+
try {
|
|
174
|
+
await table.delete(`id = '${structureId}'`);
|
|
175
|
+
debug('[generateDirEmb] Deleted existing project structure embedding');
|
|
176
|
+
} catch (error) {
|
|
177
|
+
if (!error.message.includes('Record not found') && !error.message.includes('cannot find')) {
|
|
178
|
+
debug(`[generateDirEmb] Error deleting existing project structure: ${error.message}`);
|
|
179
|
+
} else {
|
|
180
|
+
debug('[generateDirEmb] No existing project structure to delete.');
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const directoryStructure = this.generateDirectoryStructure(options);
|
|
185
|
+
if (!directoryStructure) throw new Error('[generateDirEmb] Failed to generate directory structure string');
|
|
186
|
+
debug('[generateDirEmb] Directory structure string generated.');
|
|
187
|
+
|
|
188
|
+
// *** Calculate embedding explicitly ***
|
|
189
|
+
const embedding = await this.modelManager.calculateEmbedding(directoryStructure);
|
|
190
|
+
|
|
191
|
+
if (!embedding) {
|
|
192
|
+
console.error(chalk.red('[generateDirEmb] Failed to calculate embedding for directory structure.'));
|
|
193
|
+
return false; // Indicate failure
|
|
194
|
+
}
|
|
195
|
+
debug(`[generateDirEmb] Embedding calculated, length: ${embedding.length}`);
|
|
196
|
+
|
|
197
|
+
const record = {
|
|
198
|
+
vector: embedding, // Include calculated embedding
|
|
199
|
+
id: structureId,
|
|
200
|
+
content: directoryStructure,
|
|
201
|
+
type: 'directory-structure',
|
|
202
|
+
name: `${projectName} Project Structure`,
|
|
203
|
+
path: `${projectName} Project Structure`, // Project-specific path
|
|
204
|
+
project_path: path.resolve(rootDir), // Add project path for consistency with new schema
|
|
205
|
+
language: 'text',
|
|
206
|
+
content_hash: createHash('md5').update(directoryStructure).digest('hex').substring(0, 8),
|
|
207
|
+
last_modified: new Date().toISOString(), // Use current timestamp for directory structure
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
debug(`[generateDirEmb] Prepared record: ID=${record.id}, Vector length=${record.vector?.length}`);
|
|
211
|
+
if (record.vector?.length !== this.modelManager.embeddingDimensions) {
|
|
212
|
+
console.error(chalk.red(`[generateDirEmb] !!! Vector dimension mismatch before add !!!`));
|
|
213
|
+
return false; // Don't add invalid record
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// *** Add record with specific try/catch ***
|
|
217
|
+
debug('[generateDirEmb] Attempting table.add...');
|
|
218
|
+
try {
|
|
219
|
+
await table.add([record]);
|
|
220
|
+
console.log(chalk.green('[generateDirEmb] Successfully added directory structure embedding.'));
|
|
221
|
+
return true; // Indicate success
|
|
222
|
+
} catch (addError) {
|
|
223
|
+
console.error(chalk.red(`[generateDirEmb] !!! Error during table.add: ${addError.message}`), addError.stack);
|
|
224
|
+
return false; // Indicate failure
|
|
225
|
+
}
|
|
226
|
+
} catch (error) {
|
|
227
|
+
console.error(chalk.red(`[generateDirEmb] Overall error: ${error.message}`), error.stack);
|
|
228
|
+
return false; // Indicate failure
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// ============================================================================
|
|
233
|
+
// BATCH PROCESSING
|
|
234
|
+
// ============================================================================
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Process embeddings for multiple files in batch
|
|
238
|
+
* @param {string[]} filePaths - Array of file paths to process
|
|
239
|
+
* @param {Object} options - Processing options
|
|
240
|
+
* @returns {Promise<Object>} Processing results
|
|
241
|
+
*/
|
|
242
|
+
async processBatchEmbeddings(filePaths, options = {}) {
|
|
243
|
+
const {
|
|
244
|
+
excludePatterns = [],
|
|
245
|
+
respectGitignore = true,
|
|
246
|
+
baseDir: optionBaseDir = process.cwd(),
|
|
247
|
+
maxLines = 1000,
|
|
248
|
+
onProgress, // <<< Add onProgress here
|
|
249
|
+
} = options;
|
|
250
|
+
const resolvedCanonicalBaseDir = path.resolve(optionBaseDir);
|
|
251
|
+
debug(`Resolved canonical base directory: ${resolvedCanonicalBaseDir}`);
|
|
252
|
+
|
|
253
|
+
if (!this.modelManager) {
|
|
254
|
+
throw createFileProcessingError('ModelManager is required for batch processing');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (!this.databaseManager) {
|
|
258
|
+
throw createFileProcessingError('DatabaseManager is required for batch processing');
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
try {
|
|
262
|
+
await this.modelManager.initialize(); // Ensure model is ready
|
|
263
|
+
} catch {
|
|
264
|
+
console.error(chalk.red('Failed to initialize embedding model. Aborting batch process.'));
|
|
265
|
+
return { processed: 0, failed: filePaths.length, skipped: 0, excluded: 0, files: [], failedFiles: [...filePaths], excludedFiles: [] };
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
console.log(chalk.blue('Ensuring database tables exist before batch processing...'));
|
|
269
|
+
try {
|
|
270
|
+
await this.databaseManager.getDB();
|
|
271
|
+
console.log(chalk.green('Database table check complete.'));
|
|
272
|
+
} catch (dbError) {
|
|
273
|
+
console.error(chalk.red(`Failed to initialize database or tables: ${dbError.message}. Aborting batch process.`));
|
|
274
|
+
return { processed: 0, failed: filePaths.length, skipped: 0, excluded: 0, files: [], failedFiles: [...filePaths], excludedFiles: [] };
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
const results = { processed: 0, failed: 0, skipped: 0, excluded: 0, files: [], failedFiles: [], excludedFiles: [] };
|
|
278
|
+
const exclusionOptions = { excludePatterns, respectGitignore, baseDir: resolvedCanonicalBaseDir };
|
|
279
|
+
this.processedFiles.clear();
|
|
280
|
+
this.progressTracker.reset(filePaths.length);
|
|
281
|
+
console.log(chalk.blue(`Starting batch processing of ${filePaths.length} files...`));
|
|
282
|
+
|
|
283
|
+
// Generate directory structure embedding first
|
|
284
|
+
try {
|
|
285
|
+
await this.generateDirectoryStructureEmbedding({
|
|
286
|
+
rootDir: resolvedCanonicalBaseDir,
|
|
287
|
+
maxDepth: 5,
|
|
288
|
+
ignorePatterns: excludePatterns,
|
|
289
|
+
showFiles: true,
|
|
290
|
+
});
|
|
291
|
+
} catch (structureError) {
|
|
292
|
+
console.warn(chalk.yellow(`Warning: Failed to generate directory structure embedding: ${structureError.message}`));
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable);
|
|
296
|
+
if (!fileTable) {
|
|
297
|
+
console.error(chalk.red(`Table ${this.fileEmbeddingsTable} not found. Aborting batch file embedding.`));
|
|
298
|
+
results.failed = filePaths.length;
|
|
299
|
+
results.failedFiles = [...filePaths];
|
|
300
|
+
this.progressTracker.failedCount = filePaths.length;
|
|
301
|
+
this.progressTracker.update('failed');
|
|
302
|
+
return results;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Process files in batches
|
|
306
|
+
console.log(chalk.cyan('--- Starting Phase 1: File Embeddings ---'));
|
|
307
|
+
const BATCH_SIZE = 50; // Process files in smaller batches for better performance
|
|
308
|
+
|
|
309
|
+
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
310
|
+
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
311
|
+
const batchResults = await this._processBatch(batch, resolvedCanonicalBaseDir, exclusionOptions, onProgress, maxLines);
|
|
312
|
+
|
|
313
|
+
// Merge results
|
|
314
|
+
results.processed += batchResults.processed;
|
|
315
|
+
results.failed += batchResults.failed;
|
|
316
|
+
results.skipped += batchResults.skipped;
|
|
317
|
+
results.excluded += batchResults.excluded;
|
|
318
|
+
results.files.push(...batchResults.files);
|
|
319
|
+
results.failedFiles.push(...batchResults.failedFiles);
|
|
320
|
+
results.excludedFiles.push(...batchResults.excludedFiles);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Process document chunks
|
|
324
|
+
await this._processDocumentChunks(filePaths, resolvedCanonicalBaseDir, excludePatterns);
|
|
325
|
+
|
|
326
|
+
console.log(chalk.green(`Batch processing complete!`));
|
|
327
|
+
|
|
328
|
+
// Update progress tracker counts for internal tracking
|
|
329
|
+
this.progressTracker.processedCount = results.processed;
|
|
330
|
+
this.progressTracker.skippedCount = results.excluded + results.skipped;
|
|
331
|
+
this.progressTracker.failedCount = results.failed;
|
|
332
|
+
|
|
333
|
+
return results;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// ============================================================================
|
|
337
|
+
// PRIVATE METHODS
|
|
338
|
+
// ============================================================================
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* Process a batch of files
|
|
342
|
+
* @param {string[]} filePaths - File paths to process
|
|
343
|
+
* @param {string} baseDir - Base directory
|
|
344
|
+
* @param {Object} exclusionOptions - Exclusion options
|
|
345
|
+
* @param {Function} onProgress - Progress callback
|
|
346
|
+
* @returns {Promise<Object>} Batch processing results
|
|
347
|
+
* @private
|
|
348
|
+
*/
|
|
349
|
+
async _processBatch(filePaths, baseDir, exclusionOptions, onProgress, maxLines = 1000) {
|
|
350
|
+
const results = { processed: 0, failed: 0, skipped: 0, excluded: 0, files: [], failedFiles: [], excludedFiles: [] };
|
|
351
|
+
|
|
352
|
+
// ============================================================================
|
|
353
|
+
// PHASE 1: BATCH GITIGNORE CHECK
|
|
354
|
+
// ============================================================================
|
|
355
|
+
let gitignoreCache = new Map();
|
|
356
|
+
if (exclusionOptions.respectGitignore !== false) {
|
|
357
|
+
console.log(chalk.cyan(`Performing batch gitignore check for ${filePaths.length} files...`));
|
|
358
|
+
const gitStartTime = Date.now();
|
|
359
|
+
const absoluteFilePaths = filePaths.map((fp) => (path.isAbsolute(fp) ? path.resolve(fp) : path.resolve(baseDir, fp)));
|
|
360
|
+
gitignoreCache = await batchCheckGitignore(absoluteFilePaths, baseDir);
|
|
361
|
+
const gitDuration = ((Date.now() - gitStartTime) / 1000).toFixed(2);
|
|
362
|
+
console.log(chalk.green(`✓ Batch gitignore check completed in ${gitDuration}s`));
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// ============================================================================
|
|
366
|
+
// PHASE 2: GET EXISTING EMBEDDINGS (for early filtering)
|
|
367
|
+
// ============================================================================
|
|
368
|
+
// Query existing embeddings BEFORE reading any files
|
|
369
|
+
const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable);
|
|
370
|
+
let existingFilesMap = new Map();
|
|
371
|
+
|
|
372
|
+
try {
|
|
373
|
+
const existingRecords = await fileTable
|
|
374
|
+
.query()
|
|
375
|
+
.where(`project_path = '${baseDir.replace(/'/g, "''")}'`)
|
|
376
|
+
.toArray();
|
|
377
|
+
|
|
378
|
+
for (const record of existingRecords) {
|
|
379
|
+
if (!existingFilesMap.has(record.path)) {
|
|
380
|
+
existingFilesMap.set(record.path, []);
|
|
381
|
+
}
|
|
382
|
+
existingFilesMap.get(record.path).push(record);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
console.log(chalk.cyan(`Found ${existingRecords.length} existing embeddings for comparison`));
|
|
386
|
+
} catch (queryError) {
|
|
387
|
+
console.warn(chalk.yellow(`Warning: Could not query existing embeddings: ${queryError.message}`));
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// ============================================================================
|
|
391
|
+
// PHASE 3: FAST PRE-FILTERING (without reading file contents)
|
|
392
|
+
// ============================================================================
|
|
393
|
+
// Filter files based on basic checks and file timestamps before reading content
|
|
394
|
+
const candidateFiles = [];
|
|
395
|
+
|
|
396
|
+
for (const filePath of filePaths) {
|
|
397
|
+
const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath);
|
|
398
|
+
const consistentRelativePath = path.relative(baseDir, absoluteFilePath);
|
|
399
|
+
|
|
400
|
+
try {
|
|
401
|
+
// Get file stats (size, mtime, etc.)
|
|
402
|
+
const stats = fs.statSync(absoluteFilePath);
|
|
403
|
+
|
|
404
|
+
// Check if file should be processed (using cached gitignore results)
|
|
405
|
+
if (
|
|
406
|
+
!utilsShouldProcessFile(absoluteFilePath, '', {
|
|
407
|
+
...exclusionOptions,
|
|
408
|
+
baseDir: baseDir,
|
|
409
|
+
relativePathToCheck: consistentRelativePath,
|
|
410
|
+
gitignoreCache, // Pass the pre-computed cache
|
|
411
|
+
fileStats: stats, // Pass stats to avoid re-reading
|
|
412
|
+
})
|
|
413
|
+
) {
|
|
414
|
+
results.excluded++;
|
|
415
|
+
results.excludedFiles.push(filePath);
|
|
416
|
+
this.progressTracker.update('skipped');
|
|
417
|
+
if (typeof onProgress === 'function') onProgress('excluded', filePath);
|
|
418
|
+
this.processedFiles.set(filePath, 'excluded');
|
|
419
|
+
continue;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Early skip based on modification time if file exists in database
|
|
423
|
+
const existingRecords = existingFilesMap.get(consistentRelativePath) || [];
|
|
424
|
+
let potentiallyUnchanged = false;
|
|
425
|
+
|
|
426
|
+
if (existingRecords.length > 0) {
|
|
427
|
+
// Check if any existing record has the same modification time
|
|
428
|
+
for (const existing of existingRecords) {
|
|
429
|
+
const existingMtime = new Date(existing.last_modified).getTime();
|
|
430
|
+
const currentMtime = stats.mtime.getTime();
|
|
431
|
+
|
|
432
|
+
// If modification times match (within 1 second to account for filesystem precision)
|
|
433
|
+
if (Math.abs(existingMtime - currentMtime) < 1000) {
|
|
434
|
+
potentiallyUnchanged = true;
|
|
435
|
+
break;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
candidateFiles.push({
|
|
441
|
+
filePath: absoluteFilePath,
|
|
442
|
+
originalInputPath: filePath,
|
|
443
|
+
relativePath: consistentRelativePath,
|
|
444
|
+
stats,
|
|
445
|
+
potentiallyUnchanged,
|
|
446
|
+
existingRecords,
|
|
447
|
+
});
|
|
448
|
+
} catch {
|
|
449
|
+
results.failed++;
|
|
450
|
+
results.failedFiles.push(filePath);
|
|
451
|
+
this.progressTracker.update('failed');
|
|
452
|
+
if (typeof onProgress === 'function') onProgress('failed', filePath);
|
|
453
|
+
this.processedFiles.set(filePath, 'failed_stat');
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
console.log(chalk.cyan(`Pre-filtered to ${candidateFiles.length} candidate files (excluded ${results.excluded})`));
|
|
458
|
+
|
|
459
|
+
// ============================================================================
|
|
460
|
+
// PHASE 4: READ FILES AND CONTENT HASH CHECK
|
|
461
|
+
// ============================================================================
|
|
462
|
+
// Now read file contents only for candidates that passed initial filtering
|
|
463
|
+
const filesToProcess = [];
|
|
464
|
+
const contentsForBatch = [];
|
|
465
|
+
|
|
466
|
+
for (const fileData of candidateFiles) {
|
|
467
|
+
try {
|
|
468
|
+
// Read file content
|
|
469
|
+
let content = await fs.promises.readFile(fileData.filePath, 'utf8');
|
|
470
|
+
|
|
471
|
+
// Check if empty
|
|
472
|
+
if (content.trim().length === 0) {
|
|
473
|
+
results.skipped++;
|
|
474
|
+
this.progressTracker.update('skipped');
|
|
475
|
+
if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath);
|
|
476
|
+
this.processedFiles.set(fileData.originalInputPath, 'skipped_empty');
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// Truncate content to maximum specified lines for code files only
|
|
481
|
+
const isDocFile = isDocumentationFile(fileData.filePath);
|
|
482
|
+
if (!isDocFile) {
|
|
483
|
+
const lines = content.split('\n');
|
|
484
|
+
if (lines.length > maxLines) {
|
|
485
|
+
content = lines.slice(0, maxLines).join('\n') + '\n... (truncated from ' + lines.length + ' lines)';
|
|
486
|
+
debug(`Truncated code file ${fileData.relativePath} from ${lines.length} lines to ${maxLines} lines`);
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Add content to file data
|
|
491
|
+
fileData.content = content;
|
|
492
|
+
filesToProcess.push(fileData);
|
|
493
|
+
contentsForBatch.push(content);
|
|
494
|
+
} catch {
|
|
495
|
+
results.failed++;
|
|
496
|
+
results.failedFiles.push(fileData.originalInputPath);
|
|
497
|
+
this.progressTracker.update('failed');
|
|
498
|
+
if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
|
|
499
|
+
this.processedFiles.set(fileData.originalInputPath, 'failed_read');
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// ============================================================================
|
|
504
|
+
// PHASE 5: CONTENT HASH CHECK AND DEDUPLICATION
|
|
505
|
+
// ============================================================================
|
|
506
|
+
// Check each file against existing embeddings using content hash
|
|
507
|
+
const filesToActuallyProcess = [];
|
|
508
|
+
const contentsToActuallyProcess = [];
|
|
509
|
+
const recordsToDelete = [];
|
|
510
|
+
|
|
511
|
+
for (let i = 0; i < filesToProcess.length; i++) {
|
|
512
|
+
const fileData = filesToProcess[i];
|
|
513
|
+
const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8);
|
|
514
|
+
|
|
515
|
+
const existingRecords = fileData.existingRecords || [];
|
|
516
|
+
let needsUpdate = true;
|
|
517
|
+
|
|
518
|
+
if (existingRecords.length > 0) {
|
|
519
|
+
// Check if any existing record matches our current file state
|
|
520
|
+
for (const existing of existingRecords) {
|
|
521
|
+
if (existing.content_hash === contentHash) {
|
|
522
|
+
// File content hasn't changed - skip processing (CI-friendly)
|
|
523
|
+
// Note: We rely on content_hash rather than last_modified because
|
|
524
|
+
// GitHub Actions checkout changes file timestamps even for unchanged files
|
|
525
|
+
needsUpdate = false;
|
|
526
|
+
results.skipped++;
|
|
527
|
+
this.progressTracker.update('skipped');
|
|
528
|
+
if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath);
|
|
529
|
+
this.processedFiles.set(fileData.originalInputPath, 'skipped_unchanged');
|
|
530
|
+
debug(`Skipping unchanged file: ${fileData.relativePath} (hash: ${contentHash})`);
|
|
531
|
+
break;
|
|
532
|
+
} else if (existing.path === fileData.relativePath) {
|
|
533
|
+
// Same file path but different content - mark old version for deletion
|
|
534
|
+
recordsToDelete.push(existing);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if (needsUpdate) {
|
|
540
|
+
// File needs processing (new or changed)
|
|
541
|
+
filesToActuallyProcess.push(fileData);
|
|
542
|
+
contentsToActuallyProcess.push(fileData.content);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Batch delete old versions if any
|
|
547
|
+
if (recordsToDelete.length > 0) {
|
|
548
|
+
for (const recordToDelete of recordsToDelete) {
|
|
549
|
+
try {
|
|
550
|
+
await fileTable.delete(`id = '${recordToDelete.id.replace(/'/g, "''")}'`);
|
|
551
|
+
debug(`Deleted old version: ${recordToDelete.path} (old hash: ${recordToDelete.content_hash})`);
|
|
552
|
+
} catch (deleteError) {
|
|
553
|
+
console.warn(chalk.yellow(`Warning: Could not delete old version of ${recordToDelete.path}: ${deleteError.message}`));
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Generate embeddings only for files that need processing
|
|
559
|
+
if (filesToActuallyProcess.length > 0) {
|
|
560
|
+
console.log(
|
|
561
|
+
chalk.cyan(
|
|
562
|
+
`Processing ${filesToActuallyProcess.length} new/changed files (skipped ${filesToProcess.length - filesToActuallyProcess.length} unchanged)`
|
|
563
|
+
)
|
|
564
|
+
);
|
|
565
|
+
|
|
566
|
+
try {
|
|
567
|
+
const embeddings = await this.modelManager.calculateEmbeddingBatch(contentsToActuallyProcess);
|
|
568
|
+
const recordsToAdd = [];
|
|
569
|
+
|
|
570
|
+
for (let i = 0; i < embeddings.length; i++) {
|
|
571
|
+
const fileData = filesToActuallyProcess[i];
|
|
572
|
+
const embeddingVector = embeddings[i];
|
|
573
|
+
|
|
574
|
+
if (embeddingVector) {
|
|
575
|
+
const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8);
|
|
576
|
+
const fileId = `${fileData.relativePath}#${contentHash}`;
|
|
577
|
+
|
|
578
|
+
const record = {
|
|
579
|
+
vector: embeddingVector,
|
|
580
|
+
id: fileId,
|
|
581
|
+
content: fileData.content,
|
|
582
|
+
type: 'file',
|
|
583
|
+
name: path.basename(fileData.filePath),
|
|
584
|
+
path: fileData.relativePath,
|
|
585
|
+
project_path: baseDir,
|
|
586
|
+
language: detectLanguageFromExtension(path.extname(fileData.filePath)),
|
|
587
|
+
content_hash: contentHash,
|
|
588
|
+
last_modified: fileData.stats.mtime.toISOString(),
|
|
589
|
+
};
|
|
590
|
+
recordsToAdd.push(record);
|
|
591
|
+
} else {
|
|
592
|
+
results.failed++;
|
|
593
|
+
results.failedFiles.push(fileData.originalInputPath);
|
|
594
|
+
this.progressTracker.update('failed');
|
|
595
|
+
if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
|
|
596
|
+
this.processedFiles.set(fileData.originalInputPath, 'failed_embedding');
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Add new/updated records to database
|
|
601
|
+
if (recordsToAdd.length > 0) {
|
|
602
|
+
await fileTable.add(recordsToAdd);
|
|
603
|
+
|
|
604
|
+
// Optimize table to sync indices with data and prevent TakeExec panics
|
|
605
|
+
try {
|
|
606
|
+
await fileTable.optimize();
|
|
607
|
+
} catch (optimizeError) {
|
|
608
|
+
if (optimizeError.message && optimizeError.message.includes('legacy format')) {
|
|
609
|
+
console.log(
|
|
610
|
+
chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`)
|
|
611
|
+
);
|
|
612
|
+
} else {
|
|
613
|
+
console.warn(
|
|
614
|
+
chalk.yellow(`Warning: Failed to optimize file embeddings table after adding records: ${optimizeError.message}`)
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
recordsToAdd.forEach((record, index) => {
|
|
620
|
+
const fileData = filesToActuallyProcess[index];
|
|
621
|
+
if (embeddings[index]) {
|
|
622
|
+
results.processed++;
|
|
623
|
+
results.files.push(fileData.originalInputPath);
|
|
624
|
+
this.progressTracker.update('processed');
|
|
625
|
+
if (typeof onProgress === 'function') onProgress('processed', fileData.originalInputPath);
|
|
626
|
+
this.processedFiles.set(fileData.originalInputPath, 'processed');
|
|
627
|
+
}
|
|
628
|
+
});
|
|
629
|
+
}
|
|
630
|
+
} catch (error) {
|
|
631
|
+
console.error(chalk.red(`Error processing batch: ${error.message}`));
|
|
632
|
+
filesToProcess.forEach((fileData) => {
|
|
633
|
+
results.failed++;
|
|
634
|
+
results.failedFiles.push(fileData.originalInputPath);
|
|
635
|
+
this.progressTracker.update('failed');
|
|
636
|
+
if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath);
|
|
637
|
+
this.processedFiles.set(fileData.originalInputPath, 'failed_batch');
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
return results;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
/**
|
|
646
|
+
* Process document chunks
|
|
647
|
+
* @param {string[]} filePaths - File paths to process
|
|
648
|
+
* @param {string} baseDir - Base directory
|
|
649
|
+
* @param {string[]} excludePatterns - Exclude patterns
|
|
650
|
+
* @returns {Promise<void>}
|
|
651
|
+
* @private
|
|
652
|
+
*/
|
|
653
|
+
async _processDocumentChunks(filePaths, baseDir) {
|
|
654
|
+
console.log(chalk.cyan('--- Starting Phase 2: Document Chunk Embeddings ---'));
|
|
655
|
+
const documentChunkTable = await this.databaseManager.getTable(this.documentChunkTable);
|
|
656
|
+
if (!documentChunkTable) {
|
|
657
|
+
console.warn(chalk.yellow(`Skipping Phase 2: Document Chunk Embeddings because table ${this.documentChunkTable} was not found.`));
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
// Efficient batch check: Get all existing document chunks for this project
|
|
662
|
+
let existingDocChunksMap = new Map();
|
|
663
|
+
try {
|
|
664
|
+
const existingChunks = await documentChunkTable
|
|
665
|
+
.query()
|
|
666
|
+
.where(`project_path = '${baseDir.replace(/'/g, "''")}'`)
|
|
667
|
+
.toArray();
|
|
668
|
+
|
|
669
|
+
// Build a map for fast lookup: original_document_path -> [chunks]
|
|
670
|
+
for (const chunk of existingChunks) {
|
|
671
|
+
if (!existingDocChunksMap.has(chunk.original_document_path)) {
|
|
672
|
+
existingDocChunksMap.set(chunk.original_document_path, []);
|
|
673
|
+
}
|
|
674
|
+
existingDocChunksMap.get(chunk.original_document_path).push(chunk);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
console.log(chalk.cyan(`Found ${existingChunks.length} existing document chunks for comparison`));
|
|
678
|
+
} catch (queryError) {
|
|
679
|
+
console.warn(chalk.yellow(`Warning: Could not query existing document chunks, will process all docs: ${queryError.message}`));
|
|
680
|
+
existingDocChunksMap = new Map();
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
const allDocChunksToEmbed = [];
|
|
684
|
+
const allDocChunkRecordsToAdd = [];
|
|
685
|
+
const processedDocPathsForDeletion = new Set();
|
|
686
|
+
let skippedDocCount = 0;
|
|
687
|
+
|
|
688
|
+
for (const filePath of filePaths) {
|
|
689
|
+
const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath);
|
|
690
|
+
const consistentRelativePath = path.relative(baseDir, absoluteFilePath);
|
|
691
|
+
const language = detectLanguageFromExtension(path.extname(absoluteFilePath));
|
|
692
|
+
|
|
693
|
+
if (isDocumentationFile(absoluteFilePath, language)) {
|
|
694
|
+
try {
|
|
695
|
+
const stats = fs.statSync(absoluteFilePath);
|
|
696
|
+
if (stats.size > 5 * 1024 * 1024) {
|
|
697
|
+
// 5MB limit for docs
|
|
698
|
+
continue;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
const content = await fs.promises.readFile(absoluteFilePath, 'utf8');
|
|
702
|
+
if (content.trim().length === 0) {
|
|
703
|
+
continue;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
// Check if document has changed by comparing chunk content hashes
|
|
707
|
+
const existingChunks = existingDocChunksMap.get(consistentRelativePath) || [];
|
|
708
|
+
|
|
709
|
+
// Extract chunks to compare with existing ones
|
|
710
|
+
const { chunks: currentChunks, documentH1 } = extractMarkdownChunks(absoluteFilePath, content, consistentRelativePath);
|
|
711
|
+
let hasUnchangedDocument = false;
|
|
712
|
+
|
|
713
|
+
if (existingChunks.length > 0 && currentChunks.length === existingChunks.length) {
|
|
714
|
+
// Create a signature of the document by combining all chunk content hashes
|
|
715
|
+
const currentChunkHashes = currentChunks
|
|
716
|
+
.map((chunk) => createHash('md5').update(chunk.content).digest('hex').substring(0, 8))
|
|
717
|
+
.sort()
|
|
718
|
+
.join('|');
|
|
719
|
+
|
|
720
|
+
const existingChunkHashes = existingChunks
|
|
721
|
+
.map((chunk) => chunk.content_hash)
|
|
722
|
+
.sort()
|
|
723
|
+
.join('|');
|
|
724
|
+
|
|
725
|
+
hasUnchangedDocument = currentChunkHashes === existingChunkHashes;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
if (hasUnchangedDocument) {
|
|
729
|
+
// Document hasn't changed - skip processing
|
|
730
|
+
skippedDocCount++;
|
|
731
|
+
debug(`Skipping unchanged document: ${consistentRelativePath} (${currentChunks.length} chunks match)`);
|
|
732
|
+
continue;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Document has changed or is new - process it
|
|
736
|
+
if (!processedDocPathsForDeletion.has(consistentRelativePath)) {
|
|
737
|
+
processedDocPathsForDeletion.add(consistentRelativePath);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
if (currentChunks.length > 0) {
|
|
741
|
+
currentChunks.forEach((chunk) => {
|
|
742
|
+
const chunkWithTitle = {
|
|
743
|
+
...chunk,
|
|
744
|
+
documentTitle: documentH1 || path.basename(absoluteFilePath, path.extname(absoluteFilePath)),
|
|
745
|
+
fileStats: stats,
|
|
746
|
+
};
|
|
747
|
+
allDocChunksToEmbed.push(chunkWithTitle);
|
|
748
|
+
});
|
|
749
|
+
}
|
|
750
|
+
} catch (docError) {
|
|
751
|
+
console.warn(chalk.yellow(`Error processing document ${consistentRelativePath} for chunking: ${docError.message}`));
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
if (skippedDocCount > 0) {
|
|
757
|
+
console.log(chalk.cyan(`Skipped ${skippedDocCount} unchanged documentation files`));
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
if (allDocChunksToEmbed.length > 0) {
|
|
761
|
+
console.log(chalk.blue(`Extracted ${allDocChunksToEmbed.length} total document chunks to process for embeddings.`));
|
|
762
|
+
const chunkContentsForBatching = allDocChunksToEmbed.map((chunk) => chunk.content);
|
|
763
|
+
const chunkEmbeddings = await this.modelManager.calculateEmbeddingBatch(chunkContentsForBatching);
|
|
764
|
+
|
|
765
|
+
for (let i = 0; i < chunkEmbeddings.length; i++) {
|
|
766
|
+
const chunkData = allDocChunksToEmbed[i];
|
|
767
|
+
const chunkEmbeddingVector = chunkEmbeddings[i];
|
|
768
|
+
|
|
769
|
+
if (chunkEmbeddingVector) {
|
|
770
|
+
const chunkContentHash = createHash('md5').update(chunkData.content).digest('hex').substring(0, 8);
|
|
771
|
+
const chunkId = `${chunkData.original_document_path}#${slugify(chunkData.heading || 'section')}_${chunkData.start_line_in_doc}`;
|
|
772
|
+
|
|
773
|
+
const record = {
|
|
774
|
+
id: chunkId,
|
|
775
|
+
content: chunkData.content,
|
|
776
|
+
original_document_path: chunkData.original_document_path,
|
|
777
|
+
project_path: baseDir,
|
|
778
|
+
heading_text: chunkData.heading || '',
|
|
779
|
+
document_title: chunkData.documentTitle,
|
|
780
|
+
language: chunkData.language || 'markdown',
|
|
781
|
+
vector: chunkEmbeddingVector,
|
|
782
|
+
content_hash: chunkContentHash,
|
|
783
|
+
last_modified: chunkData.fileStats ? chunkData.fileStats.mtime.toISOString() : new Date().toISOString(),
|
|
784
|
+
};
|
|
785
|
+
allDocChunkRecordsToAdd.push(record);
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
// Delete old chunks and add new ones
|
|
791
|
+
if (processedDocPathsForDeletion.size > 0) {
|
|
792
|
+
for (const docPathToDelete of processedDocPathsForDeletion) {
|
|
793
|
+
try {
|
|
794
|
+
await documentChunkTable.delete(`original_document_path = '${docPathToDelete.replace(/'/g, "''")}'`);
|
|
795
|
+
} catch (deleteError) {
|
|
796
|
+
console.warn(chalk.yellow(`Error deleting chunks for document ${docPathToDelete}: ${deleteError.message}`));
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
if (allDocChunkRecordsToAdd.length > 0) {
|
|
802
|
+
try {
|
|
803
|
+
await documentChunkTable.add(allDocChunkRecordsToAdd);
|
|
804
|
+
|
|
805
|
+
// Optimize table to sync indices with data and prevent TakeExec panics
|
|
806
|
+
try {
|
|
807
|
+
await documentChunkTable.optimize();
|
|
808
|
+
} catch (optimizeError) {
|
|
809
|
+
if (optimizeError.message && optimizeError.message.includes('legacy format')) {
|
|
810
|
+
console.log(chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`));
|
|
811
|
+
} else {
|
|
812
|
+
console.warn(chalk.yellow(`Warning: Failed to optimize document chunk table after adding records: ${optimizeError.message}`));
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
console.log(
|
|
817
|
+
chalk.green(`Successfully added ${allDocChunkRecordsToAdd.length} document chunk embeddings to ${this.documentChunkTable}.`)
|
|
818
|
+
);
|
|
819
|
+
} catch (addError) {
|
|
820
|
+
console.error(chalk.red(`Error batch adding document chunk embeddings to DB: ${addError.message}`), addError.stack);
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
console.log(chalk.green('--- Finished Phase 2: Document Chunk Embeddings ---'));
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// ============================================================================
|
|
828
|
+
// CLEANUP
|
|
829
|
+
// ============================================================================
|
|
830
|
+
|
|
831
|
+
/**
|
|
832
|
+
* Clean up file processor resources
|
|
833
|
+
*/
|
|
834
|
+
async cleanup() {
|
|
835
|
+
if (this.cleaningUp) {
|
|
836
|
+
return; // Already cleaning up, prevent duplicate calls
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
this.cleaningUp = true;
|
|
840
|
+
|
|
841
|
+
try {
|
|
842
|
+
this.processedFiles.clear();
|
|
843
|
+
this.progressTracker.reset(0);
|
|
844
|
+
console.log(chalk.green('[FileProcessor] Resources cleaned up.'));
|
|
845
|
+
} catch (error) {
|
|
846
|
+
console.error(chalk.red(`[FileProcessor] Error during cleanup: ${error.message}`));
|
|
847
|
+
} finally {
|
|
848
|
+
this.cleaningUp = false;
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
}
|