codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,597 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Custom Document Processor
|
|
3
|
+
*
|
|
4
|
+
* This module provides advanced custom document processing capabilities:
|
|
5
|
+
* - Intelligent document chunking with metadata preservation
|
|
6
|
+
* - Batch embedding generation for optimal performance
|
|
7
|
+
* - Memory-based document storage with project isolation
|
|
8
|
+
* - Context-aware search and retrieval
|
|
9
|
+
* - Parallel processing with sophisticated reranking
|
|
10
|
+
*
|
|
11
|
+
* @module CustomDocuments
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { createHash } from 'crypto';
|
|
15
|
+
import path from 'path';
|
|
16
|
+
import chalk from 'chalk';
|
|
17
|
+
import { CacheManager } from './embeddings/cache-manager.js';
|
|
18
|
+
import { EmbeddingError, ValidationError } from './embeddings/errors.js';
|
|
19
|
+
import { ModelManager } from './embeddings/model-manager.js';
|
|
20
|
+
import { calculateCosineSimilarity, calculatePathSimilarity } from './embeddings/similarity-calculator.js';
|
|
21
|
+
import { debug } from './utils/logging.js';
|
|
22
|
+
import { slugify } from './utils/string-utils.js';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* CustomDocumentProcessor class for advanced document processing
|
|
26
|
+
*/
|
|
27
|
+
export class CustomDocumentProcessor {
|
|
28
|
+
constructor(options = {}) {
|
|
29
|
+
this.modelManager = options.modelManager || new ModelManager();
|
|
30
|
+
this.cacheManager = options.cacheManager || new CacheManager();
|
|
31
|
+
|
|
32
|
+
// In-memory storage for custom document chunks (project-isolated)
|
|
33
|
+
this.customDocumentChunks = new Map();
|
|
34
|
+
|
|
35
|
+
// Embedding cache for performance optimization
|
|
36
|
+
this.h1EmbeddingCache = new Map();
|
|
37
|
+
|
|
38
|
+
// Performance metrics
|
|
39
|
+
this.performanceMetrics = {
|
|
40
|
+
documentsProcessed: 0,
|
|
41
|
+
chunksGenerated: 0,
|
|
42
|
+
embeddingsCalculated: 0,
|
|
43
|
+
batchSuccessRate: 0,
|
|
44
|
+
averageChunkSize: 0,
|
|
45
|
+
processingTime: 0,
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
// Cleanup guard
|
|
49
|
+
this.cleaningUp = false;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Chunk a custom document into manageable pieces
|
|
54
|
+
* @param {Object} doc - Document object with title and content
|
|
55
|
+
* @returns {Array} Array of document chunks
|
|
56
|
+
*/
|
|
57
|
+
chunkDocument(doc) {
|
|
58
|
+
const { title, content } = doc;
|
|
59
|
+
const startTime = Date.now();
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
if (!doc || !content) {
|
|
63
|
+
throw new ValidationError('Document must have content');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Extract the actual document title from content
|
|
67
|
+
let documentTitle = title;
|
|
68
|
+
|
|
69
|
+
// Try to find a markdown header in the content
|
|
70
|
+
const headerMatch = content.match(/^#\s+(.+)$/m);
|
|
71
|
+
if (headerMatch) {
|
|
72
|
+
documentTitle = headerMatch[1].trim();
|
|
73
|
+
} else {
|
|
74
|
+
// If no header found, try to extract filename from title like "instruction:./FILENAME.md"
|
|
75
|
+
const filePathMatch = title.match(/:\.\/([^/]+)\.([a-zA-Z]+)$/);
|
|
76
|
+
if (filePathMatch) {
|
|
77
|
+
// Use filename without extension, but capitalize it nicely
|
|
78
|
+
documentTitle = filePathMatch[1].replace(/_/g, ' ').replace(/\b\w/g, (l) => l.toUpperCase());
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const chunks = [];
|
|
83
|
+
const sections = content.split(/\n\s*\n/);
|
|
84
|
+
let currentChunk = '';
|
|
85
|
+
let chunkIndex = 0;
|
|
86
|
+
const maxChunkSize = 1000; // Max characters per chunk
|
|
87
|
+
const minChunkSize = 100; // Min characters to avoid tiny chunks
|
|
88
|
+
|
|
89
|
+
for (let i = 0; i < sections.length; i++) {
|
|
90
|
+
const section = sections[i].trim();
|
|
91
|
+
if (!section) continue;
|
|
92
|
+
|
|
93
|
+
// Check if adding this section would exceed max chunk size
|
|
94
|
+
if (currentChunk.length + section.length > maxChunkSize && currentChunk.length > minChunkSize) {
|
|
95
|
+
// Save current chunk
|
|
96
|
+
chunks.push({
|
|
97
|
+
id: `${slugify(documentTitle)}_chunk_${chunkIndex}`,
|
|
98
|
+
content: currentChunk.trim(),
|
|
99
|
+
document_title: documentTitle,
|
|
100
|
+
chunk_index: chunkIndex,
|
|
101
|
+
metadata: {
|
|
102
|
+
section_start: chunkIndex === 0,
|
|
103
|
+
total_chunks: 0, // Will be updated after all chunks are created
|
|
104
|
+
original_title: title,
|
|
105
|
+
chunk_hash: createHash('md5').update(currentChunk.trim()).digest('hex').substring(0, 8),
|
|
106
|
+
},
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
chunkIndex++;
|
|
110
|
+
currentChunk = section;
|
|
111
|
+
} else {
|
|
112
|
+
// Add section to current chunk
|
|
113
|
+
currentChunk += (currentChunk ? '\n\n' : '') + section;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Add the last chunk if it has content
|
|
118
|
+
if (currentChunk.trim()) {
|
|
119
|
+
chunks.push({
|
|
120
|
+
id: `${slugify(documentTitle)}_chunk_${chunkIndex}`,
|
|
121
|
+
content: currentChunk.trim(),
|
|
122
|
+
document_title: documentTitle,
|
|
123
|
+
chunk_index: chunkIndex,
|
|
124
|
+
metadata: {
|
|
125
|
+
section_start: chunkIndex === 0,
|
|
126
|
+
total_chunks: 0,
|
|
127
|
+
original_title: title,
|
|
128
|
+
chunk_hash: createHash('md5').update(currentChunk.trim()).digest('hex').substring(0, 8),
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Update total_chunks metadata for all chunks
|
|
134
|
+
chunks.forEach((chunk) => {
|
|
135
|
+
chunk.metadata.total_chunks = chunks.length;
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Update performance metrics
|
|
139
|
+
this.performanceMetrics.chunksGenerated += chunks.length;
|
|
140
|
+
this.performanceMetrics.averageChunkSize = chunks.reduce((sum, chunk) => sum + chunk.content.length, 0) / chunks.length;
|
|
141
|
+
this.performanceMetrics.processingTime += Date.now() - startTime;
|
|
142
|
+
|
|
143
|
+
console.log(chalk.gray(` Chunked document "${documentTitle}" into ${chunks.length} chunks`));
|
|
144
|
+
return chunks;
|
|
145
|
+
} catch (error) {
|
|
146
|
+
console.error(chalk.red(`Error chunking document: ${error.message}`));
|
|
147
|
+
throw new EmbeddingError(`Document chunking failed: ${error.message}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Process custom documents in memory with advanced batch processing
|
|
153
|
+
* @param {Array} customDocs - Array of custom documents
|
|
154
|
+
* @param {string} projectPath - Project path for isolation
|
|
155
|
+
* @returns {Promise<Array>} Array of processed chunks with embeddings
|
|
156
|
+
*/
|
|
157
|
+
async processDocumentsInMemory(customDocs, projectPath) {
|
|
158
|
+
const startTime = Date.now();
|
|
159
|
+
|
|
160
|
+
try {
|
|
161
|
+
if (!customDocs || customDocs.length === 0) {
|
|
162
|
+
console.log(chalk.gray('No custom documents to process'));
|
|
163
|
+
return [];
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
console.log(chalk.cyan(`Processing ${customDocs.length} custom documents into chunks...`));
|
|
167
|
+
|
|
168
|
+
const allChunks = [];
|
|
169
|
+
let totalBatchAttempts = 0;
|
|
170
|
+
let successfulBatches = 0;
|
|
171
|
+
|
|
172
|
+
for (const doc of customDocs) {
|
|
173
|
+
console.log(chalk.gray(` Processing document: ${doc.title}`));
|
|
174
|
+
|
|
175
|
+
// Chunk the document
|
|
176
|
+
const chunks = this.chunkDocument(doc);
|
|
177
|
+
|
|
178
|
+
// OPTIMIZATION: Batch process embeddings instead of individual calls
|
|
179
|
+
const chunkContents = chunks.map((chunk) => chunk.content);
|
|
180
|
+
totalBatchAttempts++;
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
// Generate embeddings for all chunks in a single batch
|
|
184
|
+
const embeddings = await this.modelManager.calculateEmbeddingBatch(chunkContents);
|
|
185
|
+
successfulBatches++;
|
|
186
|
+
|
|
187
|
+
// Process results with mixed success/failure handling
|
|
188
|
+
const chunksWithEmbeddings = chunks.map((chunk, index) => {
|
|
189
|
+
if (embeddings[index] !== null) {
|
|
190
|
+
return {
|
|
191
|
+
...chunk,
|
|
192
|
+
embedding: embeddings[index],
|
|
193
|
+
similarity: 0, // Will be calculated during search
|
|
194
|
+
type: 'custom-document-chunk',
|
|
195
|
+
project_path: path.resolve(projectPath),
|
|
196
|
+
created_at: new Date().toISOString(),
|
|
197
|
+
};
|
|
198
|
+
} else {
|
|
199
|
+
console.error(chalk.red(`Error generating embedding for chunk ${chunk.id}: batch processing failed`));
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Filter out failed chunks
|
|
205
|
+
const validChunks = chunksWithEmbeddings.filter((chunk) => chunk !== null);
|
|
206
|
+
allChunks.push(...validChunks);
|
|
207
|
+
|
|
208
|
+
console.log(chalk.gray(` Generated embeddings for ${validChunks.length}/${chunks.length} chunks`));
|
|
209
|
+
this.performanceMetrics.embeddingsCalculated += validChunks.length;
|
|
210
|
+
} catch (error) {
|
|
211
|
+
console.error(chalk.red(`Error in batch embedding generation for document ${doc.title}: ${error.message}`));
|
|
212
|
+
// Fallback to individual processing for this document
|
|
213
|
+
console.log(chalk.yellow(` Falling back to individual processing for ${doc.title}`));
|
|
214
|
+
|
|
215
|
+
const chunksWithEmbeddings = await Promise.all(
|
|
216
|
+
chunks.map(async (chunk) => {
|
|
217
|
+
try {
|
|
218
|
+
const embedding = await this.modelManager.calculateEmbedding(chunk.content);
|
|
219
|
+
this.performanceMetrics.embeddingsCalculated++;
|
|
220
|
+
return {
|
|
221
|
+
...chunk,
|
|
222
|
+
embedding,
|
|
223
|
+
similarity: 0,
|
|
224
|
+
type: 'custom-document-chunk',
|
|
225
|
+
project_path: path.resolve(projectPath),
|
|
226
|
+
created_at: new Date().toISOString(),
|
|
227
|
+
};
|
|
228
|
+
} catch (error) {
|
|
229
|
+
console.error(chalk.red(`Error generating embedding for chunk ${chunk.id}: ${error.message}`));
|
|
230
|
+
return null;
|
|
231
|
+
}
|
|
232
|
+
})
|
|
233
|
+
);
|
|
234
|
+
|
|
235
|
+
const validChunks = chunksWithEmbeddings.filter((chunk) => chunk !== null);
|
|
236
|
+
allChunks.push(...validChunks);
|
|
237
|
+
|
|
238
|
+
console.log(chalk.gray(` Generated embeddings for ${validChunks.length}/${chunks.length} chunks (fallback)`));
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Calculate batch success rate
|
|
243
|
+
this.performanceMetrics.batchSuccessRate = totalBatchAttempts > 0 ? (successfulBatches / totalBatchAttempts) * 100 : 0;
|
|
244
|
+
|
|
245
|
+
// Store chunks in memory organized by project path
|
|
246
|
+
const resolvedProjectPath = path.resolve(projectPath);
|
|
247
|
+
this.customDocumentChunks.set(resolvedProjectPath, allChunks);
|
|
248
|
+
|
|
249
|
+
// Cache in CacheManager for persistence
|
|
250
|
+
await this.cacheManager.storeCustomDocuments(resolvedProjectPath, allChunks);
|
|
251
|
+
|
|
252
|
+
this.performanceMetrics.documentsProcessed += customDocs.length;
|
|
253
|
+
this.performanceMetrics.processingTime += Date.now() - startTime;
|
|
254
|
+
|
|
255
|
+
console.log(chalk.green(`Successfully processed ${allChunks.length} custom document chunks (${Date.now() - startTime}ms)`));
|
|
256
|
+
return allChunks;
|
|
257
|
+
} catch (error) {
|
|
258
|
+
console.error(chalk.red(`Error processing custom documents: ${error.message}`));
|
|
259
|
+
throw new EmbeddingError(`Custom document processing failed: ${error.message}`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Find relevant custom document chunks with advanced reranking
|
|
265
|
+
* @param {string} queryText - The search query
|
|
266
|
+
* @param {Array} chunks - Array of document chunks to search
|
|
267
|
+
* @param {Object} options - Search configuration
|
|
268
|
+
* @returns {Promise<Array>} Array of relevant chunks
|
|
269
|
+
*/
|
|
270
|
+
async findRelevantChunks(queryText, chunks = [], options = {}) {
|
|
271
|
+
const {
|
|
272
|
+
limit = 5,
|
|
273
|
+
similarityThreshold = 0.3,
|
|
274
|
+
queryContextForReranking = null,
|
|
275
|
+
useReranking = true,
|
|
276
|
+
precomputedQueryEmbedding = null,
|
|
277
|
+
queryFilePath = null,
|
|
278
|
+
} = options;
|
|
279
|
+
|
|
280
|
+
const startTime = Date.now();
|
|
281
|
+
|
|
282
|
+
try {
|
|
283
|
+
if (!queryText?.trim()) {
|
|
284
|
+
throw new ValidationError('Empty query text provided for custom document search');
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (!chunks || chunks.length === 0) {
|
|
288
|
+
console.log(chalk.gray('No custom document chunks available for search'));
|
|
289
|
+
return [];
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
console.log(chalk.cyan(`Searching ${chunks.length} custom document chunks...`));
|
|
293
|
+
|
|
294
|
+
// OPTIMIZATION: Use pre-computed query embedding if available
|
|
295
|
+
let queryEmbedding = precomputedQueryEmbedding;
|
|
296
|
+
if (!queryEmbedding) {
|
|
297
|
+
queryEmbedding = await this.modelManager.calculateQueryEmbedding(queryText);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// OPTIMIZATION: Vectorized similarity calculation for better performance
|
|
301
|
+
const results = chunks.map((chunk) => ({
|
|
302
|
+
...chunk,
|
|
303
|
+
similarity: calculateCosineSimilarity(queryEmbedding, chunk.embedding),
|
|
304
|
+
reranked: false,
|
|
305
|
+
}));
|
|
306
|
+
|
|
307
|
+
// Filter by similarity threshold
|
|
308
|
+
let filteredResults = results.filter((result) => result.similarity >= similarityThreshold);
|
|
309
|
+
|
|
310
|
+
// Apply sophisticated context-aware reranking if enabled and context is available
|
|
311
|
+
if (useReranking && queryContextForReranking && filteredResults.length >= 2) {
|
|
312
|
+
await this._applyParallelReranking(filteredResults, queryText, queryContextForReranking, queryFilePath, queryEmbedding);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Sort by similarity and limit results
|
|
316
|
+
filteredResults.sort((a, b) => b.similarity - a.similarity);
|
|
317
|
+
|
|
318
|
+
if (filteredResults.length > limit) {
|
|
319
|
+
filteredResults = filteredResults.slice(0, limit);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
console.log(chalk.green(`Found ${filteredResults.length} relevant custom document chunks (${Date.now() - startTime}ms)`));
|
|
323
|
+
|
|
324
|
+
// Log top results for debugging
|
|
325
|
+
if (filteredResults.length > 0) {
|
|
326
|
+
debug(`[Custom Doc Search] Top result: ${filteredResults[0].document_title} (${filteredResults[0].similarity.toFixed(3)})`);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return filteredResults;
|
|
330
|
+
} catch (error) {
|
|
331
|
+
console.error(chalk.red(`Error searching custom document chunks: ${error.message}`));
|
|
332
|
+
throw new EmbeddingError(`Custom document search failed: ${error.message}`);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Get existing custom document chunks for a project
|
|
338
|
+
* @param {string} projectPath - Project path
|
|
339
|
+
* @returns {Promise<Array>} Array of existing chunks
|
|
340
|
+
*/
|
|
341
|
+
async getExistingChunks(projectPath) {
|
|
342
|
+
try {
|
|
343
|
+
const resolvedProjectPath = path.resolve(projectPath);
|
|
344
|
+
|
|
345
|
+
// Try memory first
|
|
346
|
+
const existingChunks = this.customDocumentChunks.get(resolvedProjectPath);
|
|
347
|
+
if (existingChunks && existingChunks.length > 0) {
|
|
348
|
+
debug(`[getExistingChunks] Found ${existingChunks.length} existing chunks in memory for project: ${resolvedProjectPath}`);
|
|
349
|
+
return existingChunks;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Try cache manager
|
|
353
|
+
const cachedChunks = await this.cacheManager.getCustomDocuments(resolvedProjectPath);
|
|
354
|
+
if (cachedChunks && cachedChunks.length > 0) {
|
|
355
|
+
// Restore to memory
|
|
356
|
+
this.customDocumentChunks.set(resolvedProjectPath, cachedChunks);
|
|
357
|
+
debug(`[getExistingChunks] Restored ${cachedChunks.length} chunks from cache for project: ${resolvedProjectPath}`);
|
|
358
|
+
return cachedChunks;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
debug(`[getExistingChunks] No existing chunks found for project: ${resolvedProjectPath}`);
|
|
362
|
+
return [];
|
|
363
|
+
} catch (error) {
|
|
364
|
+
debug(`[getExistingChunks] Error checking existing chunks: ${error.message}`);
|
|
365
|
+
return [];
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Apply sophisticated parallel reranking to custom document chunks
|
|
371
|
+
* @private
|
|
372
|
+
*/
|
|
373
|
+
async _applyParallelReranking(filteredResults, queryText, queryContextForReranking, queryFilePath, queryEmbedding) {
|
|
374
|
+
console.log(chalk.cyan('Applying optimized parallel contextual reranking to custom document chunks...'));
|
|
375
|
+
|
|
376
|
+
const WEIGHT_INITIAL_SIM = 0.4;
|
|
377
|
+
const WEIGHT_DOCUMENT_TITLE_MATCH = 0.2;
|
|
378
|
+
const HEAVY_BOOST_SAME_AREA = 0.3;
|
|
379
|
+
const MODERATE_BOOST_TECH_MATCH = 0.15;
|
|
380
|
+
const HEAVY_PENALTY_AREA_MISMATCH = -0.1;
|
|
381
|
+
const PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH = -0.1;
|
|
382
|
+
|
|
383
|
+
// Pre-calculate common values to avoid redundant computations
|
|
384
|
+
const queryArea = queryContextForReranking.area;
|
|
385
|
+
const queryAreaLower = queryArea?.toLowerCase();
|
|
386
|
+
const queryKeywords = queryContextForReranking.keywords || [];
|
|
387
|
+
const queryKeywordsLower = queryKeywords.map((kw) => kw.toLowerCase());
|
|
388
|
+
const queryTech = queryContextForReranking.dominantTech || [];
|
|
389
|
+
const queryTechLower = queryTech.map((tech) => tech.toLowerCase());
|
|
390
|
+
|
|
391
|
+
// Pre-calculate area matching patterns
|
|
392
|
+
const areaMatchPatterns = queryAreaLower ? [queryAreaLower, queryAreaLower.replace(/[_-]/g, ' ')] : [];
|
|
393
|
+
|
|
394
|
+
// Batch calculate document title embeddings for cache misses
|
|
395
|
+
await this._batchCalculateDocumentTitleEmbeddings(filteredResults);
|
|
396
|
+
|
|
397
|
+
// True parallel processing with pre-computed values
|
|
398
|
+
const rerankingPromises = filteredResults.map(async (result) => {
|
|
399
|
+
let chunkInitialScore = result.similarity * WEIGHT_INITIAL_SIM;
|
|
400
|
+
let contextMatchBonus = 0;
|
|
401
|
+
let titleRelevanceBonus = 0;
|
|
402
|
+
let genericDocPenalty = 0;
|
|
403
|
+
let pathSimilarityScore = 0;
|
|
404
|
+
|
|
405
|
+
const docTitle = result.document_title;
|
|
406
|
+
const contentLower = result.content.toLowerCase();
|
|
407
|
+
|
|
408
|
+
// Vectorized context matching with pre-computed patterns
|
|
409
|
+
if (queryArea !== 'Unknown' && queryArea !== 'General') {
|
|
410
|
+
const areaMatch = areaMatchPatterns.some((pattern) => contentLower.includes(pattern));
|
|
411
|
+
|
|
412
|
+
if (areaMatch) {
|
|
413
|
+
contextMatchBonus += HEAVY_BOOST_SAME_AREA;
|
|
414
|
+
|
|
415
|
+
// Vectorized technology matching
|
|
416
|
+
if (queryTechLower.length > 0) {
|
|
417
|
+
const techMatch = queryTechLower.some((tech) => contentLower.includes(tech));
|
|
418
|
+
if (techMatch) {
|
|
419
|
+
contextMatchBonus += MODERATE_BOOST_TECH_MATCH;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
} else if (queryArea !== 'GeneralJS_TS') {
|
|
423
|
+
contextMatchBonus += HEAVY_PENALTY_AREA_MISMATCH;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Vectorized keyword matching
|
|
428
|
+
if (queryKeywordsLower.length > 0) {
|
|
429
|
+
const matchingKeywords = queryKeywordsLower.filter((keyword) => contentLower.includes(keyword));
|
|
430
|
+
const keywordMatchRatio = matchingKeywords.length / queryKeywordsLower.length;
|
|
431
|
+
contextMatchBonus += keywordMatchRatio * 0.1;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// Cached title relevance calculation
|
|
435
|
+
if (docTitle && queryEmbedding) {
|
|
436
|
+
const titleEmb = this.h1EmbeddingCache.get(docTitle);
|
|
437
|
+
if (titleEmb) {
|
|
438
|
+
titleRelevanceBonus = calculateCosineSimilarity(queryEmbedding, titleEmb) * WEIGHT_DOCUMENT_TITLE_MATCH;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// Optimized generic document penalty calculation
|
|
443
|
+
const contentLength = result.content.length;
|
|
444
|
+
if (contentLength > 2000 && queryKeywordsLower.length > 0) {
|
|
445
|
+
const matchingKeywords = queryKeywordsLower.filter((kw) => contentLower.includes(kw));
|
|
446
|
+
const specificityScore = matchingKeywords.length / queryKeywordsLower.length;
|
|
447
|
+
|
|
448
|
+
if (specificityScore < 0.3) {
|
|
449
|
+
genericDocPenalty = PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Cached path similarity calculation
|
|
454
|
+
if (queryFilePath && result.document_title) {
|
|
455
|
+
const pathSim = calculatePathSimilarity(queryFilePath, result.document_title);
|
|
456
|
+
pathSimilarityScore = pathSim * 0.1;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
const finalScore = chunkInitialScore + contextMatchBonus + titleRelevanceBonus + pathSimilarityScore + genericDocPenalty;
|
|
460
|
+
result.similarity = Math.max(0, Math.min(1, finalScore));
|
|
461
|
+
result.reranked = true;
|
|
462
|
+
|
|
463
|
+
return result;
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
// Wait for all reranking calculations to complete in parallel
|
|
467
|
+
await Promise.all(rerankingPromises);
|
|
468
|
+
|
|
469
|
+
console.log(chalk.cyan(`Parallel reranking completed for ${filteredResults.length} chunks`));
|
|
470
|
+
|
|
471
|
+
// Log debug info for first few results
|
|
472
|
+
for (let i = 0; i < Math.min(3, filteredResults.length); i++) {
|
|
473
|
+
const result = filteredResults[i];
|
|
474
|
+
debug(`[CustomDocRerank] ${result.document_title?.substring(0, 30)}... Final=${result.similarity.toFixed(4)}`);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
debug('Optimized parallel contextual reranking of custom document chunks complete.');
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Batch calculate document title embeddings for performance
|
|
482
|
+
* @private
|
|
483
|
+
*/
|
|
484
|
+
async _batchCalculateDocumentTitleEmbeddings(results) {
|
|
485
|
+
const uniqueDocTitles = new Set();
|
|
486
|
+
const docTitlesToCalculate = [];
|
|
487
|
+
|
|
488
|
+
for (const result of results) {
|
|
489
|
+
const docTitle = result.document_title;
|
|
490
|
+
if (docTitle && !uniqueDocTitles.has(docTitle)) {
|
|
491
|
+
uniqueDocTitles.add(docTitle);
|
|
492
|
+
if (!this.h1EmbeddingCache.has(docTitle)) {
|
|
493
|
+
docTitlesToCalculate.push(docTitle);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// Batch calculate document title embeddings for cache misses
|
|
499
|
+
if (docTitlesToCalculate.length > 0) {
|
|
500
|
+
debug(`[OPTIMIZATION] Batch calculating ${docTitlesToCalculate.length} custom document title embeddings`);
|
|
501
|
+
try {
|
|
502
|
+
const titleEmbeddings = await this.modelManager.calculateEmbeddingBatch(docTitlesToCalculate);
|
|
503
|
+
for (let i = 0; i < docTitlesToCalculate.length; i++) {
|
|
504
|
+
if (titleEmbeddings[i]) {
|
|
505
|
+
this.h1EmbeddingCache.set(docTitlesToCalculate[i], titleEmbeddings[i]);
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
} catch (error) {
|
|
509
|
+
debug(`[OPTIMIZATION] Error in batch title embedding calculation: ${error.message}`);
|
|
510
|
+
// Continue without title embeddings
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Clear custom document chunks for a project
|
|
517
|
+
* @param {string} projectPath - Project path
|
|
518
|
+
*/
|
|
519
|
+
async clearProjectChunks(projectPath) {
|
|
520
|
+
try {
|
|
521
|
+
const resolvedProjectPath = path.resolve(projectPath);
|
|
522
|
+
this.customDocumentChunks.delete(resolvedProjectPath);
|
|
523
|
+
await this.cacheManager.clearCustomDocuments(resolvedProjectPath);
|
|
524
|
+
console.log(chalk.green(`Cleared custom document chunks for project: ${resolvedProjectPath}`));
|
|
525
|
+
} catch (error) {
|
|
526
|
+
console.error(chalk.red(`Error clearing project chunks: ${error.message}`));
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
/**
|
|
531
|
+
* Get all projects with custom documents
|
|
532
|
+
* @returns {Array} Array of project paths
|
|
533
|
+
*/
|
|
534
|
+
getProjectsWithCustomDocuments() {
|
|
535
|
+
return Array.from(this.customDocumentChunks.keys());
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
/**
|
|
539
|
+
* Get performance metrics
|
|
540
|
+
* @returns {Object} Performance metrics
|
|
541
|
+
*/
|
|
542
|
+
getPerformanceMetrics() {
|
|
543
|
+
return {
|
|
544
|
+
...this.performanceMetrics,
|
|
545
|
+
averageProcessingTime:
|
|
546
|
+
this.performanceMetrics.documentsProcessed > 0
|
|
547
|
+
? this.performanceMetrics.processingTime / this.performanceMetrics.documentsProcessed
|
|
548
|
+
: 0,
|
|
549
|
+
embeddingEfficiency:
|
|
550
|
+
this.performanceMetrics.chunksGenerated > 0
|
|
551
|
+
? (this.performanceMetrics.embeddingsCalculated / this.performanceMetrics.chunksGenerated) * 100
|
|
552
|
+
: 0,
|
|
553
|
+
cacheSize: this.h1EmbeddingCache.size,
|
|
554
|
+
activeProjects: this.customDocumentChunks.size,
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/**
|
|
559
|
+
* Clear all caches
|
|
560
|
+
*/
|
|
561
|
+
clearCaches() {
|
|
562
|
+
this.h1EmbeddingCache.clear();
|
|
563
|
+
this.customDocumentChunks.clear();
|
|
564
|
+
console.log(chalk.green('CustomDocumentProcessor caches cleared'));
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
/**
|
|
568
|
+
* Cleanup resources
|
|
569
|
+
*/
|
|
570
|
+
async cleanup() {
|
|
571
|
+
if (this.cleaningUp) {
|
|
572
|
+
return; // Already cleaning up, prevent duplicate calls
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
this.cleaningUp = true;
|
|
576
|
+
|
|
577
|
+
try {
|
|
578
|
+
// Clear LOCAL caches only (not system-wide caches)
|
|
579
|
+
this.h1EmbeddingCache.clear();
|
|
580
|
+
this.customDocumentChunks.clear();
|
|
581
|
+
|
|
582
|
+
// Reset LOCAL performance metrics
|
|
583
|
+
this.performanceMetrics = {
|
|
584
|
+
documentsProcessed: 0,
|
|
585
|
+
chunksGenerated: 0,
|
|
586
|
+
embeddingsCalculated: 0,
|
|
587
|
+
batchSuccessRate: 0,
|
|
588
|
+
averageChunkSize: 0,
|
|
589
|
+
processingTime: 0,
|
|
590
|
+
};
|
|
591
|
+
|
|
592
|
+
console.log(chalk.green('CustomDocumentProcessor cleanup complete'));
|
|
593
|
+
} finally {
|
|
594
|
+
this.cleaningUp = false;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|