codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,597 @@
1
+ /**
2
+ * Custom Document Processor
3
+ *
4
+ * This module provides advanced custom document processing capabilities:
5
+ * - Intelligent document chunking with metadata preservation
6
+ * - Batch embedding generation for optimal performance
7
+ * - Memory-based document storage with project isolation
8
+ * - Context-aware search and retrieval
9
+ * - Parallel processing with sophisticated reranking
10
+ *
11
+ * @module CustomDocuments
12
+ */
13
+
14
+ import { createHash } from 'crypto';
15
+ import path from 'path';
16
+ import chalk from 'chalk';
17
+ import { CacheManager } from './embeddings/cache-manager.js';
18
+ import { EmbeddingError, ValidationError } from './embeddings/errors.js';
19
+ import { ModelManager } from './embeddings/model-manager.js';
20
+ import { calculateCosineSimilarity, calculatePathSimilarity } from './embeddings/similarity-calculator.js';
21
+ import { debug } from './utils/logging.js';
22
+ import { slugify } from './utils/string-utils.js';
23
+
24
+ /**
25
+ * CustomDocumentProcessor class for advanced document processing
26
+ */
27
+ export class CustomDocumentProcessor {
28
+ constructor(options = {}) {
29
+ this.modelManager = options.modelManager || new ModelManager();
30
+ this.cacheManager = options.cacheManager || new CacheManager();
31
+
32
+ // In-memory storage for custom document chunks (project-isolated)
33
+ this.customDocumentChunks = new Map();
34
+
35
+ // Embedding cache for performance optimization
36
+ this.h1EmbeddingCache = new Map();
37
+
38
+ // Performance metrics
39
+ this.performanceMetrics = {
40
+ documentsProcessed: 0,
41
+ chunksGenerated: 0,
42
+ embeddingsCalculated: 0,
43
+ batchSuccessRate: 0,
44
+ averageChunkSize: 0,
45
+ processingTime: 0,
46
+ };
47
+
48
+ // Cleanup guard
49
+ this.cleaningUp = false;
50
+ }
51
+
52
+ /**
53
+ * Chunk a custom document into manageable pieces
54
+ * @param {Object} doc - Document object with title and content
55
+ * @returns {Array} Array of document chunks
56
+ */
57
+ chunkDocument(doc) {
58
+ const { title, content } = doc;
59
+ const startTime = Date.now();
60
+
61
+ try {
62
+ if (!doc || !content) {
63
+ throw new ValidationError('Document must have content');
64
+ }
65
+
66
+ // Extract the actual document title from content
67
+ let documentTitle = title;
68
+
69
+ // Try to find a markdown header in the content
70
+ const headerMatch = content.match(/^#\s+(.+)$/m);
71
+ if (headerMatch) {
72
+ documentTitle = headerMatch[1].trim();
73
+ } else {
74
+ // If no header found, try to extract filename from title like "instruction:./FILENAME.md"
75
+ const filePathMatch = title.match(/:\.\/([^/]+)\.([a-zA-Z]+)$/);
76
+ if (filePathMatch) {
77
+ // Use filename without extension, but capitalize it nicely
78
+ documentTitle = filePathMatch[1].replace(/_/g, ' ').replace(/\b\w/g, (l) => l.toUpperCase());
79
+ }
80
+ }
81
+
82
+ const chunks = [];
83
+ const sections = content.split(/\n\s*\n/);
84
+ let currentChunk = '';
85
+ let chunkIndex = 0;
86
+ const maxChunkSize = 1000; // Max characters per chunk
87
+ const minChunkSize = 100; // Min characters to avoid tiny chunks
88
+
89
+ for (let i = 0; i < sections.length; i++) {
90
+ const section = sections[i].trim();
91
+ if (!section) continue;
92
+
93
+ // Check if adding this section would exceed max chunk size
94
+ if (currentChunk.length + section.length > maxChunkSize && currentChunk.length > minChunkSize) {
95
+ // Save current chunk
96
+ chunks.push({
97
+ id: `${slugify(documentTitle)}_chunk_${chunkIndex}`,
98
+ content: currentChunk.trim(),
99
+ document_title: documentTitle,
100
+ chunk_index: chunkIndex,
101
+ metadata: {
102
+ section_start: chunkIndex === 0,
103
+ total_chunks: 0, // Will be updated after all chunks are created
104
+ original_title: title,
105
+ chunk_hash: createHash('md5').update(currentChunk.trim()).digest('hex').substring(0, 8),
106
+ },
107
+ });
108
+
109
+ chunkIndex++;
110
+ currentChunk = section;
111
+ } else {
112
+ // Add section to current chunk
113
+ currentChunk += (currentChunk ? '\n\n' : '') + section;
114
+ }
115
+ }
116
+
117
+ // Add the last chunk if it has content
118
+ if (currentChunk.trim()) {
119
+ chunks.push({
120
+ id: `${slugify(documentTitle)}_chunk_${chunkIndex}`,
121
+ content: currentChunk.trim(),
122
+ document_title: documentTitle,
123
+ chunk_index: chunkIndex,
124
+ metadata: {
125
+ section_start: chunkIndex === 0,
126
+ total_chunks: 0,
127
+ original_title: title,
128
+ chunk_hash: createHash('md5').update(currentChunk.trim()).digest('hex').substring(0, 8),
129
+ },
130
+ });
131
+ }
132
+
133
+ // Update total_chunks metadata for all chunks
134
+ chunks.forEach((chunk) => {
135
+ chunk.metadata.total_chunks = chunks.length;
136
+ });
137
+
138
+ // Update performance metrics
139
+ this.performanceMetrics.chunksGenerated += chunks.length;
140
+ this.performanceMetrics.averageChunkSize = chunks.reduce((sum, chunk) => sum + chunk.content.length, 0) / chunks.length;
141
+ this.performanceMetrics.processingTime += Date.now() - startTime;
142
+
143
+ console.log(chalk.gray(` Chunked document "${documentTitle}" into ${chunks.length} chunks`));
144
+ return chunks;
145
+ } catch (error) {
146
+ console.error(chalk.red(`Error chunking document: ${error.message}`));
147
+ throw new EmbeddingError(`Document chunking failed: ${error.message}`);
148
+ }
149
+ }
150
+
151
+ /**
152
+ * Process custom documents in memory with advanced batch processing
153
+ * @param {Array} customDocs - Array of custom documents
154
+ * @param {string} projectPath - Project path for isolation
155
+ * @returns {Promise<Array>} Array of processed chunks with embeddings
156
+ */
157
+ async processDocumentsInMemory(customDocs, projectPath) {
158
+ const startTime = Date.now();
159
+
160
+ try {
161
+ if (!customDocs || customDocs.length === 0) {
162
+ console.log(chalk.gray('No custom documents to process'));
163
+ return [];
164
+ }
165
+
166
+ console.log(chalk.cyan(`Processing ${customDocs.length} custom documents into chunks...`));
167
+
168
+ const allChunks = [];
169
+ let totalBatchAttempts = 0;
170
+ let successfulBatches = 0;
171
+
172
+ for (const doc of customDocs) {
173
+ console.log(chalk.gray(` Processing document: ${doc.title}`));
174
+
175
+ // Chunk the document
176
+ const chunks = this.chunkDocument(doc);
177
+
178
+ // OPTIMIZATION: Batch process embeddings instead of individual calls
179
+ const chunkContents = chunks.map((chunk) => chunk.content);
180
+ totalBatchAttempts++;
181
+
182
+ try {
183
+ // Generate embeddings for all chunks in a single batch
184
+ const embeddings = await this.modelManager.calculateEmbeddingBatch(chunkContents);
185
+ successfulBatches++;
186
+
187
+ // Process results with mixed success/failure handling
188
+ const chunksWithEmbeddings = chunks.map((chunk, index) => {
189
+ if (embeddings[index] !== null) {
190
+ return {
191
+ ...chunk,
192
+ embedding: embeddings[index],
193
+ similarity: 0, // Will be calculated during search
194
+ type: 'custom-document-chunk',
195
+ project_path: path.resolve(projectPath),
196
+ created_at: new Date().toISOString(),
197
+ };
198
+ } else {
199
+ console.error(chalk.red(`Error generating embedding for chunk ${chunk.id}: batch processing failed`));
200
+ return null;
201
+ }
202
+ });
203
+
204
+ // Filter out failed chunks
205
+ const validChunks = chunksWithEmbeddings.filter((chunk) => chunk !== null);
206
+ allChunks.push(...validChunks);
207
+
208
+ console.log(chalk.gray(` Generated embeddings for ${validChunks.length}/${chunks.length} chunks`));
209
+ this.performanceMetrics.embeddingsCalculated += validChunks.length;
210
+ } catch (error) {
211
+ console.error(chalk.red(`Error in batch embedding generation for document ${doc.title}: ${error.message}`));
212
+ // Fallback to individual processing for this document
213
+ console.log(chalk.yellow(` Falling back to individual processing for ${doc.title}`));
214
+
215
+ const chunksWithEmbeddings = await Promise.all(
216
+ chunks.map(async (chunk) => {
217
+ try {
218
+ const embedding = await this.modelManager.calculateEmbedding(chunk.content);
219
+ this.performanceMetrics.embeddingsCalculated++;
220
+ return {
221
+ ...chunk,
222
+ embedding,
223
+ similarity: 0,
224
+ type: 'custom-document-chunk',
225
+ project_path: path.resolve(projectPath),
226
+ created_at: new Date().toISOString(),
227
+ };
228
+ } catch (error) {
229
+ console.error(chalk.red(`Error generating embedding for chunk ${chunk.id}: ${error.message}`));
230
+ return null;
231
+ }
232
+ })
233
+ );
234
+
235
+ const validChunks = chunksWithEmbeddings.filter((chunk) => chunk !== null);
236
+ allChunks.push(...validChunks);
237
+
238
+ console.log(chalk.gray(` Generated embeddings for ${validChunks.length}/${chunks.length} chunks (fallback)`));
239
+ }
240
+ }
241
+
242
+ // Calculate batch success rate
243
+ this.performanceMetrics.batchSuccessRate = totalBatchAttempts > 0 ? (successfulBatches / totalBatchAttempts) * 100 : 0;
244
+
245
+ // Store chunks in memory organized by project path
246
+ const resolvedProjectPath = path.resolve(projectPath);
247
+ this.customDocumentChunks.set(resolvedProjectPath, allChunks);
248
+
249
+ // Cache in CacheManager for persistence
250
+ await this.cacheManager.storeCustomDocuments(resolvedProjectPath, allChunks);
251
+
252
+ this.performanceMetrics.documentsProcessed += customDocs.length;
253
+ this.performanceMetrics.processingTime += Date.now() - startTime;
254
+
255
+ console.log(chalk.green(`Successfully processed ${allChunks.length} custom document chunks (${Date.now() - startTime}ms)`));
256
+ return allChunks;
257
+ } catch (error) {
258
+ console.error(chalk.red(`Error processing custom documents: ${error.message}`));
259
+ throw new EmbeddingError(`Custom document processing failed: ${error.message}`);
260
+ }
261
+ }
262
+
263
+ /**
264
+ * Find relevant custom document chunks with advanced reranking
265
+ * @param {string} queryText - The search query
266
+ * @param {Array} chunks - Array of document chunks to search
267
+ * @param {Object} options - Search configuration
268
+ * @returns {Promise<Array>} Array of relevant chunks
269
+ */
270
+ async findRelevantChunks(queryText, chunks = [], options = {}) {
271
+ const {
272
+ limit = 5,
273
+ similarityThreshold = 0.3,
274
+ queryContextForReranking = null,
275
+ useReranking = true,
276
+ precomputedQueryEmbedding = null,
277
+ queryFilePath = null,
278
+ } = options;
279
+
280
+ const startTime = Date.now();
281
+
282
+ try {
283
+ if (!queryText?.trim()) {
284
+ throw new ValidationError('Empty query text provided for custom document search');
285
+ }
286
+
287
+ if (!chunks || chunks.length === 0) {
288
+ console.log(chalk.gray('No custom document chunks available for search'));
289
+ return [];
290
+ }
291
+
292
+ console.log(chalk.cyan(`Searching ${chunks.length} custom document chunks...`));
293
+
294
+ // OPTIMIZATION: Use pre-computed query embedding if available
295
+ let queryEmbedding = precomputedQueryEmbedding;
296
+ if (!queryEmbedding) {
297
+ queryEmbedding = await this.modelManager.calculateQueryEmbedding(queryText);
298
+ }
299
+
300
+ // OPTIMIZATION: Vectorized similarity calculation for better performance
301
+ const results = chunks.map((chunk) => ({
302
+ ...chunk,
303
+ similarity: calculateCosineSimilarity(queryEmbedding, chunk.embedding),
304
+ reranked: false,
305
+ }));
306
+
307
+ // Filter by similarity threshold
308
+ let filteredResults = results.filter((result) => result.similarity >= similarityThreshold);
309
+
310
+ // Apply sophisticated context-aware reranking if enabled and context is available
311
+ if (useReranking && queryContextForReranking && filteredResults.length >= 2) {
312
+ await this._applyParallelReranking(filteredResults, queryText, queryContextForReranking, queryFilePath, queryEmbedding);
313
+ }
314
+
315
+ // Sort by similarity and limit results
316
+ filteredResults.sort((a, b) => b.similarity - a.similarity);
317
+
318
+ if (filteredResults.length > limit) {
319
+ filteredResults = filteredResults.slice(0, limit);
320
+ }
321
+
322
+ console.log(chalk.green(`Found ${filteredResults.length} relevant custom document chunks (${Date.now() - startTime}ms)`));
323
+
324
+ // Log top results for debugging
325
+ if (filteredResults.length > 0) {
326
+ debug(`[Custom Doc Search] Top result: ${filteredResults[0].document_title} (${filteredResults[0].similarity.toFixed(3)})`);
327
+ }
328
+
329
+ return filteredResults;
330
+ } catch (error) {
331
+ console.error(chalk.red(`Error searching custom document chunks: ${error.message}`));
332
+ throw new EmbeddingError(`Custom document search failed: ${error.message}`);
333
+ }
334
+ }
335
+
336
+ /**
337
+ * Get existing custom document chunks for a project
338
+ * @param {string} projectPath - Project path
339
+ * @returns {Promise<Array>} Array of existing chunks
340
+ */
341
+ async getExistingChunks(projectPath) {
342
+ try {
343
+ const resolvedProjectPath = path.resolve(projectPath);
344
+
345
+ // Try memory first
346
+ const existingChunks = this.customDocumentChunks.get(resolvedProjectPath);
347
+ if (existingChunks && existingChunks.length > 0) {
348
+ debug(`[getExistingChunks] Found ${existingChunks.length} existing chunks in memory for project: ${resolvedProjectPath}`);
349
+ return existingChunks;
350
+ }
351
+
352
+ // Try cache manager
353
+ const cachedChunks = await this.cacheManager.getCustomDocuments(resolvedProjectPath);
354
+ if (cachedChunks && cachedChunks.length > 0) {
355
+ // Restore to memory
356
+ this.customDocumentChunks.set(resolvedProjectPath, cachedChunks);
357
+ debug(`[getExistingChunks] Restored ${cachedChunks.length} chunks from cache for project: ${resolvedProjectPath}`);
358
+ return cachedChunks;
359
+ }
360
+
361
+ debug(`[getExistingChunks] No existing chunks found for project: ${resolvedProjectPath}`);
362
+ return [];
363
+ } catch (error) {
364
+ debug(`[getExistingChunks] Error checking existing chunks: ${error.message}`);
365
+ return [];
366
+ }
367
+ }
368
+
369
+ /**
370
+ * Apply sophisticated parallel reranking to custom document chunks
371
+ * @private
372
+ */
373
+ async _applyParallelReranking(filteredResults, queryText, queryContextForReranking, queryFilePath, queryEmbedding) {
374
+ console.log(chalk.cyan('Applying optimized parallel contextual reranking to custom document chunks...'));
375
+
376
+ const WEIGHT_INITIAL_SIM = 0.4;
377
+ const WEIGHT_DOCUMENT_TITLE_MATCH = 0.2;
378
+ const HEAVY_BOOST_SAME_AREA = 0.3;
379
+ const MODERATE_BOOST_TECH_MATCH = 0.15;
380
+ const HEAVY_PENALTY_AREA_MISMATCH = -0.1;
381
+ const PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH = -0.1;
382
+
383
+ // Pre-calculate common values to avoid redundant computations
384
+ const queryArea = queryContextForReranking.area;
385
+ const queryAreaLower = queryArea?.toLowerCase();
386
+ const queryKeywords = queryContextForReranking.keywords || [];
387
+ const queryKeywordsLower = queryKeywords.map((kw) => kw.toLowerCase());
388
+ const queryTech = queryContextForReranking.dominantTech || [];
389
+ const queryTechLower = queryTech.map((tech) => tech.toLowerCase());
390
+
391
+ // Pre-calculate area matching patterns
392
+ const areaMatchPatterns = queryAreaLower ? [queryAreaLower, queryAreaLower.replace(/[_-]/g, ' ')] : [];
393
+
394
+ // Batch calculate document title embeddings for cache misses
395
+ await this._batchCalculateDocumentTitleEmbeddings(filteredResults);
396
+
397
+ // True parallel processing with pre-computed values
398
+ const rerankingPromises = filteredResults.map(async (result) => {
399
+ let chunkInitialScore = result.similarity * WEIGHT_INITIAL_SIM;
400
+ let contextMatchBonus = 0;
401
+ let titleRelevanceBonus = 0;
402
+ let genericDocPenalty = 0;
403
+ let pathSimilarityScore = 0;
404
+
405
+ const docTitle = result.document_title;
406
+ const contentLower = result.content.toLowerCase();
407
+
408
+ // Vectorized context matching with pre-computed patterns
409
+ if (queryArea !== 'Unknown' && queryArea !== 'General') {
410
+ const areaMatch = areaMatchPatterns.some((pattern) => contentLower.includes(pattern));
411
+
412
+ if (areaMatch) {
413
+ contextMatchBonus += HEAVY_BOOST_SAME_AREA;
414
+
415
+ // Vectorized technology matching
416
+ if (queryTechLower.length > 0) {
417
+ const techMatch = queryTechLower.some((tech) => contentLower.includes(tech));
418
+ if (techMatch) {
419
+ contextMatchBonus += MODERATE_BOOST_TECH_MATCH;
420
+ }
421
+ }
422
+ } else if (queryArea !== 'GeneralJS_TS') {
423
+ contextMatchBonus += HEAVY_PENALTY_AREA_MISMATCH;
424
+ }
425
+ }
426
+
427
+ // Vectorized keyword matching
428
+ if (queryKeywordsLower.length > 0) {
429
+ const matchingKeywords = queryKeywordsLower.filter((keyword) => contentLower.includes(keyword));
430
+ const keywordMatchRatio = matchingKeywords.length / queryKeywordsLower.length;
431
+ contextMatchBonus += keywordMatchRatio * 0.1;
432
+ }
433
+
434
+ // Cached title relevance calculation
435
+ if (docTitle && queryEmbedding) {
436
+ const titleEmb = this.h1EmbeddingCache.get(docTitle);
437
+ if (titleEmb) {
438
+ titleRelevanceBonus = calculateCosineSimilarity(queryEmbedding, titleEmb) * WEIGHT_DOCUMENT_TITLE_MATCH;
439
+ }
440
+ }
441
+
442
+ // Optimized generic document penalty calculation
443
+ const contentLength = result.content.length;
444
+ if (contentLength > 2000 && queryKeywordsLower.length > 0) {
445
+ const matchingKeywords = queryKeywordsLower.filter((kw) => contentLower.includes(kw));
446
+ const specificityScore = matchingKeywords.length / queryKeywordsLower.length;
447
+
448
+ if (specificityScore < 0.3) {
449
+ genericDocPenalty = PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH;
450
+ }
451
+ }
452
+
453
+ // Cached path similarity calculation
454
+ if (queryFilePath && result.document_title) {
455
+ const pathSim = calculatePathSimilarity(queryFilePath, result.document_title);
456
+ pathSimilarityScore = pathSim * 0.1;
457
+ }
458
+
459
+ const finalScore = chunkInitialScore + contextMatchBonus + titleRelevanceBonus + pathSimilarityScore + genericDocPenalty;
460
+ result.similarity = Math.max(0, Math.min(1, finalScore));
461
+ result.reranked = true;
462
+
463
+ return result;
464
+ });
465
+
466
+ // Wait for all reranking calculations to complete in parallel
467
+ await Promise.all(rerankingPromises);
468
+
469
+ console.log(chalk.cyan(`Parallel reranking completed for ${filteredResults.length} chunks`));
470
+
471
+ // Log debug info for first few results
472
+ for (let i = 0; i < Math.min(3, filteredResults.length); i++) {
473
+ const result = filteredResults[i];
474
+ debug(`[CustomDocRerank] ${result.document_title?.substring(0, 30)}... Final=${result.similarity.toFixed(4)}`);
475
+ }
476
+
477
+ debug('Optimized parallel contextual reranking of custom document chunks complete.');
478
+ }
479
+
480
+ /**
481
+ * Batch calculate document title embeddings for performance
482
+ * @private
483
+ */
484
+ async _batchCalculateDocumentTitleEmbeddings(results) {
485
+ const uniqueDocTitles = new Set();
486
+ const docTitlesToCalculate = [];
487
+
488
+ for (const result of results) {
489
+ const docTitle = result.document_title;
490
+ if (docTitle && !uniqueDocTitles.has(docTitle)) {
491
+ uniqueDocTitles.add(docTitle);
492
+ if (!this.h1EmbeddingCache.has(docTitle)) {
493
+ docTitlesToCalculate.push(docTitle);
494
+ }
495
+ }
496
+ }
497
+
498
+ // Batch calculate document title embeddings for cache misses
499
+ if (docTitlesToCalculate.length > 0) {
500
+ debug(`[OPTIMIZATION] Batch calculating ${docTitlesToCalculate.length} custom document title embeddings`);
501
+ try {
502
+ const titleEmbeddings = await this.modelManager.calculateEmbeddingBatch(docTitlesToCalculate);
503
+ for (let i = 0; i < docTitlesToCalculate.length; i++) {
504
+ if (titleEmbeddings[i]) {
505
+ this.h1EmbeddingCache.set(docTitlesToCalculate[i], titleEmbeddings[i]);
506
+ }
507
+ }
508
+ } catch (error) {
509
+ debug(`[OPTIMIZATION] Error in batch title embedding calculation: ${error.message}`);
510
+ // Continue without title embeddings
511
+ }
512
+ }
513
+ }
514
+
515
+ /**
516
+ * Clear custom document chunks for a project
517
+ * @param {string} projectPath - Project path
518
+ */
519
+ async clearProjectChunks(projectPath) {
520
+ try {
521
+ const resolvedProjectPath = path.resolve(projectPath);
522
+ this.customDocumentChunks.delete(resolvedProjectPath);
523
+ await this.cacheManager.clearCustomDocuments(resolvedProjectPath);
524
+ console.log(chalk.green(`Cleared custom document chunks for project: ${resolvedProjectPath}`));
525
+ } catch (error) {
526
+ console.error(chalk.red(`Error clearing project chunks: ${error.message}`));
527
+ }
528
+ }
529
+
530
+ /**
531
+ * Get all projects with custom documents
532
+ * @returns {Array} Array of project paths
533
+ */
534
+ getProjectsWithCustomDocuments() {
535
+ return Array.from(this.customDocumentChunks.keys());
536
+ }
537
+
538
+ /**
539
+ * Get performance metrics
540
+ * @returns {Object} Performance metrics
541
+ */
542
+ getPerformanceMetrics() {
543
+ return {
544
+ ...this.performanceMetrics,
545
+ averageProcessingTime:
546
+ this.performanceMetrics.documentsProcessed > 0
547
+ ? this.performanceMetrics.processingTime / this.performanceMetrics.documentsProcessed
548
+ : 0,
549
+ embeddingEfficiency:
550
+ this.performanceMetrics.chunksGenerated > 0
551
+ ? (this.performanceMetrics.embeddingsCalculated / this.performanceMetrics.chunksGenerated) * 100
552
+ : 0,
553
+ cacheSize: this.h1EmbeddingCache.size,
554
+ activeProjects: this.customDocumentChunks.size,
555
+ };
556
+ }
557
+
558
+ /**
559
+ * Clear all caches
560
+ */
561
+ clearCaches() {
562
+ this.h1EmbeddingCache.clear();
563
+ this.customDocumentChunks.clear();
564
+ console.log(chalk.green('CustomDocumentProcessor caches cleared'));
565
+ }
566
+
567
+ /**
568
+ * Cleanup resources
569
+ */
570
+ async cleanup() {
571
+ if (this.cleaningUp) {
572
+ return; // Already cleaning up, prevent duplicate calls
573
+ }
574
+
575
+ this.cleaningUp = true;
576
+
577
+ try {
578
+ // Clear LOCAL caches only (not system-wide caches)
579
+ this.h1EmbeddingCache.clear();
580
+ this.customDocumentChunks.clear();
581
+
582
+ // Reset LOCAL performance metrics
583
+ this.performanceMetrics = {
584
+ documentsProcessed: 0,
585
+ chunksGenerated: 0,
586
+ embeddingsCalculated: 0,
587
+ batchSuccessRate: 0,
588
+ averageChunkSize: 0,
589
+ processingTime: 0,
590
+ };
591
+
592
+ console.log(chalk.green('CustomDocumentProcessor cleanup complete'));
593
+ } finally {
594
+ this.cleaningUp = false;
595
+ }
596
+ }
597
+ }