codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,747 @@
1
+ /**
2
+ * Content Retrieval Service
3
+ *
4
+ * This module provides sophisticated content retrieval capabilities with:
5
+ * - Hybrid search combining vector similarity and full-text search
6
+ * - Context-aware reranking algorithms
7
+ * - Project-specific filtering and isolation
8
+ * - H1 embedding cache integration
9
+ * - Parallel processing for optimal performance
10
+ *
11
+ * @module ContentRetrieval
12
+ */
13
+
14
+ import fs from 'fs';
15
+ import path from 'path';
16
+ import chalk from 'chalk';
17
+ import { CacheManager } from './embeddings/cache-manager.js';
18
+ import { TABLE_NAMES } from './embeddings/constants.js';
19
+ import { DatabaseManager } from './embeddings/database.js';
20
+ import { EmbeddingError } from './embeddings/errors.js';
21
+ import { ModelManager } from './embeddings/model-manager.js';
22
+ import { calculateCosineSimilarity, calculatePathSimilarity } from './embeddings/similarity-calculator.js';
23
+ import { inferContextFromDocumentContent } from './utils/context-inference.js';
24
+ import { isGenericDocument, getGenericDocumentContext } from './utils/document-detection.js';
25
+ import { isDocumentationFile } from './utils/file-validation.js';
26
+ import { debug } from './utils/logging.js';
27
+
28
+ const FILE_EMBEDDINGS_TABLE = TABLE_NAMES.FILE_EMBEDDINGS;
29
+ const DOCUMENT_CHUNK_TABLE = TABLE_NAMES.DOCUMENT_CHUNK;
30
+
31
+ /**
32
+ * ContentRetriever class for advanced search and discovery
33
+ */
34
+ export class ContentRetriever {
35
+ constructor(options = {}) {
36
+ this.modelManager = options.modelManager || new ModelManager();
37
+ this.database = options.database || new DatabaseManager();
38
+ this.cacheManager = options.cacheManager || new CacheManager();
39
+
40
+ // Initialize caches for performance optimization
41
+ this.h1EmbeddingCache = new Map();
42
+ this.documentContextCache = new Map();
43
+ this.documentContextPromiseCache = new Map();
44
+
45
+ // Performance tracking
46
+ this.performanceMetrics = {
47
+ searchCount: 0,
48
+ totalSearchTime: 0,
49
+ cacheHitRate: 0,
50
+ parallelRerankingTime: 0,
51
+ };
52
+
53
+ // Cleanup guard
54
+ this.cleaningUp = false;
55
+ }
56
+
57
+ /**
58
+ * Find relevant documentation with sophisticated reranking
59
+ * @param {string} queryText - The search query
60
+ * @param {Object} options - Search configuration
61
+ * @returns {Promise<Array>} Array of relevant documents
62
+ */
63
+ async findRelevantDocs(queryText, options = {}) {
64
+ const {
65
+ limit = 10,
66
+ similarityThreshold = 0.1,
67
+ useReranking = true,
68
+ queryFilePath = null,
69
+ queryContextForReranking = null,
70
+ projectPath = process.cwd(),
71
+ precomputedQueryEmbedding = null,
72
+ } = options;
73
+
74
+ this.performanceMetrics.searchCount++;
75
+
76
+ try {
77
+ if (!queryText?.trim()) {
78
+ console.warn(chalk.yellow('Empty query text provided for documentation search'));
79
+ return [];
80
+ }
81
+
82
+ console.log(
83
+ chalk.cyan(`Native hybrid documentation search - limit: ${limit}, threshold: ${similarityThreshold}, reranking: ${useReranking}`)
84
+ );
85
+
86
+ await this.database.connect();
87
+ const table = await this.database.getTable(DOCUMENT_CHUNK_TABLE);
88
+
89
+ if (!table) {
90
+ console.warn(chalk.yellow(`Documentation table ${DOCUMENT_CHUNK_TABLE} not found`));
91
+ return [];
92
+ }
93
+
94
+ console.log(chalk.cyan('Performing native hybrid search for documentation...'));
95
+ let query = table.search(queryText).nearestToText(queryText);
96
+
97
+ const resolvedProjectPath = path.resolve(projectPath);
98
+ try {
99
+ const tableSchema = await table.schema;
100
+ if (tableSchema?.fields?.some((field) => field.name === 'project_path')) {
101
+ query = query.where(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`);
102
+ debug(`Filtering documentation by project_path: ${resolvedProjectPath}`);
103
+ }
104
+ } catch (schemaError) {
105
+ debug(`Could not check schema for project_path field: ${schemaError.message}`);
106
+ }
107
+
108
+ const results = await query.limit(Math.max(limit * 3, 20)).toArray();
109
+ console.log(chalk.green(`Native hybrid search returned ${results.length} documentation results`));
110
+
111
+ // OPTIMIZATION: Enhanced batch file existence checks with parallel processing
112
+ const docsToCheck = [];
113
+ const docProjectMatchMap = new Map();
114
+
115
+ // First pass: collect files that need existence checking
116
+ for (let i = 0; i < results.length; i++) {
117
+ const result = results[i];
118
+
119
+ if (result.project_path) {
120
+ docProjectMatchMap.set(i, result.project_path === resolvedProjectPath);
121
+ continue;
122
+ }
123
+
124
+ if (!result.original_document_path) {
125
+ docProjectMatchMap.set(i, false);
126
+ continue;
127
+ }
128
+
129
+ const filePath = result.original_document_path;
130
+ try {
131
+ if (path.isAbsolute(filePath)) {
132
+ docProjectMatchMap.set(i, filePath.startsWith(resolvedProjectPath));
133
+ continue;
134
+ }
135
+
136
+ const absolutePath = path.resolve(resolvedProjectPath, filePath);
137
+ if (absolutePath.startsWith(resolvedProjectPath)) {
138
+ // Mark for batch existence check
139
+ docsToCheck.push({ result, index: i, absolutePath, filePath });
140
+ } else {
141
+ docProjectMatchMap.set(i, false);
142
+ }
143
+ } catch (error) {
144
+ debug(`Error filtering result for project: ${error.message}`);
145
+ docProjectMatchMap.set(i, false);
146
+ }
147
+ }
148
+
149
+ // Enhanced batch check file existence with improved error handling
150
+ if (docsToCheck.length > 0) {
151
+ debug(`[OPTIMIZATION] Batch checking existence of ${docsToCheck.length} documentation files`);
152
+ const existencePromises = docsToCheck.map(async ({ index, absolutePath, filePath }) => {
153
+ try {
154
+ await fs.promises.access(absolutePath, fs.constants.F_OK);
155
+ return { index, exists: true };
156
+ } catch {
157
+ debug(`Filtering out non-existent documentation file: ${filePath}`);
158
+ return { index, exists: false };
159
+ }
160
+ });
161
+
162
+ const existenceResults = await Promise.all(existencePromises);
163
+ for (const { index, exists } of existenceResults) {
164
+ docProjectMatchMap.set(index, exists);
165
+ }
166
+ }
167
+
168
+ // Filter results based on project match using the map
169
+ const projectFilteredResults = results.filter((result, index) => docProjectMatchMap.get(index) === true);
170
+
171
+ console.log(chalk.blue(`Filtered to ${projectFilteredResults.length} documentation results from current project`));
172
+ let finalResults = projectFilteredResults.map((result) => {
173
+ let similarity;
174
+ if (result._distance !== undefined) {
175
+ similarity = Math.max(0, Math.min(1, 1 - result._distance));
176
+ } else if (result._score !== undefined) {
177
+ similarity = Math.max(0, Math.min(1, result._score));
178
+ } else {
179
+ similarity = 0.5;
180
+ }
181
+
182
+ return {
183
+ similarity,
184
+ type: 'documentation-chunk',
185
+ content: result.content,
186
+ path: result.original_document_path,
187
+ file_path: result.original_document_path,
188
+ language: result.language,
189
+ headingText: result.heading_text,
190
+ document_title: result.document_title,
191
+ startLine: result.start_line,
192
+ reranked: false,
193
+ };
194
+ });
195
+
196
+ finalResults = finalResults.filter((result) => result.similarity >= similarityThreshold);
197
+
198
+ let queryEmbedding = null;
199
+ if (useReranking && queryContextForReranking && finalResults.length >= 3) {
200
+ console.log(chalk.cyan('Applying sophisticated contextual reranking to documentation...'));
201
+ const WEIGHT_INITIAL_SIM = 0.3;
202
+ const WEIGHT_H1_CHUNK_RERANK = 0.15;
203
+ const HEAVY_BOOST_SAME_AREA = 0.4;
204
+ const MODERATE_BOOST_TECH_MATCH = 0.2;
205
+ const HEAVY_PENALTY_AREA_MISMATCH = -0.1;
206
+ const PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH = -0.1;
207
+
208
+ queryEmbedding = precomputedQueryEmbedding || (await this.modelManager.calculateQueryEmbedding(queryText));
209
+
210
+ // OPTIMIZATION 1: Enhanced batch calculate missing H1 embeddings with cache tracking
211
+ const uniqueH1Titles = new Set();
212
+ const h1TitlesToCalculate = [];
213
+
214
+ for (const result of finalResults) {
215
+ const docH1 = result.document_title;
216
+ if (docH1 && !uniqueH1Titles.has(docH1)) {
217
+ uniqueH1Titles.add(docH1);
218
+ if (!this.h1EmbeddingCache.has(docH1)) {
219
+ h1TitlesToCalculate.push(docH1);
220
+ }
221
+ }
222
+ }
223
+
224
+ // Batch calculate H1 embeddings for cache misses
225
+ if (h1TitlesToCalculate.length > 0) {
226
+ debug(`[OPTIMIZATION] Batch calculating ${h1TitlesToCalculate.length} H1 embeddings`);
227
+ const h1Embeddings = await this.modelManager.calculateEmbeddingBatch(h1TitlesToCalculate);
228
+ for (let i = 0; i < h1TitlesToCalculate.length; i++) {
229
+ if (h1Embeddings[i]) {
230
+ this.h1EmbeddingCache.set(h1TitlesToCalculate[i], h1Embeddings[i]);
231
+ }
232
+ }
233
+ }
234
+
235
+ // OPTIMIZATION 2: Cross-file document context caching for multi-file PRs
236
+ const docContextsToCalculate = [];
237
+
238
+ // Check cache for ALL documents (no uniqueDocPaths filter to allow cross-file caching)
239
+ const documentPathsInThisQuery = new Set();
240
+ for (const result of finalResults) {
241
+ const docPath = result.path;
242
+ // Use normalized path for better cache hits (resolve relative to target project)
243
+ const normalizedPath = path.resolve(resolvedProjectPath, docPath);
244
+
245
+ if (docPath && !documentPathsInThisQuery.has(normalizedPath)) {
246
+ documentPathsInThisQuery.add(normalizedPath);
247
+
248
+ // Need to calculate document context
249
+ if (!this.documentContextCache.has(normalizedPath) && !this.documentContextPromiseCache.has(normalizedPath)) {
250
+ docContextsToCalculate.push({
251
+ docPath: normalizedPath,
252
+ originalPath: docPath,
253
+ docH1: result.document_title,
254
+ result,
255
+ });
256
+ }
257
+ }
258
+ }
259
+
260
+ // Optimize context calculation with concurrency limits and fast-path detection
261
+ if (docContextsToCalculate.length > 0) {
262
+ debug(`[OPTIMIZATION] Batch calculating ${docContextsToCalculate.length} document contexts with concurrency limit`);
263
+
264
+ // Process in smaller batches to avoid memory issues and improve responsiveness
265
+ const CONTEXT_BATCH_SIZE = 3; // Limit concurrent context calculations
266
+ const contextResults = [];
267
+
268
+ for (let i = 0; i < docContextsToCalculate.length; i += CONTEXT_BATCH_SIZE) {
269
+ const batch = docContextsToCalculate.slice(i, i + CONTEXT_BATCH_SIZE);
270
+
271
+ const batchPromises = batch.map(async ({ docPath, originalPath, docH1, result }) => {
272
+ // Check if there's already a promise for this document
273
+ if (this.documentContextPromiseCache.has(docPath)) {
274
+ const context = await this.documentContextPromiseCache.get(docPath);
275
+ return { docPath, context };
276
+ }
277
+
278
+ // Create a new promise for this document calculation
279
+ const contextPromise = (async () => {
280
+ try {
281
+ let context;
282
+
283
+ // FAST-PATH OPTIMIZATION: Check for generic documents first
284
+ if (isGenericDocument(originalPath, docH1)) {
285
+ // Use pre-computed context for generic documents (README, RUNBOOK, etc.)
286
+ context = getGenericDocumentContext(originalPath, docH1);
287
+ debug(`[FAST-PATH] Using pre-computed context for generic document: ${originalPath}`);
288
+ } else {
289
+ // Use the expensive inference for non-generic documents
290
+ context = await inferContextFromDocumentContent(
291
+ originalPath,
292
+ docH1,
293
+ [result],
294
+ queryContextForReranking.language || 'typescript'
295
+ );
296
+ }
297
+
298
+ return context;
299
+ } catch (error) {
300
+ debug(`[ERROR] Failed to get context for ${originalPath}: ${error.message}`);
301
+ // Return a fallback context to avoid breaking the pipeline
302
+ return {
303
+ area: 'Unknown',
304
+ dominantTech: [],
305
+ isGeneralPurposeReadmeStyle: true,
306
+ };
307
+ }
308
+ })();
309
+
310
+ // Store the promise in the cache
311
+ this.documentContextPromiseCache.set(docPath, contextPromise);
312
+
313
+ // Wait for the result
314
+ const context = await contextPromise;
315
+
316
+ // Store the result in the regular cache and remove the promise
317
+ this.documentContextCache.set(docPath, context);
318
+ this.documentContextPromiseCache.delete(docPath);
319
+
320
+ return { docPath, context };
321
+ });
322
+
323
+ const batchResults = await Promise.all(batchPromises);
324
+ contextResults.push(...batchResults);
325
+
326
+ // Add a small delay between batches to prevent overwhelming the system
327
+ if (i + CONTEXT_BATCH_SIZE < docContextsToCalculate.length) {
328
+ await new Promise((resolve) => setTimeout(resolve, 10));
329
+ }
330
+ }
331
+
332
+ // Cache all results with normalized paths (consistent with lookup keys)
333
+ for (const { docPath, context } of contextResults) {
334
+ this.documentContextCache.set(docPath, context);
335
+ }
336
+ }
337
+
338
+ // OPTIMIZATION 3: Enhanced parallelize main reranking calculations with memory monitoring
339
+ const rerankingPromises = finalResults.map(async (result) => {
340
+ let chunkInitialScore = result.similarity * WEIGHT_INITIAL_SIM;
341
+ let contextMatchBonus = 0;
342
+ let h1RelevanceBonus = 0;
343
+ let genericDocPenalty = 0;
344
+ let pathSimilarityScore = 0;
345
+
346
+ const docPath = result.path;
347
+ const docH1 = result.document_title;
348
+
349
+ // Context should now be cached from batch operation above
350
+ const normalizedDocPath = path.resolve(resolvedProjectPath, docPath);
351
+ const chunkParentDocContext = this.documentContextCache.get(normalizedDocPath);
352
+
353
+ if (
354
+ chunkParentDocContext &&
355
+ queryContextForReranking.area !== 'Unknown' &&
356
+ chunkParentDocContext.area !== 'Unknown' &&
357
+ chunkParentDocContext.area !== 'General'
358
+ ) {
359
+ if (queryContextForReranking.area === chunkParentDocContext.area) {
360
+ contextMatchBonus += HEAVY_BOOST_SAME_AREA;
361
+ if (queryContextForReranking.dominantTech && chunkParentDocContext.dominantTech) {
362
+ const techIntersection = queryContextForReranking.dominantTech.some((tech) =>
363
+ chunkParentDocContext.dominantTech.map((t) => t.toLowerCase()).includes(tech.toLowerCase())
364
+ );
365
+ if (techIntersection) {
366
+ contextMatchBonus += MODERATE_BOOST_TECH_MATCH;
367
+ }
368
+ }
369
+ } else if (queryContextForReranking.area !== 'GeneralJS_TS') {
370
+ contextMatchBonus += HEAVY_PENALTY_AREA_MISMATCH;
371
+ }
372
+ }
373
+
374
+ // H1 embedding should now be cached from batch operation above
375
+ if (docH1) {
376
+ const h1Emb = this.h1EmbeddingCache.get(docH1);
377
+ if (h1Emb && queryEmbedding) {
378
+ h1RelevanceBonus = calculateCosineSimilarity(queryEmbedding, h1Emb) * WEIGHT_H1_CHUNK_RERANK;
379
+ }
380
+ }
381
+
382
+ if (chunkParentDocContext && chunkParentDocContext.isGeneralPurposeReadmeStyle) {
383
+ const contextMatchScore = queryContextForReranking.area === chunkParentDocContext.area ? 1.0 : 0.0;
384
+ if (contextMatchScore < 0.4) {
385
+ genericDocPenalty = PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH;
386
+ debug(`[findRelevantDocs] Doc ${result.path} is generic with low context match, applying penalty: ${genericDocPenalty}`);
387
+ }
388
+ }
389
+
390
+ if (queryFilePath && result.path) {
391
+ pathSimilarityScore = calculatePathSimilarity(queryFilePath, result.path) * 0.1;
392
+ }
393
+
394
+ const finalScore = chunkInitialScore + contextMatchBonus + h1RelevanceBonus + pathSimilarityScore + genericDocPenalty;
395
+ result.similarity = Math.max(0, Math.min(1, finalScore));
396
+ result.reranked = true;
397
+
398
+ return result;
399
+ });
400
+
401
+ // Wait for all reranking calculations to complete
402
+ await Promise.all(rerankingPromises);
403
+
404
+ // Log debug info for first few results
405
+ for (let i = 0; i < Math.min(5, finalResults.length); i++) {
406
+ const result = finalResults[i];
407
+ debug(`[SophisticatedRerank] ${result.path?.substring(0, 30)}... Final=${result.similarity.toFixed(4)}`);
408
+ }
409
+
410
+ finalResults.sort((a, b) => b.similarity - a.similarity);
411
+ debug('Sophisticated contextual reranking of documentation complete.');
412
+ }
413
+
414
+ finalResults.sort((a, b) => b.similarity - a.similarity);
415
+ if (finalResults.length > limit) {
416
+ finalResults = finalResults.slice(0, limit);
417
+ }
418
+
419
+ console.log(chalk.green(`Returning ${finalResults.length} documentation results`));
420
+
421
+ return finalResults;
422
+ } catch (error) {
423
+ console.error(chalk.red(`Error in findRelevantDocs: ${error.message}`), error);
424
+ throw new EmbeddingError(`Documentation search failed: ${error.message}`);
425
+ }
426
+ }
427
+
428
+ /**
429
+ * Find similar code using native LanceDB hybrid search
430
+ * Optimized implementation using LanceDB's built-in vector + FTS + RRF
431
+ * @param {string} queryText - The text query
432
+ * @param {Object} options - Search options
433
+ * @returns {Promise<Array<object>>} Search results
434
+ */
435
+ async findSimilarCode(queryText, options = {}) {
436
+ const {
437
+ limit = 5,
438
+ similarityThreshold = 0.7,
439
+ includeProjectStructure = false,
440
+ queryFilePath = null,
441
+ projectPath = process.cwd(),
442
+ isTestFile = null,
443
+ precomputedQueryEmbedding = null,
444
+ } = options;
445
+
446
+ console.log(chalk.cyan(`Native hybrid code search - limit: ${limit}, threshold: ${similarityThreshold}, isTestFile: ${isTestFile}`));
447
+
448
+ try {
449
+ if (!queryText?.trim()) {
450
+ console.warn(chalk.yellow('Empty query text provided'));
451
+ return [];
452
+ }
453
+
454
+ await this.database.connect();
455
+ const table = await this.database.getTable(FILE_EMBEDDINGS_TABLE);
456
+
457
+ if (!table) {
458
+ console.warn(chalk.yellow(`Table ${FILE_EMBEDDINGS_TABLE} not found`));
459
+ return [];
460
+ }
461
+
462
+ // Native hybrid search with automatic vector + FTS + RRF
463
+ console.log(chalk.cyan('Performing native hybrid search for code...'));
464
+ let query = table.search(queryText).nearestToText(queryText);
465
+
466
+ // Add filtering conditions
467
+ const conditions = [];
468
+ conditions.push("type != 'directory-structure'");
469
+
470
+ // Add filtering for test files.
471
+ if (isTestFile !== null) {
472
+ if (isTestFile) {
473
+ // Only include test files
474
+ conditions.push(`(path LIKE '%.test.%' OR path LIKE '%.spec.%' OR path LIKE '%_test.py' OR path LIKE 'test_%.py')`);
475
+ console.log(chalk.blue(`Filtering to include only test files.`));
476
+ } else {
477
+ // Exclude test files
478
+ conditions.push(
479
+ `(path NOT LIKE '%.test.%' AND path NOT LIKE '%.spec.%' AND path NOT LIKE '%_test.py' AND path NOT LIKE 'test_%.py')`
480
+ );
481
+ console.log(chalk.blue(`Filtering to exclude test files.`));
482
+ }
483
+ }
484
+
485
+ // Resolve project path once for use in multiple places
486
+ const resolvedProjectPath = path.resolve(projectPath);
487
+
488
+ // Exclude the file being reviewed if queryFilePath is provided
489
+ if (queryFilePath) {
490
+ const normalizedQueryPath = path.resolve(resolvedProjectPath, queryFilePath);
491
+ // Add condition to exclude the file being reviewed
492
+ const escapedPath = normalizedQueryPath.replace(/'/g, "''");
493
+ conditions.push(`path != '${escapedPath}'`);
494
+
495
+ // Also check for relative path variants to be thorough
496
+ const relativePath = path.relative(resolvedProjectPath, normalizedQueryPath);
497
+ if (relativePath && !relativePath.startsWith('..')) {
498
+ const escapedRelativePath = relativePath.replace(/'/g, "''");
499
+ conditions.push(`path != '${escapedRelativePath}'`);
500
+ }
501
+
502
+ debug(`Excluding file being reviewed from similar code search: ${normalizedQueryPath}`);
503
+ }
504
+
505
+ // Add project path filtering if the field exists in the schema
506
+ // Check if the table has project_path field
507
+ try {
508
+ const tableSchema = await table.schema;
509
+ if (tableSchema && tableSchema.fields) {
510
+ const hasProjectPathField = tableSchema.fields.some((field) => field.name === 'project_path');
511
+
512
+ if (hasProjectPathField) {
513
+ // Use exact match for project path
514
+ conditions.push(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`);
515
+ debug(`Filtering by project_path: ${resolvedProjectPath}`);
516
+ }
517
+ }
518
+ } catch (schemaError) {
519
+ debug(`Could not check schema for project_path field: ${schemaError.message}`);
520
+ // Continue without project_path filtering in query
521
+ }
522
+
523
+ if (conditions.length > 0) {
524
+ query = query.where(conditions.join(' AND '));
525
+ }
526
+
527
+ const results = await query.limit(Math.max(limit * 3, 20)).toArray();
528
+
529
+ console.log(chalk.green(`Native hybrid search returned ${results.length} results`));
530
+
531
+ // OPTIMIZATION: Batch file existence checks for better performance
532
+ const resultsToCheck = [];
533
+ const projectMatchMap = new Map();
534
+
535
+ // First pass: collect files that need existence checking
536
+ for (let i = 0; i < results.length; i++) {
537
+ const result = results[i];
538
+
539
+ // Use project_path field if available (new schema)
540
+ if (result.project_path) {
541
+ projectMatchMap.set(i, result.project_path === resolvedProjectPath);
542
+ continue;
543
+ }
544
+
545
+ // Fallback for old embeddings without project_path field
546
+ if (!result.path && !result.original_document_path) {
547
+ projectMatchMap.set(i, false);
548
+ continue;
549
+ }
550
+
551
+ const filePath = result.original_document_path || result.path;
552
+ try {
553
+ // Check if this result belongs to the current project
554
+ // First try as absolute path
555
+ if (path.isAbsolute(filePath)) {
556
+ projectMatchMap.set(i, filePath.startsWith(resolvedProjectPath));
557
+ continue;
558
+ }
559
+
560
+ // For relative paths, check if the file actually exists in the project
561
+ const absolutePath = path.resolve(resolvedProjectPath, filePath);
562
+
563
+ // Verify the path is within project bounds
564
+ if (absolutePath.startsWith(resolvedProjectPath)) {
565
+ // Mark for batch existence check
566
+ resultsToCheck.push({ result, index: i, absolutePath });
567
+ } else {
568
+ projectMatchMap.set(i, false);
569
+ }
570
+ } catch (error) {
571
+ debug(`Error filtering result for project: ${error.message}`);
572
+ projectMatchMap.set(i, false);
573
+ }
574
+ }
575
+
576
+ // Batch check file existence for better performance
577
+ if (resultsToCheck.length > 0) {
578
+ debug(`[OPTIMIZATION] Batch checking existence of ${resultsToCheck.length} files`);
579
+ const existencePromises = resultsToCheck.map(async ({ result, index, absolutePath }) => {
580
+ try {
581
+ await fs.promises.access(absolutePath, fs.constants.F_OK);
582
+ return { index, exists: true };
583
+ } catch {
584
+ debug(`Filtering out non-existent file: ${result.original_document_path || result.path}`);
585
+ return { index, exists: false };
586
+ }
587
+ });
588
+
589
+ const existenceResults = await Promise.all(existencePromises);
590
+ for (const { index, exists } of existenceResults) {
591
+ projectMatchMap.set(index, exists);
592
+ }
593
+ }
594
+
595
+ // Filter results based on project match using the map
596
+ const projectFilteredResults = results.filter((result, index) => projectMatchMap.get(index) === true);
597
+
598
+ console.log(chalk.blue(`Filtered to ${projectFilteredResults.length} results from current project`));
599
+
600
+ // Map results to expected format
601
+ let finalResults = projectFilteredResults.map((result) => {
602
+ // Handle different score types from native hybrid search
603
+ let similarity;
604
+ if (result._distance !== undefined) {
605
+ // Vector search distance (0 = perfect match, higher = less similar)
606
+ // Apply more precise normalization to avoid all scores being 1.000
607
+ similarity = Math.max(0, Math.min(1, Math.exp(-result._distance * 2)));
608
+ } else if (result._score !== undefined) {
609
+ // FTS or hybrid score - normalize to 0-1 range with better scaling
610
+ similarity = Math.max(0, Math.min(1, result._score / Math.max(result._score, 1)));
611
+ } else {
612
+ // Fallback
613
+ similarity = 0.5;
614
+ }
615
+
616
+ // Determine if this is a documentation file using the utility function
617
+ const isDocumentation = isDocumentationFile(result.path, result.language);
618
+
619
+ return {
620
+ similarity,
621
+ type: 'file',
622
+ content: result.content,
623
+ path: result.path,
624
+ file_path: result.path,
625
+ language: result.language,
626
+ reranked: false,
627
+ isDocumentation, // Add the missing flag that cag-analyzer expects
628
+ };
629
+ });
630
+
631
+ // Apply similarity threshold
632
+ finalResults = finalResults.filter((result) => result.similarity >= similarityThreshold);
633
+
634
+ // PERFORMANCE FIX: Calculate query embedding once and reuse for both reranking and project structure
635
+ let queryEmbedding = null;
636
+
637
+ // Include project structure if requested (project-specific)
638
+ if (includeProjectStructure) {
639
+ try {
640
+ const fileTable = await this.database.getTable(FILE_EMBEDDINGS_TABLE);
641
+ if (fileTable) {
642
+ // Look for project-specific structure ID
643
+ const projectStructureId = `__project_structure__${path.basename(resolvedProjectPath)}`;
644
+ let structureResults = await fileTable.query().where(`id = '${projectStructureId}'`).limit(1).toArray();
645
+
646
+ // Fall back to generic project structure if project-specific one doesn't exist
647
+ if (structureResults.length === 0) {
648
+ structureResults = await fileTable.query().where("id = '__project_structure__'").limit(1).toArray();
649
+ }
650
+
651
+ if (structureResults.length > 0) {
652
+ const structureRecord = structureResults[0];
653
+ if (structureRecord.vector) {
654
+ // PERFORMANCE FIX: Use pre-computed query embedding if available, otherwise calculate once
655
+ if (!queryEmbedding) {
656
+ queryEmbedding = precomputedQueryEmbedding || (await this.modelManager.calculateQueryEmbedding(queryText));
657
+ }
658
+ if (queryEmbedding) {
659
+ const similarity = calculateCosineSimilarity(queryEmbedding, Array.from(structureRecord.vector));
660
+ if (similarity > 0.5) {
661
+ finalResults.push({
662
+ similarity,
663
+ type: 'project-structure',
664
+ content: structureRecord.content,
665
+ path: structureRecord.path,
666
+ file_path: structureRecord.path,
667
+ language: 'text',
668
+ reranked: false,
669
+ });
670
+ }
671
+ }
672
+ }
673
+ }
674
+ }
675
+ } catch (error) {
676
+ console.warn(chalk.yellow(`Project structure inclusion failed: ${error.message}`));
677
+ }
678
+ }
679
+
680
+ // Final sorting and limiting
681
+ finalResults.sort((a, b) => b.similarity - a.similarity);
682
+ if (finalResults.length > limit) {
683
+ finalResults = finalResults.slice(0, limit);
684
+ }
685
+
686
+ console.log(chalk.green(`Returning ${finalResults.length} optimized hybrid search results`));
687
+ return finalResults;
688
+ } catch (error) {
689
+ console.error(chalk.red(`Error in optimized findSimilarCode: ${error.message}`), error);
690
+ return [];
691
+ }
692
+ }
693
+
694
+ /**
695
+ * Get performance metrics
696
+ * @returns {Object} Performance metrics
697
+ */
698
+ getPerformanceMetrics() {
699
+ return {
700
+ ...this.performanceMetrics,
701
+ averageSearchTime:
702
+ this.performanceMetrics.searchCount > 0 ? this.performanceMetrics.totalSearchTime / this.performanceMetrics.searchCount : 0,
703
+ cacheSize: this.h1EmbeddingCache.size,
704
+ documentContextCacheSize: this.documentContextCache.size,
705
+ };
706
+ }
707
+
708
+ /**
709
+ * Clear all caches
710
+ */
711
+ clearCaches() {
712
+ this.h1EmbeddingCache.clear();
713
+ this.documentContextCache.clear();
714
+ this.documentContextPromiseCache.clear();
715
+ console.log(chalk.green('ContentRetriever caches cleared'));
716
+ }
717
+
718
+ /**
719
+ * Cleanup resources
720
+ */
721
+ async cleanup() {
722
+ if (this.cleaningUp) {
723
+ return; // Already cleaning up, prevent duplicate calls
724
+ }
725
+
726
+ this.cleaningUp = true;
727
+
728
+ try {
729
+ // Clear LOCAL caches only (not system-wide caches)
730
+ this.h1EmbeddingCache.clear();
731
+ this.documentContextCache.clear();
732
+ this.documentContextPromiseCache.clear();
733
+
734
+ // Reset LOCAL performance metrics
735
+ this.performanceMetrics = {
736
+ searchCount: 0,
737
+ totalSearchTime: 0,
738
+ cacheHitRate: 0,
739
+ parallelRerankingTime: 0,
740
+ };
741
+
742
+ console.log(chalk.green('ContentRetriever cleanup complete'));
743
+ } finally {
744
+ this.cleaningUp = false;
745
+ }
746
+ }
747
+ }