codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Retrieval Service
|
|
3
|
+
*
|
|
4
|
+
* This module provides sophisticated content retrieval capabilities with:
|
|
5
|
+
* - Hybrid search combining vector similarity and full-text search
|
|
6
|
+
* - Context-aware reranking algorithms
|
|
7
|
+
* - Project-specific filtering and isolation
|
|
8
|
+
* - H1 embedding cache integration
|
|
9
|
+
* - Parallel processing for optimal performance
|
|
10
|
+
*
|
|
11
|
+
* @module ContentRetrieval
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import fs from 'fs';
|
|
15
|
+
import path from 'path';
|
|
16
|
+
import chalk from 'chalk';
|
|
17
|
+
import { CacheManager } from './embeddings/cache-manager.js';
|
|
18
|
+
import { TABLE_NAMES } from './embeddings/constants.js';
|
|
19
|
+
import { DatabaseManager } from './embeddings/database.js';
|
|
20
|
+
import { EmbeddingError } from './embeddings/errors.js';
|
|
21
|
+
import { ModelManager } from './embeddings/model-manager.js';
|
|
22
|
+
import { calculateCosineSimilarity, calculatePathSimilarity } from './embeddings/similarity-calculator.js';
|
|
23
|
+
import { inferContextFromDocumentContent } from './utils/context-inference.js';
|
|
24
|
+
import { isGenericDocument, getGenericDocumentContext } from './utils/document-detection.js';
|
|
25
|
+
import { isDocumentationFile } from './utils/file-validation.js';
|
|
26
|
+
import { debug } from './utils/logging.js';
|
|
27
|
+
|
|
28
|
+
const FILE_EMBEDDINGS_TABLE = TABLE_NAMES.FILE_EMBEDDINGS;
|
|
29
|
+
const DOCUMENT_CHUNK_TABLE = TABLE_NAMES.DOCUMENT_CHUNK;
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* ContentRetriever class for advanced search and discovery
|
|
33
|
+
*/
|
|
34
|
+
export class ContentRetriever {
|
|
35
|
+
constructor(options = {}) {
|
|
36
|
+
this.modelManager = options.modelManager || new ModelManager();
|
|
37
|
+
this.database = options.database || new DatabaseManager();
|
|
38
|
+
this.cacheManager = options.cacheManager || new CacheManager();
|
|
39
|
+
|
|
40
|
+
// Initialize caches for performance optimization
|
|
41
|
+
this.h1EmbeddingCache = new Map();
|
|
42
|
+
this.documentContextCache = new Map();
|
|
43
|
+
this.documentContextPromiseCache = new Map();
|
|
44
|
+
|
|
45
|
+
// Performance tracking
|
|
46
|
+
this.performanceMetrics = {
|
|
47
|
+
searchCount: 0,
|
|
48
|
+
totalSearchTime: 0,
|
|
49
|
+
cacheHitRate: 0,
|
|
50
|
+
parallelRerankingTime: 0,
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// Cleanup guard
|
|
54
|
+
this.cleaningUp = false;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Find relevant documentation with sophisticated reranking
|
|
59
|
+
* @param {string} queryText - The search query
|
|
60
|
+
* @param {Object} options - Search configuration
|
|
61
|
+
* @returns {Promise<Array>} Array of relevant documents
|
|
62
|
+
*/
|
|
63
|
+
async findRelevantDocs(queryText, options = {}) {
|
|
64
|
+
const {
|
|
65
|
+
limit = 10,
|
|
66
|
+
similarityThreshold = 0.1,
|
|
67
|
+
useReranking = true,
|
|
68
|
+
queryFilePath = null,
|
|
69
|
+
queryContextForReranking = null,
|
|
70
|
+
projectPath = process.cwd(),
|
|
71
|
+
precomputedQueryEmbedding = null,
|
|
72
|
+
} = options;
|
|
73
|
+
|
|
74
|
+
this.performanceMetrics.searchCount++;
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
if (!queryText?.trim()) {
|
|
78
|
+
console.warn(chalk.yellow('Empty query text provided for documentation search'));
|
|
79
|
+
return [];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
console.log(
|
|
83
|
+
chalk.cyan(`Native hybrid documentation search - limit: ${limit}, threshold: ${similarityThreshold}, reranking: ${useReranking}`)
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
await this.database.connect();
|
|
87
|
+
const table = await this.database.getTable(DOCUMENT_CHUNK_TABLE);
|
|
88
|
+
|
|
89
|
+
if (!table) {
|
|
90
|
+
console.warn(chalk.yellow(`Documentation table ${DOCUMENT_CHUNK_TABLE} not found`));
|
|
91
|
+
return [];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
console.log(chalk.cyan('Performing native hybrid search for documentation...'));
|
|
95
|
+
let query = table.search(queryText).nearestToText(queryText);
|
|
96
|
+
|
|
97
|
+
const resolvedProjectPath = path.resolve(projectPath);
|
|
98
|
+
try {
|
|
99
|
+
const tableSchema = await table.schema;
|
|
100
|
+
if (tableSchema?.fields?.some((field) => field.name === 'project_path')) {
|
|
101
|
+
query = query.where(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`);
|
|
102
|
+
debug(`Filtering documentation by project_path: ${resolvedProjectPath}`);
|
|
103
|
+
}
|
|
104
|
+
} catch (schemaError) {
|
|
105
|
+
debug(`Could not check schema for project_path field: ${schemaError.message}`);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const results = await query.limit(Math.max(limit * 3, 20)).toArray();
|
|
109
|
+
console.log(chalk.green(`Native hybrid search returned ${results.length} documentation results`));
|
|
110
|
+
|
|
111
|
+
// OPTIMIZATION: Enhanced batch file existence checks with parallel processing
|
|
112
|
+
const docsToCheck = [];
|
|
113
|
+
const docProjectMatchMap = new Map();
|
|
114
|
+
|
|
115
|
+
// First pass: collect files that need existence checking
|
|
116
|
+
for (let i = 0; i < results.length; i++) {
|
|
117
|
+
const result = results[i];
|
|
118
|
+
|
|
119
|
+
if (result.project_path) {
|
|
120
|
+
docProjectMatchMap.set(i, result.project_path === resolvedProjectPath);
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
if (!result.original_document_path) {
|
|
125
|
+
docProjectMatchMap.set(i, false);
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const filePath = result.original_document_path;
|
|
130
|
+
try {
|
|
131
|
+
if (path.isAbsolute(filePath)) {
|
|
132
|
+
docProjectMatchMap.set(i, filePath.startsWith(resolvedProjectPath));
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const absolutePath = path.resolve(resolvedProjectPath, filePath);
|
|
137
|
+
if (absolutePath.startsWith(resolvedProjectPath)) {
|
|
138
|
+
// Mark for batch existence check
|
|
139
|
+
docsToCheck.push({ result, index: i, absolutePath, filePath });
|
|
140
|
+
} else {
|
|
141
|
+
docProjectMatchMap.set(i, false);
|
|
142
|
+
}
|
|
143
|
+
} catch (error) {
|
|
144
|
+
debug(`Error filtering result for project: ${error.message}`);
|
|
145
|
+
docProjectMatchMap.set(i, false);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Enhanced batch check file existence with improved error handling
|
|
150
|
+
if (docsToCheck.length > 0) {
|
|
151
|
+
debug(`[OPTIMIZATION] Batch checking existence of ${docsToCheck.length} documentation files`);
|
|
152
|
+
const existencePromises = docsToCheck.map(async ({ index, absolutePath, filePath }) => {
|
|
153
|
+
try {
|
|
154
|
+
await fs.promises.access(absolutePath, fs.constants.F_OK);
|
|
155
|
+
return { index, exists: true };
|
|
156
|
+
} catch {
|
|
157
|
+
debug(`Filtering out non-existent documentation file: ${filePath}`);
|
|
158
|
+
return { index, exists: false };
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
const existenceResults = await Promise.all(existencePromises);
|
|
163
|
+
for (const { index, exists } of existenceResults) {
|
|
164
|
+
docProjectMatchMap.set(index, exists);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Filter results based on project match using the map
|
|
169
|
+
const projectFilteredResults = results.filter((result, index) => docProjectMatchMap.get(index) === true);
|
|
170
|
+
|
|
171
|
+
console.log(chalk.blue(`Filtered to ${projectFilteredResults.length} documentation results from current project`));
|
|
172
|
+
let finalResults = projectFilteredResults.map((result) => {
|
|
173
|
+
let similarity;
|
|
174
|
+
if (result._distance !== undefined) {
|
|
175
|
+
similarity = Math.max(0, Math.min(1, 1 - result._distance));
|
|
176
|
+
} else if (result._score !== undefined) {
|
|
177
|
+
similarity = Math.max(0, Math.min(1, result._score));
|
|
178
|
+
} else {
|
|
179
|
+
similarity = 0.5;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
similarity,
|
|
184
|
+
type: 'documentation-chunk',
|
|
185
|
+
content: result.content,
|
|
186
|
+
path: result.original_document_path,
|
|
187
|
+
file_path: result.original_document_path,
|
|
188
|
+
language: result.language,
|
|
189
|
+
headingText: result.heading_text,
|
|
190
|
+
document_title: result.document_title,
|
|
191
|
+
startLine: result.start_line,
|
|
192
|
+
reranked: false,
|
|
193
|
+
};
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
finalResults = finalResults.filter((result) => result.similarity >= similarityThreshold);
|
|
197
|
+
|
|
198
|
+
let queryEmbedding = null;
|
|
199
|
+
if (useReranking && queryContextForReranking && finalResults.length >= 3) {
|
|
200
|
+
console.log(chalk.cyan('Applying sophisticated contextual reranking to documentation...'));
|
|
201
|
+
const WEIGHT_INITIAL_SIM = 0.3;
|
|
202
|
+
const WEIGHT_H1_CHUNK_RERANK = 0.15;
|
|
203
|
+
const HEAVY_BOOST_SAME_AREA = 0.4;
|
|
204
|
+
const MODERATE_BOOST_TECH_MATCH = 0.2;
|
|
205
|
+
const HEAVY_PENALTY_AREA_MISMATCH = -0.1;
|
|
206
|
+
const PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH = -0.1;
|
|
207
|
+
|
|
208
|
+
queryEmbedding = precomputedQueryEmbedding || (await this.modelManager.calculateQueryEmbedding(queryText));
|
|
209
|
+
|
|
210
|
+
// OPTIMIZATION 1: Enhanced batch calculate missing H1 embeddings with cache tracking
|
|
211
|
+
const uniqueH1Titles = new Set();
|
|
212
|
+
const h1TitlesToCalculate = [];
|
|
213
|
+
|
|
214
|
+
for (const result of finalResults) {
|
|
215
|
+
const docH1 = result.document_title;
|
|
216
|
+
if (docH1 && !uniqueH1Titles.has(docH1)) {
|
|
217
|
+
uniqueH1Titles.add(docH1);
|
|
218
|
+
if (!this.h1EmbeddingCache.has(docH1)) {
|
|
219
|
+
h1TitlesToCalculate.push(docH1);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Batch calculate H1 embeddings for cache misses
|
|
225
|
+
if (h1TitlesToCalculate.length > 0) {
|
|
226
|
+
debug(`[OPTIMIZATION] Batch calculating ${h1TitlesToCalculate.length} H1 embeddings`);
|
|
227
|
+
const h1Embeddings = await this.modelManager.calculateEmbeddingBatch(h1TitlesToCalculate);
|
|
228
|
+
for (let i = 0; i < h1TitlesToCalculate.length; i++) {
|
|
229
|
+
if (h1Embeddings[i]) {
|
|
230
|
+
this.h1EmbeddingCache.set(h1TitlesToCalculate[i], h1Embeddings[i]);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// OPTIMIZATION 2: Cross-file document context caching for multi-file PRs
|
|
236
|
+
const docContextsToCalculate = [];
|
|
237
|
+
|
|
238
|
+
// Check cache for ALL documents (no uniqueDocPaths filter to allow cross-file caching)
|
|
239
|
+
const documentPathsInThisQuery = new Set();
|
|
240
|
+
for (const result of finalResults) {
|
|
241
|
+
const docPath = result.path;
|
|
242
|
+
// Use normalized path for better cache hits (resolve relative to target project)
|
|
243
|
+
const normalizedPath = path.resolve(resolvedProjectPath, docPath);
|
|
244
|
+
|
|
245
|
+
if (docPath && !documentPathsInThisQuery.has(normalizedPath)) {
|
|
246
|
+
documentPathsInThisQuery.add(normalizedPath);
|
|
247
|
+
|
|
248
|
+
// Need to calculate document context
|
|
249
|
+
if (!this.documentContextCache.has(normalizedPath) && !this.documentContextPromiseCache.has(normalizedPath)) {
|
|
250
|
+
docContextsToCalculate.push({
|
|
251
|
+
docPath: normalizedPath,
|
|
252
|
+
originalPath: docPath,
|
|
253
|
+
docH1: result.document_title,
|
|
254
|
+
result,
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Optimize context calculation with concurrency limits and fast-path detection
|
|
261
|
+
if (docContextsToCalculate.length > 0) {
|
|
262
|
+
debug(`[OPTIMIZATION] Batch calculating ${docContextsToCalculate.length} document contexts with concurrency limit`);
|
|
263
|
+
|
|
264
|
+
// Process in smaller batches to avoid memory issues and improve responsiveness
|
|
265
|
+
const CONTEXT_BATCH_SIZE = 3; // Limit concurrent context calculations
|
|
266
|
+
const contextResults = [];
|
|
267
|
+
|
|
268
|
+
for (let i = 0; i < docContextsToCalculate.length; i += CONTEXT_BATCH_SIZE) {
|
|
269
|
+
const batch = docContextsToCalculate.slice(i, i + CONTEXT_BATCH_SIZE);
|
|
270
|
+
|
|
271
|
+
const batchPromises = batch.map(async ({ docPath, originalPath, docH1, result }) => {
|
|
272
|
+
// Check if there's already a promise for this document
|
|
273
|
+
if (this.documentContextPromiseCache.has(docPath)) {
|
|
274
|
+
const context = await this.documentContextPromiseCache.get(docPath);
|
|
275
|
+
return { docPath, context };
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Create a new promise for this document calculation
|
|
279
|
+
const contextPromise = (async () => {
|
|
280
|
+
try {
|
|
281
|
+
let context;
|
|
282
|
+
|
|
283
|
+
// FAST-PATH OPTIMIZATION: Check for generic documents first
|
|
284
|
+
if (isGenericDocument(originalPath, docH1)) {
|
|
285
|
+
// Use pre-computed context for generic documents (README, RUNBOOK, etc.)
|
|
286
|
+
context = getGenericDocumentContext(originalPath, docH1);
|
|
287
|
+
debug(`[FAST-PATH] Using pre-computed context for generic document: ${originalPath}`);
|
|
288
|
+
} else {
|
|
289
|
+
// Use the expensive inference for non-generic documents
|
|
290
|
+
context = await inferContextFromDocumentContent(
|
|
291
|
+
originalPath,
|
|
292
|
+
docH1,
|
|
293
|
+
[result],
|
|
294
|
+
queryContextForReranking.language || 'typescript'
|
|
295
|
+
);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return context;
|
|
299
|
+
} catch (error) {
|
|
300
|
+
debug(`[ERROR] Failed to get context for ${originalPath}: ${error.message}`);
|
|
301
|
+
// Return a fallback context to avoid breaking the pipeline
|
|
302
|
+
return {
|
|
303
|
+
area: 'Unknown',
|
|
304
|
+
dominantTech: [],
|
|
305
|
+
isGeneralPurposeReadmeStyle: true,
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
})();
|
|
309
|
+
|
|
310
|
+
// Store the promise in the cache
|
|
311
|
+
this.documentContextPromiseCache.set(docPath, contextPromise);
|
|
312
|
+
|
|
313
|
+
// Wait for the result
|
|
314
|
+
const context = await contextPromise;
|
|
315
|
+
|
|
316
|
+
// Store the result in the regular cache and remove the promise
|
|
317
|
+
this.documentContextCache.set(docPath, context);
|
|
318
|
+
this.documentContextPromiseCache.delete(docPath);
|
|
319
|
+
|
|
320
|
+
return { docPath, context };
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
const batchResults = await Promise.all(batchPromises);
|
|
324
|
+
contextResults.push(...batchResults);
|
|
325
|
+
|
|
326
|
+
// Add a small delay between batches to prevent overwhelming the system
|
|
327
|
+
if (i + CONTEXT_BATCH_SIZE < docContextsToCalculate.length) {
|
|
328
|
+
await new Promise((resolve) => setTimeout(resolve, 10));
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Cache all results with normalized paths (consistent with lookup keys)
|
|
333
|
+
for (const { docPath, context } of contextResults) {
|
|
334
|
+
this.documentContextCache.set(docPath, context);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// OPTIMIZATION 3: Enhanced parallelize main reranking calculations with memory monitoring
|
|
339
|
+
const rerankingPromises = finalResults.map(async (result) => {
|
|
340
|
+
let chunkInitialScore = result.similarity * WEIGHT_INITIAL_SIM;
|
|
341
|
+
let contextMatchBonus = 0;
|
|
342
|
+
let h1RelevanceBonus = 0;
|
|
343
|
+
let genericDocPenalty = 0;
|
|
344
|
+
let pathSimilarityScore = 0;
|
|
345
|
+
|
|
346
|
+
const docPath = result.path;
|
|
347
|
+
const docH1 = result.document_title;
|
|
348
|
+
|
|
349
|
+
// Context should now be cached from batch operation above
|
|
350
|
+
const normalizedDocPath = path.resolve(resolvedProjectPath, docPath);
|
|
351
|
+
const chunkParentDocContext = this.documentContextCache.get(normalizedDocPath);
|
|
352
|
+
|
|
353
|
+
if (
|
|
354
|
+
chunkParentDocContext &&
|
|
355
|
+
queryContextForReranking.area !== 'Unknown' &&
|
|
356
|
+
chunkParentDocContext.area !== 'Unknown' &&
|
|
357
|
+
chunkParentDocContext.area !== 'General'
|
|
358
|
+
) {
|
|
359
|
+
if (queryContextForReranking.area === chunkParentDocContext.area) {
|
|
360
|
+
contextMatchBonus += HEAVY_BOOST_SAME_AREA;
|
|
361
|
+
if (queryContextForReranking.dominantTech && chunkParentDocContext.dominantTech) {
|
|
362
|
+
const techIntersection = queryContextForReranking.dominantTech.some((tech) =>
|
|
363
|
+
chunkParentDocContext.dominantTech.map((t) => t.toLowerCase()).includes(tech.toLowerCase())
|
|
364
|
+
);
|
|
365
|
+
if (techIntersection) {
|
|
366
|
+
contextMatchBonus += MODERATE_BOOST_TECH_MATCH;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
} else if (queryContextForReranking.area !== 'GeneralJS_TS') {
|
|
370
|
+
contextMatchBonus += HEAVY_PENALTY_AREA_MISMATCH;
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// H1 embedding should now be cached from batch operation above
|
|
375
|
+
if (docH1) {
|
|
376
|
+
const h1Emb = this.h1EmbeddingCache.get(docH1);
|
|
377
|
+
if (h1Emb && queryEmbedding) {
|
|
378
|
+
h1RelevanceBonus = calculateCosineSimilarity(queryEmbedding, h1Emb) * WEIGHT_H1_CHUNK_RERANK;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
if (chunkParentDocContext && chunkParentDocContext.isGeneralPurposeReadmeStyle) {
|
|
383
|
+
const contextMatchScore = queryContextForReranking.area === chunkParentDocContext.area ? 1.0 : 0.0;
|
|
384
|
+
if (contextMatchScore < 0.4) {
|
|
385
|
+
genericDocPenalty = PENALTY_GENERIC_DOC_LOW_CONTEXT_MATCH;
|
|
386
|
+
debug(`[findRelevantDocs] Doc ${result.path} is generic with low context match, applying penalty: ${genericDocPenalty}`);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if (queryFilePath && result.path) {
|
|
391
|
+
pathSimilarityScore = calculatePathSimilarity(queryFilePath, result.path) * 0.1;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const finalScore = chunkInitialScore + contextMatchBonus + h1RelevanceBonus + pathSimilarityScore + genericDocPenalty;
|
|
395
|
+
result.similarity = Math.max(0, Math.min(1, finalScore));
|
|
396
|
+
result.reranked = true;
|
|
397
|
+
|
|
398
|
+
return result;
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
// Wait for all reranking calculations to complete
|
|
402
|
+
await Promise.all(rerankingPromises);
|
|
403
|
+
|
|
404
|
+
// Log debug info for first few results
|
|
405
|
+
for (let i = 0; i < Math.min(5, finalResults.length); i++) {
|
|
406
|
+
const result = finalResults[i];
|
|
407
|
+
debug(`[SophisticatedRerank] ${result.path?.substring(0, 30)}... Final=${result.similarity.toFixed(4)}`);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
finalResults.sort((a, b) => b.similarity - a.similarity);
|
|
411
|
+
debug('Sophisticated contextual reranking of documentation complete.');
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
finalResults.sort((a, b) => b.similarity - a.similarity);
|
|
415
|
+
if (finalResults.length > limit) {
|
|
416
|
+
finalResults = finalResults.slice(0, limit);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
console.log(chalk.green(`Returning ${finalResults.length} documentation results`));
|
|
420
|
+
|
|
421
|
+
return finalResults;
|
|
422
|
+
} catch (error) {
|
|
423
|
+
console.error(chalk.red(`Error in findRelevantDocs: ${error.message}`), error);
|
|
424
|
+
throw new EmbeddingError(`Documentation search failed: ${error.message}`);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Find similar code using native LanceDB hybrid search
|
|
430
|
+
* Optimized implementation using LanceDB's built-in vector + FTS + RRF
|
|
431
|
+
* @param {string} queryText - The text query
|
|
432
|
+
* @param {Object} options - Search options
|
|
433
|
+
* @returns {Promise<Array<object>>} Search results
|
|
434
|
+
*/
|
|
435
|
+
async findSimilarCode(queryText, options = {}) {
|
|
436
|
+
const {
|
|
437
|
+
limit = 5,
|
|
438
|
+
similarityThreshold = 0.7,
|
|
439
|
+
includeProjectStructure = false,
|
|
440
|
+
queryFilePath = null,
|
|
441
|
+
projectPath = process.cwd(),
|
|
442
|
+
isTestFile = null,
|
|
443
|
+
precomputedQueryEmbedding = null,
|
|
444
|
+
} = options;
|
|
445
|
+
|
|
446
|
+
console.log(chalk.cyan(`Native hybrid code search - limit: ${limit}, threshold: ${similarityThreshold}, isTestFile: ${isTestFile}`));
|
|
447
|
+
|
|
448
|
+
try {
|
|
449
|
+
if (!queryText?.trim()) {
|
|
450
|
+
console.warn(chalk.yellow('Empty query text provided'));
|
|
451
|
+
return [];
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
await this.database.connect();
|
|
455
|
+
const table = await this.database.getTable(FILE_EMBEDDINGS_TABLE);
|
|
456
|
+
|
|
457
|
+
if (!table) {
|
|
458
|
+
console.warn(chalk.yellow(`Table ${FILE_EMBEDDINGS_TABLE} not found`));
|
|
459
|
+
return [];
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Native hybrid search with automatic vector + FTS + RRF
|
|
463
|
+
console.log(chalk.cyan('Performing native hybrid search for code...'));
|
|
464
|
+
let query = table.search(queryText).nearestToText(queryText);
|
|
465
|
+
|
|
466
|
+
// Add filtering conditions
|
|
467
|
+
const conditions = [];
|
|
468
|
+
conditions.push("type != 'directory-structure'");
|
|
469
|
+
|
|
470
|
+
// Add filtering for test files.
|
|
471
|
+
if (isTestFile !== null) {
|
|
472
|
+
if (isTestFile) {
|
|
473
|
+
// Only include test files
|
|
474
|
+
conditions.push(`(path LIKE '%.test.%' OR path LIKE '%.spec.%' OR path LIKE '%_test.py' OR path LIKE 'test_%.py')`);
|
|
475
|
+
console.log(chalk.blue(`Filtering to include only test files.`));
|
|
476
|
+
} else {
|
|
477
|
+
// Exclude test files
|
|
478
|
+
conditions.push(
|
|
479
|
+
`(path NOT LIKE '%.test.%' AND path NOT LIKE '%.spec.%' AND path NOT LIKE '%_test.py' AND path NOT LIKE 'test_%.py')`
|
|
480
|
+
);
|
|
481
|
+
console.log(chalk.blue(`Filtering to exclude test files.`));
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Resolve project path once for use in multiple places
|
|
486
|
+
const resolvedProjectPath = path.resolve(projectPath);
|
|
487
|
+
|
|
488
|
+
// Exclude the file being reviewed if queryFilePath is provided
|
|
489
|
+
if (queryFilePath) {
|
|
490
|
+
const normalizedQueryPath = path.resolve(resolvedProjectPath, queryFilePath);
|
|
491
|
+
// Add condition to exclude the file being reviewed
|
|
492
|
+
const escapedPath = normalizedQueryPath.replace(/'/g, "''");
|
|
493
|
+
conditions.push(`path != '${escapedPath}'`);
|
|
494
|
+
|
|
495
|
+
// Also check for relative path variants to be thorough
|
|
496
|
+
const relativePath = path.relative(resolvedProjectPath, normalizedQueryPath);
|
|
497
|
+
if (relativePath && !relativePath.startsWith('..')) {
|
|
498
|
+
const escapedRelativePath = relativePath.replace(/'/g, "''");
|
|
499
|
+
conditions.push(`path != '${escapedRelativePath}'`);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
debug(`Excluding file being reviewed from similar code search: ${normalizedQueryPath}`);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Add project path filtering if the field exists in the schema
|
|
506
|
+
// Check if the table has project_path field
|
|
507
|
+
try {
|
|
508
|
+
const tableSchema = await table.schema;
|
|
509
|
+
if (tableSchema && tableSchema.fields) {
|
|
510
|
+
const hasProjectPathField = tableSchema.fields.some((field) => field.name === 'project_path');
|
|
511
|
+
|
|
512
|
+
if (hasProjectPathField) {
|
|
513
|
+
// Use exact match for project path
|
|
514
|
+
conditions.push(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`);
|
|
515
|
+
debug(`Filtering by project_path: ${resolvedProjectPath}`);
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
} catch (schemaError) {
|
|
519
|
+
debug(`Could not check schema for project_path field: ${schemaError.message}`);
|
|
520
|
+
// Continue without project_path filtering in query
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
if (conditions.length > 0) {
|
|
524
|
+
query = query.where(conditions.join(' AND '));
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const results = await query.limit(Math.max(limit * 3, 20)).toArray();
|
|
528
|
+
|
|
529
|
+
console.log(chalk.green(`Native hybrid search returned ${results.length} results`));
|
|
530
|
+
|
|
531
|
+
// OPTIMIZATION: Batch file existence checks for better performance
|
|
532
|
+
const resultsToCheck = [];
|
|
533
|
+
const projectMatchMap = new Map();
|
|
534
|
+
|
|
535
|
+
// First pass: collect files that need existence checking
|
|
536
|
+
for (let i = 0; i < results.length; i++) {
|
|
537
|
+
const result = results[i];
|
|
538
|
+
|
|
539
|
+
// Use project_path field if available (new schema)
|
|
540
|
+
if (result.project_path) {
|
|
541
|
+
projectMatchMap.set(i, result.project_path === resolvedProjectPath);
|
|
542
|
+
continue;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Fallback for old embeddings without project_path field
|
|
546
|
+
if (!result.path && !result.original_document_path) {
|
|
547
|
+
projectMatchMap.set(i, false);
|
|
548
|
+
continue;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const filePath = result.original_document_path || result.path;
|
|
552
|
+
try {
|
|
553
|
+
// Check if this result belongs to the current project
|
|
554
|
+
// First try as absolute path
|
|
555
|
+
if (path.isAbsolute(filePath)) {
|
|
556
|
+
projectMatchMap.set(i, filePath.startsWith(resolvedProjectPath));
|
|
557
|
+
continue;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// For relative paths, check if the file actually exists in the project
|
|
561
|
+
const absolutePath = path.resolve(resolvedProjectPath, filePath);
|
|
562
|
+
|
|
563
|
+
// Verify the path is within project bounds
|
|
564
|
+
if (absolutePath.startsWith(resolvedProjectPath)) {
|
|
565
|
+
// Mark for batch existence check
|
|
566
|
+
resultsToCheck.push({ result, index: i, absolutePath });
|
|
567
|
+
} else {
|
|
568
|
+
projectMatchMap.set(i, false);
|
|
569
|
+
}
|
|
570
|
+
} catch (error) {
|
|
571
|
+
debug(`Error filtering result for project: ${error.message}`);
|
|
572
|
+
projectMatchMap.set(i, false);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Batch check file existence for better performance
|
|
577
|
+
if (resultsToCheck.length > 0) {
|
|
578
|
+
debug(`[OPTIMIZATION] Batch checking existence of ${resultsToCheck.length} files`);
|
|
579
|
+
const existencePromises = resultsToCheck.map(async ({ result, index, absolutePath }) => {
|
|
580
|
+
try {
|
|
581
|
+
await fs.promises.access(absolutePath, fs.constants.F_OK);
|
|
582
|
+
return { index, exists: true };
|
|
583
|
+
} catch {
|
|
584
|
+
debug(`Filtering out non-existent file: ${result.original_document_path || result.path}`);
|
|
585
|
+
return { index, exists: false };
|
|
586
|
+
}
|
|
587
|
+
});
|
|
588
|
+
|
|
589
|
+
const existenceResults = await Promise.all(existencePromises);
|
|
590
|
+
for (const { index, exists } of existenceResults) {
|
|
591
|
+
projectMatchMap.set(index, exists);
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// Filter results based on project match using the map
|
|
596
|
+
const projectFilteredResults = results.filter((result, index) => projectMatchMap.get(index) === true);
|
|
597
|
+
|
|
598
|
+
console.log(chalk.blue(`Filtered to ${projectFilteredResults.length} results from current project`));
|
|
599
|
+
|
|
600
|
+
// Map results to expected format
|
|
601
|
+
let finalResults = projectFilteredResults.map((result) => {
|
|
602
|
+
// Handle different score types from native hybrid search
|
|
603
|
+
let similarity;
|
|
604
|
+
if (result._distance !== undefined) {
|
|
605
|
+
// Vector search distance (0 = perfect match, higher = less similar)
|
|
606
|
+
// Apply more precise normalization to avoid all scores being 1.000
|
|
607
|
+
similarity = Math.max(0, Math.min(1, Math.exp(-result._distance * 2)));
|
|
608
|
+
} else if (result._score !== undefined) {
|
|
609
|
+
// FTS or hybrid score - normalize to 0-1 range with better scaling
|
|
610
|
+
similarity = Math.max(0, Math.min(1, result._score / Math.max(result._score, 1)));
|
|
611
|
+
} else {
|
|
612
|
+
// Fallback
|
|
613
|
+
similarity = 0.5;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// Determine if this is a documentation file using the utility function
|
|
617
|
+
const isDocumentation = isDocumentationFile(result.path, result.language);
|
|
618
|
+
|
|
619
|
+
return {
|
|
620
|
+
similarity,
|
|
621
|
+
type: 'file',
|
|
622
|
+
content: result.content,
|
|
623
|
+
path: result.path,
|
|
624
|
+
file_path: result.path,
|
|
625
|
+
language: result.language,
|
|
626
|
+
reranked: false,
|
|
627
|
+
isDocumentation, // Add the missing flag that cag-analyzer expects
|
|
628
|
+
};
|
|
629
|
+
});
|
|
630
|
+
|
|
631
|
+
// Apply similarity threshold
|
|
632
|
+
finalResults = finalResults.filter((result) => result.similarity >= similarityThreshold);
|
|
633
|
+
|
|
634
|
+
// PERFORMANCE FIX: Calculate query embedding once and reuse for both reranking and project structure
|
|
635
|
+
let queryEmbedding = null;
|
|
636
|
+
|
|
637
|
+
// Include project structure if requested (project-specific)
|
|
638
|
+
if (includeProjectStructure) {
|
|
639
|
+
try {
|
|
640
|
+
const fileTable = await this.database.getTable(FILE_EMBEDDINGS_TABLE);
|
|
641
|
+
if (fileTable) {
|
|
642
|
+
// Look for project-specific structure ID
|
|
643
|
+
const projectStructureId = `__project_structure__${path.basename(resolvedProjectPath)}`;
|
|
644
|
+
let structureResults = await fileTable.query().where(`id = '${projectStructureId}'`).limit(1).toArray();
|
|
645
|
+
|
|
646
|
+
// Fall back to generic project structure if project-specific one doesn't exist
|
|
647
|
+
if (structureResults.length === 0) {
|
|
648
|
+
structureResults = await fileTable.query().where("id = '__project_structure__'").limit(1).toArray();
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (structureResults.length > 0) {
|
|
652
|
+
const structureRecord = structureResults[0];
|
|
653
|
+
if (structureRecord.vector) {
|
|
654
|
+
// PERFORMANCE FIX: Use pre-computed query embedding if available, otherwise calculate once
|
|
655
|
+
if (!queryEmbedding) {
|
|
656
|
+
queryEmbedding = precomputedQueryEmbedding || (await this.modelManager.calculateQueryEmbedding(queryText));
|
|
657
|
+
}
|
|
658
|
+
if (queryEmbedding) {
|
|
659
|
+
const similarity = calculateCosineSimilarity(queryEmbedding, Array.from(structureRecord.vector));
|
|
660
|
+
if (similarity > 0.5) {
|
|
661
|
+
finalResults.push({
|
|
662
|
+
similarity,
|
|
663
|
+
type: 'project-structure',
|
|
664
|
+
content: structureRecord.content,
|
|
665
|
+
path: structureRecord.path,
|
|
666
|
+
file_path: structureRecord.path,
|
|
667
|
+
language: 'text',
|
|
668
|
+
reranked: false,
|
|
669
|
+
});
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
} catch (error) {
|
|
676
|
+
console.warn(chalk.yellow(`Project structure inclusion failed: ${error.message}`));
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Final sorting and limiting
|
|
681
|
+
finalResults.sort((a, b) => b.similarity - a.similarity);
|
|
682
|
+
if (finalResults.length > limit) {
|
|
683
|
+
finalResults = finalResults.slice(0, limit);
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
console.log(chalk.green(`Returning ${finalResults.length} optimized hybrid search results`));
|
|
687
|
+
return finalResults;
|
|
688
|
+
} catch (error) {
|
|
689
|
+
console.error(chalk.red(`Error in optimized findSimilarCode: ${error.message}`), error);
|
|
690
|
+
return [];
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
/**
|
|
695
|
+
* Get performance metrics
|
|
696
|
+
* @returns {Object} Performance metrics
|
|
697
|
+
*/
|
|
698
|
+
getPerformanceMetrics() {
|
|
699
|
+
return {
|
|
700
|
+
...this.performanceMetrics,
|
|
701
|
+
averageSearchTime:
|
|
702
|
+
this.performanceMetrics.searchCount > 0 ? this.performanceMetrics.totalSearchTime / this.performanceMetrics.searchCount : 0,
|
|
703
|
+
cacheSize: this.h1EmbeddingCache.size,
|
|
704
|
+
documentContextCacheSize: this.documentContextCache.size,
|
|
705
|
+
};
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Clear all caches
|
|
710
|
+
*/
|
|
711
|
+
clearCaches() {
|
|
712
|
+
this.h1EmbeddingCache.clear();
|
|
713
|
+
this.documentContextCache.clear();
|
|
714
|
+
this.documentContextPromiseCache.clear();
|
|
715
|
+
console.log(chalk.green('ContentRetriever caches cleared'));
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/**
|
|
719
|
+
* Cleanup resources
|
|
720
|
+
*/
|
|
721
|
+
async cleanup() {
|
|
722
|
+
if (this.cleaningUp) {
|
|
723
|
+
return; // Already cleaning up, prevent duplicate calls
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
this.cleaningUp = true;
|
|
727
|
+
|
|
728
|
+
try {
|
|
729
|
+
// Clear LOCAL caches only (not system-wide caches)
|
|
730
|
+
this.h1EmbeddingCache.clear();
|
|
731
|
+
this.documentContextCache.clear();
|
|
732
|
+
this.documentContextPromiseCache.clear();
|
|
733
|
+
|
|
734
|
+
// Reset LOCAL performance metrics
|
|
735
|
+
this.performanceMetrics = {
|
|
736
|
+
searchCount: 0,
|
|
737
|
+
totalSearchTime: 0,
|
|
738
|
+
cacheHitRate: 0,
|
|
739
|
+
parallelRerankingTime: 0,
|
|
740
|
+
};
|
|
741
|
+
|
|
742
|
+
console.log(chalk.green('ContentRetriever cleanup complete'));
|
|
743
|
+
} finally {
|
|
744
|
+
this.cleaningUp = false;
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
}
|