codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Feedback Loader Module
|
|
3
|
+
*
|
|
4
|
+
* Loads and processes feedback artifacts from previous PR review runs.
|
|
5
|
+
* Used by CLI tool to filter dismissed issues and improve analysis quality.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Semantic similarity comparison using embeddings for accurate issue matching
|
|
9
|
+
* - Fallback to word-based similarity when embeddings are not available
|
|
10
|
+
* - Supports comparing LLM-generated text that may be lexically different but semantically similar
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fs from 'fs';
|
|
14
|
+
import path from 'path';
|
|
15
|
+
import chalk from 'chalk';
|
|
16
|
+
import { getDefaultEmbeddingsSystem } from './embeddings/factory.js';
|
|
17
|
+
import { calculateCosineSimilarity } from './embeddings/similarity-calculator.js';
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Load feedback data from artifacts directory
|
|
21
|
+
*
|
|
22
|
+
* @param {string} feedbackPath - Path to feedback artifacts directory
|
|
23
|
+
* @param {Object} options - Loading options
|
|
24
|
+
* @returns {Promise<Object>} Loaded feedback data
|
|
25
|
+
*/
|
|
26
|
+
export async function loadFeedbackData(feedbackPath, options = {}) {
|
|
27
|
+
const { verbose = false } = options;
|
|
28
|
+
|
|
29
|
+
if (!feedbackPath) {
|
|
30
|
+
if (verbose) console.log(chalk.gray('No feedback path provided'));
|
|
31
|
+
return {};
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
if (!fs.existsSync(feedbackPath)) {
|
|
36
|
+
if (verbose) console.log(chalk.gray(`Feedback directory not found: ${feedbackPath}`));
|
|
37
|
+
return {};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (verbose) console.log(chalk.cyan(`📁 Loading feedback from: ${feedbackPath}`));
|
|
41
|
+
|
|
42
|
+
// Look for feedback files in the directory
|
|
43
|
+
const feedbackFiles = fs.readdirSync(feedbackPath).filter((file) => file.startsWith('feedback-') && file.endsWith('.json'));
|
|
44
|
+
|
|
45
|
+
if (feedbackFiles.length === 0) {
|
|
46
|
+
if (verbose) console.log(chalk.gray('No feedback files found'));
|
|
47
|
+
return {};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (verbose) console.log(chalk.cyan(`📥 Found ${feedbackFiles.length} feedback file(s)`));
|
|
51
|
+
|
|
52
|
+
// Load and merge all feedback files
|
|
53
|
+
const allFeedback = {};
|
|
54
|
+
let totalItems = 0;
|
|
55
|
+
|
|
56
|
+
for (const file of feedbackFiles) {
|
|
57
|
+
try {
|
|
58
|
+
const filePath = path.join(feedbackPath, file);
|
|
59
|
+
const fileContent = fs.readFileSync(filePath, 'utf8');
|
|
60
|
+
const feedbackData = JSON.parse(fileContent);
|
|
61
|
+
|
|
62
|
+
// Merge feedback data
|
|
63
|
+
if (feedbackData.feedback) {
|
|
64
|
+
Object.assign(allFeedback, feedbackData.feedback);
|
|
65
|
+
const itemCount = Object.keys(feedbackData.feedback).length;
|
|
66
|
+
totalItems += itemCount;
|
|
67
|
+
if (verbose) {
|
|
68
|
+
console.log(chalk.cyan(`📋 Loaded feedback from ${file}: ${itemCount} items`));
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
} catch (parseError) {
|
|
72
|
+
console.log(chalk.yellow(`⚠️ Error parsing feedback file ${file}: ${parseError.message}`));
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (totalItems > 0) {
|
|
77
|
+
if (verbose) {
|
|
78
|
+
console.log(chalk.green(`✅ Successfully loaded ${totalItems} feedback items total`));
|
|
79
|
+
}
|
|
80
|
+
return allFeedback;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return {};
|
|
84
|
+
} catch (error) {
|
|
85
|
+
console.log(chalk.red(`❌ Error loading feedback data: ${error.message}`));
|
|
86
|
+
return {};
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ============================================================================
|
|
91
|
+
// SEMANTIC SIMILARITY USING EXISTING EMBEDDINGS SYSTEM
|
|
92
|
+
// ============================================================================
|
|
93
|
+
|
|
94
|
+
// Use the existing embeddings system from the codebase for semantic similarity
|
|
95
|
+
// This avoids code duplication with custom-documents.js and pr-history/comment-processor.js
|
|
96
|
+
let embeddingsSystem = null;
|
|
97
|
+
let semanticSimilarityInitialized = false;
|
|
98
|
+
let semanticSimilarityAvailable = false;
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Initialize semantic similarity using the existing embeddings system
|
|
102
|
+
* This should be called early in the application lifecycle if semantic similarity is desired.
|
|
103
|
+
*
|
|
104
|
+
* @returns {Promise<void>}
|
|
105
|
+
*/
|
|
106
|
+
export async function initializeSemanticSimilarity() {
|
|
107
|
+
if (semanticSimilarityInitialized) {
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
try {
|
|
112
|
+
embeddingsSystem = getDefaultEmbeddingsSystem();
|
|
113
|
+
await embeddingsSystem.initialize();
|
|
114
|
+
semanticSimilarityInitialized = true;
|
|
115
|
+
semanticSimilarityAvailable = true;
|
|
116
|
+
console.log(chalk.green('[FeedbackLoader] Semantic similarity initialized using embeddings system'));
|
|
117
|
+
} catch (error) {
|
|
118
|
+
console.log(chalk.yellow(`[FeedbackLoader] Semantic similarity initialization failed: ${error.message}`));
|
|
119
|
+
semanticSimilarityAvailable = false;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Check if semantic similarity is available
|
|
125
|
+
* @returns {boolean} True if semantic similarity can be used
|
|
126
|
+
*/
|
|
127
|
+
export function isSemanticSimilarityAvailable() {
|
|
128
|
+
return semanticSimilarityAvailable && embeddingsSystem !== null;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Calculate semantic similarity between two texts using embeddings
|
|
133
|
+
* Uses the existing embeddings system that's also used by content-retrieval and pr-history
|
|
134
|
+
*
|
|
135
|
+
* @param {string} text1 - First text
|
|
136
|
+
* @param {string} text2 - Second text
|
|
137
|
+
* @returns {Promise<number|null>} Similarity score (0-1) or null if calculation failed
|
|
138
|
+
*/
|
|
139
|
+
async function calculateSemanticSimilarity(text1, text2) {
|
|
140
|
+
if (!text1 || !text2 || !isSemanticSimilarityAvailable()) {
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
try {
|
|
145
|
+
// Use the same embedding calculation as the rest of the codebase
|
|
146
|
+
const [embedding1, embedding2] = await Promise.all([
|
|
147
|
+
embeddingsSystem.calculateEmbedding(text1),
|
|
148
|
+
embeddingsSystem.calculateEmbedding(text2),
|
|
149
|
+
]);
|
|
150
|
+
|
|
151
|
+
if (!embedding1 || !embedding2) {
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Use the shared cosine similarity function
|
|
156
|
+
const similarity = calculateCosineSimilarity(embedding1, embedding2);
|
|
157
|
+
// Cosine similarity ranges from -1 to 1, normalize to 0-1
|
|
158
|
+
return (similarity + 1) / 2;
|
|
159
|
+
} catch (error) {
|
|
160
|
+
console.log(chalk.yellow(`[FeedbackLoader] Semantic similarity calculation failed: ${error.message}`));
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ============================================================================
|
|
166
|
+
// ISSUE SIMILARITY CHECKING
|
|
167
|
+
// ============================================================================
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Check if an issue should be skipped based on previous feedback
|
|
171
|
+
* Uses semantic similarity when available, falls back to word-based similarity.
|
|
172
|
+
*
|
|
173
|
+
* @param {string} issueDescription - Description of the current issue
|
|
174
|
+
* @param {Object} feedbackData - Loaded feedback data
|
|
175
|
+
* @param {Object} options - Filtering options
|
|
176
|
+
* @param {number} options.similarityThreshold - Threshold for considering issues similar (default: 0.7)
|
|
177
|
+
* @param {boolean} options.verbose - Enable verbose logging
|
|
178
|
+
* @param {boolean} options.useSemanticSimilarity - Use semantic similarity when available (default: true)
|
|
179
|
+
* @returns {Promise<boolean>} True if issue should be skipped
|
|
180
|
+
*/
|
|
181
|
+
export async function shouldSkipSimilarIssue(issueDescription, feedbackData, options = {}) {
|
|
182
|
+
const { similarityThreshold = 0.7, verbose = false, useSemanticSimilarity = true } = options;
|
|
183
|
+
|
|
184
|
+
if (!feedbackData || Object.keys(feedbackData).length === 0) {
|
|
185
|
+
return false;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Check if similar issues were previously dismissed
|
|
189
|
+
const dismissedIssues = Object.values(feedbackData).filter(
|
|
190
|
+
(feedback) =>
|
|
191
|
+
feedback?.overallSentiment === 'negative' ||
|
|
192
|
+
feedback?.userReplies?.some(
|
|
193
|
+
(reply) =>
|
|
194
|
+
reply.body.toLowerCase().includes('false positive') ||
|
|
195
|
+
reply.body.toLowerCase().includes('not relevant') ||
|
|
196
|
+
reply.body.toLowerCase().includes('ignore') ||
|
|
197
|
+
reply.body.toLowerCase().includes('resolved')
|
|
198
|
+
)
|
|
199
|
+
);
|
|
200
|
+
|
|
201
|
+
if (dismissedIssues.length === 0) {
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Determine if we should use semantic similarity
|
|
206
|
+
const canUseSemanticSimilarity = useSemanticSimilarity && isSemanticSimilarityAvailable();
|
|
207
|
+
|
|
208
|
+
if (verbose && canUseSemanticSimilarity) {
|
|
209
|
+
console.log(chalk.cyan('🔍 Using semantic similarity for issue comparison'));
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Check similarity with dismissed issues
|
|
213
|
+
for (const dismissed of dismissedIssues) {
|
|
214
|
+
if (!dismissed.originalIssue) continue;
|
|
215
|
+
|
|
216
|
+
let similarity;
|
|
217
|
+
let similarityMethod;
|
|
218
|
+
|
|
219
|
+
if (canUseSemanticSimilarity) {
|
|
220
|
+
// Try semantic similarity first using existing embeddings system
|
|
221
|
+
similarity = await calculateSemanticSimilarity(issueDescription, dismissed.originalIssue);
|
|
222
|
+
similarityMethod = 'semantic';
|
|
223
|
+
|
|
224
|
+
// Fall back to word similarity if semantic calculation failed
|
|
225
|
+
if (similarity === null) {
|
|
226
|
+
similarity = calculateWordSimilarity(issueDescription, dismissed.originalIssue);
|
|
227
|
+
similarityMethod = 'word-based';
|
|
228
|
+
}
|
|
229
|
+
} else {
|
|
230
|
+
// Use word-based similarity
|
|
231
|
+
similarity = calculateWordSimilarity(issueDescription, dismissed.originalIssue);
|
|
232
|
+
similarityMethod = 'word-based';
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (similarity > similarityThreshold) {
|
|
236
|
+
if (verbose) {
|
|
237
|
+
console.log(chalk.yellow(`⏭️ Skipping similar dismissed issue (${(similarity * 100).toFixed(1)}% ${similarityMethod} similarity)`));
|
|
238
|
+
console.log(chalk.gray(` Current: ${issueDescription.substring(0, 80)}...`));
|
|
239
|
+
console.log(chalk.gray(` Previous: ${dismissed.originalIssue.substring(0, 80)}...`));
|
|
240
|
+
}
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return false;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Calculate combined similarity between two issue descriptions.
|
|
250
|
+
* Uses both semantic and word-based similarity for robust comparison.
|
|
251
|
+
*
|
|
252
|
+
* @param {string} text1 - First text
|
|
253
|
+
* @param {string} text2 - Second text
|
|
254
|
+
* @param {Object} options - Options
|
|
255
|
+
* @param {boolean} options.useSemanticSimilarity - Use semantic similarity when available (default: true)
|
|
256
|
+
* @returns {Promise<{similarity: number, method: string}>} Similarity result with method used
|
|
257
|
+
*/
|
|
258
|
+
export async function calculateIssueSimilarity(text1, text2, options = {}) {
|
|
259
|
+
const { useSemanticSimilarity = true } = options;
|
|
260
|
+
|
|
261
|
+
if (!text1 || !text2) {
|
|
262
|
+
return { similarity: 0, method: 'none' };
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const canUseSemanticSimilarity = useSemanticSimilarity && isSemanticSimilarityAvailable();
|
|
266
|
+
|
|
267
|
+
if (canUseSemanticSimilarity) {
|
|
268
|
+
const semanticSimilarity = await calculateSemanticSimilarity(text1, text2);
|
|
269
|
+
|
|
270
|
+
if (semanticSimilarity !== null) {
|
|
271
|
+
// Also calculate word similarity for a hybrid score
|
|
272
|
+
const wordSimilarity = calculateWordSimilarity(text1, text2);
|
|
273
|
+
|
|
274
|
+
// Combine both scores with more weight on semantic similarity
|
|
275
|
+
// This helps catch both semantically similar and lexically similar issues
|
|
276
|
+
const combinedSimilarity = semanticSimilarity * 0.7 + wordSimilarity * 0.3;
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
similarity: combinedSimilarity,
|
|
280
|
+
method: 'hybrid',
|
|
281
|
+
semanticScore: semanticSimilarity,
|
|
282
|
+
wordScore: wordSimilarity,
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Fall back to word-based similarity
|
|
288
|
+
return {
|
|
289
|
+
similarity: calculateWordSimilarity(text1, text2),
|
|
290
|
+
method: 'word-based',
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Calculate word-based similarity between two strings using Jaccard similarity.
|
|
296
|
+
* This is the fallback method when embeddings are not available.
|
|
297
|
+
*
|
|
298
|
+
* @param {string} text1 - First text
|
|
299
|
+
* @param {string} text2 - Second text
|
|
300
|
+
* @returns {number} Similarity score (0-1)
|
|
301
|
+
*/
|
|
302
|
+
export function calculateWordSimilarity(text1, text2) {
|
|
303
|
+
if (!text1 || !text2) return 0;
|
|
304
|
+
|
|
305
|
+
// Normalize and tokenize
|
|
306
|
+
const normalize = (text) =>
|
|
307
|
+
text
|
|
308
|
+
.toLowerCase()
|
|
309
|
+
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
|
310
|
+
.split(/\s+/)
|
|
311
|
+
.filter((word) => word.length > 2); // Filter short words
|
|
312
|
+
|
|
313
|
+
const words1 = new Set(normalize(text1));
|
|
314
|
+
const words2 = new Set(normalize(text2));
|
|
315
|
+
|
|
316
|
+
if (words1.size === 0 || words2.size === 0) return 0;
|
|
317
|
+
|
|
318
|
+
// Calculate Jaccard similarity (intersection over union)
|
|
319
|
+
const intersection = [...words1].filter((word) => words2.has(word)).length;
|
|
320
|
+
const union = new Set([...words1, ...words2]).size;
|
|
321
|
+
|
|
322
|
+
return union > 0 ? intersection / union : 0;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Extract dismissed issue patterns for LLM context
|
|
327
|
+
*
|
|
328
|
+
* @param {Object} feedbackData - Loaded feedback data
|
|
329
|
+
* @param {Object} options - Extraction options
|
|
330
|
+
* @returns {Array} Array of dismissed issue patterns
|
|
331
|
+
*/
|
|
332
|
+
export function extractDismissedPatterns(feedbackData, options = {}) {
|
|
333
|
+
const { maxPatterns = 10, verbose = false } = options;
|
|
334
|
+
|
|
335
|
+
if (!feedbackData || Object.keys(feedbackData).length === 0) {
|
|
336
|
+
return [];
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// Find dismissed issues with clear patterns
|
|
340
|
+
const dismissedIssues = Object.values(feedbackData)
|
|
341
|
+
.filter(
|
|
342
|
+
(feedback) =>
|
|
343
|
+
feedback?.overallSentiment === 'negative' ||
|
|
344
|
+
feedback?.userReplies?.some(
|
|
345
|
+
(reply) =>
|
|
346
|
+
reply.body.toLowerCase().includes('false positive') ||
|
|
347
|
+
reply.body.toLowerCase().includes('not relevant') ||
|
|
348
|
+
reply.body.toLowerCase().includes('ignore')
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
.map((feedback) => ({
|
|
352
|
+
issue: feedback.originalIssue || 'Unknown issue',
|
|
353
|
+
reason: feedback.userReplies?.[0]?.body?.substring(0, 100) || 'Negative feedback',
|
|
354
|
+
sentiment: feedback.overallSentiment,
|
|
355
|
+
}))
|
|
356
|
+
.slice(0, maxPatterns);
|
|
357
|
+
|
|
358
|
+
if (verbose && dismissedIssues.length > 0) {
|
|
359
|
+
console.log(chalk.cyan(`📋 Extracted ${dismissedIssues.length} dismissed issue patterns for LLM context`));
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
return dismissedIssues;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Generate LLM context about dismissed issues
|
|
367
|
+
*
|
|
368
|
+
* @param {Array} dismissedPatterns - Array of dismissed patterns
|
|
369
|
+
* @returns {string} Context text for LLM
|
|
370
|
+
*/
|
|
371
|
+
export function generateFeedbackContext(dismissedPatterns) {
|
|
372
|
+
if (!dismissedPatterns || dismissedPatterns.length === 0) {
|
|
373
|
+
return '';
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
const contextLines = dismissedPatterns.map((pattern, index) => `${index + 1}. "${pattern.issue}" (Reason: ${pattern.reason})`);
|
|
377
|
+
|
|
378
|
+
return `
|
|
379
|
+
IMPORTANT: The following types of issues have been previously dismissed or marked as not relevant by users in this project:
|
|
380
|
+
|
|
381
|
+
${contextLines.join('\n')}
|
|
382
|
+
|
|
383
|
+
Please avoid suggesting similar issues unless they represent genuinely different problems. Focus on identifying new, actionable, and relevant issues that haven't been previously dismissed.`;
|
|
384
|
+
}
|