codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,384 @@
1
+ /**
2
+ * Feedback Loader Module
3
+ *
4
+ * Loads and processes feedback artifacts from previous PR review runs.
5
+ * Used by CLI tool to filter dismissed issues and improve analysis quality.
6
+ *
7
+ * Features:
8
+ * - Semantic similarity comparison using embeddings for accurate issue matching
9
+ * - Fallback to word-based similarity when embeddings are not available
10
+ * - Supports comparing LLM-generated text that may be lexically different but semantically similar
11
+ */
12
+
13
+ import fs from 'fs';
14
+ import path from 'path';
15
+ import chalk from 'chalk';
16
+ import { getDefaultEmbeddingsSystem } from './embeddings/factory.js';
17
+ import { calculateCosineSimilarity } from './embeddings/similarity-calculator.js';
18
+
19
+ /**
20
+ * Load feedback data from artifacts directory
21
+ *
22
+ * @param {string} feedbackPath - Path to feedback artifacts directory
23
+ * @param {Object} options - Loading options
24
+ * @returns {Promise<Object>} Loaded feedback data
25
+ */
26
+ export async function loadFeedbackData(feedbackPath, options = {}) {
27
+ const { verbose = false } = options;
28
+
29
+ if (!feedbackPath) {
30
+ if (verbose) console.log(chalk.gray('No feedback path provided'));
31
+ return {};
32
+ }
33
+
34
+ try {
35
+ if (!fs.existsSync(feedbackPath)) {
36
+ if (verbose) console.log(chalk.gray(`Feedback directory not found: ${feedbackPath}`));
37
+ return {};
38
+ }
39
+
40
+ if (verbose) console.log(chalk.cyan(`📁 Loading feedback from: ${feedbackPath}`));
41
+
42
+ // Look for feedback files in the directory
43
+ const feedbackFiles = fs.readdirSync(feedbackPath).filter((file) => file.startsWith('feedback-') && file.endsWith('.json'));
44
+
45
+ if (feedbackFiles.length === 0) {
46
+ if (verbose) console.log(chalk.gray('No feedback files found'));
47
+ return {};
48
+ }
49
+
50
+ if (verbose) console.log(chalk.cyan(`📥 Found ${feedbackFiles.length} feedback file(s)`));
51
+
52
+ // Load and merge all feedback files
53
+ const allFeedback = {};
54
+ let totalItems = 0;
55
+
56
+ for (const file of feedbackFiles) {
57
+ try {
58
+ const filePath = path.join(feedbackPath, file);
59
+ const fileContent = fs.readFileSync(filePath, 'utf8');
60
+ const feedbackData = JSON.parse(fileContent);
61
+
62
+ // Merge feedback data
63
+ if (feedbackData.feedback) {
64
+ Object.assign(allFeedback, feedbackData.feedback);
65
+ const itemCount = Object.keys(feedbackData.feedback).length;
66
+ totalItems += itemCount;
67
+ if (verbose) {
68
+ console.log(chalk.cyan(`📋 Loaded feedback from ${file}: ${itemCount} items`));
69
+ }
70
+ }
71
+ } catch (parseError) {
72
+ console.log(chalk.yellow(`⚠️ Error parsing feedback file ${file}: ${parseError.message}`));
73
+ }
74
+ }
75
+
76
+ if (totalItems > 0) {
77
+ if (verbose) {
78
+ console.log(chalk.green(`✅ Successfully loaded ${totalItems} feedback items total`));
79
+ }
80
+ return allFeedback;
81
+ }
82
+
83
+ return {};
84
+ } catch (error) {
85
+ console.log(chalk.red(`❌ Error loading feedback data: ${error.message}`));
86
+ return {};
87
+ }
88
+ }
89
+
90
+ // ============================================================================
91
+ // SEMANTIC SIMILARITY USING EXISTING EMBEDDINGS SYSTEM
92
+ // ============================================================================
93
+
94
+ // Use the existing embeddings system from the codebase for semantic similarity
95
+ // This avoids code duplication with custom-documents.js and pr-history/comment-processor.js
96
+ let embeddingsSystem = null;
97
+ let semanticSimilarityInitialized = false;
98
+ let semanticSimilarityAvailable = false;
99
+
100
+ /**
101
+ * Initialize semantic similarity using the existing embeddings system
102
+ * This should be called early in the application lifecycle if semantic similarity is desired.
103
+ *
104
+ * @returns {Promise<void>}
105
+ */
106
+ export async function initializeSemanticSimilarity() {
107
+ if (semanticSimilarityInitialized) {
108
+ return;
109
+ }
110
+
111
+ try {
112
+ embeddingsSystem = getDefaultEmbeddingsSystem();
113
+ await embeddingsSystem.initialize();
114
+ semanticSimilarityInitialized = true;
115
+ semanticSimilarityAvailable = true;
116
+ console.log(chalk.green('[FeedbackLoader] Semantic similarity initialized using embeddings system'));
117
+ } catch (error) {
118
+ console.log(chalk.yellow(`[FeedbackLoader] Semantic similarity initialization failed: ${error.message}`));
119
+ semanticSimilarityAvailable = false;
120
+ }
121
+ }
122
+
123
+ /**
124
+ * Check if semantic similarity is available
125
+ * @returns {boolean} True if semantic similarity can be used
126
+ */
127
+ export function isSemanticSimilarityAvailable() {
128
+ return semanticSimilarityAvailable && embeddingsSystem !== null;
129
+ }
130
+
131
+ /**
132
+ * Calculate semantic similarity between two texts using embeddings
133
+ * Uses the existing embeddings system that's also used by content-retrieval and pr-history
134
+ *
135
+ * @param {string} text1 - First text
136
+ * @param {string} text2 - Second text
137
+ * @returns {Promise<number|null>} Similarity score (0-1) or null if calculation failed
138
+ */
139
+ async function calculateSemanticSimilarity(text1, text2) {
140
+ if (!text1 || !text2 || !isSemanticSimilarityAvailable()) {
141
+ return null;
142
+ }
143
+
144
+ try {
145
+ // Use the same embedding calculation as the rest of the codebase
146
+ const [embedding1, embedding2] = await Promise.all([
147
+ embeddingsSystem.calculateEmbedding(text1),
148
+ embeddingsSystem.calculateEmbedding(text2),
149
+ ]);
150
+
151
+ if (!embedding1 || !embedding2) {
152
+ return null;
153
+ }
154
+
155
+ // Use the shared cosine similarity function
156
+ const similarity = calculateCosineSimilarity(embedding1, embedding2);
157
+ // Cosine similarity ranges from -1 to 1, normalize to 0-1
158
+ return (similarity + 1) / 2;
159
+ } catch (error) {
160
+ console.log(chalk.yellow(`[FeedbackLoader] Semantic similarity calculation failed: ${error.message}`));
161
+ return null;
162
+ }
163
+ }
164
+
165
+ // ============================================================================
166
+ // ISSUE SIMILARITY CHECKING
167
+ // ============================================================================
168
+
169
+ /**
170
+ * Check if an issue should be skipped based on previous feedback
171
+ * Uses semantic similarity when available, falls back to word-based similarity.
172
+ *
173
+ * @param {string} issueDescription - Description of the current issue
174
+ * @param {Object} feedbackData - Loaded feedback data
175
+ * @param {Object} options - Filtering options
176
+ * @param {number} options.similarityThreshold - Threshold for considering issues similar (default: 0.7)
177
+ * @param {boolean} options.verbose - Enable verbose logging
178
+ * @param {boolean} options.useSemanticSimilarity - Use semantic similarity when available (default: true)
179
+ * @returns {Promise<boolean>} True if issue should be skipped
180
+ */
181
+ export async function shouldSkipSimilarIssue(issueDescription, feedbackData, options = {}) {
182
+ const { similarityThreshold = 0.7, verbose = false, useSemanticSimilarity = true } = options;
183
+
184
+ if (!feedbackData || Object.keys(feedbackData).length === 0) {
185
+ return false;
186
+ }
187
+
188
+ // Check if similar issues were previously dismissed
189
+ const dismissedIssues = Object.values(feedbackData).filter(
190
+ (feedback) =>
191
+ feedback?.overallSentiment === 'negative' ||
192
+ feedback?.userReplies?.some(
193
+ (reply) =>
194
+ reply.body.toLowerCase().includes('false positive') ||
195
+ reply.body.toLowerCase().includes('not relevant') ||
196
+ reply.body.toLowerCase().includes('ignore') ||
197
+ reply.body.toLowerCase().includes('resolved')
198
+ )
199
+ );
200
+
201
+ if (dismissedIssues.length === 0) {
202
+ return false;
203
+ }
204
+
205
+ // Determine if we should use semantic similarity
206
+ const canUseSemanticSimilarity = useSemanticSimilarity && isSemanticSimilarityAvailable();
207
+
208
+ if (verbose && canUseSemanticSimilarity) {
209
+ console.log(chalk.cyan('🔍 Using semantic similarity for issue comparison'));
210
+ }
211
+
212
+ // Check similarity with dismissed issues
213
+ for (const dismissed of dismissedIssues) {
214
+ if (!dismissed.originalIssue) continue;
215
+
216
+ let similarity;
217
+ let similarityMethod;
218
+
219
+ if (canUseSemanticSimilarity) {
220
+ // Try semantic similarity first using existing embeddings system
221
+ similarity = await calculateSemanticSimilarity(issueDescription, dismissed.originalIssue);
222
+ similarityMethod = 'semantic';
223
+
224
+ // Fall back to word similarity if semantic calculation failed
225
+ if (similarity === null) {
226
+ similarity = calculateWordSimilarity(issueDescription, dismissed.originalIssue);
227
+ similarityMethod = 'word-based';
228
+ }
229
+ } else {
230
+ // Use word-based similarity
231
+ similarity = calculateWordSimilarity(issueDescription, dismissed.originalIssue);
232
+ similarityMethod = 'word-based';
233
+ }
234
+
235
+ if (similarity > similarityThreshold) {
236
+ if (verbose) {
237
+ console.log(chalk.yellow(`⏭️ Skipping similar dismissed issue (${(similarity * 100).toFixed(1)}% ${similarityMethod} similarity)`));
238
+ console.log(chalk.gray(` Current: ${issueDescription.substring(0, 80)}...`));
239
+ console.log(chalk.gray(` Previous: ${dismissed.originalIssue.substring(0, 80)}...`));
240
+ }
241
+ return true;
242
+ }
243
+ }
244
+
245
+ return false;
246
+ }
247
+
248
+ /**
249
+ * Calculate combined similarity between two issue descriptions.
250
+ * Uses both semantic and word-based similarity for robust comparison.
251
+ *
252
+ * @param {string} text1 - First text
253
+ * @param {string} text2 - Second text
254
+ * @param {Object} options - Options
255
+ * @param {boolean} options.useSemanticSimilarity - Use semantic similarity when available (default: true)
256
+ * @returns {Promise<{similarity: number, method: string}>} Similarity result with method used
257
+ */
258
+ export async function calculateIssueSimilarity(text1, text2, options = {}) {
259
+ const { useSemanticSimilarity = true } = options;
260
+
261
+ if (!text1 || !text2) {
262
+ return { similarity: 0, method: 'none' };
263
+ }
264
+
265
+ const canUseSemanticSimilarity = useSemanticSimilarity && isSemanticSimilarityAvailable();
266
+
267
+ if (canUseSemanticSimilarity) {
268
+ const semanticSimilarity = await calculateSemanticSimilarity(text1, text2);
269
+
270
+ if (semanticSimilarity !== null) {
271
+ // Also calculate word similarity for a hybrid score
272
+ const wordSimilarity = calculateWordSimilarity(text1, text2);
273
+
274
+ // Combine both scores with more weight on semantic similarity
275
+ // This helps catch both semantically similar and lexically similar issues
276
+ const combinedSimilarity = semanticSimilarity * 0.7 + wordSimilarity * 0.3;
277
+
278
+ return {
279
+ similarity: combinedSimilarity,
280
+ method: 'hybrid',
281
+ semanticScore: semanticSimilarity,
282
+ wordScore: wordSimilarity,
283
+ };
284
+ }
285
+ }
286
+
287
+ // Fall back to word-based similarity
288
+ return {
289
+ similarity: calculateWordSimilarity(text1, text2),
290
+ method: 'word-based',
291
+ };
292
+ }
293
+
294
+ /**
295
+ * Calculate word-based similarity between two strings using Jaccard similarity.
296
+ * This is the fallback method when embeddings are not available.
297
+ *
298
+ * @param {string} text1 - First text
299
+ * @param {string} text2 - Second text
300
+ * @returns {number} Similarity score (0-1)
301
+ */
302
+ export function calculateWordSimilarity(text1, text2) {
303
+ if (!text1 || !text2) return 0;
304
+
305
+ // Normalize and tokenize
306
+ const normalize = (text) =>
307
+ text
308
+ .toLowerCase()
309
+ .replace(/[^\w\s]/g, ' ') // Remove punctuation
310
+ .split(/\s+/)
311
+ .filter((word) => word.length > 2); // Filter short words
312
+
313
+ const words1 = new Set(normalize(text1));
314
+ const words2 = new Set(normalize(text2));
315
+
316
+ if (words1.size === 0 || words2.size === 0) return 0;
317
+
318
+ // Calculate Jaccard similarity (intersection over union)
319
+ const intersection = [...words1].filter((word) => words2.has(word)).length;
320
+ const union = new Set([...words1, ...words2]).size;
321
+
322
+ return union > 0 ? intersection / union : 0;
323
+ }
324
+
325
+ /**
326
+ * Extract dismissed issue patterns for LLM context
327
+ *
328
+ * @param {Object} feedbackData - Loaded feedback data
329
+ * @param {Object} options - Extraction options
330
+ * @returns {Array} Array of dismissed issue patterns
331
+ */
332
+ export function extractDismissedPatterns(feedbackData, options = {}) {
333
+ const { maxPatterns = 10, verbose = false } = options;
334
+
335
+ if (!feedbackData || Object.keys(feedbackData).length === 0) {
336
+ return [];
337
+ }
338
+
339
+ // Find dismissed issues with clear patterns
340
+ const dismissedIssues = Object.values(feedbackData)
341
+ .filter(
342
+ (feedback) =>
343
+ feedback?.overallSentiment === 'negative' ||
344
+ feedback?.userReplies?.some(
345
+ (reply) =>
346
+ reply.body.toLowerCase().includes('false positive') ||
347
+ reply.body.toLowerCase().includes('not relevant') ||
348
+ reply.body.toLowerCase().includes('ignore')
349
+ )
350
+ )
351
+ .map((feedback) => ({
352
+ issue: feedback.originalIssue || 'Unknown issue',
353
+ reason: feedback.userReplies?.[0]?.body?.substring(0, 100) || 'Negative feedback',
354
+ sentiment: feedback.overallSentiment,
355
+ }))
356
+ .slice(0, maxPatterns);
357
+
358
+ if (verbose && dismissedIssues.length > 0) {
359
+ console.log(chalk.cyan(`📋 Extracted ${dismissedIssues.length} dismissed issue patterns for LLM context`));
360
+ }
361
+
362
+ return dismissedIssues;
363
+ }
364
+
365
+ /**
366
+ * Generate LLM context about dismissed issues
367
+ *
368
+ * @param {Array} dismissedPatterns - Array of dismissed patterns
369
+ * @returns {string} Context text for LLM
370
+ */
371
+ export function generateFeedbackContext(dismissedPatterns) {
372
+ if (!dismissedPatterns || dismissedPatterns.length === 0) {
373
+ return '';
374
+ }
375
+
376
+ const contextLines = dismissedPatterns.map((pattern, index) => `${index + 1}. "${pattern.issue}" (Reason: ${pattern.reason})`);
377
+
378
+ return `
379
+ IMPORTANT: The following types of issues have been previously dismissed or marked as not relevant by users in this project:
380
+
381
+ ${contextLines.join('\n')}
382
+
383
+ Please avoid suggesting similar issues unless they represent genuinely different problems. Focus on identifying new, actionable, and relevant issues that haven't been previously dismissed.`;
384
+ }