codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,549 @@
1
+ /**
2
+ * PR Comment Processor
3
+ *
4
+ * Processes GitHub PR comments, extracts code context, generates embeddings,
5
+ * and classifies comments for storage in the embeddings database.
6
+ */
7
+
8
+ import chalk from 'chalk';
9
+ import { getDefaultEmbeddingsSystem } from '../embeddings/factory.js';
10
+ import { filterBotComments } from './bot-detector.js';
11
+
12
+ // Create embeddings system instance
13
+ const embeddingsSystem = getDefaultEmbeddingsSystem();
14
+
15
+ export class PRCommentProcessor {
16
+ constructor() {
17
+ // Classification patterns for different issue categories
18
+ this.classificationPatterns = {
19
+ security: [
20
+ /sql injection/i,
21
+ /xss/i,
22
+ /cross.?site/i,
23
+ /sanitize/i,
24
+ /vulnerability/i,
25
+ /security/i,
26
+ /authentication/i,
27
+ /authorization/i,
28
+ /password/i,
29
+ /token/i,
30
+ /secret/i,
31
+ /encryption/i,
32
+ /sensitive/i,
33
+ /exploit/i,
34
+ /attack/i,
35
+ ],
36
+ performance: [
37
+ /inefficient/i,
38
+ /performance/i,
39
+ /slow/i,
40
+ /memory leak/i,
41
+ /optimization/i,
42
+ /algorithm/i,
43
+ /complexity/i,
44
+ /bottleneck/i,
45
+ /cache/i,
46
+ /database.*query/i,
47
+ /n\+1/i,
48
+ /timeout/i,
49
+ ],
50
+ style: [
51
+ /naming/i,
52
+ /convention/i,
53
+ /documentation/i,
54
+ /comment/i,
55
+ /indentation/i,
56
+ /formatting/i,
57
+ /camelcase/i,
58
+ /snake_case/i,
59
+ /consistency/i,
60
+ /readability/i,
61
+ /typo/i,
62
+ ],
63
+ logic: [
64
+ /condition/i,
65
+ /always false/i,
66
+ /always true/i,
67
+ /error handling/i,
68
+ /edge case/i,
69
+ /logic/i,
70
+ /simplified/i,
71
+ /missing/i,
72
+ /handle/i,
73
+ /check/i,
74
+ /validation/i,
75
+ ],
76
+ };
77
+
78
+ // Severity patterns
79
+ this.severityPatterns = {
80
+ critical: [/critical/i, /crash/i, /security flaw/i, /data loss/i, /system down/i, /fatal/i],
81
+ major: [/major/i, /serious/i, /important/i, /significant/i, /will cause/i, /breaks/i, /failure/i],
82
+ style: [/typo/i, /formatting/i, /whitespace/i, /spacing/i, /minor style/i],
83
+ };
84
+
85
+ // Pattern recognition keywords
86
+ this.patternKeywords = {
87
+ error_handling: ['error handling', 'error', 'exception', 'try catch', 'handle'],
88
+ input_validation: ['validation', 'validate', 'sanitize', 'check input'],
89
+ null_check: ['null check', 'null', 'undefined', 'falsy'],
90
+ async_await: ['async', 'await', 'promise', 'callback'],
91
+ performance: ['performance', 'optimize', 'efficient', 'slow'],
92
+ security: ['security', 'sanitize', 'escape', 'auth'],
93
+ documentation: ['documentation', 'comment', 'doc', 'readme'],
94
+ testing: ['test', 'unit test', 'coverage', 'spec'],
95
+ };
96
+ }
97
+
98
+ /**
99
+ * Process a single comment with its PR context
100
+ * @param {Object} comment - The comment object from GitHub API
101
+ * @param {Object} prContext - PR context including files and metadata
102
+ * @returns {Promise<Object>} Processed comment with embeddings and classification
103
+ */
104
+ async processComment(comment, prContext) {
105
+ try {
106
+ // Validate comment data
107
+ if (!comment || !comment.body || !comment.user) {
108
+ throw new Error('Invalid comment data');
109
+ }
110
+
111
+ // Extract basic metadata
112
+ const metadata = this.extractMetadata(comment, prContext);
113
+
114
+ // Extract code context
115
+ const codeContext = await this.extractCodeContext(comment, prContext);
116
+
117
+ // Generate embeddings
118
+ const commentEmbedding = await this.generateCommentEmbedding(comment.body);
119
+ if (!commentEmbedding) {
120
+ throw new Error('Failed to generate comment embedding');
121
+ }
122
+
123
+ let codeEmbedding = null;
124
+ if (codeContext.original_code) {
125
+ codeEmbedding = await this.generateCodeEmbedding(codeContext.original_code);
126
+ }
127
+
128
+ // Combine embeddings from concatenated text
129
+ const combinedEmbedding = codeContext.original_code
130
+ ? await this.combineEmbeddings(comment.body, codeContext.original_code)
131
+ : commentEmbedding;
132
+
133
+ // Classify comment
134
+ let classification;
135
+ try {
136
+ classification = await this.classifyComment(comment.body, codeContext);
137
+ } catch {
138
+ // Graceful degradation on classification failure
139
+ classification = {
140
+ issue_category: 'unknown',
141
+ severity: 'minor',
142
+ pattern_tags: [],
143
+ };
144
+ }
145
+
146
+ return {
147
+ ...metadata,
148
+ ...codeContext,
149
+ comment_embedding: commentEmbedding,
150
+ code_embedding: codeEmbedding,
151
+ combined_embedding: combinedEmbedding,
152
+ ...classification,
153
+ };
154
+ } catch (error) {
155
+ console.error('Error processing comment:', error);
156
+ throw error;
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Extract metadata from comment
162
+ * @param {Object} comment - Comment object
163
+ * @param {Object} prContext - PR context
164
+ * @returns {Object} Extracted metadata
165
+ */
166
+ extractMetadata(comment, prContext) {
167
+ const commentType = this.determineCommentType(comment);
168
+
169
+ return {
170
+ id: comment.id.toString(),
171
+ pr_number: prContext.pr?.number || null,
172
+ repository: prContext.pr?.repository || null,
173
+ comment_type: commentType,
174
+ comment_text: comment.body,
175
+ author: comment.user?.login || 'unknown',
176
+ created_at: comment.created_at,
177
+ updated_at: comment.updated_at,
178
+ review_id: comment.pull_request_review_id?.toString() || null,
179
+ review_state: comment.review_state || null,
180
+ };
181
+ }
182
+
183
+ /**
184
+ * Determine the type of comment
185
+ * @param {Object} comment - Comment object
186
+ * @returns {string} Comment type: 'review', 'issue', or 'inline'
187
+ */
188
+ determineCommentType(comment) {
189
+ if (comment.path && comment.position !== undefined) {
190
+ return 'review';
191
+ }
192
+ if (comment.path && comment.line !== undefined) {
193
+ return 'inline';
194
+ }
195
+ return 'issue';
196
+ }
197
+
198
+ /**
199
+ * Extract code context from comment and PR context
200
+ * @param {Object} comment - Comment object
201
+ * @param {Object} prContext - PR context
202
+ * @returns {Object} Code context
203
+ */
204
+ extractCodeContext(comment, prContext) {
205
+ const result = {
206
+ file_path: comment.path || null,
207
+ line_number: comment.line || comment.position || null,
208
+ line_range_start: null,
209
+ line_range_end: null,
210
+ original_code: null,
211
+ suggested_code: null,
212
+ diff_hunk: comment.diff_hunk || null,
213
+ };
214
+
215
+ // Extract line range from diff hunk
216
+ if (comment.diff_hunk) {
217
+ const lineRange = this.extractLineRange(comment.diff_hunk);
218
+ result.line_range_start = lineRange.start;
219
+ result.line_range_end = lineRange.end;
220
+
221
+ // Extract code from diff
222
+ const codeFromDiff = this.extractCodeFromDiff(comment.diff_hunk);
223
+ result.original_code = codeFromDiff.original_code;
224
+ result.suggested_code = codeFromDiff.suggested_code;
225
+ }
226
+
227
+ // If no diff hunk, try to extract from file patch
228
+ if (!result.original_code && comment.path && prContext.files) {
229
+ const file = prContext.files.find((f) => f.filename === comment.path);
230
+ if (file && file.patch) {
231
+ const codeFromPatch = this.extractCodeFromPatch(file.patch, comment.line);
232
+ result.original_code = codeFromPatch.original_code;
233
+ result.suggested_code = codeFromPatch.suggested_code;
234
+ result.diff_hunk = file.patch;
235
+ }
236
+ }
237
+
238
+ return result;
239
+ }
240
+
241
+ /**
242
+ * Extract line range from diff hunk
243
+ * @param {string} diffHunk - Git diff hunk
244
+ * @returns {Object} Line range information
245
+ */
246
+ extractLineRange(diffHunk) {
247
+ const hunkMatch = diffHunk.match(/@@ -(\d+),?(\d+)? \+(\d+),?(\d+)? @@/);
248
+ if (hunkMatch) {
249
+ const startLine = parseInt(hunkMatch[3]);
250
+ const contextLines = parseInt(hunkMatch[4]) || 1;
251
+ return {
252
+ start: startLine,
253
+ end: startLine + contextLines - 1,
254
+ contextLines,
255
+ };
256
+ }
257
+ return { start: null, end: null, contextLines: 0 };
258
+ }
259
+
260
+ /**
261
+ * Extract code from diff hunk
262
+ * @param {string} diffHunk - Git diff hunk
263
+ * @returns {Object} Extracted code
264
+ */
265
+ extractCodeFromDiff(diffHunk) {
266
+ const lines = diffHunk.split('\n');
267
+ let originalCode = [];
268
+ let suggestedCode = [];
269
+ let contextLines = [];
270
+
271
+ for (const line of lines) {
272
+ if (line.startsWith('-')) {
273
+ originalCode.push(line.substring(1));
274
+ } else if (line.startsWith('+')) {
275
+ suggestedCode.push(line.substring(1));
276
+ } else if (!line.startsWith('@@') && line.trim()) {
277
+ contextLines.push(line.substring(1) || line);
278
+ }
279
+ }
280
+
281
+ return {
282
+ original_code: originalCode.length > 0 ? originalCode.join('\n') : null,
283
+ suggested_code: suggestedCode.length > 0 ? suggestedCode.join('\n') : null,
284
+ context_lines: contextLines.join('\n'),
285
+ };
286
+ }
287
+
288
+ /**
289
+ * Extract code from file patch at specific line
290
+ * @param {string} filePatch - Complete file patch
291
+ * @param {number} line - Target line number
292
+ * @returns {Object} Extracted code
293
+ */
294
+ extractCodeFromPatch(filePatch, line) {
295
+ const lines = filePatch.split('\n');
296
+ let currentLine = 0;
297
+ let originalCode = null;
298
+ let suggestedCode = null;
299
+
300
+ for (const patchLine of lines) {
301
+ if (patchLine.startsWith('@@')) {
302
+ const match = patchLine.match(/@@ -(\d+),?(\d+)? \+(\d+),?(\d+)? @@/);
303
+ if (match) {
304
+ currentLine = parseInt(match[3]);
305
+ }
306
+ continue;
307
+ }
308
+
309
+ if (currentLine === line) {
310
+ if (patchLine.startsWith('-')) {
311
+ originalCode = patchLine.substring(1);
312
+ } else if (patchLine.startsWith('+')) {
313
+ suggestedCode = patchLine.substring(1);
314
+ }
315
+ break;
316
+ }
317
+
318
+ if (!patchLine.startsWith('-')) {
319
+ currentLine++;
320
+ }
321
+ }
322
+
323
+ return { original_code: originalCode, suggested_code: suggestedCode };
324
+ }
325
+
326
+ /**
327
+ * Generate embedding for comment text
328
+ * @param {string} text - Comment text
329
+ * @returns {Promise<Array<number>>} Comment embedding
330
+ */
331
+ async generateCommentEmbedding(text) {
332
+ const embedding = await embeddingsSystem.calculateEmbedding(text);
333
+ if (!embedding || embedding.length !== 384) {
334
+ throw new Error(`Invalid embedding dimensions: expected 384, got ${embedding?.length}`);
335
+ }
336
+ return embedding;
337
+ }
338
+
339
+ /**
340
+ * Generate embedding for code
341
+ * @param {string} code - Code snippet
342
+ * @returns {Promise<Array<number>>} Code embedding
343
+ */
344
+ async generateCodeEmbedding(code) {
345
+ const embedding = await embeddingsSystem.calculateEmbedding(code);
346
+ if (!embedding || embedding.length !== 384) {
347
+ throw new Error(`Invalid embedding dimensions: expected 384, got ${embedding?.length}`);
348
+ }
349
+ return embedding;
350
+ }
351
+
352
+ /**
353
+ * Combine comment and code text, then generate embedding from concatenated content
354
+ * @param {string} commentText - Comment text
355
+ * @param {string} codeText - Code text
356
+ * @returns {Promise<Array<number>>} Combined embedding from concatenated text
357
+ */
358
+ async combineEmbeddings(commentText, codeText) {
359
+ if (!commentText && !codeText) {
360
+ return null;
361
+ }
362
+
363
+ // Concatenate comment and code text with clear separation
364
+ const combinedText = [commentText, codeText].filter(Boolean).join('\n\n--- CODE CONTEXT ---\n\n');
365
+
366
+ // Generate embedding from the concatenated text
367
+ const combinedEmbedding = await embeddingsSystem.calculateEmbedding(combinedText);
368
+ if (!combinedEmbedding || combinedEmbedding.length !== 384) {
369
+ throw new Error(`Invalid combined embedding dimensions: expected 384, got ${combinedEmbedding?.length}`);
370
+ }
371
+
372
+ return combinedEmbedding;
373
+ }
374
+
375
+ /**
376
+ * Classify comment by category and severity
377
+ * @param {string} commentText - Comment text
378
+ * @param {Object} codeContext - Code context
379
+ * @returns {Promise<Object>} Classification result
380
+ */
381
+ async classifyComment(commentText, codeContext = {}) {
382
+ const text = commentText.toLowerCase();
383
+ const code = (codeContext.code || codeContext.original_code || '').toLowerCase();
384
+ const filePath = (codeContext.file_path || '').toLowerCase();
385
+
386
+ // Determine category
387
+ let category = 'general';
388
+ let maxScore = 0;
389
+
390
+ for (const [cat, patterns] of Object.entries(this.classificationPatterns)) {
391
+ let score = 0;
392
+ for (const pattern of patterns) {
393
+ if (pattern.test(text)) score += 2;
394
+ if (pattern.test(code)) score += 1;
395
+ if (pattern.test(filePath)) score += 0.5;
396
+ }
397
+
398
+ if (score > maxScore) {
399
+ maxScore = score;
400
+ category = cat;
401
+ }
402
+ }
403
+
404
+ // Special handling for security context
405
+ if (code.includes('password') || code.includes('token') || filePath.includes('auth')) {
406
+ if (category === 'general') category = 'security';
407
+ }
408
+
409
+ // Determine severity
410
+ let severity = 'minor';
411
+ for (const [sev, patterns] of Object.entries(this.severityPatterns)) {
412
+ for (const pattern of patterns) {
413
+ if (pattern.test(text)) {
414
+ severity = sev;
415
+ break;
416
+ }
417
+ }
418
+ if (severity !== 'minor') break;
419
+ }
420
+
421
+ // Adjust severity based on category
422
+ if (category === 'security' && severity === 'minor') {
423
+ severity = 'major';
424
+ }
425
+
426
+ // Generate pattern tags
427
+ const patternTags = this.generatePatternTags(commentText);
428
+
429
+ return {
430
+ issue_category: category,
431
+ severity,
432
+ pattern_tags: patternTags,
433
+ };
434
+ }
435
+
436
+ /**
437
+ * Generate pattern tags for comment
438
+ * @param {string} commentText - Comment text
439
+ * @returns {Array<string>} Pattern tags
440
+ */
441
+ generatePatternTags(commentText) {
442
+ const text = commentText.toLowerCase();
443
+ const tags = [];
444
+
445
+ for (const [pattern, keywords] of Object.entries(this.patternKeywords)) {
446
+ for (const keyword of keywords) {
447
+ if (text.includes(keyword.toLowerCase())) {
448
+ tags.push(pattern);
449
+ break;
450
+ }
451
+ }
452
+ }
453
+
454
+ return [...new Set(tags)]; // Remove duplicates
455
+ }
456
+
457
+ /**
458
+ * Identify recurring patterns in comments
459
+ * @param {Array<string>} comments - Array of comment texts
460
+ * @returns {Array<string>} Identified patterns
461
+ */
462
+ identifyPatterns(comments) {
463
+ const patterns = [];
464
+ const patternCounts = {};
465
+
466
+ for (const comment of comments) {
467
+ const tags = this.generatePatternTags(comment);
468
+ for (const tag of tags) {
469
+ patternCounts[tag] = (patternCounts[tag] || 0) + 1;
470
+ }
471
+ }
472
+
473
+ // Return patterns that appear in multiple comments
474
+ for (const [pattern, count] of Object.entries(patternCounts)) {
475
+ if (count >= 2) {
476
+ patterns.push(pattern);
477
+ }
478
+ }
479
+
480
+ return patterns;
481
+ }
482
+
483
+ /**
484
+ * Calculate pattern weights by frequency
485
+ * @param {Array<string>} commentHistory - Array of comment texts
486
+ * @returns {Object} Pattern weights
487
+ */
488
+ calculatePatternWeights(commentHistory) {
489
+ const weights = {};
490
+ const totalComments = commentHistory.length;
491
+
492
+ for (const comment of commentHistory) {
493
+ const tags = this.generatePatternTags(comment);
494
+ for (const tag of tags) {
495
+ weights[tag] = (weights[tag] || 0) + 1;
496
+ }
497
+ }
498
+
499
+ // Normalize weights
500
+ for (const tag in weights) {
501
+ weights[tag] = weights[tag] / totalComments;
502
+ }
503
+
504
+ return weights;
505
+ }
506
+
507
+ /**
508
+ * Process comments in batch
509
+ * @param {Array<Object>} comments - Array of comments
510
+ * @param {Object} prContext - PR context
511
+ * @returns {Promise<Array<Object>>} Processed comments
512
+ */
513
+ async processBatch(comments, prContext) {
514
+ const results = [];
515
+ const batchSize = 10; // Process in small batches to avoid rate limits
516
+
517
+ if (comments.length === 0) {
518
+ return results;
519
+ }
520
+
521
+ // Filter out bot comments before processing
522
+ const humanComments = filterBotComments(comments);
523
+
524
+ if (humanComments.length === 0) {
525
+ return results;
526
+ }
527
+
528
+ for (let i = 0; i < humanComments.length; i += batchSize) {
529
+ const batch = humanComments.slice(i, i + batchSize);
530
+
531
+ const batchPromises = batch.map((comment) =>
532
+ this.processComment(comment, prContext).catch((error) => {
533
+ console.error(chalk.red(`Error processing comment ${comment.id}:`), error);
534
+ return null; // Return null for failed comments
535
+ })
536
+ );
537
+
538
+ const batchResults = await Promise.all(batchPromises);
539
+ const validResults = batchResults.filter((result) => result !== null);
540
+ results.push(...validResults);
541
+
542
+ // Small delay between batches to be gentle on the embedding service
543
+ if (i + batchSize < humanComments.length) {
544
+ await new Promise((resolve) => setTimeout(resolve, 100));
545
+ }
546
+ }
547
+ return results;
548
+ }
549
+ }