codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,2764 @@
1
+ /**
2
+ * RAG Analyzer Module
3
+ *
4
+ * This module provides functionality for analyzing code using context
5
+ * extracted by the Retrieval Augmented Generation (RAG) approach for code review.
6
+ * It identifies patterns, best practices, and generates review comments.
7
+ */
8
+
9
+ import fs from 'node:fs';
10
+ import path from 'node:path';
11
+ import chalk from 'chalk';
12
+ import { getDefaultEmbeddingsSystem } from './embeddings/factory.js';
13
+ import { calculateCosineSimilarity } from './embeddings/similarity-calculator.js';
14
+ import {
15
+ loadFeedbackData,
16
+ shouldSkipSimilarIssue,
17
+ extractDismissedPatterns,
18
+ generateFeedbackContext,
19
+ initializeSemanticSimilarity,
20
+ isSemanticSimilarityAvailable,
21
+ } from './feedback-loader.js';
22
+ import * as llm from './llm.js';
23
+ import { findRelevantPRComments } from './pr-history/database.js';
24
+ import { inferContextFromCodeContent, inferContextFromDocumentContent } from './utils/context-inference.js';
25
+ import { isGenericDocument, getGenericDocumentContext } from './utils/document-detection.js';
26
+ import { isTestFile, shouldProcessFile } from './utils/file-validation.js';
27
+ import { detectFileType, detectLanguageFromExtension } from './utils/language-detection.js';
28
+ import { debug } from './utils/logging.js';
29
+
30
+ // Constants for content processing
31
+ const MAX_QUERY_CONTEXT_LENGTH = 1500;
32
+ const MAX_EMBEDDING_CONTENT_LENGTH = 10000;
33
+ const DEFAULT_TRUNCATE_LINES = 300;
34
+ const GUIDELINE_TRUNCATE_LINES = 400;
35
+ const MAX_PR_COMMENTS_FOR_CONTEXT = 15;
36
+
37
+ // Create embeddings system instance
38
+ const embeddingsSystem = getDefaultEmbeddingsSystem();
39
+
40
+ // Track if semantic similarity has been initialized
41
+ let semanticSimilarityInitialized = false;
42
+
43
+ /**
44
+ * Initialize semantic similarity for feedback filtering
45
+ * Uses the shared embeddings system from feedback-loader.js
46
+ */
47
+ async function ensureSemanticSimilarityInitialized() {
48
+ if (semanticSimilarityInitialized) {
49
+ return;
50
+ }
51
+
52
+ try {
53
+ // Initialize semantic similarity using the shared embeddings system
54
+ await initializeSemanticSimilarity();
55
+ semanticSimilarityInitialized = true;
56
+ } catch (error) {
57
+ console.log(chalk.yellow(`⚠️ Could not initialize semantic similarity: ${error.message}`));
58
+ // Continue without semantic similarity - word-based fallback will be used
59
+ }
60
+ }
61
+
62
+ // ============================================================================
63
+ // COMMON PROMPT INSTRUCTIONS
64
+ // ============================================================================
65
+
66
+ /**
67
+ * Generate the common critical rules block for all prompts
68
+ * @param {Object} options - Options for customization
69
+ * @param {string} options.importRuleContext - Context-specific text for import rule ('code', 'test', or 'pr')
70
+ * @returns {string} Critical rules block
71
+ */
72
+ function getCriticalRulesBlock(options = {}) {
73
+ const { importRuleContext = 'code' } = options;
74
+
75
+ // Customize import rule based on context
76
+ let importRuleText;
77
+ switch (importRuleContext) {
78
+ case 'test':
79
+ importRuleText =
80
+ 'DO NOT flag missing imports or files referenced in import statements as issues. Focus only on test quality, logic, and patterns within the provided test files.';
81
+ break;
82
+ case 'pr':
83
+ importRuleText =
84
+ 'DO NOT flag missing imports or files referenced in import statements as issues. In PR analysis, some files (especially assets like images, fonts, or excluded files) may not be included in the review scope. Focus only on code quality, logic, and patterns within the provided PR files.';
85
+ break;
86
+ default:
87
+ importRuleText =
88
+ 'DO NOT flag missing imports or files referenced in import statements as issues. Focus only on code quality, logic, and patterns within the provided files.';
89
+ }
90
+
91
+ return `**🚨 CRITICAL: LINE NUMBER REPORTING RULE - READ CAREFULLY 🚨**
92
+ When reporting issues in the JSON output, NEVER provide exhaustive lists of line numbers. For repeated issues, list only 3-5 representative line numbers maximum. Exhaustive line number lists are considered errors and must be avoided.
93
+
94
+ **🚨 CRITICAL: IMPORT STATEMENT RULE - READ CAREFULLY 🚨**
95
+ ${importRuleText}
96
+
97
+ **🚨 CRITICAL: NO LOW SEVERITY ISSUES - READ CAREFULLY 🚨**
98
+ DO NOT report "low" severity issues. Low severity issues typically include:
99
+ - Import statement ordering or grouping
100
+ - Code formatting and whitespace
101
+ - Minor stylistic preferences
102
+ - Comment placement or formatting
103
+ - Line length or wrapping suggestions
104
+ These concerns are handled by project linters (ESLint, Prettier, etc.) and should NOT be included in your review.
105
+ Only report issues with severity: "critical", "high", or "medium".
106
+
107
+ **🚨 CRITICAL: ACTIONABLE CODE ISSUES ONLY - NO VERIFICATION REQUESTS 🚨**
108
+ Your review must contain ONLY issues where you have identified a DEFINITE problem and can provide a SPECIFIC code fix.
109
+
110
+ **AUTOMATIC REJECTION - If your suggestion contains ANY of these phrases, DO NOT include it:**
111
+ - "Verify that..." / "Verify the..." / "Verify if..."
112
+ - "Ensure that..." / "Ensure the..."
113
+ - "Confirm that..." / "Confirm the..."
114
+ - "Validate that..." / "Validate the..."
115
+ - "Check that..." / "Check if..." / "Check whether..."
116
+ - "Add a comment explaining..." / "Add documentation..."
117
+ - "Review the documentation..." / "Reference the migration guide..."
118
+ - "Consider whether..." / "Consider if..."
119
+ - "This could potentially..." / "This might..." / "This may..."
120
+ - "If this is intentional..." / "If this change is to fix..."
121
+ - "...should be validated" / "...should be verified"
122
+ - "...but there's no validation..." / "...but there's no verification..."
123
+
124
+ **THE RULE**: If you cannot point to a SPECIFIC BUG or SPECIFIC VIOLATION and provide EXACT CODE to fix it, do not report it.
125
+
126
+ **GOOD issue**: "The function returns null on line 42 but the return type doesn't allow null. Fix: Change return type to \`string | null\`"
127
+ **BAD issue**: "Verify that the function handles null correctly" (This asks for verification, not a code fix)
128
+ **BAD issue**: "The type cast may bypass type safety" (This expresses uncertainty - "may" - without identifying a definite problem)
129
+ **BAD issue**: "Add a comment explaining why this type was changed" (This requests documentation, not a code fix)
130
+
131
+ When in doubt, leave it out. Only report issues you are CERTAIN about.`;
132
+ }
133
+
134
+ /**
135
+ * Generate the common citation requirement block
136
+ * @returns {string} Citation requirement block
137
+ */
138
+ function getCitationRequirementBlock() {
139
+ return `**🚨 CRITICAL CITATION REQUIREMENT 🚨**
140
+ When you identify issues that violate custom instructions provided at the beginning of this prompt, you MUST:
141
+ - Include the source document name in your issue description (e.g., "violates the coding standards specified in '[Document Name]'")
142
+ - Reference the source document in your suggestion (e.g., "as required by '[Document Name]'" or "according to '[Document Name]'")
143
+ - Do NOT provide generic suggestions - always tie violations back to the specific custom instruction source`;
144
+ }
145
+
146
+ /**
147
+ * Generate the common code suggestions format block
148
+ * @returns {string} Code suggestions format block
149
+ */
150
+ function getCodeSuggestionsFormatBlock() {
151
+ return `**🚨 CODE SUGGESTIONS FORMAT 🚨**
152
+ When suggesting code changes, you can optionally include a codeSuggestion object with:
153
+ - startLine: The starting line number of the code to replace
154
+ - endLine: (optional) The ending line number if replacing multiple lines
155
+ - oldCode: The exact current code that should be replaced (must match exactly)
156
+ - newCode: The proposed replacement code
157
+
158
+ Code suggestions enable reviewers to apply fixes directly as GitHub suggestions. Only provide code suggestions when:
159
+ 1. The fix is concrete and can be applied automatically
160
+ 2. You have the exact current code from the file content
161
+ 3. The suggestion is a direct code replacement (not architectural changes)`;
162
+ }
163
+
164
+ /**
165
+ * Generate the final reminder block for custom instructions
166
+ * @returns {string} Final reminder block
167
+ */
168
+ function getFinalReminderBlock() {
169
+ return `**FINAL REMINDER: If custom instructions were provided at the start of this prompt, they MUST be followed and take precedence over all other guidelines.**`;
170
+ }
171
+
172
+ /**
173
+ * Format custom docs section for prompts
174
+ * @param {Array} customDocs - Array of custom document chunks
175
+ * @returns {string} Formatted custom docs section
176
+ */
177
+ function formatCustomDocsSection(customDocs) {
178
+ if (!customDocs || customDocs.length === 0) {
179
+ return '';
180
+ }
181
+
182
+ let section = `
183
+
184
+ CRITICAL: CUSTOM INSTRUCTIONS - FOLLOW THESE BEFORE ALL OTHER INSTRUCTIONS
185
+ =====================================================================
186
+
187
+ `;
188
+
189
+ // Group chunks by document title to provide better context
190
+ const chunksByDocument = new Map();
191
+ customDocs.forEach((doc) => {
192
+ const title = doc.document_title || doc.title;
193
+ if (!chunksByDocument.has(title)) {
194
+ chunksByDocument.set(title, []);
195
+ }
196
+ chunksByDocument.get(title).push(doc);
197
+ });
198
+
199
+ chunksByDocument.forEach((chunks, docTitle) => {
200
+ section += `
201
+ ### AUTHORITATIVE CUSTOM INSTRUCTION: "${docTitle}"
202
+
203
+ IMPORTANT: This is an authoritative document that defines mandatory review standards for this project.
204
+ When you find violations of these standards, you MUST cite "${docTitle}" as the source in your response.
205
+
206
+ `;
207
+ chunks.forEach((chunk, index) => {
208
+ section += `
209
+ **Section ${index + 1}${chunk.chunk_index !== undefined ? ` (Chunk ${chunk.chunk_index + 1})` : ''}:**
210
+
211
+ ${chunk.content}
212
+
213
+ `;
214
+ });
215
+ section += `
216
+ ---
217
+
218
+ `;
219
+ });
220
+
221
+ section += `
222
+ =====================================================================
223
+ END OF CUSTOM INSTRUCTIONS - These are authoritative project guidelines that take precedence over all other standards
224
+ `;
225
+
226
+ return section;
227
+ }
228
+
229
+ /**
230
+ * Build role definition with custom instructions references
231
+ * @param {string} baseRole - Base role description
232
+ * @param {Array} customDocs - Array of custom document chunks
233
+ * @param {string} reviewType - Type of review ('code', 'test', or 'pr')
234
+ * @returns {string} Complete role definition
235
+ */
236
+ function buildRoleDefinition(baseRole, customDocs, reviewType = 'code') {
237
+ let roleDefinition = baseRole;
238
+
239
+ if (customDocs && customDocs.length > 0) {
240
+ const docTitles = [...new Set(customDocs.map((doc) => doc.document_title || doc.title))];
241
+ const reviewTypeText = reviewType === 'test' ? 'test reviews' : reviewType === 'pr' ? 'PR reviews' : 'review';
242
+
243
+ roleDefinition += `\n\nIMPORTANT: You have been given specific custom instructions that define how you should conduct your ${reviewTypeText}:`;
244
+ docTitles.forEach((title, index) => {
245
+ roleDefinition += `\n\n**CUSTOM INSTRUCTION SOURCE ${index + 1}: "${title}"**`;
246
+ roleDefinition += `\nThis contains specific instructions for your ${reviewType === 'test' ? 'test review' : 'review'} approach and criteria.`;
247
+ });
248
+ roleDefinition +=
249
+ '\n\nThese custom instructions define your review methodology and must be followed throughout your analysis. When you apply these instructions, reference the source document that informed your decision.';
250
+ }
251
+
252
+ return roleDefinition;
253
+ }
254
+
255
+ /**
256
+ * Format code examples for prompts
257
+ * @param {Array} codeExamples - Array of code examples
258
+ * @param {string} labelPrefix - Label prefix (e.g., 'CODE EXAMPLE', 'TEST EXAMPLE')
259
+ * @returns {string} Formatted code examples
260
+ */
261
+ function formatCodeExamplesBlock(codeExamples, labelPrefix = 'CODE EXAMPLE') {
262
+ if (!codeExamples || codeExamples.length === 0) {
263
+ return labelPrefix.includes('TEST') ? 'No relevant test examples found.' : 'No relevant code examples found.';
264
+ }
265
+
266
+ return codeExamples
267
+ .map((ex) => {
268
+ const langIdentifier = ex.language || '';
269
+ return `
270
+ ${labelPrefix} ${ex.index} (Similarity: ${ex.similarity})
271
+ Path: ${ex.path}
272
+ Language: ${ex.language}
273
+
274
+ \`\`\`${langIdentifier}
275
+ ${ex.content}
276
+ \`\`\`
277
+ `;
278
+ })
279
+ .join('\n');
280
+ }
281
+
282
+ /**
283
+ * Format guideline snippets for prompts
284
+ * @param {Array} guidelineSnippets - Array of guideline snippets
285
+ * @param {string} labelPrefix - Label prefix (e.g., 'GUIDELINE', 'TESTING GUIDELINE')
286
+ * @returns {string} Formatted guideline snippets
287
+ */
288
+ function formatGuidelinesBlock(guidelineSnippets, labelPrefix = 'GUIDELINE') {
289
+ if (!guidelineSnippets || guidelineSnippets.length === 0) {
290
+ return labelPrefix.includes('TESTING') ? 'No specific testing guideline snippets found.' : 'No specific guideline snippets found.';
291
+ }
292
+
293
+ return guidelineSnippets
294
+ .map((ex) => {
295
+ const langIdentifier = ex.language || 'text';
296
+ let title = `${labelPrefix} ${ex.index} (Source: ${ex.path}, Similarity: ${ex.similarity})`;
297
+ if (ex.headingText) {
298
+ title += `, Heading: "${ex.headingText}"`;
299
+ }
300
+
301
+ return `
302
+ ${title}
303
+
304
+ \`\`\`${langIdentifier}
305
+ ${ex.content}
306
+ \`\`\`
307
+ `;
308
+ })
309
+ .join('\n');
310
+ }
311
+
312
+ // ============================================================================
313
+ // END COMMON PROMPT INSTRUCTIONS
314
+ // ============================================================================
315
+
316
+ /**
317
+ * Get project summary for the given project path
318
+ * @param {string} projectPath - Project path
319
+ * @returns {Promise<Object|null>} Project summary or null
320
+ */
321
+ async function getProjectSummary(projectPath) {
322
+ const resolvedPath = path.resolve(projectPath);
323
+
324
+ try {
325
+ // Retrieve from database
326
+ const summary = await embeddingsSystem.getProjectSummary(resolvedPath);
327
+
328
+ if (summary) {
329
+ console.log(chalk.cyan(`📋 Retrieved project summary for: ${path.basename(resolvedPath)}`));
330
+ }
331
+
332
+ return summary;
333
+ } catch (error) {
334
+ console.error(chalk.red(`Error retrieving project summary: ${error.message}`));
335
+ return null;
336
+ }
337
+ }
338
+
339
+ /**
340
+ * Format project summary for LLM context
341
+ * @param {Object} summary - Project summary object
342
+ * @returns {string} Formatted context string
343
+ */
344
+ function formatProjectSummaryForLLM(summary) {
345
+ if (!summary) return '';
346
+
347
+ let context = `\n## PROJECT ARCHITECTURE CONTEXT\n\n`;
348
+
349
+ context += `**Project:** ${summary.projectName || 'Unknown'} (${summary.projectType || 'Unknown'})\n`;
350
+
351
+ // Safe access to technologies array
352
+ if (summary.technologies && Array.isArray(summary.technologies) && summary.technologies.length > 0) {
353
+ context += `**Technologies:** ${summary.technologies.slice(0, 8).join(', ')}${summary.technologies.length > 8 ? '...' : ''}\n`;
354
+ }
355
+
356
+ // Safe access to mainFrameworks array
357
+ if (summary.mainFrameworks && Array.isArray(summary.mainFrameworks) && summary.mainFrameworks.length > 0) {
358
+ context += `**Main Frameworks:** ${summary.mainFrameworks.join(', ')}\n`;
359
+ }
360
+
361
+ context += '\n';
362
+
363
+ if (summary.customImplementations && Array.isArray(summary.customImplementations) && summary.customImplementations.length > 0) {
364
+ context += `**Custom Implementations to Recognize:**\n`;
365
+ summary.customImplementations.forEach((impl, i) => {
366
+ if (i < 5 && impl) {
367
+ // Limit to top 5 to avoid overwhelming the LLM
368
+ context += `- **${impl.name || 'Unknown'}**: ${impl.description || 'No description'}\n`;
369
+ if (impl.properties && Array.isArray(impl.properties) && impl.properties.length > 0) {
370
+ context += ` Properties: ${impl.properties.slice(0, 3).join(', ')}\n`;
371
+ }
372
+ }
373
+ });
374
+ context += '\n';
375
+ }
376
+
377
+ if (summary.apiPatterns && Array.isArray(summary.apiPatterns) && summary.apiPatterns.length > 0) {
378
+ context += `**API Patterns:**\n`;
379
+ summary.apiPatterns.forEach((pattern) => {
380
+ if (pattern) {
381
+ context += `- ${pattern.type || 'Unknown'}: ${pattern.description || 'No description'}\n`;
382
+ }
383
+ });
384
+ context += '\n';
385
+ }
386
+
387
+ if (summary.stateManagement && summary.stateManagement.approach && summary.stateManagement.approach !== 'Unknown') {
388
+ context += `**State Management:** ${summary.stateManagement.approach}\n`;
389
+ if (
390
+ summary.stateManagement.patterns &&
391
+ Array.isArray(summary.stateManagement.patterns) &&
392
+ summary.stateManagement.patterns.length > 0
393
+ ) {
394
+ context += `- Patterns: ${summary.stateManagement.patterns.join(', ')}\n`;
395
+ }
396
+ context += '\n';
397
+ }
398
+
399
+ if (summary.reviewGuidelines && Array.isArray(summary.reviewGuidelines) && summary.reviewGuidelines.length > 0) {
400
+ context += `**Project-Specific Review Guidelines:**\n`;
401
+ summary.reviewGuidelines.slice(0, 6).forEach((guideline) => {
402
+ if (guideline) {
403
+ context += `- ${guideline}\n`;
404
+ }
405
+ });
406
+ }
407
+
408
+ return context;
409
+ }
410
+
411
+ // Helper function for truncating content with line count
412
+ function truncateContent(content, maxLines = DEFAULT_TRUNCATE_LINES) {
413
+ const lines = content.split('\n');
414
+ if (lines.length > maxLines) {
415
+ return {
416
+ content: lines.slice(0, maxLines).join('\n') + `\n... (truncated, ${lines.length - maxLines} more lines)`,
417
+ wasTruncated: true,
418
+ originalLineCount: lines.length,
419
+ };
420
+ }
421
+ return {
422
+ content: content,
423
+ wasTruncated: false,
424
+ originalLineCount: lines.length,
425
+ };
426
+ }
427
+
428
+ // Helper function for formatting context items (code examples or guidelines)
429
+ function formatContextItems(items, type = 'code') {
430
+ return items.map((item, idx) => {
431
+ // Format similarity score
432
+ const similarityFormatted = typeof item.similarity === 'number' ? item.similarity.toFixed(2) : 'N/A';
433
+
434
+ // Truncate content based on type
435
+ const maxLines = type === 'guideline' ? GUIDELINE_TRUNCATE_LINES : DEFAULT_TRUNCATE_LINES;
436
+ const truncated = truncateContent(item.content, maxLines);
437
+
438
+ const baseFormatted = {
439
+ index: idx + 1,
440
+ path: item.path,
441
+ similarity: similarityFormatted,
442
+ language: item.language || (type === 'guideline' ? 'text' : 'unknown'),
443
+ content: truncated.content,
444
+ };
445
+
446
+ // Add type-specific fields
447
+ if (type === 'guideline') {
448
+ baseFormatted.headingText = item.headingText || null;
449
+ baseFormatted.type = item.type || 'documentation';
450
+ }
451
+
452
+ return baseFormatted;
453
+ });
454
+ }
455
+
456
+ // --- Helper: createGuidelineQueryForLLMRetrieval ---
457
+ function createGuidelineQueryForLLMRetrieval(codeSnippet, reviewedSnippetContext, language) {
458
+ const codeContext = codeSnippet.substring(0, MAX_QUERY_CONTEXT_LENGTH); // Limit snippet length in query
459
+ let query = 'Retrieve technical documentation, architectural guidelines, and best practices. ';
460
+
461
+ if (
462
+ reviewedSnippetContext.area !== 'Unknown' &&
463
+ reviewedSnippetContext.area !== 'GeneralJS_TS' &&
464
+ reviewedSnippetContext.area !== 'General'
465
+ ) {
466
+ query += `Specifically looking for ${reviewedSnippetContext.area} related information. `;
467
+ }
468
+ if (reviewedSnippetContext.dominantTech.length > 0) {
469
+ query += `Focus on technologies like: ${reviewedSnippetContext.dominantTech.join(', ')}. `;
470
+ }
471
+ const generalKeywords = reviewedSnippetContext.keywords.filter(
472
+ (kw) => !reviewedSnippetContext.dominantTech.map((t) => t.toLowerCase()).includes(kw.toLowerCase())
473
+ );
474
+ if (generalKeywords.length > 0) {
475
+ query += `Consider relevance to concepts such as: ${generalKeywords.slice(0, 3).join(', ')}. `;
476
+ }
477
+ query += `Relevant to the following ${language} code snippet context: \\n\`\`\`${language}\\n${codeContext}...\\n\`\`\``;
478
+ return query;
479
+ }
480
+
481
+ // --- Helper: createTestGuidelineQueryForLLMRetrieval ---
482
+ function createTestGuidelineQueryForLLMRetrieval(codeSnippet, reviewedSnippetContext, language) {
483
+ const codeContext = codeSnippet.substring(0, MAX_QUERY_CONTEXT_LENGTH); // Limit snippet length in query
484
+ let query = 'Retrieve testing documentation, test patterns, and testing best practices. ';
485
+
486
+ query += 'Focus on test coverage, test naming conventions, assertion patterns, mocking strategies, and test organization. ';
487
+
488
+ if (
489
+ reviewedSnippetContext.area !== 'Unknown' &&
490
+ reviewedSnippetContext.area !== 'GeneralJS_TS' &&
491
+ reviewedSnippetContext.area !== 'General'
492
+ ) {
493
+ query += `Specifically looking for ${reviewedSnippetContext.area} testing patterns and practices. `;
494
+ }
495
+
496
+ if (reviewedSnippetContext.dominantTech.length > 0) {
497
+ query += `Focus on testing frameworks and patterns for: ${reviewedSnippetContext.dominantTech.join(', ')}. `;
498
+ }
499
+
500
+ const testingKeywords = [
501
+ 'test',
502
+ 'spec',
503
+ 'mock',
504
+ 'stub',
505
+ 'assertion',
506
+ 'coverage',
507
+ 'fixture',
508
+ 'beforeEach',
509
+ 'afterEach',
510
+ 'describe',
511
+ 'it',
512
+ 'expect',
513
+ ];
514
+ const relevantKeywords = reviewedSnippetContext.keywords.filter((kw) => testingKeywords.some((tk) => kw.toLowerCase().includes(tk)));
515
+
516
+ if (relevantKeywords.length > 0) {
517
+ query += `Consider testing concepts such as: ${relevantKeywords.slice(0, 3).join(', ')}. `;
518
+ }
519
+
520
+ query += `Relevant to the following ${language} test file context: \\n\`\`\`${language}\\n${codeContext}...\\n\`\`\``;
521
+ return query;
522
+ }
523
+
524
+ /**
525
+ * Run an analysis using the RAG approach (single file or holistic PR)
526
+ *
527
+ * @param {string} filePath - Path to the file to analyze, or a special marker for PR reviews
528
+ * @param {Object} options - Analysis options
529
+ * @returns {Promise<Object>} Analysis results
530
+ */
531
+ async function runAnalysis(filePath, options = {}) {
532
+ try {
533
+ // Check if this is a holistic PR review
534
+ if (options.isHolisticPRReview && filePath === 'PR_HOLISTIC_REVIEW') {
535
+ console.log(chalk.blue(`Performing holistic PR review for ${options.prFiles?.length || 0} files`));
536
+ return await performHolisticPRAnalysis(options);
537
+ }
538
+
539
+ console.log(chalk.blue(`Analyzing file: ${filePath}`));
540
+
541
+ // Load feedback data if feedback tracking is enabled
542
+ let feedbackData = {};
543
+ if (options.trackFeedback && options.feedbackPath) {
544
+ console.log(chalk.cyan('--- Loading Feedback Data ---'));
545
+ feedbackData = await loadFeedbackData(options.feedbackPath, { verbose: options.verbose });
546
+ }
547
+
548
+ // Check if file exists
549
+ if (!fs.existsSync(filePath)) {
550
+ throw new Error(`File not found: ${filePath}`);
551
+ }
552
+
553
+ // Read file content - use diff content if this is a diff-only review
554
+ let content;
555
+ let fullFileContent;
556
+ if (options.diffOnly && options.diffContent) {
557
+ content = options.diffContent;
558
+ // For PR reviews, always read the full file content for context awareness
559
+ fullFileContent = fs.existsSync(filePath) ? fs.readFileSync(filePath, 'utf8') : null;
560
+ console.log(chalk.blue(`Analyzing diff only for ${path.basename(filePath)}`));
561
+ } else {
562
+ content = fs.readFileSync(filePath, 'utf8');
563
+ fullFileContent = content;
564
+ console.log(chalk.blue(`Analyzing full file ${path.basename(filePath)}`));
565
+ }
566
+
567
+ // Check if file should be processed
568
+ if (!shouldProcessFile(filePath, content)) {
569
+ console.log(chalk.yellow(`Skipping file based on exclusion patterns: ${filePath}`));
570
+ return {
571
+ success: true,
572
+ skipped: true,
573
+ message: 'File skipped based on exclusion patterns',
574
+ };
575
+ }
576
+
577
+ // --- Stage 1: CONTEXT RETRIEVAL ---
578
+ console.log(chalk.blue('--- Stage 1: Context Retrieval ---'));
579
+ const {
580
+ language,
581
+ isTestFile,
582
+ finalCodeExamples,
583
+ finalGuidelineSnippets,
584
+ prCommentContext,
585
+ prContextAvailable,
586
+ relevantCustomDocChunks,
587
+ } = await getContextForFile(filePath, content, options);
588
+
589
+ // --- Stage 1.5: PROJECT ARCHITECTURE CONTEXT ---
590
+ console.log(chalk.blue('--- Stage 1.5: Retrieving Project Architecture Context ---'));
591
+ const projectPath = options.projectPath || process.cwd();
592
+ const projectSummary = await getProjectSummary(projectPath);
593
+
594
+ // --- Stage 2: PREPARE CONTEXT FOR LLM ---
595
+ console.log(chalk.blue('--- Stage 2: Preparing Context for LLM ---'));
596
+
597
+ // Format the lists that will be passed
598
+ const formattedCodeExamples = formatContextItems(finalCodeExamples, 'code');
599
+ const formattedGuidelines = formatContextItems(finalGuidelineSnippets, 'guideline');
600
+
601
+ // --- Log the context being sent to the LLM --- >
602
+ console.log(chalk.magenta('--- Guidelines Sent to LLM ---'));
603
+ if (formattedGuidelines.length > 0) {
604
+ formattedGuidelines.forEach((g, i) => {
605
+ console.log(chalk.magenta(` [${i + 1}] Path: ${g.path} ${g.headingText ? `(Heading: "${g.headingText}")` : ''}`));
606
+ console.log(chalk.gray(` Content: ${g.content.substring(0, 100).replace(/\\n/g, ' ')}...`));
607
+ });
608
+ } else {
609
+ console.log(chalk.magenta(' (None)'));
610
+ }
611
+
612
+ console.log(chalk.magenta('--- Code Examples Sent to LLM ---'));
613
+ if (finalCodeExamples.length > 0) {
614
+ finalCodeExamples.forEach((ex, i) => {
615
+ console.log(chalk.magenta(` [${i + 1}] Path: ${ex.path} (Similarity: ${ex.similarity?.toFixed(3) || 'N/A'})`));
616
+ console.log(chalk.gray(` Content: ${ex.content.substring(0, 100).replace(/\\n/g, ' ')}...`));
617
+ });
618
+ } else {
619
+ console.log(chalk.magenta(' (None)'));
620
+ }
621
+
622
+ console.log(chalk.magenta('--- Custom Document Chunks Sent to LLM ---'));
623
+ if (relevantCustomDocChunks && relevantCustomDocChunks.length > 0) {
624
+ relevantCustomDocChunks.forEach((chunk, i) => {
625
+ console.log(chalk.magenta(` [${i + 1}] Document: "${chunk.document_title}" (Chunk ${chunk.chunk_index + 1})`));
626
+ console.log(chalk.magenta(` Similarity: ${chunk.similarity?.toFixed(3) || 'N/A'}`));
627
+ console.log(chalk.gray(` Content: ${chunk.content.substring(0, 100).replace(/\\n/g, ' ')}...`));
628
+ });
629
+ } else {
630
+ console.log(chalk.magenta(' (None)'));
631
+ }
632
+ console.log(chalk.magenta('---------------------------------'));
633
+ // --- End Logging --->
634
+
635
+ // Prepare context for LLM with the potentially reduced lists
636
+ const context = prepareContextForLLM(
637
+ filePath,
638
+ content,
639
+ language,
640
+ // Pass the formatted lists
641
+ formattedCodeExamples,
642
+ formattedGuidelines, // Always pass the formatted guidelines
643
+ prCommentContext, // Pass PR comment context
644
+ { ...options, isTestFile, relevantCustomDocChunks, feedbackData, projectSummary, fullFileContent } // Pass full file content for context
645
+ );
646
+
647
+ // Call LLM for analysis
648
+ const analysisResults = await callLLMForAnalysis(context, { ...options, isTestFile, feedbackData });
649
+
650
+ // Filter out low severity issues (formatting/style concerns handled by linters)
651
+ // Note: The LLM prompt instructs not to generate low severity issues, but this filter
652
+ // serves as a safety net in case any slip through despite the prompt instructions
653
+ const lowSeverityFiltered = filterLowSeverityIssues(analysisResults, { verbose: options.verbose });
654
+
655
+ // Post-process results to filter dismissed issues
656
+ let filteredResults = lowSeverityFiltered;
657
+ if (options.trackFeedback && feedbackData && Object.keys(feedbackData).length > 0) {
658
+ console.log(chalk.cyan('--- Filtering Results Based on Feedback ---'));
659
+ filteredResults = await filterAnalysisResults(lowSeverityFiltered, feedbackData, {
660
+ similarityThreshold: options.feedbackThreshold || 0.7,
661
+ verbose: options.verbose,
662
+ });
663
+ }
664
+
665
+ return {
666
+ success: true,
667
+ filePath,
668
+ language,
669
+ results: filteredResults,
670
+ context: {
671
+ codeExamples: finalCodeExamples.length,
672
+ guidelines: finalGuidelineSnippets.length,
673
+ prComments: prCommentContext.length,
674
+ prContextAvailable,
675
+ },
676
+ prHistory: prContextAvailable
677
+ ? {
678
+ commentsFound: prCommentContext.length,
679
+ patterns: extractCommentPatterns(prCommentContext),
680
+ summary: generateContextSummary(prCommentContext, extractCommentPatterns(prCommentContext)),
681
+ }
682
+ : null,
683
+ similarExamples: finalCodeExamples.map((ex) => ({
684
+ path: ex.path,
685
+ similarity: ex.similarity,
686
+ })),
687
+ metadata: {
688
+ analysisTimestamp: new Date().toISOString(),
689
+ featuresUsed: {
690
+ codeExamples: finalCodeExamples.length > 0,
691
+ guidelines: finalGuidelineSnippets.length > 0,
692
+ prHistory: prContextAvailable,
693
+ feedbackFiltering: options.trackFeedback && Object.keys(feedbackData).length > 0,
694
+ },
695
+ ...(filteredResults.metadata || {}),
696
+ },
697
+ };
698
+ } catch (error) {
699
+ console.error(chalk.red(`Error analyzing file: ${error.message}`));
700
+ return {
701
+ success: false,
702
+ error: error.message,
703
+ filePath,
704
+ };
705
+ }
706
+ }
707
+
708
+ /**
709
+ * Prepare context for LLM analysis
710
+ *
711
+ * @param {string} filePath - Path to the file
712
+ * @param {string} content - File content
713
+ * @param {string} language - File language
714
+ * @param {Array<Object>} codeExamples - Processed list of code examples
715
+ * @param {Array<Object>} guidelineSnippets - Processed list of guideline snippets
716
+ * @param {Array<Object>} prCommentContext - PR comment context
717
+ * @param {Object} options - Options
718
+ * @returns {Object} Context for LLM
719
+ */
720
+ function prepareContextForLLM(filePath, content, language, finalCodeExamples, finalGuidelineSnippets, prCommentContext = [], options = {}) {
721
+ const { customDocs, relevantCustomDocChunks, feedbackData, projectSummary } = options;
722
+
723
+ // Extract file name and directory
724
+ const fileName = path.basename(filePath);
725
+ const dirPath = path.dirname(filePath);
726
+ const dirName = path.basename(dirPath);
727
+
728
+ // Determine if this is a diff-only review
729
+ const isDiffReview = options.diffOnly && options.diffContent;
730
+ const reviewType = isDiffReview ? 'DIFF REVIEW' : 'FULL FILE REVIEW';
731
+
732
+ // For PR reviews, we need both the full file content and the diff
733
+ // content represents the diff (what to review)
734
+ // options.fullFileContent represents the complete file context
735
+ const fullFileContent = isDiffReview && options.fullFileContent ? options.fullFileContent : content;
736
+
737
+ // Format similar code examples and guideline snippets
738
+ const codeExamples = formatContextItems(finalCodeExamples, 'code');
739
+ const guidelineSnippets = formatContextItems(finalGuidelineSnippets, 'guideline');
740
+
741
+ const contextSections = [];
742
+
743
+ // Add existing context sections
744
+ if (codeExamples.length > 0) {
745
+ contextSections.push({
746
+ title: 'Similar Code Examples',
747
+ description: 'Code patterns from the project that are similar to the file being reviewed',
748
+ items: codeExamples,
749
+ });
750
+ }
751
+
752
+ if (guidelineSnippets.length > 0) {
753
+ contextSections.push({
754
+ title: 'Project Guidelines',
755
+ description: 'Documentation and guidelines relevant to this code',
756
+ items: guidelineSnippets,
757
+ });
758
+ }
759
+
760
+ // Add PR Comment Context Section
761
+ if (prCommentContext && prCommentContext.length > 0) {
762
+ contextSections.push({
763
+ title: 'Historical Review Comments',
764
+ description: 'Similar code patterns and issues identified by human reviewers in past PRs',
765
+ items: prCommentContext,
766
+ });
767
+ }
768
+
769
+ // Add feedback context if available
770
+ const dismissedPatterns = feedbackData ? extractDismissedPatterns(feedbackData, { maxPatterns: 10 }) : [];
771
+ if (dismissedPatterns.length > 0) {
772
+ contextSections.push({
773
+ title: 'Dismissed Issue Patterns',
774
+ description: 'Types of issues previously dismissed or marked as not relevant by users',
775
+ items: dismissedPatterns.map((pattern, index) => ({
776
+ index: index + 1,
777
+ issue: pattern.issue,
778
+ reason: pattern.reason,
779
+ sentiment: pattern.sentiment,
780
+ })),
781
+ });
782
+ }
783
+
784
+ return {
785
+ file: {
786
+ path: filePath,
787
+ name: fileName,
788
+ directory: dirPath,
789
+ directoryName: dirName,
790
+ language,
791
+ content,
792
+ fullFileContent, // Include full file content for context awareness
793
+ reviewType: reviewType,
794
+ isDiffReview: isDiffReview,
795
+ // Add PR context if available
796
+ ...(options.prContext && {
797
+ prContext: {
798
+ totalFiles: options.prContext.totalFiles,
799
+ testFiles: options.prContext.testFiles,
800
+ sourceFiles: options.prContext.sourceFiles,
801
+ allFiles: options.prContext.allFiles,
802
+ },
803
+ }),
804
+ // Add diff-specific info if this is a diff review
805
+ ...(isDiffReview &&
806
+ options.diffInfo && {
807
+ diffInfo: {
808
+ addedLines: options.diffInfo.addedLines.length,
809
+ removedLines: options.diffInfo.removedLines.length,
810
+ baseBranch: options.baseBranch,
811
+ targetBranch: options.targetBranch,
812
+ },
813
+ }),
814
+ },
815
+ context: contextSections,
816
+ codeExamples,
817
+ guidelineSnippets,
818
+ customDocs: relevantCustomDocChunks || customDocs, // Use relevant chunks if available, fallback to full docs
819
+ feedbackContext: generateFeedbackContext(dismissedPatterns), // Add feedback context for LLM
820
+ projectSummary: projectSummary, // Add project architecture summary
821
+ metadata: {
822
+ hasCodeExamples: finalCodeExamples.length > 0,
823
+ hasGuidelines: finalGuidelineSnippets.length > 0,
824
+ hasPRHistory: prCommentContext.length > 0,
825
+ hasFeedbackContext: dismissedPatterns.length > 0,
826
+ hasProjectSummary: !!projectSummary,
827
+ analysisTimestamp: new Date().toISOString(),
828
+ reviewType: reviewType,
829
+ isPRReview: options.isPRReview || false,
830
+ },
831
+ options,
832
+ };
833
+ }
834
+
835
+ /**
836
+ * Call LLM for code analysis
837
+ *
838
+ * @param {Object} context - Context for LLM
839
+ * @param {Object} options - Options
840
+ * @returns {Promise<Object>} Analysis results
841
+ */
842
+ async function callLLMForAnalysis(context, options = {}) {
843
+ try {
844
+ let prompt;
845
+ const model = options.model || 'claude-sonnet-4-5';
846
+ const maxTokens = options.maxTokens || 8192; // Default to a safe limit
847
+
848
+ if (options.isHolisticPRReview) {
849
+ prompt = generateHolisticPRAnalysisPrompt(context);
850
+ } else {
851
+ prompt = options.isTestFile ? generateTestFileAnalysisPrompt(context) : generateAnalysisPrompt(context);
852
+ }
853
+
854
+ // Call LLM with the prompt
855
+ const llmResponse = await sendPromptToLLM(prompt, {
856
+ temperature: 0,
857
+ maxTokens: maxTokens,
858
+ model: model,
859
+ isJsonMode: true, // Standardize on using JSON mode if available
860
+ });
861
+
862
+ console.log(chalk.blue('Received LLM response, attempting to parse...'));
863
+
864
+ console.log(chalk.gray(`Response type: ${typeof llmResponse}`));
865
+ console.log(chalk.gray(`Response length: ${llmResponse?.length || 0} characters`));
866
+
867
+ // Parse the raw LLM response
868
+ const analysisResponse = parseAnalysisResponse(llmResponse);
869
+
870
+ // Validate the parsed response has the expected structure
871
+ if (!options.isHolisticPRReview && (!analysisResponse.summary || !Array.isArray(analysisResponse.issues))) {
872
+ console.warn(chalk.yellow('Parsed response missing expected structure, attempting to reconstruct...'));
873
+
874
+ return {
875
+ summary: analysisResponse.summary || 'Analysis completed with parsing issues',
876
+ issues: Array.isArray(analysisResponse.issues) ? analysisResponse.issues : [],
877
+ rawResponse: analysisResponse.rawResponse || llmResponse.substring(0, 500),
878
+ parseWarning: 'Response structure was reconstructed due to parsing issues',
879
+ };
880
+ }
881
+
882
+ console.log(chalk.green('Successfully parsed LLM response with expected structure'));
883
+ return analysisResponse;
884
+ } catch (error) {
885
+ console.error(chalk.red(`Error calling LLM for analysis: ${error.message}`));
886
+ console.error(error.stack);
887
+ throw error;
888
+ }
889
+ }
890
+
891
+ /**
892
+ * Appends critical JSON formatting requirements to a prompt.
893
+ * @param {string} promptBody - The main body of the prompt.
894
+ * @returns {string} The finalized prompt with JSON formatting instructions.
895
+ */
896
+ function finalizePrompt(promptBody) {
897
+ return `${promptBody}
898
+
899
+ CRITICAL FORMATTING REQUIREMENTS:
900
+ - Respond ONLY with a valid JSON object
901
+ - Do not include any text before or after the JSON
902
+ - Do not wrap the JSON in markdown code blocks
903
+ - Ensure all strings are properly escaped
904
+ - Use double quotes for all string values
905
+ - Do not include trailing commas
906
+ - Validate that your response is parseable JSON before sending
907
+
908
+ MARKDOWN FORMATTING IN DESCRIPTIONS AND SUGGESTIONS:
909
+ - Use backticks (\`) around code elements like commands, flags, file names, variable names, function names, etc.
910
+ - Examples: \`git fetch\`, \`--unshallow\`, \`timeout-minutes\`, \`process.env.NODE_ENV\`, \`handleClick()\`
911
+ - Use backticks for any technical terms that would be considered "code" including:
912
+ - Command line tools and commands
913
+ - Command line flags and options
914
+ - Configuration keys and values
915
+ - File names and extensions
916
+ - Environment variables
917
+ - Function and variable names
918
+ - CSS classes and IDs
919
+ - HTML attributes
920
+ - API endpoints and parameters
921
+ - Do NOT use backticks around regular English words or common nouns
922
+ - Use proper markdown formatting for emphasis (*italics*, **bold**) when appropriate
923
+
924
+ Your response must start with { and end with } with no additional text.`;
925
+ }
926
+
927
+ // LLM call function
928
+ async function sendPromptToLLM(prompt, llmOptions) {
929
+ try {
930
+ if (!llm || typeof llm.sendPromptToClaude !== 'function') {
931
+ throw new Error('LLM module does not contain required function: sendPromptToClaude');
932
+ }
933
+
934
+ // Define schema for code review responses
935
+ const codeReviewSchema = {
936
+ type: 'object',
937
+ additionalProperties: false,
938
+ properties: {
939
+ summary: { type: 'string' },
940
+ issues: {
941
+ type: 'array',
942
+ items: {
943
+ type: 'object',
944
+ properties: {
945
+ type: { type: 'string' },
946
+ severity: { type: 'string' },
947
+ description: { type: 'string' },
948
+ lineNumbers: {
949
+ type: 'array',
950
+ items: { type: 'number' },
951
+ },
952
+ suggestion: { type: 'string' },
953
+ codeSuggestion: {
954
+ type: 'object',
955
+ properties: {
956
+ startLine: { type: 'number' },
957
+ endLine: { type: 'number' },
958
+ oldCode: { type: 'string' },
959
+ newCode: { type: 'string' },
960
+ },
961
+ required: ['startLine', 'oldCode', 'newCode'],
962
+ },
963
+ category: { type: 'string' },
964
+ },
965
+ required: ['type', 'severity', 'description', 'lineNumbers'],
966
+ },
967
+ },
968
+ crossFileIssues: {
969
+ type: 'array',
970
+ items: {
971
+ type: 'object',
972
+ properties: {
973
+ type: { type: 'string' },
974
+ severity: { type: 'string' },
975
+ message: { type: 'string' },
976
+ files: {
977
+ type: 'array',
978
+ items: { type: 'string' },
979
+ },
980
+ suggestion: { type: 'string' },
981
+ category: { type: 'string' },
982
+ },
983
+ required: ['type', 'severity', 'message', 'files'],
984
+ },
985
+ },
986
+ fileSpecificIssues: {
987
+ type: 'object',
988
+ additionalProperties: {
989
+ type: 'array',
990
+ items: {
991
+ type: 'object',
992
+ properties: {
993
+ type: { type: 'string' },
994
+ severity: { type: 'string' },
995
+ description: { type: 'string' },
996
+ lineNumbers: {
997
+ type: 'array',
998
+ items: { type: 'number' },
999
+ },
1000
+ suggestion: { type: 'string' },
1001
+ codeSuggestion: {
1002
+ type: 'object',
1003
+ properties: {
1004
+ startLine: { type: 'number' },
1005
+ endLine: { type: 'number' },
1006
+ oldCode: { type: 'string' },
1007
+ newCode: { type: 'string' },
1008
+ },
1009
+ required: ['startLine', 'oldCode', 'newCode'],
1010
+ },
1011
+ category: { type: 'string' },
1012
+ },
1013
+ required: ['type', 'severity', 'description', 'lineNumbers'],
1014
+ },
1015
+ },
1016
+ },
1017
+ recommendations: {
1018
+ type: 'array',
1019
+ items: {
1020
+ type: 'object',
1021
+ properties: {
1022
+ category: { type: 'string' },
1023
+ suggestion: { type: 'string' },
1024
+ priority: { type: 'string' },
1025
+ impact: { type: 'string' },
1026
+ },
1027
+ required: ['category', 'suggestion'],
1028
+ },
1029
+ },
1030
+ },
1031
+ required: ['summary'],
1032
+ };
1033
+
1034
+ const response = await llm.sendPromptToClaude(prompt, {
1035
+ ...llmOptions,
1036
+ jsonSchema: codeReviewSchema,
1037
+ });
1038
+
1039
+ // Return the response object so parseAnalysisResponse can access the json property
1040
+ return response;
1041
+ } catch (error) {
1042
+ console.error(chalk.red(`Error in LLM call: ${error.message}`));
1043
+ throw error; // Re-throw to properly handle the error
1044
+ }
1045
+ }
1046
+
1047
+ /**
1048
+ * Generate analysis prompt for LLM
1049
+ *
1050
+ * @param {Object} context - Context for LLM
1051
+ * @returns {string} Analysis prompt
1052
+ */
1053
+ function generateAnalysisPrompt(context) {
1054
+ const { file, codeExamples, guidelineSnippets, customDocs, feedbackContext } = context;
1055
+
1056
+ // Format code examples and guidelines using shared helpers
1057
+ const formattedCodeExamples = formatCodeExamplesBlock(codeExamples, 'CODE EXAMPLE');
1058
+ const formattedGuidelines = formatGuidelinesBlock(guidelineSnippets, 'GUIDELINE');
1059
+
1060
+ // Check for PR comment context in the context object
1061
+ const { context: contextSections } = context;
1062
+ let prHistorySection = '';
1063
+
1064
+ console.log(chalk.blue(`🔍 Checking for PR comments in prompt generation...`));
1065
+ console.log(chalk.gray(`Context sections available: ${contextSections ? contextSections.length : 0}`));
1066
+
1067
+ if (contextSections && contextSections.length > 0) {
1068
+ contextSections.forEach((section, idx) => {
1069
+ console.log(chalk.gray(` Section ${idx + 1}: ${section.title} (${section.items?.length || 0} items)`));
1070
+ });
1071
+
1072
+ const prComments = contextSections.find((section) => section.title === 'Historical Review Comments');
1073
+ if (prComments && prComments.items.length > 0) {
1074
+ console.log(chalk.green(`✅ Adding ${prComments.items.length} PR comments to LLM prompt`));
1075
+ prHistorySection += `
1076
+
1077
+ CONTEXT C: HISTORICAL REVIEW COMMENTS
1078
+ Similar code patterns and issues identified by human reviewers in past PRs
1079
+
1080
+ `;
1081
+ prComments.items.slice(0, MAX_PR_COMMENTS_FOR_CONTEXT).forEach((comment, idx) => {
1082
+ prHistorySection += `### Historical Comment ${idx + 1}\n`;
1083
+ prHistorySection += `- **PR**: #${comment.pr_number} by ${comment.author}\n`;
1084
+ prHistorySection += `- **File**: ${comment.file_path}\n`;
1085
+ prHistorySection += `- **Type**: ${comment.comment_type}\n`;
1086
+ prHistorySection += `- **Relevance**: ${(comment.similarity_score * 100).toFixed(1)}%\n`;
1087
+ prHistorySection += `- **Review**: ${comment.comment_text}\n\n`;
1088
+ });
1089
+
1090
+ prHistorySection += `Use these historical patterns to identify DEFINITE issues in the current code. `;
1091
+ prHistorySection += `Only report issues that EXACTLY match historical patterns with SPECIFIC code fixes.\n\n`;
1092
+
1093
+ console.log(chalk.blue(`PR History section preview: ${prHistorySection.substring(0, 200)}...`));
1094
+ } else {
1095
+ console.log(chalk.yellow(`❌ No PR comments section found in context`));
1096
+ }
1097
+ } else {
1098
+ console.log(chalk.yellow(`❌ No context sections available for PR comments`));
1099
+ }
1100
+
1101
+ // Detect if this is a diff review
1102
+ const isDiffReview = file.reviewType === 'DIFF REVIEW';
1103
+ const reviewInstructions = isDiffReview
1104
+ ? 'Your task is to review the git diff by performing a two-stage analysis based **only** on the provided context, prioritizing documented guidelines and historical review patterns. Follow the context awareness instructions provided with the file content below.'
1105
+ : 'Your task is to review the following code file by performing a two-stage analysis based **only** on the provided context, prioritizing documented guidelines and historical review patterns.';
1106
+
1107
+ const fileSection = isDiffReview
1108
+ ? `GIT DIFF TO REVIEW (FOCUS ONLY ON CHANGED LINES):
1109
+ Path: ${file.path}
1110
+ Language: ${file.language}
1111
+ Base Branch: ${file.diffInfo?.baseBranch || 'master'}
1112
+ Target Branch: ${file.diffInfo?.targetBranch || 'HEAD'}
1113
+
1114
+ **CRITICAL CONTEXT AWARENESS INSTRUCTIONS:**
1115
+
1116
+ You have access to TWO pieces of information:
1117
+ 1. **FULL FILE CONTENT** - The complete file for understanding context
1118
+ 2. **GIT DIFF** - Only the changes to review
1119
+
1120
+ **Review Rules:**
1121
+ - ONLY critique the CHANGED lines shown in the diff (lines with + or -)
1122
+ - USE the full file content to understand context and dependencies
1123
+ - DO NOT suggest adding code that already exists in the unchanged portions
1124
+ - DO NOT flag issues about missing code if it exists in the full file
1125
+ - Do NOT flag functions/variables as missing if they exist elsewhere in the full file
1126
+ - The unchanged code is part of the file - check it before making assumptions
1127
+
1128
+ **FULL FILE CONTENT (for context - DO NOT review unchanged code):**
1129
+
1130
+ \`\`\`${file.language}
1131
+ ${file.fullFileContent || file.content}
1132
+ \`\`\`
1133
+
1134
+ **GIT DIFF TO REVIEW (critique ONLY these changes):**
1135
+
1136
+ \`\`\`diff
1137
+ ${file.content}
1138
+ \`\`\``
1139
+ : `FILE TO REVIEW:
1140
+ Path: ${file.path}
1141
+ Language: ${file.language}
1142
+
1143
+ \`\`\`${file.language}
1144
+ ${file.content}
1145
+ \`\`\``;
1146
+
1147
+ // Add project architecture context if available
1148
+ let projectArchitectureSection = '';
1149
+ if (context.projectSummary) {
1150
+ projectArchitectureSection = formatProjectSummaryForLLM(context.projectSummary);
1151
+ }
1152
+
1153
+ // Use shared helpers for custom docs and role definition
1154
+ const customDocsSection = formatCustomDocsSection(customDocs);
1155
+ const roleDefinition = buildRoleDefinition(
1156
+ 'You are an expert code reviewer acting as a senior developer on this specific project.',
1157
+ customDocs,
1158
+ 'code'
1159
+ );
1160
+
1161
+ // Corrected prompt with full two-stage analysis + combined output stage
1162
+ return finalizePrompt(`
1163
+ ${roleDefinition}
1164
+
1165
+ ${reviewInstructions}
1166
+
1167
+ ${customDocsSection}
1168
+
1169
+ ${fileSection}
1170
+
1171
+ CONTEXT FROM PROJECT:
1172
+ ${projectArchitectureSection}
1173
+
1174
+ CONTEXT A: EXPLICIT GUIDELINES FROM DOCUMENTATION
1175
+ ${formattedGuidelines}
1176
+
1177
+ CONTEXT B: SIMILAR CODE EXAMPLES FROM PROJECT
1178
+ ${formattedCodeExamples}
1179
+
1180
+ ${prHistorySection}
1181
+
1182
+ ${feedbackContext || ''}
1183
+
1184
+ INSTRUCTIONS:
1185
+
1186
+ ${getCriticalRulesBlock({ importRuleContext: 'code' })}
1187
+
1188
+ **Perform the following analysis stages sequentially:**
1189
+
1190
+ **STAGE 1: Custom Instructions & Guideline-Based Review**
1191
+ 1. **FIRST AND MOST IMPORTANT**: If custom instructions were provided at the beginning of this prompt, analyze the 'FILE TO REVIEW' against those custom instructions BEFORE all other analysis. Custom instructions always take precedence.
1192
+ 2. Analyze the 'FILE TO REVIEW' strictly against the standards, rules, and explanations provided in 'CONTEXT A: EXPLICIT GUIDELINES'.
1193
+ 3. Identify any specific deviations where the reviewed code violates custom instructions OR explicit guidelines. **CRITICAL**: When you find violations of custom instructions, you MUST cite the specific custom instruction source document name in your issue description and suggestion.
1194
+ 4. Temporarily ignore 'CONTEXT B: SIMILAR CODE EXAMPLES' during this stage.
1195
+
1196
+ **STAGE 2: Code Example-Based Review (CRITICAL FOR IMPLICIT PATTERNS)**
1197
+ 1. **CRITICAL FIRST STEP**: Scan ALL code examples in Context B and create a mental list of:
1198
+ - Common import statements (especially those containing 'helper', 'util', 'shared', 'common', 'test')
1199
+ - Frequently used function calls that appear across multiple examples
1200
+ - Project-specific wrappers or utilities (e.g., \`renderWithTestHelpers\` instead of direct \`render\`)
1201
+ - Consistent patterns in how operations are performed
1202
+ 2. **IMPORTANT**: For each common utility or pattern you identify, note:
1203
+ - Which files use it (cite specific examples)
1204
+ - What the pattern appears to do
1205
+ - Whether the reviewed file is using this pattern or not
1206
+ 3. Analyze the 'FILE TO REVIEW' against these discovered patterns. Focus on:
1207
+ - Missing imports of commonly used utilities
1208
+ - Direct library usage where others use project wrappers
1209
+ - Deviations from established patterns
1210
+ 4. **HIGH PRIORITY**: Flag any instances where:
1211
+ - The reviewed code uses a direct library call (e.g., \`render\`) when multiple examples use a project wrapper (e.g., \`renderWithTestHelpers\`)
1212
+ - Common utility functions available in the project are not being imported or used
1213
+ - The code deviates from patterns that appear in 3+ examples
1214
+ 5. Pay special attention to imports - if most similar files import certain utilities, the reviewed file should too.
1215
+
1216
+ **STAGE 3: Historical Review Comments Analysis**
1217
+ 1. **CRITICAL**: If 'CONTEXT C: HISTORICAL REVIEW COMMENTS' is present, analyze each historical comment:
1218
+ - Look for patterns in the types of issues human reviewers have identified in similar code
1219
+ - Identify if the SAME DEFINITE issue exists in the current file (not similar - the SAME)
1220
+ - Pay special attention to comments with high relevance scores (>70%)
1221
+ 2. **Apply Historical Insights**: For each historical comment:
1222
+ - Only report if the EXACT same issue type exists with a SPECIFIC code fix
1223
+ - Do NOT report speculative issues based on historical patterns
1224
+ 3. **Prioritize Historical Issues**: Issues DEFINITELY matching historical patterns get high priority
1225
+
1226
+ **STAGE 4: Consolidate, Prioritize, and Generate Output**
1227
+ 1. **CRITICAL REMINDER**: If custom instructions were provided at the beginning of this prompt, they take ABSOLUTE PRECEDENCE over all other guidelines and must be followed strictly.
1228
+ 2. Combine the potential issues identified in Stage 1 (Guideline-Based), Stage 2 (Example-Based), and Stage 3 (Historical Review Comments).
1229
+ 3. **Apply Conflict Resolution AND Citation Rules:**
1230
+ * **Guideline Precedence:** If an issue identified in Stage 2 (from code examples) or Stage 3 (from historical comments) **contradicts** an explicit guideline from Stage 1, **discard the conflicting issue**. Guidelines always take precedence.
1231
+ * **Citation Priority:** When reporting an issue:
1232
+ * **CRITICAL FOR CUSTOM INSTRUCTIONS**: If the issue violates a custom instruction provided at the beginning of this prompt, you MUST include the source document name in both the description and suggestion. For example: "violates the coding standards specified in '[Document Name]'" or "as required by '[Document Name]'".
1233
+ * If the relevant convention or standard is defined in 'CONTEXT A: EXPLICIT GUIDELINES', cite the guideline document.
1234
+ * For implicit patterns discovered from code examples (like helper utilities, common practices), cite the specific code examples that demonstrate the pattern.
1235
+ * For issues identified from historical review comments, report them as standard code review findings without referencing the historical source.
1236
+ * **IMPORTANT**: When citing implicit patterns from Context B, be specific about which files demonstrate the pattern and what the pattern is.
1237
+ 4. **Special attention to implicit patterns**: Issues related to not using project-specific utilities or helpers should be marked as high priority if the pattern appears consistently across multiple examples in Context B.
1238
+ 5. **Special attention to historical patterns**: Issues DEFINITELY matching historical patterns get high priority.
1239
+ 6. Assess for DEFINITE logic errors or bugs only - do NOT report speculative issues.
1240
+ 7. **CRITICAL OUTPUT FILTER**: Before reporting ANY issue, ask yourself: "Do I have a SPECIFIC code fix?" If not, do NOT report it. Do NOT ask the developer to verify, ensure, or check anything.
1241
+ 8. **CRITICAL 'lineNumbers' RULE - MANDATORY COMPLIANCE**:
1242
+ - **ALWAYS provide line numbers** - this field is REQUIRED for every issue
1243
+ - If you can identify specific lines, provide them (max 3-5 for repeated issues)
1244
+ - If the issue affects the entire file or cannot be pinpointed, provide [1] or relevant section line numbers
1245
+ - For ANY issue that occurs multiple times in a file, list ONLY the first 3-5 occurrences maximum
1246
+ - NEVER provide exhaustive lists of line numbers (e.g., [1,2,3,4,5,6,7,8,9,10...])
1247
+ - If an issue affects many lines, use representative examples only
1248
+ - Exhaustive line number lists are considered hallucination and must be avoided
1249
+ - Example: Instead of listing 20+ line numbers, use [15, 23, 47]
1250
+ - **NEVER omit lineNumbers** - empty arrays [] are not allowed
1251
+ 9. Format the final, consolidated, and prioritized list of issues, along with a brief overall summary, **strictly** according to the JSON structure below.
1252
+ 10. CRITICAL: Respond ONLY with valid JSON - start with { and end with }, no additional text.
1253
+
1254
+ ${getFinalReminderBlock()}
1255
+
1256
+ ${getCitationRequirementBlock()}
1257
+
1258
+ REQUIRED JSON OUTPUT FORMAT:
1259
+
1260
+ **REMINDER: lineNumbers is REQUIRED - always provide at least one line number. Use ONLY 3-5 representative line numbers for repeated issues. NEVER provide exhaustive lists or empty arrays.**
1261
+
1262
+ ${getCodeSuggestionsFormatBlock()}
1263
+
1264
+ You must respond with EXACTLY this JSON structure, with no additional text:
1265
+
1266
+ {
1267
+ "summary": "Brief summary of the review, highlighting adherence to documented guidelines and consistency with code examples, plus any major issues found.",
1268
+ "issues": [
1269
+ {
1270
+ "type": "bug | improvement | convention | performance | security",
1271
+ "severity": "critical | high | medium",
1272
+ "description": "Description of the issue, clearly stating the deviation from the prioritized project pattern (guideline or example) OR the nature of the bug/improvement.",
1273
+ "lineNumbers": [42, 55, 61],
1274
+ "suggestion": "Concrete suggestion for fixing the issue or aligning with the prioritized inferred pattern. Ensure the suggestion is additive if adding missing functionality (like a hook) and doesn't wrongly suggest replacing existing, unrelated code.",
1275
+ "codeSuggestion": {
1276
+ "startLine": 42,
1277
+ "endLine": 44,
1278
+ "oldCode": " const result = data.map(item => item.value);",
1279
+ "newCode": " const result = data?.map(item => item?.value) ?? [];"
1280
+ }
1281
+ }
1282
+ ]
1283
+ }
1284
+ `);
1285
+ }
1286
+
1287
+ /**
1288
+ * Generate test file analysis prompt for LLM
1289
+ *
1290
+ * @param {Object} context - Context for LLM
1291
+ * @returns {string} Test file analysis prompt
1292
+ */
1293
+ function generateTestFileAnalysisPrompt(context) {
1294
+ const { file, codeExamples, guidelineSnippets, customDocs } = context;
1295
+
1296
+ // Format code examples and guidelines using shared helpers
1297
+ const formattedCodeExamples = formatCodeExamplesBlock(codeExamples, 'TEST EXAMPLE');
1298
+ const formattedGuidelines = formatGuidelinesBlock(guidelineSnippets, 'TESTING GUIDELINE');
1299
+
1300
+ // Detect if this is a diff review
1301
+ const isDiffReview = file.reviewType === 'DIFF REVIEW';
1302
+ const reviewInstructions = isDiffReview
1303
+ ? 'Your task is to review the test file git diff by performing a comprehensive analysis focused on testing best practices and patterns. Follow the context awareness instructions provided with the file content below.'
1304
+ : 'Your task is to review the following test file by performing a comprehensive analysis focused on testing best practices and patterns.';
1305
+
1306
+ const fileSection = isDiffReview
1307
+ ? `TEST FILE GIT DIFF TO REVIEW (FOCUS ONLY ON CHANGED LINES):
1308
+ Path: ${file.path}
1309
+ Language: ${file.language}
1310
+ Base Branch: ${file.diffInfo?.baseBranch || 'master'}
1311
+ Target Branch: ${file.diffInfo?.targetBranch || 'HEAD'}
1312
+
1313
+ **CRITICAL CONTEXT AWARENESS INSTRUCTIONS:**
1314
+
1315
+ You have access to TWO pieces of information:
1316
+ 1. **FULL TEST FILE CONTENT** - The complete test file for understanding existing test coverage
1317
+ 2. **GIT DIFF** - Only the test changes to review
1318
+
1319
+ **Review Rules:**
1320
+ - ONLY critique the CHANGED lines in the diff (lines with + or -)
1321
+ - USE the full file to verify existing test coverage before suggesting new tests
1322
+ - DO NOT suggest adding tests that already exist in the unchanged portions
1323
+ - DO NOT flag missing test coverage if tests exist elsewhere in the file
1324
+ - Check the full file for existing test cases before making assumptions
1325
+ - The unchanged test code is part of the file - review it before suggesting additions
1326
+
1327
+ **FULL TEST FILE CONTENT (for context - check for existing tests):**
1328
+
1329
+ \`\`\`${file.language}
1330
+ ${file.fullFileContent || file.content}
1331
+ \`\`\`
1332
+
1333
+ **GIT DIFF TO REVIEW (critique ONLY these changes):**
1334
+
1335
+ \`\`\`diff
1336
+ ${file.content}
1337
+ \`\`\``
1338
+ : `TEST FILE TO REVIEW:
1339
+ Path: ${file.path}
1340
+ Language: ${file.language}
1341
+
1342
+ \`\`\`${file.language}
1343
+ ${file.content}
1344
+ \`\`\``;
1345
+
1346
+ // Use shared helpers for custom docs and role definition
1347
+ const customDocsSection = formatCustomDocsSection(customDocs);
1348
+ const roleDefinition = buildRoleDefinition(
1349
+ 'You are an expert test code reviewer acting as a senior developer on this specific project.',
1350
+ customDocs,
1351
+ 'test'
1352
+ );
1353
+
1354
+ // Add project architecture context if available
1355
+ let projectArchitectureSection = '';
1356
+ if (context.projectSummary) {
1357
+ projectArchitectureSection = formatProjectSummaryForLLM(context.projectSummary);
1358
+ }
1359
+
1360
+ // Test-specific prompt
1361
+ return finalizePrompt(`
1362
+ ${roleDefinition}
1363
+
1364
+ ${reviewInstructions}
1365
+
1366
+ ${fileSection}
1367
+
1368
+ ## ANALYSIS CONTEXT
1369
+ ${customDocsSection}
1370
+
1371
+ CONTEXT FROM PROJECT:
1372
+ ${projectArchitectureSection}
1373
+
1374
+ CONTEXT A: TESTING GUIDELINES AND BEST PRACTICES
1375
+ ${formattedGuidelines}
1376
+
1377
+ CONTEXT B: SIMILAR TEST EXAMPLES FROM PROJECT
1378
+ ${formattedCodeExamples}
1379
+
1380
+ INSTRUCTIONS:
1381
+
1382
+ ${getCriticalRulesBlock({ importRuleContext: 'test' })}
1383
+
1384
+ **Perform the following test-specific analysis:**
1385
+
1386
+ **STAGE 1: Custom Instructions & Test Coverage Analysis**
1387
+ 1. **FIRST AND MOST IMPORTANT**: If custom instructions were provided at the beginning of this prompt, analyze the test file against those custom instructions BEFORE all other analysis. Custom instructions always take precedence.
1388
+ 2. Analyze test coverage - identify SPECIFIC missing test cases only if you can name the exact scenario that should be tested.
1389
+ 3. Only report coverage gaps where you can provide a concrete test case to add.
1390
+
1391
+ **STAGE 2: Test Quality and Best Practices**
1392
+ 1. Evaluate test naming conventions - report only DEFINITE violations where you can show the correct naming.
1393
+ 2. Analyze test organization - report only if tests are clearly misorganized with a specific fix.
1394
+ 3. Assess assertion quality - report only weak assertions where you can provide a stronger alternative.
1395
+ 4. Review test isolation - report only if you find a DEFINITE side effect issue with a specific fix.
1396
+
1397
+ **STAGE 3: Testing Patterns and Conventions (CRITICAL)**
1398
+ 1. **IMPORTANT**: Carefully analyze ALL code examples in Context B to identify:
1399
+ - Common helper functions or utilities that appear across multiple test files
1400
+ - Consistent patterns in how certain operations are performed (e.g., rendering, mocking, assertions)
1401
+ - Any project-specific abstractions or wrappers around standard testing libraries
1402
+ 2. **CRITICAL**: Compare the reviewed test file against these discovered patterns. Flag ONLY instances where:
1403
+ - The test DEFINITELY uses a direct library call when a project wrapper exists (cite the wrapper)
1404
+ - A common utility is DEFINITELY available but not used (cite where it's defined)
1405
+ - The test CLEARLY deviates from a pattern shown in 3+ examples (cite the examples)
1406
+ 3. Report mocking/stubbing issues only with a specific code fix.
1407
+ 4. Report fixture issues only with a specific code fix showing the correct pattern.
1408
+ 5. Report async handling issues only with specific code showing the correct approach.
1409
+
1410
+ **STAGE 4: Performance and Maintainability**
1411
+ 1. Report slow tests only if you can identify the specific cause and fix.
1412
+ 2. Report code duplication only with a specific refactoring suggestion.
1413
+
1414
+ **STAGE 5: Consolidate and Generate Output**
1415
+ 1. **CRITICAL**: Prioritize issues where the test deviates from implicit project patterns shown in Context B (similar test examples), especially regarding test utilities and helper functions.
1416
+ 2. Provide concrete suggestions that align with the project's testing patterns, referencing specific examples from Context B when applicable.
1417
+ 3. Assess for any potential logic errors or bugs within the reviewed code itself, independent of conventions, and include them as separate issues.
1418
+ 4. **CRITICAL 'lineNumbers' RULE - MANDATORY COMPLIANCE**:
1419
+ - For ANY issue that occurs multiple times in a test file, list ONLY the first 3-5 occurrences maximum
1420
+ - NEVER provide exhaustive lists of line numbers (e.g., [1,2,3,4,5,6,7,8,9,10...])
1421
+ - If an issue affects many lines, use representative examples only
1422
+ - Exhaustive line number lists are considered hallucination and must be avoided
1423
+ - Example: Instead of listing 20+ line numbers, use [15, 23, 47, "...and 12 other occurrences"]
1424
+ 5. Format the output according to the JSON structure below.
1425
+
1426
+ ${getFinalReminderBlock()}
1427
+
1428
+ ${getCitationRequirementBlock()}
1429
+
1430
+ REQUIRED JSON OUTPUT FORMAT:
1431
+
1432
+ **REMINDER: For lineNumbers array, use ONLY 3-5 representative line numbers for repeated issues. NEVER provide exhaustive lists.**
1433
+
1434
+ ${getCodeSuggestionsFormatBlock()}
1435
+
1436
+ You must respond with EXACTLY this JSON structure, with no additional text:
1437
+
1438
+ {
1439
+ "summary": "Brief summary of the test file review, highlighting coverage completeness, adherence to testing best practices, and any critical issues found.",
1440
+ "issues": [
1441
+ {
1442
+ "type": "bug | improvement | convention | performance | coverage",
1443
+ "severity": "critical | high | medium",
1444
+ "description": "Description of the issue, clearly stating the problem with the test implementation or coverage gap.",
1445
+ "lineNumbers": [25, 38],
1446
+ "suggestion": "Concrete suggestion for improving the test, adding missing coverage, or following testing best practices.",
1447
+ "codeSuggestion": {
1448
+ "startLine": 25,
1449
+ "endLine": 27,
1450
+ "oldCode": " expect(result).toBe(true);",
1451
+ "newCode": " expect(result).toBe(true);\n expect(result).not.toBeNull();"
1452
+ }
1453
+ }
1454
+ ]
1455
+ }
1456
+ `);
1457
+ }
1458
+
1459
+ /**
1460
+ * Generate holistic PR analysis prompt for LLM
1461
+ *
1462
+ * @param {Object} context - Holistic context for LLM
1463
+ * @returns {string} Holistic PR analysis prompt
1464
+ */
1465
+ function generateHolisticPRAnalysisPrompt(context) {
1466
+ const { file, context: contextSections, customDocs } = context;
1467
+
1468
+ // Format unified context sections
1469
+ const formattedCodeExamples =
1470
+ contextSections
1471
+ .find((s) => s.title === 'Similar Code Examples')
1472
+ ?.items?.slice(0, 10)
1473
+ .map((ex, idx) => {
1474
+ return `
1475
+ CODE EXAMPLE ${idx + 1} (Similarity: ${ex.similarity?.toFixed(3) || 'N/A'})
1476
+ Path: ${ex.path}
1477
+ Language: ${ex.language}
1478
+
1479
+ \`\`\`${ex.language || ''}
1480
+ ${ex.content}
1481
+ \`\`\`
1482
+ `;
1483
+ })
1484
+ .join('\n') || 'No relevant code examples found.';
1485
+
1486
+ const formattedGuidelines =
1487
+ contextSections
1488
+ .find((s) => s.title === 'Project Guidelines')
1489
+ ?.items?.slice(0, 8)
1490
+ .map((g, idx) => {
1491
+ return `
1492
+ GUIDELINE ${idx + 1} (Source: ${g.path})
1493
+ ${g.headingText ? `Heading: "${g.headingText}"` : ''}
1494
+
1495
+ \`\`\`
1496
+ ${g.content}
1497
+ \`\`\`
1498
+ `;
1499
+ })
1500
+ .join('\n') || 'No specific guidelines found.';
1501
+
1502
+ const formattedPRComments =
1503
+ contextSections
1504
+ .find((s) => s.title === 'Historical Review Comments')
1505
+ ?.items?.slice(0, MAX_PR_COMMENTS_FOR_CONTEXT)
1506
+ .map((comment, idx) => {
1507
+ return `### Historical Comment ${idx + 1}
1508
+ - **PR**: #${comment.prNumber} by ${comment.author}
1509
+ - **File**: ${comment.filePath}
1510
+ - **Type**: ${comment.commentType || 'review'}
1511
+ - **Relevance**: ${(comment.relevanceScore * 100).toFixed(1)}%
1512
+ - **Review**: ${comment.body}
1513
+
1514
+ `;
1515
+ })
1516
+ .join('\n') || 'No historical PR comments found.';
1517
+
1518
+ // Format PR files with their diffs
1519
+ const prFiles = file.prFiles || [];
1520
+ const formattedPRFiles = prFiles
1521
+ .map((prFile, idx) => {
1522
+ return `
1523
+ ## FILE ${idx + 1}: ${prFile.path}
1524
+ **Language**: ${prFile.language}
1525
+ **Type**: ${prFile.isTest ? 'Test' : 'Source'} file
1526
+ **Summary**: ${prFile.summary}
1527
+
1528
+ ### Changes (Git Diff):
1529
+ \`\`\`diff
1530
+ ${prFile.diff}
1531
+ \`\`\`
1532
+
1533
+ ### Full File Content (For Context):
1534
+ \`\`\`${prFile.language}
1535
+ ${prFile.fullContent}
1536
+ \`\`\`
1537
+ `;
1538
+ })
1539
+ .join('\n');
1540
+
1541
+ // Use shared helper for custom docs section
1542
+ const customDocsSection = formatCustomDocsSection(customDocs);
1543
+
1544
+ // Build the role definition - PR analysis has additional context awareness instructions
1545
+ const baseRole = `You are an expert code reviewer performing a holistic review of a Pull Request with ${prFiles.length} files.
1546
+
1547
+ **CRITICAL CONTEXT AWARENESS INSTRUCTIONS:**
1548
+
1549
+ For each file in this PR, you have access to:
1550
+ 1. **FULL FILE CONTENT** - The complete file for understanding context and existing code
1551
+ 2. **GIT DIFF** - Only the changes to review
1552
+
1553
+ **Review Rules:**
1554
+ - ONLY critique the CHANGED lines shown in each file's diff (lines with + or -)
1555
+ - USE the full file content to understand context, dependencies, and existing implementations
1556
+ - DO NOT suggest adding code that already exists in the unchanged portions of any file
1557
+ - DO NOT flag issues about missing code if it exists elsewhere in the full file
1558
+ - Before flagging cross-file issues, verify the code doesn't already exist in unchanged portions
1559
+ - Do NOT flag functions/variables as missing if they exist elsewhere in the full file
1560
+ - The unchanged code is part of each file - always check it before making assumptions`;
1561
+
1562
+ let roleDefinition = buildRoleDefinition(baseRole, customDocs, 'pr');
1563
+ roleDefinition += '\nAnalyze ALL files together to identify cross-file issues, consistency problems, and overall code quality.';
1564
+
1565
+ // Add project architecture context if available
1566
+ let projectArchitectureSection = '';
1567
+ if (context.projectSummary) {
1568
+ projectArchitectureSection = formatProjectSummaryForLLM(context.projectSummary);
1569
+ }
1570
+
1571
+ return finalizePrompt(`
1572
+ ${roleDefinition}
1573
+
1574
+ ## PULL REQUEST OVERVIEW
1575
+ - **Total Files**: ${prFiles.length}
1576
+ - **Source Files**: ${prFiles.filter((f) => !f.isTest).length}
1577
+ - **Test Files**: ${prFiles.filter((f) => f.isTest).length}
1578
+
1579
+ ## UNIFIED CONTEXT FROM PROJECT
1580
+ ${projectArchitectureSection}
1581
+
1582
+ ### PROJECT CODE EXAMPLES
1583
+ ${formattedCodeExamples}
1584
+
1585
+ ### PROJECT GUIDELINES
1586
+ ${formattedGuidelines}
1587
+
1588
+ ### HISTORICAL REVIEW COMMENTS
1589
+ ${formattedPRComments}
1590
+
1591
+ ## PR FILES WITH CHANGES
1592
+ ${formattedPRFiles}
1593
+
1594
+ ## ANALYSIS CONTEXT
1595
+ ${customDocsSection}
1596
+
1597
+ ## ANALYSIS INSTRUCTIONS
1598
+
1599
+ ${getCriticalRulesBlock({ importRuleContext: 'pr' })}
1600
+
1601
+ **Perform the following holistic analysis stages sequentially for all PR files:**
1602
+
1603
+ ### **STAGE 1: Project Pattern Analysis (CRITICAL FOR CONSISTENCY)**
1604
+
1605
+ 1. **CRITICAL FIRST STEP**: Scan ALL code examples in PROJECT CODE EXAMPLES and create a comprehensive list of:
1606
+ - Common import statements (especially those containing 'helper', 'util', 'shared', 'common', 'test')
1607
+ - Frequently used function calls that appear across multiple examples
1608
+ - Project-specific wrappers or utilities (e.g., \`renderWithTestHelpers\` instead of direct \`render\`)
1609
+ - Consistent patterns in how operations are performed
1610
+ - Testing patterns and helper functions
1611
+ - Component patterns and architectural approaches
1612
+
1613
+ 2. **IMPORTANT**: For each common utility or pattern you identify, note:
1614
+ - Which example files demonstrate it (cite specific examples)
1615
+ - What the pattern appears to do
1616
+ - Whether ALL PR files are using this pattern consistently
1617
+
1618
+ 3. **HIGH PRIORITY CROSS-FILE CHECKS**: Flag any instances where:
1619
+ - Files use direct library calls when multiple examples use project wrappers
1620
+ - Common utility functions available in the project are not being imported/used consistently
1621
+ - Files deviate from patterns that appear in 3+ examples
1622
+ - Test files don't follow established test helper patterns
1623
+ - Import statements are inconsistent across similar files
1624
+
1625
+ ### **STAGE 2: Custom Instructions & Guideline Compliance Analysis**
1626
+
1627
+ 1. **FIRST AND MOST IMPORTANT**: If custom instructions were provided at the beginning of this prompt, analyze ALL PR files against those custom instructions BEFORE all other analysis. Custom instructions always take precedence.
1628
+ 2. Analyze ALL PR files strictly against the standards, rules, and explanations in PROJECT GUIDELINES
1629
+ 3. Identify specific deviations where any file violates custom instructions OR explicit guidelines. Note the source for each deviation found.
1630
+ 4. Check for consistency of guideline application across all files
1631
+ 5. Ensure architectural decisions are consistent across the PR
1632
+
1633
+ ### **STAGE 3: Historical Pattern Recognition**
1634
+
1635
+ 1. **CRITICAL**: Analyze HISTORICAL REVIEW COMMENTS to identify patterns:
1636
+ - Types of issues human reviewers frequently flag in similar code
1637
+ - Recurring themes across multiple historical comments
1638
+ - High-relevance issues (>70% relevance score) that apply to current PR
1639
+
1640
+ 2. **Apply Historical Insights to Each File**:
1641
+ - Identify DEFINITE issues that match historical patterns across PR files
1642
+ - Apply reviewer suggestions that are relevant to current changes
1643
+ - Look for patterns that span multiple files in the PR
1644
+
1645
+ ### **STAGE 4: Cross-File Integration Analysis**
1646
+
1647
+ 1. **Naming and Import Consistency**:
1648
+ - Report naming inconsistencies only with specific examples and fixes
1649
+ - Report import/export issues only with specific missing/incorrect imports identified
1650
+ - Report duplicated logic only with specific refactoring suggestions
1651
+
1652
+ 2. **Test Coverage and Quality**:
1653
+ - Report missing tests only if you can specify EXACTLY which test case should be added
1654
+ - Report test pattern deviations only with specific code fixes
1655
+ - Do NOT suggest "adding tests" without specifying the exact test
1656
+
1657
+ 3. **Architectural Integration**:
1658
+ - Report breaking changes only if you can identify the SPECIFIC break
1659
+ - Report API inconsistencies only with SPECIFIC mismatches identified
1660
+ - Report separation of concerns issues only with SPECIFIC refactoring suggestions
1661
+
1662
+ ### **STAGE 5: Consolidate and Prioritize Issues**
1663
+
1664
+ 1. **Apply Conflict Resolution Rules**:
1665
+ - **Guideline Precedence**: If pattern-based or historical insights contradict explicit guidelines, guidelines take precedence
1666
+ - **Cross-File Priority**: Issues affecting multiple files get higher priority
1667
+ - **Pattern Consistency**: Missing project-specific utilities/helpers are high priority if pattern appears in 3+ examples
1668
+
1669
+ 2. **Citation Rules**:
1670
+ - For guideline violations: cite the specific guideline document
1671
+ - For pattern deviations: cite specific code examples that demonstrate the correct pattern
1672
+ - For historical issues: report as standard findings without referencing historical source
1673
+ - For cross-file issues: specify all affected files
1674
+
1675
+ 3. **CRITICAL OUTPUT FILTER - Apply before reporting ANY issue**:
1676
+ - **Only report issues where you have a DEFINITE problem AND a SPECIFIC code fix**
1677
+ - **Do NOT report issues that require the developer to "verify", "ensure", or "check" something**
1678
+ - **Do NOT report issues where you are uncertain** - if you find yourself writing "may", "might", "could", or "consider", do not report it
1679
+ - **Do NOT suggest adding comments or documentation**
1680
+
1681
+ 4. Assess for DEFINITE logic errors or bugs only - do not report speculative issues.
1682
+ 5. DO NOT check if any file referenced in a import statement, is missing.
1683
+ 6. **CRITICAL 'lineNumbers' RULE - MANDATORY COMPLIANCE**:
1684
+ - For ANY issue that occurs multiple times in a file, list ONLY the first 3-5 occurrences maximum
1685
+ - NEVER provide exhaustive lists of line numbers (e.g., [1,2,3,4,5,6,7,8,9,10...])
1686
+ - If an issue affects many lines, use representative examples only
1687
+ - Exhaustive line number lists are considered hallucination and must be avoided
1688
+ - Example: Instead of listing 20+ line numbers, use [15, 23, 47, "...and 12 other occurrences"]
1689
+
1690
+ ${getFinalReminderBlock()}
1691
+
1692
+ ${getCitationRequirementBlock()}
1693
+
1694
+ REQUIRED JSON OUTPUT FORMAT:
1695
+
1696
+ **REMINDER: For lineNumbers array, use ONLY 3-5 representative line numbers for repeated issues. NEVER provide exhaustive lists.**
1697
+
1698
+ ${getCodeSuggestionsFormatBlock()}
1699
+
1700
+ You must respond with EXACTLY this JSON structure, with no additional text:
1701
+
1702
+ {
1703
+ "summary": "Brief, high-level summary of the entire PR review...",
1704
+ "crossFileIssues": [
1705
+ {
1706
+ "type": "bug | improvement | convention | architecture",
1707
+ "severity": "critical | high | medium",
1708
+ "description": "Detailed description of an issue that spans multiple files...",
1709
+ "suggestion": "Actionable suggestion to resolve the cross-file issue.",
1710
+ "filesInvolved": ["path/to/file1.js", "path/to/file2.ts"]
1711
+ }
1712
+ ],
1713
+ "fileSpecificIssues": {
1714
+ "path/to/file1.js": [
1715
+ {
1716
+ "type": "bug | improvement | convention | performance | security",
1717
+ "severity": "critical | high | medium",
1718
+ "description": "Description of the issue specific to this file.",
1719
+ "lineNumbers": [10, 15],
1720
+ "suggestion": "Concrete suggestion for fixing the issue in this file.",
1721
+ "codeSuggestion": {
1722
+ "startLine": 10,
1723
+ "endLine": 15,
1724
+ "oldCode": " const result = data.map(item => item.value);",
1725
+ "newCode": " const result = data?.map(item => item?.value) ?? [];"
1726
+ }
1727
+ }
1728
+ ]
1729
+ },
1730
+ "recommendations": [
1731
+ {
1732
+ "type": "refactoring | testing | documentation",
1733
+ "description": "A high-level recommendation for improving the codebase...",
1734
+ "filesInvolved": ["path/to/relevant/file.js"]
1735
+ }
1736
+ ]
1737
+ }
1738
+ `);
1739
+ }
1740
+
1741
+ /**
1742
+ * Parse LLM analysis response
1743
+ *
1744
+ * @param {string} rawResponse - Raw LLM response
1745
+ * @returns {Object} Parsed analysis response
1746
+ */
1747
+ function parseAnalysisResponse(rawResponse) {
1748
+ // rawResponse is now the full LLM response object with structured JSON from tool calling
1749
+ const parsedResponse = rawResponse.json;
1750
+
1751
+ if (!parsedResponse) {
1752
+ return {
1753
+ summary: 'Error parsing LLM response',
1754
+ issues: [],
1755
+ crossFileIssues: [],
1756
+ fileSpecificIssues: {},
1757
+ recommendations: [],
1758
+ rawResponse,
1759
+ parseError: 'Failed to parse JSON from LLM response',
1760
+ };
1761
+ }
1762
+
1763
+ // Check for holistic review structure, which contains fileSpecificIssues
1764
+ if (parsedResponse.fileSpecificIssues || parsedResponse.crossFileIssues || parsedResponse.recommendations) {
1765
+ return {
1766
+ summary: parsedResponse.summary || 'No summary provided',
1767
+ crossFileIssues: parsedResponse.crossFileIssues || [],
1768
+ fileSpecificIssues: parsedResponse.fileSpecificIssues || {},
1769
+ recommendations: parsedResponse.recommendations || [],
1770
+ rawResponse,
1771
+ };
1772
+ }
1773
+
1774
+ // Fallback to single-file review structure
1775
+ return {
1776
+ summary: parsedResponse.summary || 'No summary provided',
1777
+ issues: parsedResponse.issues || [],
1778
+ rawResponse,
1779
+ };
1780
+ }
1781
+
1782
+ /**
1783
+ * Get PR comment context for historical analysis integration
1784
+ *
1785
+ * @param {string} filePath - Path to the file being analyzed
1786
+ * @param {Object} options - Options for context retrieval
1787
+ * @returns {Promise<Object>} Historical PR comment context
1788
+ */
1789
+ async function getPRCommentContext(filePath, options = {}) {
1790
+ try {
1791
+ const { maxComments = 20, similarityThreshold = 0.15, projectPath = process.cwd(), precomputedQueryEmbedding = null } = options;
1792
+
1793
+ // Normalize file path for comparison
1794
+ const normalizedPath = path.normalize(filePath);
1795
+ const fileName = path.basename(normalizedPath);
1796
+
1797
+ debug(`[getPRCommentContext] Getting context for ${normalizedPath}`);
1798
+
1799
+ // Use pre-computed embedding if available, otherwise compute it
1800
+ let fileContent = '';
1801
+ let contentForSearch = '';
1802
+
1803
+ if (precomputedQueryEmbedding) {
1804
+ console.log(chalk.blue(`🔍 Using pre-computed query embedding for PR comment search`));
1805
+ // We still need the file content for the search function, but not for embedding
1806
+ try {
1807
+ fileContent = fs.readFileSync(filePath, 'utf8');
1808
+ const maxEmbeddingLength = 8000; // Keep consistent with original truncation
1809
+ contentForSearch = fileContent.length > maxEmbeddingLength ? fileContent.substring(0, maxEmbeddingLength) : fileContent;
1810
+ } catch (readError) {
1811
+ debug(`[getPRCommentContext] Could not read file ${filePath}: ${readError.message}`);
1812
+ return {
1813
+ success: false,
1814
+ hasContext: false,
1815
+ error: `Could not read file: ${readError.message}`,
1816
+ comments: [],
1817
+ summary: 'Failed to read file for context analysis',
1818
+ };
1819
+ }
1820
+ } else {
1821
+ // Fallback to original behavior if no pre-computed embedding provided
1822
+ try {
1823
+ fileContent = fs.readFileSync(filePath, 'utf8');
1824
+ } catch (readError) {
1825
+ debug(`[getPRCommentContext] Could not read file ${filePath}: ${readError.message}`);
1826
+ return {
1827
+ success: false,
1828
+ hasContext: false,
1829
+ error: `Could not read file: ${readError.message}`,
1830
+ comments: [],
1831
+ summary: 'Failed to read file for context analysis',
1832
+ };
1833
+ }
1834
+
1835
+ // Truncate content for embedding if too long
1836
+ const maxEmbeddingLength = 8000; // Reasonable limit for embedding
1837
+ contentForSearch = fileContent.length > maxEmbeddingLength ? fileContent.substring(0, maxEmbeddingLength) : fileContent;
1838
+ }
1839
+
1840
+ // Detect if this is a test file using existing utility
1841
+ const isTest = isTestFile(filePath);
1842
+
1843
+ // Use semantic search to find similar PR comments
1844
+ let relevantComments = [];
1845
+
1846
+ console.log(chalk.blue(`🔍 Searching for PR comments with:`));
1847
+
1848
+ console.log(chalk.gray(` Project Path: ${projectPath}`));
1849
+ console.log(chalk.gray(` File: ${fileName}`));
1850
+ console.log(chalk.gray(` Similarity Threshold: ${similarityThreshold}`));
1851
+ console.log(chalk.gray(` Content Length: ${contentForSearch.length} chars`));
1852
+ console.log(chalk.gray(` Using Pre-computed Embedding: ${precomputedQueryEmbedding ? 'Yes' : 'No'}`));
1853
+
1854
+ try {
1855
+ console.log(chalk.blue(`🔍 Attempting hybrid search with chunking...`));
1856
+ relevantComments = await findRelevantPRComments(contentForSearch, {
1857
+ projectPath,
1858
+ limit: maxComments,
1859
+ isTestFile: isTest, // Pass test file context for filtering
1860
+ precomputedQueryEmbedding: precomputedQueryEmbedding, // Pass pre-computed embedding if available
1861
+ });
1862
+ console.log(chalk.green(`✅ Hybrid search returned ${relevantComments.length} comments`));
1863
+ if (relevantComments.length > 0) {
1864
+ console.log(chalk.blue(`Top comment similarities:`));
1865
+ relevantComments.slice(0, 3).forEach((comment, idx) => {
1866
+ console.log(
1867
+ chalk.gray(` ${idx + 1}. Score: ${comment.similarity_score?.toFixed(3)} - ${comment.comment_text?.substring(0, 80)}...`)
1868
+ );
1869
+ });
1870
+ }
1871
+ } catch (dbError) {
1872
+ console.log(chalk.yellow(`⚠️ Hybrid search failed: ${dbError.message}`));
1873
+ debug(`[getPRCommentContext] Hybrid search failed: ${dbError.message}`);
1874
+ // No fallback needed - if hybrid search fails, we just return empty results
1875
+ relevantComments = [];
1876
+ }
1877
+
1878
+ console.log('Total relevant comments number:', relevantComments.length);
1879
+
1880
+ // Extract patterns and insights
1881
+ const patterns = extractCommentPatterns(relevantComments);
1882
+ const summary = generateContextSummary(relevantComments, patterns);
1883
+
1884
+ debug(`[getPRCommentContext] Found ${relevantComments.length} relevant comments for ${normalizedPath}`);
1885
+
1886
+ return {
1887
+ success: true,
1888
+ hasContext: relevantComments.length > 0,
1889
+ filePath: normalizedPath,
1890
+ comments: relevantComments.map(formatCommentForContext),
1891
+ patterns,
1892
+ summary,
1893
+ metadata: {
1894
+ totalCommentsFound: relevantComments.length,
1895
+ relevantCommentsReturned: relevantComments.length,
1896
+ averageRelevanceScore:
1897
+ relevantComments.length > 0 ? relevantComments.reduce((sum, c) => sum + c.similarity_score, 0) / relevantComments.length : 0,
1898
+ searchMethod:
1899
+ relevantComments.length > 0 && relevantComments[0].similarity_score !== 0.5 ? 'semantic_embedding' : 'file_path_fallback',
1900
+ },
1901
+ };
1902
+ } catch (error) {
1903
+ debug(`[getPRCommentContext] Error getting PR comment context: ${error.message}`);
1904
+ return {
1905
+ success: false,
1906
+ hasContext: false,
1907
+ error: error.message,
1908
+ comments: [],
1909
+ summary: 'Failed to retrieve historical context',
1910
+ };
1911
+ }
1912
+ }
1913
+
1914
+ /**
1915
+ * Extract patterns from historical comments
1916
+ */
1917
+ function extractCommentPatterns(comments) {
1918
+ const patterns = {
1919
+ commonIssues: [],
1920
+ reviewPatterns: [],
1921
+ technicalConcerns: [],
1922
+ suggestedImprovements: [],
1923
+ };
1924
+
1925
+ // Analyze comment content for patterns
1926
+ const allText = comments
1927
+ .map((c) => c.body || '')
1928
+ .join(' ')
1929
+ .toLowerCase();
1930
+
1931
+ // Common issue keywords
1932
+ const issueKeywords = ['bug', 'error', 'issue', 'problem', 'broken', 'fail'];
1933
+ patterns.commonIssues = issueKeywords.filter((keyword) => allText.includes(keyword));
1934
+
1935
+ // Review pattern keywords
1936
+ const reviewKeywords = ['suggest', 'recommend', 'consider', 'improve', 'better'];
1937
+ patterns.reviewPatterns = reviewKeywords.filter((keyword) => allText.includes(keyword));
1938
+
1939
+ // Technical concern keywords
1940
+ const techKeywords = ['performance', 'security', 'memory', 'optimization', 'scalability'];
1941
+ patterns.technicalConcerns = techKeywords.filter((keyword) => allText.includes(keyword));
1942
+
1943
+ return patterns;
1944
+ }
1945
+
1946
+ /**
1947
+ * Generate summary of historical context
1948
+ */
1949
+ function generateContextSummary(comments, patterns) {
1950
+ if (comments.length === 0) {
1951
+ return 'No relevant historical comments found for this file.';
1952
+ }
1953
+
1954
+ const summaryParts = [`Found ${comments.length} relevant historical comments.`];
1955
+
1956
+ if (patterns.commonIssues.length > 0) {
1957
+ summaryParts.push(`Common issues mentioned: ${patterns.commonIssues.join(', ')}.`);
1958
+ }
1959
+
1960
+ if (patterns.reviewPatterns.length > 0) {
1961
+ summaryParts.push(`Review suggestions often involve: ${patterns.reviewPatterns.join(', ')}.`);
1962
+ }
1963
+
1964
+ if (patterns.technicalConcerns.length > 0) {
1965
+ summaryParts.push(`Technical concerns raised: ${patterns.technicalConcerns.join(', ')}.`);
1966
+ }
1967
+
1968
+ // Add recency information
1969
+ const recentComments = comments.filter((c) => {
1970
+ const daysSince = (Date.now() - new Date(c.created_at).getTime()) / (1000 * 60 * 60 * 24);
1971
+ return daysSince <= 30;
1972
+ });
1973
+
1974
+ if (recentComments.length > 0) {
1975
+ summaryParts.push(`${recentComments.length} comments from the last 30 days.`);
1976
+ }
1977
+
1978
+ return summaryParts.join(' ');
1979
+ }
1980
+
1981
+ /**
1982
+ * Format comment for context usage
1983
+ */
1984
+ function formatCommentForContext(comment) {
1985
+ return {
1986
+ id: comment.id,
1987
+ author: comment.author || comment.author_login, // Handle both field names
1988
+ body: (comment.comment_text || comment.body || '').substring(0, 500), // Handle both field names and truncate
1989
+ createdAt: comment.created_at,
1990
+ commentType: comment.comment_type,
1991
+ filePath: comment.file_path,
1992
+ prNumber: comment.pr_number,
1993
+ prTitle: comment.pr_title,
1994
+ relevanceScore: comment.similarity_score || comment.relevanceScore, // Handle both field names
1995
+ };
1996
+ }
1997
+
1998
+ /**
1999
+ * Perform holistic PR analysis using unified context
2000
+ * @param {Object} options - Analysis options including prFiles and unifiedContext
2001
+ * @returns {Promise<Object>} Holistic analysis results
2002
+ */
2003
+ async function performHolisticPRAnalysis(options) {
2004
+ try {
2005
+ const { prFiles, unifiedContext, customDocs } = options;
2006
+
2007
+ console.log(chalk.blue(`🔍 Performing holistic analysis of ${prFiles.length} files with unified context...`));
2008
+
2009
+ // Retrieve project architecture summary
2010
+ console.log(chalk.blue('--- Retrieving Project Architecture Context for Holistic PR Review ---'));
2011
+ const projectPath = options.projectPath || process.cwd();
2012
+ const projectSummary = await getProjectSummary(projectPath);
2013
+
2014
+ // Create a synthetic file context for holistic analysis
2015
+ const holisticContext = {
2016
+ file: {
2017
+ path: 'PR_HOLISTIC_REVIEW',
2018
+ name: 'Pull Request',
2019
+ directory: '.',
2020
+ directoryName: '.',
2021
+ language: 'diff',
2022
+ content: prFiles.map((f) => f.diff).join('\n\n'),
2023
+ reviewType: 'PR HOLISTIC REVIEW',
2024
+ isDiffReview: true,
2025
+ prFiles: prFiles, // Add all PR files for context
2026
+ },
2027
+ context: [
2028
+ {
2029
+ title: 'Similar Code Examples',
2030
+ description: 'Code patterns from the project that are similar to the files being reviewed',
2031
+ items: unifiedContext.codeExamples.slice(0, 10),
2032
+ },
2033
+ {
2034
+ title: 'Project Guidelines',
2035
+ description: 'Documentation and guidelines relevant to this code',
2036
+ items: unifiedContext.guidelines.slice(0, 8),
2037
+ },
2038
+ {
2039
+ title: 'Historical Review Comments',
2040
+ description: 'Similar code patterns and issues identified by human reviewers in past PRs',
2041
+ items: unifiedContext.prComments.slice(0, 10),
2042
+ },
2043
+ ],
2044
+ customDocs: unifiedContext.customDocChunks || options.relevantCustomDocChunks || customDocs, // Use unified chunks first, then relevant chunks, then full docs
2045
+ projectSummary: projectSummary, // Add project architecture summary
2046
+ metadata: {
2047
+ hasCodeExamples: unifiedContext.codeExamples.length > 0,
2048
+ hasGuidelines: unifiedContext.guidelines.length > 0,
2049
+ hasPRHistory: unifiedContext.prComments.length > 0,
2050
+ hasProjectSummary: !!projectSummary,
2051
+ analysisTimestamp: new Date().toISOString(),
2052
+ reviewType: 'PR HOLISTIC REVIEW',
2053
+ isPRReview: true,
2054
+ isHolisticReview: true,
2055
+ },
2056
+ options: options,
2057
+ };
2058
+
2059
+ // Add verbose debug logging similar to individual file reviews
2060
+ console.log(chalk.magenta('--- Holistic PR Review: Guidelines Sent to LLM ---'));
2061
+ if (unifiedContext.guidelines.length > 0) {
2062
+ unifiedContext.guidelines.slice(0, 10).forEach((g, i) => {
2063
+ console.log(
2064
+ chalk.magenta(
2065
+ ` [${i + 1}] Path: ${g.path} ${g.headingText || g.heading_text ? `(Heading: "${g.headingText || g.heading_text}")` : ''}`
2066
+ )
2067
+ );
2068
+ console.log(chalk.gray(` Content: ${g.content.substring(0, 100).replace(/\n/g, ' ')}...`));
2069
+ });
2070
+ } else {
2071
+ console.log(chalk.magenta(' (None)'));
2072
+ }
2073
+
2074
+ console.log(chalk.magenta('--- Holistic PR Review: Code Examples Sent to LLM ---'));
2075
+ if (unifiedContext.codeExamples.length > 0) {
2076
+ unifiedContext.codeExamples.slice(0, 10).forEach((ex, i) => {
2077
+ console.log(chalk.magenta(` [${i + 1}] Path: ${ex.path} (Similarity: ${ex.similarity?.toFixed(3) || 'N/A'})`));
2078
+ console.log(chalk.gray(` Content: ${ex.content.substring(0, 100).replace(/\\n/g, ' ')}...`));
2079
+ });
2080
+ } else {
2081
+ console.log(chalk.magenta(' (None)'));
2082
+ }
2083
+
2084
+ console.log(chalk.magenta('--- Holistic PR Review: Top Historic Comments Sent to LLM ---'));
2085
+ if (unifiedContext.prComments.length > 0) {
2086
+ unifiedContext.prComments.slice(0, 5).forEach((comment, i) => {
2087
+ console.log(
2088
+ chalk.magenta(
2089
+ ` [${i + 1}] PR #${comment.prNumber} by ${comment.author} (Relevance: ${(comment.relevanceScore * 100).toFixed(1)}%)`
2090
+ )
2091
+ );
2092
+ console.log(chalk.gray(` File: ${comment.filePath}`));
2093
+ console.log(chalk.gray(` Comment: ${comment.body.substring(0, 100).replace(/\n/g, ' ')}...`));
2094
+ });
2095
+ } else {
2096
+ console.log(chalk.magenta(' (None)'));
2097
+ }
2098
+
2099
+ console.log(chalk.magenta('--- Holistic PR Review: Custom Document Chunks Sent to LLM ---'));
2100
+ if (unifiedContext.customDocChunks && unifiedContext.customDocChunks.length > 0) {
2101
+ unifiedContext.customDocChunks.forEach((chunk, i) => {
2102
+ console.log(chalk.magenta(` [${i + 1}] Document: "${chunk.document_title}" (Chunk ${chunk.chunk_index + 1})`));
2103
+ console.log(chalk.gray(` Similarity: ${chunk.similarity?.toFixed(3) || 'N/A'}`));
2104
+ console.log(chalk.gray(` Content: ${chunk.content.substring(0, 100).replace(/\n/g, ' ')}...`));
2105
+ });
2106
+ } else {
2107
+ console.log(chalk.magenta(' (None)'));
2108
+ }
2109
+ console.log(chalk.magenta('--- Sending Holistic PR Analysis Prompt to LLM ---'));
2110
+
2111
+ // Call the centralized analysis function
2112
+ const parsedResponse = await callLLMForAnalysis(holisticContext, {
2113
+ ...options,
2114
+ isHolisticPRReview: true,
2115
+ });
2116
+
2117
+ // Debug logging
2118
+ console.log(chalk.blue(`🐛 Holistic analysis parsed response:`));
2119
+ console.log(chalk.gray(`Summary: ${parsedResponse.summary?.substring(0, 100)}...`));
2120
+ console.log(chalk.gray(`Cross-file issues: ${parsedResponse.crossFileIssues?.length || 0}`));
2121
+ console.log(chalk.gray(`File-specific issues keys: ${Object.keys(parsedResponse.fileSpecificIssues || {}).join(', ')}`));
2122
+ console.log(chalk.gray(`Recommendations: ${parsedResponse.recommendations?.length || 0}`));
2123
+
2124
+ // Filter out low severity issues (formatting/style concerns handled by linters)
2125
+ // Note: The LLM prompt instructs not to generate low severity issues, but this filter
2126
+ // serves as a safety net in case any slip through despite the prompt instructions
2127
+ const filteredResponse = filterLowSeverityIssues(parsedResponse, { verbose: options.verbose });
2128
+
2129
+ return {
2130
+ success: true,
2131
+ filePath: 'PR_HOLISTIC_REVIEW',
2132
+ language: 'diff',
2133
+ results: {
2134
+ summary: filteredResponse.summary || 'Holistic PR review completed',
2135
+ crossFileIssues: filteredResponse.crossFileIssues || [],
2136
+ fileSpecificIssues: filteredResponse.fileSpecificIssues || {},
2137
+ recommendations: filteredResponse.recommendations || [],
2138
+ },
2139
+ context: {
2140
+ codeExamples: unifiedContext.codeExamples.length,
2141
+ guidelines: unifiedContext.guidelines.length,
2142
+ prComments: unifiedContext.prComments.length,
2143
+ },
2144
+ metadata: {
2145
+ analysisTimestamp: new Date().toISOString(),
2146
+ featuresUsed: {
2147
+ codeExamples: unifiedContext.codeExamples.length > 0,
2148
+ guidelines: unifiedContext.guidelines.length > 0,
2149
+ prHistory: unifiedContext.prComments.length > 0,
2150
+ },
2151
+ },
2152
+ };
2153
+ } catch (error) {
2154
+ console.error(chalk.red(`Error in holistic PR analysis: ${error.message}`));
2155
+ return {
2156
+ success: false,
2157
+ error: error.message,
2158
+ filePath: 'PR_HOLISTIC_REVIEW',
2159
+ };
2160
+ }
2161
+ }
2162
+
2163
+ /**
2164
+ * NEW: Gathers all context for a single file.
2165
+ * This encapsulates the logic for finding docs, code, and PR comments.
2166
+ * @param {string} filePath - Path to the file to get context for.
2167
+ * @param {string} content - The content of the file (or diff).
2168
+ * @param {Object} options - Analysis options.
2169
+ * @returns {Promise<Object>} An object containing the gathered context.
2170
+ */
2171
+ async function getContextForFile(filePath, content, options = {}) {
2172
+ const RELEVANT_CHUNK_THRESHOLD = 0.1;
2173
+ const W_H1_SIM = 0.2;
2174
+ const W_DOC_CONTEXT_MATCH = 0.6;
2175
+ const GENERIC_DOC_PENALTY_FACTOR = 0.7;
2176
+ const GUIDELINE_CANDIDATE_LIMIT = 100;
2177
+ const CODE_EXAMPLE_LIMIT = 40;
2178
+ const MAX_FINAL_EXAMPLES = 8;
2179
+
2180
+ // --- Stage 0: Initialize Tables (ONE-TIME SETUP) ---
2181
+ // Note: This may be called concurrently. `initializeTables` should be idempotent.
2182
+ try {
2183
+ await embeddingsSystem.initialize();
2184
+ } catch (initError) {
2185
+ console.warn(chalk.yellow(`Database initialization warning: ${initError.message}`));
2186
+ }
2187
+
2188
+ const projectPath = options.projectPath || (options.directory ? path.resolve(options.directory) : null) || process.cwd();
2189
+ const language = detectLanguageFromExtension(path.extname(filePath).toLowerCase());
2190
+ const fileTypeInfo = detectFileType(filePath, content);
2191
+ const isTestFile = fileTypeInfo.isTest;
2192
+
2193
+ const reviewedSnippetContext = inferContextFromCodeContent(content, language);
2194
+ debug('[getContextForFile] Reviewed Snippet Context:', reviewedSnippetContext);
2195
+
2196
+ let analyzedFileEmbedding = null;
2197
+ let fileContentQueryEmbedding = null;
2198
+ let guidelineQueryEmbedding = null;
2199
+
2200
+ if (content.trim().length > 0) {
2201
+ analyzedFileEmbedding = await embeddingsSystem.calculateEmbedding(content.substring(0, MAX_EMBEDDING_CONTENT_LENGTH));
2202
+ const queryContent = isTestFile ? `${content}\\n// Looking for similar test files and testing patterns` : content;
2203
+ fileContentQueryEmbedding = await embeddingsSystem.calculateQueryEmbedding(queryContent);
2204
+ }
2205
+
2206
+ const guidelineQuery = isTestFile
2207
+ ? createTestGuidelineQueryForLLMRetrieval(content, reviewedSnippetContext, language)
2208
+ : createGuidelineQueryForLLMRetrieval(content, reviewedSnippetContext, language);
2209
+
2210
+ if (guidelineQuery && guidelineQuery.trim().length > 0) {
2211
+ guidelineQueryEmbedding = await embeddingsSystem.calculateQueryEmbedding(guidelineQuery);
2212
+ }
2213
+
2214
+ console.log(chalk.blue('� Starting parallel context retrieval...'));
2215
+ // Helper function to process custom documents in parallel (with caching)
2216
+ const processCustomDocuments = async () => {
2217
+ // Check if preprocessed chunks are available (from PR-level processing)
2218
+ if (options.preprocessedCustomDocChunks && options.preprocessedCustomDocChunks.length > 0) {
2219
+ console.log(chalk.blue(`📄 Using preprocessed custom document chunks (${options.preprocessedCustomDocChunks.length} available)`));
2220
+
2221
+ // Use the guideline query for finding relevant custom document chunks
2222
+ const relevantChunks = await embeddingsSystem.findRelevantCustomDocChunks(guidelineQuery, options.preprocessedCustomDocChunks, {
2223
+ limit: 5,
2224
+ similarityThreshold: 0.3,
2225
+ queryContextForReranking: reviewedSnippetContext,
2226
+ useReranking: true,
2227
+ precomputedQueryEmbedding: guidelineQueryEmbedding,
2228
+ queryFilePath: filePath,
2229
+ });
2230
+
2231
+ console.log(chalk.green(`📄 Found ${relevantChunks.length} relevant custom document chunks`));
2232
+
2233
+ // Log which chunks made the cut
2234
+ if (relevantChunks.length > 0) {
2235
+ console.log(chalk.cyan('📋 Custom Document Chunks Selected:'));
2236
+ relevantChunks.forEach((chunk, i) => {
2237
+ console.log(chalk.cyan(` [${i + 1}] "${chunk.document_title}" (Chunk ${chunk.chunk_index + 1})`));
2238
+ console.log(chalk.gray(` Similarity: ${chunk.similarity?.toFixed(3) || 'N/A'}`));
2239
+ console.log(chalk.gray(` Content: ${chunk.content.substring(0, 80).replace(/\n/g, ' ')}...`));
2240
+ });
2241
+ }
2242
+
2243
+ return relevantChunks;
2244
+ }
2245
+
2246
+ // Fallback to original processing if no preprocessed chunks available
2247
+ if (!options.customDocs || options.customDocs.length === 0) {
2248
+ return [];
2249
+ }
2250
+
2251
+ try {
2252
+ console.log(chalk.blue('📄 Processing custom documents for context...'));
2253
+
2254
+ // Check if custom documents are already processed for this project
2255
+ let processedChunks = await checkExistingCustomDocumentChunks(projectPath);
2256
+
2257
+ if (!processedChunks || processedChunks.length === 0) {
2258
+ console.log(chalk.cyan('📄 Custom documents not yet processed for this project, processing now...'));
2259
+ // Process custom documents into chunks (only if not already processed)
2260
+ processedChunks = await embeddingsSystem.processCustomDocumentsInMemory(options.customDocs, projectPath);
2261
+ } else {
2262
+ console.log(chalk.green(`📄 Reusing ${processedChunks.length} already processed custom document chunks`));
2263
+ }
2264
+
2265
+ if (processedChunks.length > 0) {
2266
+ // Use the guideline query for finding relevant custom document chunks
2267
+ const relevantChunks = await embeddingsSystem.findRelevantCustomDocChunks(guidelineQuery, processedChunks, {
2268
+ limit: 5,
2269
+ similarityThreshold: 0.3,
2270
+ queryContextForReranking: reviewedSnippetContext,
2271
+ useReranking: true,
2272
+ precomputedQueryEmbedding: guidelineQueryEmbedding,
2273
+ queryFilePath: filePath,
2274
+ });
2275
+
2276
+ console.log(chalk.green(`📄 Found ${relevantChunks.length} relevant custom document chunks`));
2277
+
2278
+ // Log which chunks made the cut
2279
+ if (relevantChunks.length > 0) {
2280
+ console.log(chalk.cyan('📋 Custom Document Chunks Selected:'));
2281
+ relevantChunks.forEach((chunk, i) => {
2282
+ console.log(chalk.cyan(` [${i + 1}] "${chunk.document_title}" (Chunk ${chunk.chunk_index + 1})`));
2283
+ console.log(chalk.gray(` Similarity: ${chunk.similarity?.toFixed(3) || 'N/A'}`));
2284
+ console.log(chalk.gray(` Content: ${chunk.content.substring(0, 80).replace(/\n/g, ' ')}...`));
2285
+ });
2286
+ }
2287
+
2288
+ return relevantChunks;
2289
+ }
2290
+ } catch (error) {
2291
+ console.error(chalk.red(`Error processing custom documents: ${error.message}`));
2292
+ }
2293
+
2294
+ return [];
2295
+ };
2296
+
2297
+ // Helper function to check if custom documents are already processed
2298
+ const checkExistingCustomDocumentChunks = async (projectPath) => {
2299
+ try {
2300
+ // Use the statically imported function
2301
+ return await embeddingsSystem.getExistingCustomDocumentChunks(projectPath);
2302
+ } catch {
2303
+ console.log(chalk.gray('No existing custom document chunks found, will process from scratch'));
2304
+ return [];
2305
+ }
2306
+ };
2307
+
2308
+ const [prContextResult, guidelineCandidates, codeExampleCandidates, relevantCustomDocChunks] = await Promise.all([
2309
+ getPRCommentContext(filePath, {
2310
+ ...options,
2311
+ projectPath,
2312
+ precomputedQueryEmbedding: fileContentQueryEmbedding,
2313
+ maxComments: MAX_PR_COMMENTS_FOR_CONTEXT,
2314
+ similarityThreshold: options.prSimilarityThreshold || 0.3,
2315
+ timeout: options.prTimeout || 300000,
2316
+ repository: options.repository || null,
2317
+ }),
2318
+ embeddingsSystem.findRelevantDocs(guidelineQuery, {
2319
+ ...options,
2320
+ projectPath,
2321
+ precomputedQueryEmbedding: guidelineQueryEmbedding,
2322
+ limit: GUIDELINE_CANDIDATE_LIMIT,
2323
+ similarityThreshold: 0.05,
2324
+ useReranking: true,
2325
+ queryContextForReranking: reviewedSnippetContext,
2326
+ }),
2327
+ embeddingsSystem.findSimilarCode(isTestFile ? `${content}\\n// Looking for similar test files and testing patterns` : content, {
2328
+ ...options,
2329
+ projectPath,
2330
+ isTestFile,
2331
+ precomputedQueryEmbedding: fileContentQueryEmbedding,
2332
+ limit: CODE_EXAMPLE_LIMIT,
2333
+ similarityThreshold: 0.3,
2334
+ queryFilePath: filePath,
2335
+ includeProjectStructure: false,
2336
+ }),
2337
+ processCustomDocuments(), // Add custom document processing as 4th parallel operation
2338
+ ]).catch((error) => {
2339
+ console.warn(chalk.yellow(`Parallel context retrieval failed: ${error.message}`));
2340
+ return [[], [], [], []];
2341
+ });
2342
+
2343
+ const prCommentContext = prContextResult?.comments || [];
2344
+ const prContextAvailable = prCommentContext.length > 0;
2345
+ console.log(chalk.green(`✅ Found ${prCommentContext.length} relevant PR comments`));
2346
+
2347
+ const documentChunks = Array.isArray(guidelineCandidates) ? guidelineCandidates.filter((c) => c.type === 'documentation-chunk') : [];
2348
+ const chunksByDocument = new Map();
2349
+ for (const chunk of documentChunks) {
2350
+ if (!chunksByDocument.has(chunk.path)) {
2351
+ chunksByDocument.set(chunk.path, []);
2352
+ }
2353
+ chunksByDocument.get(chunk.path).push(chunk);
2354
+ }
2355
+
2356
+ const scoredDocuments = [];
2357
+
2358
+ for (const [docPath, docChunks] of chunksByDocument.entries()) {
2359
+ const docH1 = docChunks[0]?.document_title || path.basename(docPath, path.extname(docPath));
2360
+
2361
+ // FAST-PATH OPTIMIZATION: Use shared utility for generic documents
2362
+ let candidateDocFullContext;
2363
+ if (isGenericDocument(docPath, docH1)) {
2364
+ candidateDocFullContext = getGenericDocumentContext(docPath, docH1);
2365
+ debug(`[FAST-PATH] Using pre-computed context for generic document in RAG: ${docPath}`);
2366
+ } else {
2367
+ candidateDocFullContext = await inferContextFromDocumentContent(docPath, docH1, docChunks, language);
2368
+ }
2369
+ const relevantChunksForDoc = docChunks.filter((c) => c.similarity >= RELEVANT_CHUNK_THRESHOLD);
2370
+ if (relevantChunksForDoc.length === 0) continue;
2371
+
2372
+ const maxChunkScoreInDoc = Math.max(...relevantChunksForDoc.map((c) => c.similarity));
2373
+ const avgChunkScoreInDoc = relevantChunksForDoc.reduce((sum, c) => sum + c.similarity, 0) / relevantChunksForDoc.length;
2374
+ const numRelevantChunks = relevantChunksForDoc.length;
2375
+ const semanticQualityScore = maxChunkScoreInDoc * 0.5 + avgChunkScoreInDoc * 0.3 + Math.min(numRelevantChunks, 5) * 0.04;
2376
+
2377
+ let docLevelContextMatchScore = 0;
2378
+ if (
2379
+ reviewedSnippetContext.area !== 'Unknown' &&
2380
+ candidateDocFullContext.area !== 'Unknown' &&
2381
+ candidateDocFullContext.area !== 'General'
2382
+ ) {
2383
+ if (reviewedSnippetContext.area === candidateDocFullContext.area) {
2384
+ docLevelContextMatchScore += 0.8;
2385
+ for (const tech of reviewedSnippetContext.dominantTech) {
2386
+ if (candidateDocFullContext.dominantTech.map((t) => t.toLowerCase()).includes(tech.toLowerCase())) {
2387
+ docLevelContextMatchScore += 0.2;
2388
+ break;
2389
+ }
2390
+ }
2391
+ } else if (reviewedSnippetContext.area !== 'GeneralJS_TS') {
2392
+ docLevelContextMatchScore -= 0.2;
2393
+ }
2394
+ }
2395
+
2396
+ let docH1RelevanceToReviewedFile = 0;
2397
+ if (docH1 && analyzedFileEmbedding) {
2398
+ const docH1Embedding = await embeddingsSystem.calculateEmbedding(docH1);
2399
+ if (docH1Embedding) {
2400
+ docH1RelevanceToReviewedFile = calculateCosineSimilarity(analyzedFileEmbedding, docH1Embedding);
2401
+ }
2402
+ }
2403
+
2404
+ const isGenericByName = isGenericDocument(docPath, docH1);
2405
+ let genericDocPenaltyFactor = 1.0;
2406
+ if (candidateDocFullContext.isGeneralPurposeReadmeStyle || isGenericByName) {
2407
+ if (reviewedSnippetContext.area !== 'DevOps' && (docLevelContextMatchScore < 0.8 || isGenericByName)) {
2408
+ genericDocPenaltyFactor = GENERIC_DOC_PENALTY_FACTOR;
2409
+ }
2410
+ }
2411
+
2412
+ let finalDocScore =
2413
+ semanticQualityScore * 0.2 + docLevelContextMatchScore * W_DOC_CONTEXT_MATCH + docH1RelevanceToReviewedFile * W_H1_SIM;
2414
+ finalDocScore *= genericDocPenaltyFactor;
2415
+
2416
+ scoredDocuments.push({
2417
+ path: docPath,
2418
+ score: finalDocScore,
2419
+ chunks: docChunks.sort((a, b) => b.similarity - a.similarity),
2420
+ debug: {
2421
+ area: candidateDocFullContext.area,
2422
+ tech: candidateDocFullContext.dominantTech.join(', '),
2423
+ isGenericStyle: candidateDocFullContext.isGeneralPurposeReadmeStyle || isGenericByName,
2424
+ semanticQualityScore: semanticQualityScore.toFixed(4),
2425
+ docLevelContextMatchScore: docLevelContextMatchScore.toFixed(4),
2426
+ docH1RelevanceToReviewedFile: docH1RelevanceToReviewedFile.toFixed(4),
2427
+ genericDocPenaltyFactor: genericDocPenaltyFactor.toFixed(4),
2428
+ finalScore: finalDocScore.toFixed(4),
2429
+ },
2430
+ });
2431
+ }
2432
+ scoredDocuments.sort((a, b) => b.score - a.score);
2433
+
2434
+ debug('[getContextForFile] Top Scored Documents:');
2435
+ scoredDocuments.slice(0, 7).forEach((d) => {
2436
+ debug(
2437
+ ` Path: ${d.path}, Score: ${d.score.toFixed(4)}, Area: ${d.debug.area}, Tech: ${d.debug.tech}, Generic: ${d.debug.isGenericStyle}`
2438
+ );
2439
+ });
2440
+
2441
+ const finalGuidelineSnippets = [];
2442
+ const relevantDocs = scoredDocuments.filter((doc) => {
2443
+ if (doc.score < 0.3) {
2444
+ debug(`[getContextForFile] Excluding doc ${doc.path} - score too low: ${doc.score.toFixed(4)}`);
2445
+ return false;
2446
+ }
2447
+ if (
2448
+ reviewedSnippetContext.area !== 'Unknown' &&
2449
+ doc.debug.area !== 'Unknown' &&
2450
+ doc.debug.area !== 'General' &&
2451
+ reviewedSnippetContext.area !== doc.debug.area
2452
+ ) {
2453
+ const hasTechMatch = reviewedSnippetContext.dominantTech.some((tech) => doc.debug.tech.toLowerCase().includes(tech.toLowerCase()));
2454
+ if (!hasTechMatch) {
2455
+ debug(
2456
+ `[getContextForFile] Excluding doc ${doc.path} - area mismatch without tech match: ${doc.debug.area} vs ${reviewedSnippetContext.area}`
2457
+ );
2458
+ return false;
2459
+ }
2460
+ }
2461
+ return true;
2462
+ });
2463
+
2464
+ for (const doc of relevantDocs.slice(0, 4)) {
2465
+ if (doc.chunks && doc.chunks.length > 0) {
2466
+ finalGuidelineSnippets.push(doc.chunks[0]);
2467
+ }
2468
+ }
2469
+
2470
+ const uniqueCandidates = [];
2471
+ const seenPaths = new Set();
2472
+ const normalizedReviewPath = path.resolve(projectPath, filePath);
2473
+
2474
+ for (const candidate of codeExampleCandidates || []) {
2475
+ const normalizedCandidatePath = path.resolve(projectPath, candidate.path);
2476
+ if (normalizedCandidatePath !== normalizedReviewPath && !candidate.isDocumentation && !seenPaths.has(candidate.path)) {
2477
+ uniqueCandidates.push(candidate);
2478
+ seenPaths.add(candidate.path);
2479
+ }
2480
+ }
2481
+ uniqueCandidates.sort((a, b) => b.similarity - a.similarity);
2482
+ const finalCodeExamples = uniqueCandidates.slice(0, MAX_FINAL_EXAMPLES);
2483
+
2484
+ return {
2485
+ language,
2486
+ isTestFile,
2487
+ finalCodeExamples,
2488
+ finalGuidelineSnippets,
2489
+ prCommentContext,
2490
+ prContextAvailable,
2491
+ relevantCustomDocChunks, // Add relevant custom document chunks
2492
+ };
2493
+ }
2494
+
2495
+ async function gatherUnifiedContextForPR(prFiles, options = {}) {
2496
+ const allProcessedContext = {
2497
+ codeExamples: new Map(),
2498
+ guidelines: new Map(),
2499
+ prComments: new Map(),
2500
+ customDocChunks: new Map(),
2501
+ };
2502
+
2503
+ // Process custom documents into chunks once at the start for the entire PR
2504
+ let globalCustomDocChunks = [];
2505
+ if (options.customDocs && options.customDocs.length > 0) {
2506
+ const projectPath = options.projectPath || process.cwd();
2507
+ console.log(chalk.blue('📄 Processing custom documents once for entire PR...'));
2508
+
2509
+ try {
2510
+ // Check if custom documents are already processed for this project
2511
+ let processedChunks = await embeddingsSystem.getExistingCustomDocumentChunks(projectPath);
2512
+
2513
+ if (!processedChunks || processedChunks.length === 0) {
2514
+ console.log(chalk.cyan('📄 Custom documents not yet processed for this project, processing now...'));
2515
+ processedChunks = await embeddingsSystem.processCustomDocumentsInMemory(options.customDocs, projectPath);
2516
+ } else {
2517
+ console.log(chalk.green(`📄 Reusing ${processedChunks.length} already processed custom document chunks`));
2518
+ }
2519
+
2520
+ globalCustomDocChunks = processedChunks;
2521
+ console.log(chalk.green(`📄 Custom documents processed: ${globalCustomDocChunks.length} chunks available for PR analysis`));
2522
+ } catch (error) {
2523
+ console.error(chalk.red(`Error processing custom documents for PR: ${error.message}`));
2524
+ }
2525
+ }
2526
+
2527
+ const contextPromises = prFiles.map(async (file) => {
2528
+ try {
2529
+ const filePath = file.filePath;
2530
+ const content = file.diffContent || file.content;
2531
+ // Pass the pre-processed chunks to avoid reprocessing, but still allow file-specific similarity search
2532
+ const optionsWithPreprocessedChunks = {
2533
+ ...options,
2534
+ customDocs: [], // Remove original custom docs to avoid reprocessing
2535
+ preprocessedCustomDocChunks: globalCustomDocChunks, // Pass pre-processed chunks
2536
+ };
2537
+ const context = await getContextForFile(filePath, content, optionsWithPreprocessedChunks);
2538
+ return {
2539
+ ...context,
2540
+ filePath,
2541
+ };
2542
+ } catch (error) {
2543
+ console.error(chalk.red(`Error gathering context for file ${file.filePath}: ${error.message}`));
2544
+ return null; // Return null on error for this file
2545
+ }
2546
+ });
2547
+
2548
+ const allContexts = (await Promise.all(contextPromises)).filter(Boolean); // Filter out nulls
2549
+
2550
+ // Aggregate and deduplicate results
2551
+ for (const context of allContexts) {
2552
+ (context.finalCodeExamples || []).forEach((example) => {
2553
+ const key = example.path;
2554
+ if (
2555
+ key &&
2556
+ (!allProcessedContext.codeExamples.has(key) || example.similarity > allProcessedContext.codeExamples.get(key).similarity)
2557
+ ) {
2558
+ allProcessedContext.codeExamples.set(key, example);
2559
+ }
2560
+ });
2561
+
2562
+ (context.finalGuidelineSnippets || []).forEach((guideline) => {
2563
+ const key = `${guideline.path}-${guideline.heading_text || ''}`;
2564
+ if (!allProcessedContext.guidelines.has(key) || guideline.similarity > allProcessedContext.guidelines.get(key).similarity) {
2565
+ allProcessedContext.guidelines.set(key, guideline);
2566
+ }
2567
+ });
2568
+
2569
+ (context.prCommentContext || []).forEach((comment) => {
2570
+ const key = comment.id;
2571
+ if (
2572
+ key &&
2573
+ (!allProcessedContext.prComments.has(key) || comment.relevanceScore > allProcessedContext.prComments.get(key).relevanceScore)
2574
+ ) {
2575
+ allProcessedContext.prComments.set(key, comment);
2576
+ }
2577
+ });
2578
+
2579
+ (context.relevantCustomDocChunks || []).forEach((chunk) => {
2580
+ const key = chunk.id;
2581
+ if (
2582
+ key &&
2583
+ (!allProcessedContext.customDocChunks.has(key) || chunk.similarity > allProcessedContext.customDocChunks.get(key).similarity)
2584
+ ) {
2585
+ allProcessedContext.customDocChunks.set(key, chunk);
2586
+ }
2587
+ });
2588
+ }
2589
+
2590
+ // Convert Maps to sorted arrays
2591
+ const deduplicatedCodeExamples = Array.from(allProcessedContext.codeExamples.values())
2592
+ .sort((a, b) => b.similarity - a.similarity)
2593
+ .slice(0, options.maxExamples || 40);
2594
+
2595
+ const deduplicatedGuidelines = Array.from(allProcessedContext.guidelines.values())
2596
+ .sort((a, b) => b.similarity - a.similarity)
2597
+ .slice(0, 100);
2598
+
2599
+ const deduplicatedPRComments = Array.from(allProcessedContext.prComments.values())
2600
+ .sort((a, b) => b.relevanceScore - a.relevanceScore)
2601
+ .slice(0, 40); // Keep a larger pool of 40 candidates for the final prompt selection
2602
+
2603
+ const deduplicatedCustomDocChunks = Array.from(allProcessedContext.customDocChunks.values())
2604
+ .sort((a, b) => b.similarity - a.similarity)
2605
+ .slice(0, 10); // Keep top 10 custom document chunks
2606
+
2607
+ return {
2608
+ codeExamples: deduplicatedCodeExamples,
2609
+ guidelines: deduplicatedGuidelines,
2610
+ prComments: deduplicatedPRComments,
2611
+ customDocChunks: deduplicatedCustomDocChunks,
2612
+ };
2613
+ }
2614
+
2615
+ /**
2616
+ * Filter out low severity issues from analysis results
2617
+ * Low severity issues are typically formatting/style concerns better handled by linters
2618
+ *
2619
+ * @param {Object} analysisResults - Analysis results from LLM
2620
+ * @param {Object} options - Filtering options
2621
+ * @returns {Object} Filtered analysis results without low severity issues
2622
+ */
2623
+ function filterLowSeverityIssues(analysisResults, options = {}) {
2624
+ const { verbose = false } = options;
2625
+
2626
+ if (!analysisResults) {
2627
+ return analysisResults;
2628
+ }
2629
+
2630
+ let filteredCount = 0;
2631
+
2632
+ // Filter single-file issues array
2633
+ if (analysisResults.issues && Array.isArray(analysisResults.issues)) {
2634
+ const originalCount = analysisResults.issues.length;
2635
+ analysisResults.issues = analysisResults.issues.filter((issue) => {
2636
+ const severity = (issue.severity || '').toLowerCase();
2637
+ if (severity === 'low') {
2638
+ if (verbose) {
2639
+ console.log(chalk.yellow(` Filtering low severity issue: "${(issue.description || '').substring(0, 50)}..."`));
2640
+ }
2641
+ return false;
2642
+ }
2643
+ return true;
2644
+ });
2645
+ filteredCount += originalCount - analysisResults.issues.length;
2646
+ }
2647
+
2648
+ // Filter cross-file issues (for holistic PR review)
2649
+ if (analysisResults.crossFileIssues && Array.isArray(analysisResults.crossFileIssues)) {
2650
+ const originalCount = analysisResults.crossFileIssues.length;
2651
+ analysisResults.crossFileIssues = analysisResults.crossFileIssues.filter((issue) => {
2652
+ const severity = (issue.severity || '').toLowerCase();
2653
+ if (severity === 'low') {
2654
+ if (verbose) {
2655
+ console.log(
2656
+ chalk.yellow(` Filtering low severity cross-file issue: "${(issue.message || issue.description || '').substring(0, 50)}..."`)
2657
+ );
2658
+ }
2659
+ return false;
2660
+ }
2661
+ return true;
2662
+ });
2663
+ filteredCount += originalCount - analysisResults.crossFileIssues.length;
2664
+ }
2665
+
2666
+ // Filter file-specific issues (for holistic PR review)
2667
+ if (analysisResults.fileSpecificIssues && typeof analysisResults.fileSpecificIssues === 'object') {
2668
+ for (const filePath of Object.keys(analysisResults.fileSpecificIssues)) {
2669
+ const issues = analysisResults.fileSpecificIssues[filePath];
2670
+ if (Array.isArray(issues)) {
2671
+ const originalCount = issues.length;
2672
+ analysisResults.fileSpecificIssues[filePath] = issues.filter((issue) => {
2673
+ const severity = (issue.severity || '').toLowerCase();
2674
+ if (severity === 'low') {
2675
+ if (verbose) {
2676
+ console.log(
2677
+ chalk.yellow(` Filtering low severity issue in ${filePath}: "${(issue.description || '').substring(0, 50)}..."`)
2678
+ );
2679
+ }
2680
+ return false;
2681
+ }
2682
+ return true;
2683
+ });
2684
+ filteredCount += originalCount - analysisResults.fileSpecificIssues[filePath].length;
2685
+ }
2686
+ }
2687
+ }
2688
+
2689
+ if (filteredCount > 0) {
2690
+ console.log(chalk.cyan(`🔇 Filtered ${filteredCount} low severity issue(s) (formatting/style concerns handled by linters)`));
2691
+ }
2692
+
2693
+ return analysisResults;
2694
+ }
2695
+
2696
+ /**
2697
+ * Filter analysis results based on feedback data using semantic similarity
2698
+ *
2699
+ * @param {Object} analysisResults - Raw analysis results from LLM
2700
+ * @param {Object} feedbackData - Loaded feedback data
2701
+ * @param {Object} options - Filtering options
2702
+ * @returns {Promise<Object>} Filtered analysis results
2703
+ */
2704
+ async function filterAnalysisResults(analysisResults, feedbackData, options = {}) {
2705
+ const { similarityThreshold = 0.7, verbose = false } = options;
2706
+
2707
+ if (!analysisResults || !analysisResults.issues || !Array.isArray(analysisResults.issues)) {
2708
+ return analysisResults;
2709
+ }
2710
+
2711
+ const originalCount = analysisResults.issues.length;
2712
+
2713
+ // Ensure semantic similarity is initialized for better matching
2714
+ await ensureSemanticSimilarityInitialized();
2715
+
2716
+ // Log whether semantic similarity is available
2717
+ if (verbose) {
2718
+ const usingSemanticSimilarity = isSemanticSimilarityAvailable();
2719
+ console.log(
2720
+ chalk.cyan(`🔍 Filtering issues using ${usingSemanticSimilarity ? 'semantic + word-based similarity' : 'word-based similarity only'}`)
2721
+ );
2722
+ }
2723
+
2724
+ // Filter issues based on feedback (now async due to semantic similarity)
2725
+ const filterResults = await Promise.all(
2726
+ analysisResults.issues.map(async (issue, index) => {
2727
+ const issueDescription = issue.description || issue.summary || '';
2728
+ const shouldSkip = await shouldSkipSimilarIssue(issueDescription, feedbackData, {
2729
+ similarityThreshold,
2730
+ verbose,
2731
+ });
2732
+
2733
+ if (shouldSkip && verbose) {
2734
+ console.log(chalk.yellow(` Filtered issue ${index + 1}: "${issueDescription.substring(0, 50)}..."`));
2735
+ }
2736
+
2737
+ return { issue, shouldSkip };
2738
+ })
2739
+ );
2740
+
2741
+ const filteredIssues = filterResults.filter((result) => !result.shouldSkip).map((result) => result.issue);
2742
+
2743
+ const filteredCount = originalCount - filteredIssues.length;
2744
+
2745
+ if (verbose && filteredCount > 0) {
2746
+ console.log(chalk.green(`✅ Filtered ${filteredCount} dismissed issues, ${filteredIssues.length} remaining`));
2747
+ }
2748
+
2749
+ return {
2750
+ ...analysisResults,
2751
+ issues: filteredIssues,
2752
+ metadata: {
2753
+ ...analysisResults.metadata,
2754
+ feedbackFiltering: {
2755
+ originalIssueCount: originalCount,
2756
+ filteredIssueCount: filteredCount,
2757
+ finalIssueCount: filteredIssues.length,
2758
+ usedSemanticSimilarity: isSemanticSimilarityAvailable(),
2759
+ },
2760
+ },
2761
+ };
2762
+ }
2763
+
2764
+ export { runAnalysis, gatherUnifiedContextForPR };