@juspay/yama 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +29 -1
- package/dist/core/Guardian.js +6 -3
- package/dist/core/providers/BitbucketProvider.d.ts +1 -1
- package/dist/core/providers/BitbucketProvider.js +31 -1
- package/dist/features/CodeReviewer.d.ts +17 -9
- package/dist/features/CodeReviewer.js +274 -215
- package/dist/features/MultiInstanceProcessor.d.ts +74 -0
- package/dist/features/MultiInstanceProcessor.js +359 -0
- package/dist/types/index.d.ts +120 -1
- package/dist/utils/ContentSimilarityService.d.ts +74 -0
- package/dist/utils/ContentSimilarityService.js +215 -0
- package/dist/utils/ExactDuplicateRemover.d.ts +77 -0
- package/dist/utils/ExactDuplicateRemover.js +361 -0
- package/dist/utils/ParallelProcessing.d.ts +112 -0
- package/dist/utils/ParallelProcessing.js +228 -0
- package/package.json +17 -17
- package/yama.config.example.yaml +42 -4
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Similarity Service for Semantic Deduplication
|
|
3
|
+
* Uses AI to compare violations with existing PR comments for semantic similarity
|
|
4
|
+
*/
|
|
5
|
+
import { logger } from "./Logger.js";
|
|
6
|
+
/**
|
|
7
|
+
* Service for calculating semantic similarity between violations and PR comments
|
|
8
|
+
*/
|
|
9
|
+
export class ContentSimilarityService {
|
|
10
|
+
neurolink;
|
|
11
|
+
aiConfig;
|
|
12
|
+
constructor(aiConfig) {
|
|
13
|
+
this.aiConfig = aiConfig;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Calculate similarity scores between violations and comments in batches
|
|
17
|
+
*/
|
|
18
|
+
async batchCalculateSimilarity(violations, comments, batchSize = 15) {
|
|
19
|
+
const startTime = Date.now();
|
|
20
|
+
logger.debug(`🔍 Starting semantic similarity analysis: ${violations.length} violations vs ${comments.length} comments`);
|
|
21
|
+
if (violations.length === 0 || comments.length === 0) {
|
|
22
|
+
logger.debug("⏭️ No violations or comments to compare, skipping similarity analysis");
|
|
23
|
+
return [];
|
|
24
|
+
}
|
|
25
|
+
// Prepare violation and comment content for AI analysis
|
|
26
|
+
const violationData = this.prepareViolationContent(violations);
|
|
27
|
+
const commentData = this.prepareCommentContent(comments);
|
|
28
|
+
logger.debug(`📝 Prepared ${violationData.length} violations and ${commentData.length} comments for analysis`);
|
|
29
|
+
// Process in batches to manage token limits
|
|
30
|
+
const allResults = [];
|
|
31
|
+
const totalBatches = Math.ceil(violations.length / batchSize);
|
|
32
|
+
for (let i = 0; i < violations.length; i += batchSize) {
|
|
33
|
+
const batchIndex = Math.floor(i / batchSize) + 1;
|
|
34
|
+
const violationBatch = violationData.slice(i, i + batchSize);
|
|
35
|
+
logger.debug(`🔄 Processing batch ${batchIndex}/${totalBatches} (${violationBatch.length} violations)`);
|
|
36
|
+
try {
|
|
37
|
+
const batchResults = await this.processSimilarityBatch(violationBatch, commentData);
|
|
38
|
+
allResults.push(...batchResults);
|
|
39
|
+
logger.debug(`✅ Batch ${batchIndex} completed: ${batchResults.length} similarity scores calculated`);
|
|
40
|
+
// Add delay between batches to avoid rate limiting
|
|
41
|
+
if (batchIndex < totalBatches) {
|
|
42
|
+
await this.delay(1000);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
catch (error) {
|
|
46
|
+
logger.error(`❌ Batch ${batchIndex} failed: ${error.message}`);
|
|
47
|
+
// Continue with next batch instead of failing entirely
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const processingTime = Date.now() - startTime;
|
|
51
|
+
logger.success(`✅ Semantic similarity analysis completed: ${allResults.length} comparisons in ${processingTime}ms`);
|
|
52
|
+
return allResults;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Prepare violation content for AI analysis
|
|
56
|
+
*/
|
|
57
|
+
prepareViolationContent(violations) {
|
|
58
|
+
return violations.map((violation, index) => ({
|
|
59
|
+
index,
|
|
60
|
+
id: `violation_${index}`,
|
|
61
|
+
content: this.extractViolationContent(violation),
|
|
62
|
+
}));
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Prepare comment content for AI analysis
|
|
66
|
+
*/
|
|
67
|
+
prepareCommentContent(comments) {
|
|
68
|
+
return comments.map((comment, index) => ({
|
|
69
|
+
index,
|
|
70
|
+
id: comment.id,
|
|
71
|
+
content: this.extractCommentContent(comment),
|
|
72
|
+
}));
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Extract meaningful content from violation for comparison
|
|
76
|
+
*/
|
|
77
|
+
extractViolationContent(violation) {
|
|
78
|
+
const parts = [
|
|
79
|
+
`Issue: ${violation.issue}`,
|
|
80
|
+
`Message: ${violation.message}`,
|
|
81
|
+
violation.file ? `File: ${violation.file}` : "",
|
|
82
|
+
violation.code_snippet ? `Code: ${violation.code_snippet}` : "",
|
|
83
|
+
`Severity: ${violation.severity}`,
|
|
84
|
+
`Category: ${violation.category}`,
|
|
85
|
+
].filter(Boolean);
|
|
86
|
+
return parts.join(" | ");
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Extract meaningful content from comment for comparison
|
|
90
|
+
*/
|
|
91
|
+
extractCommentContent(comment) {
|
|
92
|
+
const parts = [
|
|
93
|
+
`Comment: ${comment.text}`,
|
|
94
|
+
comment.anchor?.filePath ? `File: ${comment.anchor.filePath}` : "",
|
|
95
|
+
`Author: ${comment.author.displayName || comment.author.name}`,
|
|
96
|
+
].filter(Boolean);
|
|
97
|
+
return parts.join(" | ");
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Process a single batch of violations against all comments
|
|
101
|
+
*/
|
|
102
|
+
async processSimilarityBatch(violationBatch, commentData) {
|
|
103
|
+
const prompt = this.createSimilarityPrompt(violationBatch, commentData);
|
|
104
|
+
try {
|
|
105
|
+
// Initialize NeuroLink if not already done
|
|
106
|
+
if (!this.neurolink) {
|
|
107
|
+
const { NeuroLink } = await import("@juspay/neurolink");
|
|
108
|
+
this.neurolink = new NeuroLink();
|
|
109
|
+
}
|
|
110
|
+
// Use NeuroLink for AI analysis
|
|
111
|
+
const result = await this.neurolink.generate({
|
|
112
|
+
input: { text: prompt },
|
|
113
|
+
systemPrompt: "You are an expert code reviewer analyzing semantic similarity between violations and comments. Provide accurate similarity scores based on content analysis.",
|
|
114
|
+
provider: this.aiConfig.provider || "auto",
|
|
115
|
+
model: this.aiConfig.model || "best",
|
|
116
|
+
temperature: 0.1, // Low temperature for consistent similarity scoring
|
|
117
|
+
maxTokens: this.aiConfig.maxTokens || 4000,
|
|
118
|
+
timeout: "5m",
|
|
119
|
+
enableAnalytics: this.aiConfig.enableAnalytics || false,
|
|
120
|
+
enableEvaluation: false,
|
|
121
|
+
});
|
|
122
|
+
return this.parseSimilarityResponse(result.content, violationBatch, commentData);
|
|
123
|
+
}
|
|
124
|
+
catch (error) {
|
|
125
|
+
logger.error(`Failed to process similarity batch: ${error.message}`);
|
|
126
|
+
throw error;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Create AI prompt for similarity analysis
|
|
131
|
+
*/
|
|
132
|
+
createSimilarityPrompt(violations, comments) {
|
|
133
|
+
const violationList = violations
|
|
134
|
+
.map((v, i) => `${i + 1}. ${v.content}`)
|
|
135
|
+
.join("\n");
|
|
136
|
+
const commentList = comments
|
|
137
|
+
.map((c, i) => `${i + 1}. ${c.content.substring(0, 300)}${c.content.length > 300 ? "..." : ""}`)
|
|
138
|
+
.join("\n");
|
|
139
|
+
return `
|
|
140
|
+
Analyze the semantic similarity between these code review violations and existing PR comments.
|
|
141
|
+
|
|
142
|
+
NEW VIOLATIONS TO CHECK:
|
|
143
|
+
${violationList}
|
|
144
|
+
|
|
145
|
+
EXISTING PR COMMENTS:
|
|
146
|
+
${commentList}
|
|
147
|
+
|
|
148
|
+
For each violation, determine if it's semantically similar to any existing comment. Consider:
|
|
149
|
+
- Same or similar issues being reported
|
|
150
|
+
- Same file or code area being discussed
|
|
151
|
+
- Similar concerns or suggestions
|
|
152
|
+
- Related security, performance, or code quality topics
|
|
153
|
+
|
|
154
|
+
Return a JSON array with similarity scores (0-100) for each violation-comment pair that has meaningful similarity (score >= 70).
|
|
155
|
+
|
|
156
|
+
Format: [{"violation": 1, "comment": 2, "score": 85, "reasoning": "Both discuss the same security vulnerability in authentication"}, ...]
|
|
157
|
+
|
|
158
|
+
Only include pairs with scores >= 70. If no meaningful similarities exist, return an empty array [].
|
|
159
|
+
`.trim();
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Parse AI response to extract similarity results
|
|
163
|
+
*/
|
|
164
|
+
parseSimilarityResponse(response, violationBatch, commentData) {
|
|
165
|
+
try {
|
|
166
|
+
// Extract JSON from response
|
|
167
|
+
const jsonMatch = response.match(/\[[\s\S]*\]/);
|
|
168
|
+
if (!jsonMatch) {
|
|
169
|
+
logger.debug("No JSON array found in AI response, assuming no similarities");
|
|
170
|
+
return [];
|
|
171
|
+
}
|
|
172
|
+
const similarities = JSON.parse(jsonMatch[0]);
|
|
173
|
+
const results = [];
|
|
174
|
+
for (const similarity of similarities) {
|
|
175
|
+
const violationIndex = similarity.violation - 1; // Convert from 1-based to 0-based
|
|
176
|
+
const commentIndex = similarity.comment - 1;
|
|
177
|
+
if (violationIndex >= 0 &&
|
|
178
|
+
violationIndex < violationBatch.length &&
|
|
179
|
+
commentIndex >= 0 &&
|
|
180
|
+
commentIndex < commentData.length) {
|
|
181
|
+
const violation = violationBatch[violationIndex];
|
|
182
|
+
const comment = commentData[commentIndex];
|
|
183
|
+
results.push({
|
|
184
|
+
violationIndex: violation.index,
|
|
185
|
+
commentIndex: comment.index,
|
|
186
|
+
violationId: violation.id,
|
|
187
|
+
commentId: comment.id,
|
|
188
|
+
similarityScore: similarity.score,
|
|
189
|
+
reasoning: similarity.reasoning,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
logger.debug(`📊 Parsed ${results.length} similarity results from AI response`);
|
|
194
|
+
return results;
|
|
195
|
+
}
|
|
196
|
+
catch (error) {
|
|
197
|
+
logger.error(`Failed to parse similarity response: ${error.message}`);
|
|
198
|
+
logger.debug(`Raw response: ${response}`);
|
|
199
|
+
return [];
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Simple delay utility for rate limiting
|
|
204
|
+
*/
|
|
205
|
+
delay(ms) {
|
|
206
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Factory function to create ContentSimilarityService
|
|
211
|
+
*/
|
|
212
|
+
export function createContentSimilarityService(aiConfig) {
|
|
213
|
+
return new ContentSimilarityService(aiConfig);
|
|
214
|
+
}
|
|
215
|
+
//# sourceMappingURL=ContentSimilarityService.js.map
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exact Duplicate Removal Utility for Multi-Instance Processing
|
|
3
|
+
* Handles deduplication of violations from multiple Neurolink SDK instances
|
|
4
|
+
*/
|
|
5
|
+
import { Violation, DeduplicationResult, InstanceResult, PRComment, CommentDeduplicationResult, AIProviderConfig } from "../types/index.js";
|
|
6
|
+
/**
|
|
7
|
+
* Exact Duplicate Remover for Multi-Instance Results
|
|
8
|
+
* Implements multi-level deduplication strategy
|
|
9
|
+
*/
|
|
10
|
+
export declare class ExactDuplicateRemover {
|
|
11
|
+
/**
|
|
12
|
+
* Remove exact duplicates from multiple instance results
|
|
13
|
+
*/
|
|
14
|
+
removeDuplicates(instanceResults: InstanceResult[]): DeduplicationResult;
|
|
15
|
+
/**
|
|
16
|
+
* Flatten violations from all instances with source tracking
|
|
17
|
+
*/
|
|
18
|
+
private flattenViolationsWithSource;
|
|
19
|
+
/**
|
|
20
|
+
* Remove exact hash duplicates (Level 1)
|
|
21
|
+
*/
|
|
22
|
+
private removeExactHashDuplicates;
|
|
23
|
+
/**
|
|
24
|
+
* Remove normalized duplicates (Level 2)
|
|
25
|
+
*/
|
|
26
|
+
private removeNormalizedDuplicates;
|
|
27
|
+
/**
|
|
28
|
+
* Remove same file+line duplicates (Level 3)
|
|
29
|
+
*/
|
|
30
|
+
private removeSameLineDuplicates;
|
|
31
|
+
/**
|
|
32
|
+
* Create hash for exact violation matching
|
|
33
|
+
*/
|
|
34
|
+
private createViolationHash;
|
|
35
|
+
/**
|
|
36
|
+
* Create hash for normalized violation matching
|
|
37
|
+
*/
|
|
38
|
+
private createNormalizedViolationHash;
|
|
39
|
+
/**
|
|
40
|
+
* Normalize code snippet for comparison
|
|
41
|
+
*/
|
|
42
|
+
private normalizeCodeSnippet;
|
|
43
|
+
/**
|
|
44
|
+
* Normalize text for comparison
|
|
45
|
+
*/
|
|
46
|
+
private normalizeText;
|
|
47
|
+
/**
|
|
48
|
+
* Resolve duplicate by severity (and potentially other factors)
|
|
49
|
+
*/
|
|
50
|
+
private resolveDuplicateBySeverity;
|
|
51
|
+
/**
|
|
52
|
+
* Track which instance contributed how many violations
|
|
53
|
+
*/
|
|
54
|
+
private trackContributions;
|
|
55
|
+
/**
|
|
56
|
+
* Remove source tracking information from violation
|
|
57
|
+
*/
|
|
58
|
+
private stripSourceInfo;
|
|
59
|
+
/**
|
|
60
|
+
* Remove violations that duplicate existing PR comments using semantic similarity
|
|
61
|
+
* Uses AI-powered ContentSimilarityService for intelligent deduplication
|
|
62
|
+
*/
|
|
63
|
+
removeAgainstExistingComments(newViolations: Violation[], existingComments: PRComment[], aiConfig: AIProviderConfig, similarityThreshold?: number): Promise<CommentDeduplicationResult>;
|
|
64
|
+
/**
|
|
65
|
+
* Get detailed deduplication statistics
|
|
66
|
+
*/
|
|
67
|
+
getDeduplicationStats(result: DeduplicationResult): string;
|
|
68
|
+
/**
|
|
69
|
+
* Get detailed comment deduplication statistics
|
|
70
|
+
*/
|
|
71
|
+
getCommentDeduplicationStats(result: CommentDeduplicationResult): string;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Factory function to create ExactDuplicateRemover
|
|
75
|
+
*/
|
|
76
|
+
export declare function createExactDuplicateRemover(): ExactDuplicateRemover;
|
|
77
|
+
//# sourceMappingURL=ExactDuplicateRemover.d.ts.map
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exact Duplicate Removal Utility for Multi-Instance Processing
|
|
3
|
+
* Handles deduplication of violations from multiple Neurolink SDK instances
|
|
4
|
+
*/
|
|
5
|
+
import { createHash } from "crypto";
|
|
6
|
+
import { logger } from "./Logger.js";
|
|
7
|
+
/**
|
|
8
|
+
* Exact Duplicate Remover for Multi-Instance Results
|
|
9
|
+
* Implements multi-level deduplication strategy
|
|
10
|
+
*/
|
|
11
|
+
export class ExactDuplicateRemover {
|
|
12
|
+
/**
|
|
13
|
+
* Remove exact duplicates from multiple instance results
|
|
14
|
+
*/
|
|
15
|
+
removeDuplicates(instanceResults) {
|
|
16
|
+
const startTime = Date.now();
|
|
17
|
+
logger.debug("🔍 Starting exact duplicate removal process");
|
|
18
|
+
// Step 1: Flatten all violations with source tracking
|
|
19
|
+
const allViolations = this.flattenViolationsWithSource(instanceResults);
|
|
20
|
+
logger.debug(`📊 Total violations from all instances: ${allViolations.length}`);
|
|
21
|
+
// Step 2: Remove exact hash duplicates
|
|
22
|
+
const exactDuplicates = this.removeExactHashDuplicates(allViolations);
|
|
23
|
+
logger.debug(`🎯 Exact duplicates removed: ${exactDuplicates.removed}`);
|
|
24
|
+
// Step 3: Remove normalized duplicates
|
|
25
|
+
const normalizedDuplicates = this.removeNormalizedDuplicates(exactDuplicates.unique);
|
|
26
|
+
logger.debug(`📝 Normalized duplicates removed: ${normalizedDuplicates.removed}`);
|
|
27
|
+
// Step 4: Remove same file+line duplicates
|
|
28
|
+
const finalResult = this.removeSameLineDuplicates(normalizedDuplicates.unique);
|
|
29
|
+
logger.debug(`📍 Same-line duplicates removed: ${finalResult.removed}`);
|
|
30
|
+
// Step 5: Track contributions and create metrics
|
|
31
|
+
const instanceContributions = this.trackContributions(finalResult.unique);
|
|
32
|
+
const processingTime = Date.now() - startTime;
|
|
33
|
+
const metrics = {
|
|
34
|
+
totalViolationsInput: allViolations.length,
|
|
35
|
+
exactDuplicatesRemoved: exactDuplicates.removed,
|
|
36
|
+
normalizedDuplicatesRemoved: normalizedDuplicates.removed,
|
|
37
|
+
sameLineDuplicatesRemoved: finalResult.removed,
|
|
38
|
+
finalUniqueViolations: finalResult.unique.length,
|
|
39
|
+
deduplicationRate: ((allViolations.length - finalResult.unique.length) /
|
|
40
|
+
allViolations.length) *
|
|
41
|
+
100,
|
|
42
|
+
instanceContributions: Object.fromEntries(instanceContributions),
|
|
43
|
+
processingTimeMs: processingTime,
|
|
44
|
+
};
|
|
45
|
+
logger.success(`✅ Deduplication completed: ${allViolations.length} → ${finalResult.unique.length} violations ` +
|
|
46
|
+
`(${metrics.deduplicationRate.toFixed(1)}% reduction) in ${processingTime}ms`);
|
|
47
|
+
return {
|
|
48
|
+
uniqueViolations: finalResult.unique.map((v) => this.stripSourceInfo(v)),
|
|
49
|
+
duplicatesRemoved: {
|
|
50
|
+
exactDuplicates: exactDuplicates.removed,
|
|
51
|
+
normalizedDuplicates: normalizedDuplicates.removed,
|
|
52
|
+
sameLineDuplicates: finalResult.removed,
|
|
53
|
+
},
|
|
54
|
+
instanceContributions,
|
|
55
|
+
processingMetrics: metrics,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Flatten violations from all instances with source tracking
|
|
60
|
+
*/
|
|
61
|
+
flattenViolationsWithSource(instanceResults) {
|
|
62
|
+
const allViolations = [];
|
|
63
|
+
for (const result of instanceResults) {
|
|
64
|
+
if (!result.success || !result.violations) {
|
|
65
|
+
logger.debug(`⚠️ Skipping failed instance: ${result.instanceName}`);
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
result.violations.forEach((violation, index) => {
|
|
69
|
+
allViolations.push({
|
|
70
|
+
...violation,
|
|
71
|
+
source: result.instanceName,
|
|
72
|
+
originalIndex: index,
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
return allViolations;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Remove exact hash duplicates (Level 1)
|
|
80
|
+
*/
|
|
81
|
+
removeExactHashDuplicates(violations) {
|
|
82
|
+
const seenHashes = new Set();
|
|
83
|
+
const unique = [];
|
|
84
|
+
let removed = 0;
|
|
85
|
+
for (const violation of violations) {
|
|
86
|
+
const hash = this.createViolationHash(violation);
|
|
87
|
+
if (!seenHashes.has(hash)) {
|
|
88
|
+
seenHashes.add(hash);
|
|
89
|
+
unique.push(violation);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
removed++;
|
|
93
|
+
logger.debug(`🔄 Exact duplicate removed: ${violation.issue} (${violation.source})`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return { unique, removed };
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Remove normalized duplicates (Level 2)
|
|
100
|
+
*/
|
|
101
|
+
removeNormalizedDuplicates(violations) {
|
|
102
|
+
const seenNormalizedHashes = new Set();
|
|
103
|
+
const unique = [];
|
|
104
|
+
let removed = 0;
|
|
105
|
+
for (const violation of violations) {
|
|
106
|
+
const normalizedHash = this.createNormalizedViolationHash(violation);
|
|
107
|
+
if (!seenNormalizedHashes.has(normalizedHash)) {
|
|
108
|
+
seenNormalizedHashes.add(normalizedHash);
|
|
109
|
+
unique.push(violation);
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
removed++;
|
|
113
|
+
logger.debug(`📝 Normalized duplicate removed: ${violation.issue} (${violation.source})`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return { unique, removed };
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Remove same file+line duplicates (Level 3)
|
|
120
|
+
*/
|
|
121
|
+
removeSameLineDuplicates(violations) {
|
|
122
|
+
const fileLineMap = new Map();
|
|
123
|
+
const uniqueMap = new Map();
|
|
124
|
+
let removed = 0;
|
|
125
|
+
for (const violation of violations) {
|
|
126
|
+
if (!violation.file || !violation.code_snippet) {
|
|
127
|
+
uniqueMap.set(`${violation.file}_${violation.originalIndex}`, violation);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
const fileKey = violation.file;
|
|
131
|
+
const lineKey = this.normalizeCodeSnippet(violation.code_snippet);
|
|
132
|
+
const uniqueKey = `${violation.file}_${violation.originalIndex}`;
|
|
133
|
+
if (!fileLineMap.has(fileKey)) {
|
|
134
|
+
fileLineMap.set(fileKey, new Map());
|
|
135
|
+
}
|
|
136
|
+
const linesInFile = fileLineMap.get(fileKey);
|
|
137
|
+
if (linesInFile.has(lineKey)) {
|
|
138
|
+
// Duplicate found - resolve by severity and instance quality
|
|
139
|
+
const existing = linesInFile.get(lineKey);
|
|
140
|
+
const better = this.resolveDuplicateBySeverity([existing, violation]);
|
|
141
|
+
if (better === violation) {
|
|
142
|
+
// Replace existing with current
|
|
143
|
+
linesInFile.set(lineKey, violation);
|
|
144
|
+
// Remove existing from unique map and add current
|
|
145
|
+
const existingKey = `${existing.file}_${existing.originalIndex}`;
|
|
146
|
+
uniqueMap.delete(existingKey);
|
|
147
|
+
uniqueMap.set(uniqueKey, violation);
|
|
148
|
+
}
|
|
149
|
+
removed++;
|
|
150
|
+
logger.debug(`📍 Same-line duplicate resolved: ${violation.issue} (${violation.source})`);
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
linesInFile.set(lineKey, violation);
|
|
154
|
+
uniqueMap.set(uniqueKey, violation);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return { unique: Array.from(uniqueMap.values()), removed };
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Create hash for exact violation matching
|
|
161
|
+
*/
|
|
162
|
+
createViolationHash(violation) {
|
|
163
|
+
const key = {
|
|
164
|
+
file: violation.file?.trim(),
|
|
165
|
+
code_snippet: violation.code_snippet?.trim(),
|
|
166
|
+
severity: violation.severity,
|
|
167
|
+
category: violation.category,
|
|
168
|
+
issue: violation.issue.trim(),
|
|
169
|
+
message: violation.message.trim(),
|
|
170
|
+
};
|
|
171
|
+
return createHash("sha256").update(JSON.stringify(key)).digest("hex");
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Create hash for normalized violation matching
|
|
175
|
+
*/
|
|
176
|
+
createNormalizedViolationHash(violation) {
|
|
177
|
+
const normalized = {
|
|
178
|
+
file: violation.file?.toLowerCase().trim(),
|
|
179
|
+
code_snippet: this.normalizeCodeSnippet(violation.code_snippet || ""),
|
|
180
|
+
severity: violation.severity,
|
|
181
|
+
category: violation.category,
|
|
182
|
+
issue: this.normalizeText(violation.issue),
|
|
183
|
+
message: this.normalizeText(violation.message),
|
|
184
|
+
};
|
|
185
|
+
return createHash("sha256")
|
|
186
|
+
.update(JSON.stringify(normalized))
|
|
187
|
+
.digest("hex");
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Normalize code snippet for comparison
|
|
191
|
+
*/
|
|
192
|
+
normalizeCodeSnippet(snippet) {
|
|
193
|
+
return snippet
|
|
194
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
195
|
+
.replace(/['"]/g, '"') // Normalize quotes
|
|
196
|
+
.replace(/;+$/, "") // Remove trailing semicolons
|
|
197
|
+
.replace(/[{}]/g, "") // Remove braces for comparison
|
|
198
|
+
.trim()
|
|
199
|
+
.toLowerCase();
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Normalize text for comparison
|
|
203
|
+
*/
|
|
204
|
+
normalizeText(text) {
|
|
205
|
+
return text
|
|
206
|
+
.toLowerCase()
|
|
207
|
+
.replace(/[^\w\s]/g, "") // Remove punctuation
|
|
208
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
209
|
+
.trim();
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Resolve duplicate by severity (and potentially other factors)
|
|
213
|
+
*/
|
|
214
|
+
resolveDuplicateBySeverity(duplicates) {
|
|
215
|
+
const severityOrder = {
|
|
216
|
+
CRITICAL: 4,
|
|
217
|
+
MAJOR: 3,
|
|
218
|
+
MINOR: 2,
|
|
219
|
+
SUGGESTION: 1,
|
|
220
|
+
};
|
|
221
|
+
return duplicates.reduce((best, current) => {
|
|
222
|
+
const bestScore = severityOrder[best.severity] || 0;
|
|
223
|
+
const currentScore = severityOrder[current.severity] || 0;
|
|
224
|
+
if (currentScore > bestScore) {
|
|
225
|
+
return current;
|
|
226
|
+
}
|
|
227
|
+
else if (currentScore === bestScore) {
|
|
228
|
+
// Same severity - could add more sophisticated logic here
|
|
229
|
+
// For now, prefer the first one (could be based on instance quality)
|
|
230
|
+
return best;
|
|
231
|
+
}
|
|
232
|
+
return best;
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Track which instance contributed how many violations
|
|
237
|
+
*/
|
|
238
|
+
trackContributions(violations) {
|
|
239
|
+
const contributions = new Map();
|
|
240
|
+
for (const violation of violations) {
|
|
241
|
+
const current = contributions.get(violation.source) || 0;
|
|
242
|
+
contributions.set(violation.source, current + 1);
|
|
243
|
+
}
|
|
244
|
+
return contributions;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Remove source tracking information from violation
|
|
248
|
+
*/
|
|
249
|
+
stripSourceInfo(violation) {
|
|
250
|
+
const { source, originalIndex, ...cleanViolation } = violation;
|
|
251
|
+
return cleanViolation;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Remove violations that duplicate existing PR comments using semantic similarity
|
|
255
|
+
* Uses AI-powered ContentSimilarityService for intelligent deduplication
|
|
256
|
+
*/
|
|
257
|
+
async removeAgainstExistingComments(newViolations, existingComments, aiConfig, similarityThreshold = 85) {
|
|
258
|
+
const startTime = Date.now();
|
|
259
|
+
logger.debug("🔍 Starting semantic comment deduplication process");
|
|
260
|
+
logger.debug(`📊 New violations: ${newViolations.length}, Existing comments: ${existingComments.length}`);
|
|
261
|
+
logger.debug(`🎯 Similarity threshold: ${similarityThreshold}%`);
|
|
262
|
+
if (newViolations.length === 0 || existingComments.length === 0) {
|
|
263
|
+
logger.debug("⏭️ No violations or comments to compare, skipping deduplication");
|
|
264
|
+
return {
|
|
265
|
+
uniqueViolations: newViolations,
|
|
266
|
+
duplicatesRemoved: 0,
|
|
267
|
+
semanticMatches: [],
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
try {
|
|
271
|
+
// Use ContentSimilarityService for semantic analysis
|
|
272
|
+
const { ContentSimilarityService } = await import("./ContentSimilarityService.js");
|
|
273
|
+
const similarityService = new ContentSimilarityService(aiConfig);
|
|
274
|
+
// Get similarity results
|
|
275
|
+
const similarityResults = await similarityService.batchCalculateSimilarity(newViolations, existingComments, 15);
|
|
276
|
+
// Filter violations based on similarity threshold
|
|
277
|
+
const duplicateViolationIndices = new Set();
|
|
278
|
+
const semanticMatches = [];
|
|
279
|
+
for (const result of similarityResults) {
|
|
280
|
+
if (result.similarityScore >= similarityThreshold) {
|
|
281
|
+
duplicateViolationIndices.add(result.violationIndex);
|
|
282
|
+
const violation = newViolations[result.violationIndex];
|
|
283
|
+
const comment = existingComments[result.commentIndex];
|
|
284
|
+
semanticMatches.push({
|
|
285
|
+
violation: violation.issue,
|
|
286
|
+
comment: `Comment ${comment.id}`,
|
|
287
|
+
similarityScore: result.similarityScore,
|
|
288
|
+
reasoning: result.reasoning,
|
|
289
|
+
});
|
|
290
|
+
logger.debug(`🎯 Semantic duplicate found: "${violation.issue}" matches comment ${comment.id} ` +
|
|
291
|
+
`(${result.similarityScore}% similarity)`);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
// Create final list of unique violations
|
|
295
|
+
const uniqueViolations = newViolations.filter((_, index) => !duplicateViolationIndices.has(index));
|
|
296
|
+
const processingTime = Date.now() - startTime;
|
|
297
|
+
const duplicatesRemoved = duplicateViolationIndices.size;
|
|
298
|
+
logger.success(`✅ Semantic deduplication completed: ${newViolations.length} → ${uniqueViolations.length} violations ` +
|
|
299
|
+
`(${duplicatesRemoved} duplicates removed) in ${processingTime}ms`);
|
|
300
|
+
return {
|
|
301
|
+
uniqueViolations,
|
|
302
|
+
duplicatesRemoved,
|
|
303
|
+
semanticMatches,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
catch (error) {
|
|
307
|
+
logger.error(`❌ Semantic deduplication failed: ${error.message}`);
|
|
308
|
+
logger.warn("⚠️ Falling back to no deduplication - returning all violations");
|
|
309
|
+
// Graceful fallback: return all violations if AI analysis fails
|
|
310
|
+
return {
|
|
311
|
+
uniqueViolations: newViolations,
|
|
312
|
+
duplicatesRemoved: 0,
|
|
313
|
+
semanticMatches: [],
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Get detailed deduplication statistics
|
|
319
|
+
*/
|
|
320
|
+
getDeduplicationStats(result) {
|
|
321
|
+
const metrics = result.processingMetrics;
|
|
322
|
+
const contributions = Array.from(result.instanceContributions.entries())
|
|
323
|
+
.map(([instance, count]) => `${instance}: ${count}`)
|
|
324
|
+
.join(", ");
|
|
325
|
+
return `
|
|
326
|
+
📊 Deduplication Statistics:
|
|
327
|
+
• Input violations: ${metrics.totalViolationsInput}
|
|
328
|
+
• Exact duplicates removed: ${metrics.exactDuplicatesRemoved}
|
|
329
|
+
• Normalized duplicates removed: ${metrics.normalizedDuplicatesRemoved}
|
|
330
|
+
• Same-line duplicates removed: ${metrics.sameLineDuplicatesRemoved}
|
|
331
|
+
• Final unique violations: ${metrics.finalUniqueViolations}
|
|
332
|
+
• Deduplication rate: ${metrics.deduplicationRate.toFixed(1)}%
|
|
333
|
+
• Processing time: ${metrics.processingTimeMs}ms
|
|
334
|
+
• Instance contributions: ${contributions}
|
|
335
|
+
`.trim();
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Get detailed comment deduplication statistics
|
|
339
|
+
*/
|
|
340
|
+
getCommentDeduplicationStats(result) {
|
|
341
|
+
const averageSimilarity = result.semanticMatches.length > 0
|
|
342
|
+
? result.semanticMatches.reduce((sum, match) => sum + match.similarityScore, 0) / result.semanticMatches.length
|
|
343
|
+
: 0;
|
|
344
|
+
return `
|
|
345
|
+
📊 Comment Deduplication Statistics:
|
|
346
|
+
• Input violations: ${result.uniqueViolations.length + result.duplicatesRemoved}
|
|
347
|
+
• Unique violations: ${result.uniqueViolations.length}
|
|
348
|
+
• Duplicates removed: ${result.duplicatesRemoved}
|
|
349
|
+
• Deduplication rate: ${((result.duplicatesRemoved / (result.uniqueViolations.length + result.duplicatesRemoved)) * 100).toFixed(1)}%
|
|
350
|
+
• Semantic matches: ${result.semanticMatches.length}
|
|
351
|
+
• Average similarity score: ${averageSimilarity.toFixed(1)}%
|
|
352
|
+
`.trim();
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Factory function to create ExactDuplicateRemover
|
|
357
|
+
*/
|
|
358
|
+
export function createExactDuplicateRemover() {
|
|
359
|
+
return new ExactDuplicateRemover();
|
|
360
|
+
}
|
|
361
|
+
//# sourceMappingURL=ExactDuplicateRemover.js.map
|