@aiready/pattern-detect 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +103 -3
- package/dist/chunk-4CZGZIDL.mjs +409 -0
- package/dist/chunk-57O7FEEM.mjs +400 -0
- package/dist/chunk-6VQTQRDW.mjs +245 -0
- package/dist/chunk-DNI7S33V.mjs +399 -0
- package/dist/chunk-JTJXOIO2.mjs +378 -0
- package/dist/chunk-N5DE7IYX.mjs +416 -0
- package/dist/chunk-YA3N6EC5.mjs +351 -0
- package/dist/chunk-ZNZ5O435.mjs +400 -0
- package/dist/cli.js +258 -48
- package/dist/cli.mjs +45 -24
- package/dist/index.d.mts +23 -1
- package/dist/index.d.ts +23 -1
- package/dist/index.js +214 -25
- package/dist/index.mjs +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -80,6 +80,7 @@ function extractCodeBlocks(content, minLines) {
|
|
|
80
80
|
blocks.push({
|
|
81
81
|
content: blockContent,
|
|
82
82
|
startLine: blockStart + 1,
|
|
83
|
+
endLine: i + 1,
|
|
83
84
|
patternType: categorizePattern(blockContent),
|
|
84
85
|
linesOfCode
|
|
85
86
|
});
|
|
@@ -95,6 +96,16 @@ function extractCodeBlocks(content, minLines) {
|
|
|
95
96
|
function normalizeCode(code) {
|
|
96
97
|
return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
|
|
97
98
|
}
|
|
99
|
+
function jaccardSimilarity(tokens1, tokens2) {
|
|
100
|
+
const set1 = new Set(tokens1);
|
|
101
|
+
const set2 = new Set(tokens2);
|
|
102
|
+
let intersection = 0;
|
|
103
|
+
for (const token of set1) {
|
|
104
|
+
if (set2.has(token)) intersection++;
|
|
105
|
+
}
|
|
106
|
+
const union = set1.size + set2.size - intersection;
|
|
107
|
+
return union === 0 ? 0 : intersection / union;
|
|
108
|
+
}
|
|
98
109
|
function calculateSimilarity(block1, block2) {
|
|
99
110
|
const norm1 = normalizeCode(block1);
|
|
100
111
|
const norm2 = normalizeCode(block2);
|
|
@@ -104,38 +115,188 @@ function calculateSimilarity(block1, block2) {
|
|
|
104
115
|
const tokenSimilarity = (0, import_core.similarityScore)(tokens1.join(" "), tokens2.join(" "));
|
|
105
116
|
return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
|
|
106
117
|
}
|
|
107
|
-
function detectDuplicatePatterns(files, options) {
|
|
108
|
-
const {
|
|
118
|
+
async function detectDuplicatePatterns(files, options) {
|
|
119
|
+
const {
|
|
120
|
+
minSimilarity,
|
|
121
|
+
minLines,
|
|
122
|
+
maxBlocks = 500,
|
|
123
|
+
batchSize = 100,
|
|
124
|
+
approx = true,
|
|
125
|
+
minSharedTokens = 8,
|
|
126
|
+
maxCandidatesPerBlock = 100,
|
|
127
|
+
fastMode = true,
|
|
128
|
+
maxComparisons = 5e4,
|
|
129
|
+
// Cap at 50K comparisons by default
|
|
130
|
+
streamResults = false
|
|
131
|
+
} = options;
|
|
109
132
|
const duplicates = [];
|
|
110
|
-
|
|
133
|
+
let allBlocks = files.flatMap(
|
|
111
134
|
(file) => extractCodeBlocks(file.content, minLines).map((block) => ({
|
|
112
|
-
|
|
135
|
+
content: block.content,
|
|
136
|
+
startLine: block.startLine,
|
|
137
|
+
endLine: block.endLine,
|
|
113
138
|
file: file.file,
|
|
114
139
|
normalized: normalizeCode(block.content),
|
|
115
|
-
|
|
140
|
+
patternType: block.patternType,
|
|
141
|
+
tokenCost: (0, import_core.estimateTokens)(block.content),
|
|
142
|
+
linesOfCode: block.linesOfCode
|
|
116
143
|
}))
|
|
117
144
|
);
|
|
118
145
|
console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
|
|
146
|
+
if (allBlocks.length > maxBlocks) {
|
|
147
|
+
console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
|
|
148
|
+
console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
|
|
149
|
+
allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
|
|
150
|
+
}
|
|
151
|
+
const stopwords = /* @__PURE__ */ new Set([
|
|
152
|
+
"return",
|
|
153
|
+
"const",
|
|
154
|
+
"let",
|
|
155
|
+
"var",
|
|
156
|
+
"function",
|
|
157
|
+
"class",
|
|
158
|
+
"new",
|
|
159
|
+
"if",
|
|
160
|
+
"else",
|
|
161
|
+
"for",
|
|
162
|
+
"while",
|
|
163
|
+
"async",
|
|
164
|
+
"await",
|
|
165
|
+
"try",
|
|
166
|
+
"catch",
|
|
167
|
+
"switch",
|
|
168
|
+
"case",
|
|
169
|
+
"default",
|
|
170
|
+
"import",
|
|
171
|
+
"export",
|
|
172
|
+
"from",
|
|
173
|
+
"true",
|
|
174
|
+
"false",
|
|
175
|
+
"null",
|
|
176
|
+
"undefined",
|
|
177
|
+
"this"
|
|
178
|
+
]);
|
|
179
|
+
const tokenize = (norm) => norm.split(/[\s(){}\[\];,\.]+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
|
|
180
|
+
const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
|
|
181
|
+
const invertedIndex = /* @__PURE__ */ new Map();
|
|
182
|
+
if (approx) {
|
|
183
|
+
for (let i = 0; i < blockTokens.length; i++) {
|
|
184
|
+
for (const tok of blockTokens[i]) {
|
|
185
|
+
let arr = invertedIndex.get(tok);
|
|
186
|
+
if (!arr) {
|
|
187
|
+
arr = [];
|
|
188
|
+
invertedIndex.set(tok, arr);
|
|
189
|
+
}
|
|
190
|
+
arr.push(i);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
|
|
195
|
+
if (totalComparisons !== void 0) {
|
|
196
|
+
console.log(`Processing ${totalComparisons.toLocaleString()} comparisons in batches...`);
|
|
197
|
+
} else {
|
|
198
|
+
console.log(`Using approximate candidate selection to reduce comparisons...`);
|
|
199
|
+
}
|
|
200
|
+
let comparisonsProcessed = 0;
|
|
201
|
+
let comparisonsBudgetExhausted = false;
|
|
202
|
+
const startTime = Date.now();
|
|
119
203
|
for (let i = 0; i < allBlocks.length; i++) {
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
204
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) {
|
|
205
|
+
comparisonsBudgetExhausted = true;
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
if (i % batchSize === 0 && i > 0) {
|
|
209
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
210
|
+
const duplicatesFound = duplicates.length;
|
|
211
|
+
if (totalComparisons !== void 0) {
|
|
212
|
+
const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
|
|
213
|
+
const remaining = totalComparisons - comparisonsProcessed;
|
|
214
|
+
const rate = comparisonsProcessed / parseFloat(elapsed);
|
|
215
|
+
const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
|
|
216
|
+
console.log(` ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`);
|
|
217
|
+
} else {
|
|
218
|
+
console.log(` Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`);
|
|
219
|
+
}
|
|
220
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
221
|
+
}
|
|
222
|
+
const block1 = allBlocks[i];
|
|
223
|
+
let candidates = null;
|
|
224
|
+
if (approx) {
|
|
225
|
+
const counts = /* @__PURE__ */ new Map();
|
|
226
|
+
for (const tok of blockTokens[i]) {
|
|
227
|
+
const ids = invertedIndex.get(tok);
|
|
228
|
+
if (!ids) continue;
|
|
229
|
+
for (const j of ids) {
|
|
230
|
+
if (j <= i) continue;
|
|
231
|
+
if (allBlocks[j].file === block1.file) continue;
|
|
232
|
+
counts.set(j, (counts.get(j) || 0) + 1);
|
|
233
|
+
}
|
|
137
234
|
}
|
|
235
|
+
candidates = Array.from(counts.entries()).filter(([, shared]) => shared >= minSharedTokens).sort((a, b) => b[1] - a[1]).slice(0, maxCandidatesPerBlock).map(([j, shared]) => ({ j, shared }));
|
|
138
236
|
}
|
|
237
|
+
if (approx && candidates) {
|
|
238
|
+
for (const { j } of candidates) {
|
|
239
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
240
|
+
comparisonsProcessed++;
|
|
241
|
+
const block2 = allBlocks[j];
|
|
242
|
+
const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
|
|
243
|
+
if (similarity >= minSimilarity) {
|
|
244
|
+
const duplicate = {
|
|
245
|
+
file1: block1.file,
|
|
246
|
+
file2: block2.file,
|
|
247
|
+
line1: block1.startLine,
|
|
248
|
+
line2: block2.startLine,
|
|
249
|
+
endLine1: block1.endLine,
|
|
250
|
+
endLine2: block2.endLine,
|
|
251
|
+
similarity,
|
|
252
|
+
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
253
|
+
patternType: block1.patternType,
|
|
254
|
+
tokenCost: block1.tokenCost + block2.tokenCost,
|
|
255
|
+
linesOfCode: block1.linesOfCode
|
|
256
|
+
};
|
|
257
|
+
duplicates.push(duplicate);
|
|
258
|
+
if (streamResults) {
|
|
259
|
+
console.log(`
|
|
260
|
+
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
261
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
262
|
+
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
for (let j = i + 1; j < allBlocks.length; j++) {
|
|
268
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
269
|
+
comparisonsProcessed++;
|
|
270
|
+
const block2 = allBlocks[j];
|
|
271
|
+
if (block1.file === block2.file) continue;
|
|
272
|
+
const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
|
|
273
|
+
if (similarity >= minSimilarity) {
|
|
274
|
+
const duplicate = {
|
|
275
|
+
file1: block1.file,
|
|
276
|
+
file2: block2.file,
|
|
277
|
+
line1: block1.startLine,
|
|
278
|
+
line2: block2.startLine,
|
|
279
|
+
endLine1: block1.endLine,
|
|
280
|
+
endLine2: block2.endLine,
|
|
281
|
+
similarity,
|
|
282
|
+
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
283
|
+
patternType: block1.patternType,
|
|
284
|
+
tokenCost: block1.tokenCost + block2.tokenCost,
|
|
285
|
+
linesOfCode: block1.linesOfCode
|
|
286
|
+
};
|
|
287
|
+
duplicates.push(duplicate);
|
|
288
|
+
if (streamResults) {
|
|
289
|
+
console.log(`
|
|
290
|
+
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
291
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
292
|
+
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
if (comparisonsBudgetExhausted) {
|
|
299
|
+
console.log(`\u26A0\uFE0F Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`);
|
|
139
300
|
}
|
|
140
301
|
return duplicates.sort(
|
|
141
302
|
(a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
|
|
@@ -157,7 +318,20 @@ function getRefactoringSuggestion(patternType, similarity) {
|
|
|
157
318
|
return baseMessages[patternType] + urgency;
|
|
158
319
|
}
|
|
159
320
|
async function analyzePatterns(options) {
|
|
160
|
-
const {
|
|
321
|
+
const {
|
|
322
|
+
minSimilarity = 0.65,
|
|
323
|
+
// Lower default for fast Jaccard mode (Levenshtein would be 0.85+)
|
|
324
|
+
minLines = 5,
|
|
325
|
+
maxBlocks = 500,
|
|
326
|
+
batchSize = 100,
|
|
327
|
+
approx = true,
|
|
328
|
+
minSharedTokens = 8,
|
|
329
|
+
maxCandidatesPerBlock = 100,
|
|
330
|
+
fastMode = true,
|
|
331
|
+
maxComparisons = 5e4,
|
|
332
|
+
streamResults = false,
|
|
333
|
+
...scanOptions
|
|
334
|
+
} = options;
|
|
161
335
|
const files = await (0, import_core2.scanFiles)(scanOptions);
|
|
162
336
|
const results = [];
|
|
163
337
|
const fileContents = await Promise.all(
|
|
@@ -166,9 +340,17 @@ async function analyzePatterns(options) {
|
|
|
166
340
|
content: await (0, import_core2.readFileContent)(file)
|
|
167
341
|
}))
|
|
168
342
|
);
|
|
169
|
-
const duplicates = detectDuplicatePatterns(fileContents, {
|
|
343
|
+
const duplicates = await detectDuplicatePatterns(fileContents, {
|
|
170
344
|
minSimilarity,
|
|
171
|
-
minLines
|
|
345
|
+
minLines,
|
|
346
|
+
maxBlocks,
|
|
347
|
+
batchSize,
|
|
348
|
+
approx,
|
|
349
|
+
minSharedTokens,
|
|
350
|
+
maxCandidatesPerBlock,
|
|
351
|
+
fastMode,
|
|
352
|
+
maxComparisons,
|
|
353
|
+
streamResults
|
|
172
354
|
});
|
|
173
355
|
for (const file of files) {
|
|
174
356
|
const fileDuplicates = duplicates.filter(
|
|
@@ -233,6 +415,13 @@ function generateSummary(results) {
|
|
|
233
415
|
return {
|
|
234
416
|
file1: issue.location.file,
|
|
235
417
|
file2: fileMatch?.[1] || "unknown",
|
|
418
|
+
line1: issue.location.line,
|
|
419
|
+
line2: 0,
|
|
420
|
+
// Not available from Issue
|
|
421
|
+
endLine1: 0,
|
|
422
|
+
// Not available from Issue
|
|
423
|
+
endLine2: 0,
|
|
424
|
+
// Not available from Issue
|
|
236
425
|
similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
|
|
237
426
|
patternType: typeMatch?.[1] || "unknown",
|
|
238
427
|
tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aiready/pattern-detect",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"description": "Semantic duplicate pattern detection for AI-generated code - finds similar implementations that waste AI context tokens",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|