@aiready/pattern-detect 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -80,6 +80,7 @@ function extractCodeBlocks(content, minLines) {
80
80
  blocks.push({
81
81
  content: blockContent,
82
82
  startLine: blockStart + 1,
83
+ endLine: i + 1,
83
84
  patternType: categorizePattern(blockContent),
84
85
  linesOfCode
85
86
  });
@@ -95,6 +96,16 @@ function extractCodeBlocks(content, minLines) {
95
96
  function normalizeCode(code) {
96
97
  return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
97
98
  }
99
+ function jaccardSimilarity(tokens1, tokens2) {
100
+ const set1 = new Set(tokens1);
101
+ const set2 = new Set(tokens2);
102
+ let intersection = 0;
103
+ for (const token of set1) {
104
+ if (set2.has(token)) intersection++;
105
+ }
106
+ const union = set1.size + set2.size - intersection;
107
+ return union === 0 ? 0 : intersection / union;
108
+ }
98
109
  function calculateSimilarity(block1, block2) {
99
110
  const norm1 = normalizeCode(block1);
100
111
  const norm2 = normalizeCode(block2);
@@ -104,38 +115,188 @@ function calculateSimilarity(block1, block2) {
104
115
  const tokenSimilarity = (0, import_core.similarityScore)(tokens1.join(" "), tokens2.join(" "));
105
116
  return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
106
117
  }
107
- function detectDuplicatePatterns(files, options) {
108
- const { minSimilarity, minLines } = options;
118
+ async function detectDuplicatePatterns(files, options) {
119
+ const {
120
+ minSimilarity,
121
+ minLines,
122
+ maxBlocks = 500,
123
+ batchSize = 100,
124
+ approx = true,
125
+ minSharedTokens = 8,
126
+ maxCandidatesPerBlock = 100,
127
+ fastMode = true,
128
+ maxComparisons = 5e4,
129
+ // Cap at 50K comparisons by default
130
+ streamResults = false
131
+ } = options;
109
132
  const duplicates = [];
110
- const allBlocks = files.flatMap(
133
+ let allBlocks = files.flatMap(
111
134
  (file) => extractCodeBlocks(file.content, minLines).map((block) => ({
112
- ...block,
135
+ content: block.content,
136
+ startLine: block.startLine,
137
+ endLine: block.endLine,
113
138
  file: file.file,
114
139
  normalized: normalizeCode(block.content),
115
- tokenCost: (0, import_core.estimateTokens)(block.content)
140
+ patternType: block.patternType,
141
+ tokenCost: (0, import_core.estimateTokens)(block.content),
142
+ linesOfCode: block.linesOfCode
116
143
  }))
117
144
  );
118
145
  console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
146
+ if (allBlocks.length > maxBlocks) {
147
+ console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
148
+ console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
149
+ allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
150
+ }
151
+ const stopwords = /* @__PURE__ */ new Set([
152
+ "return",
153
+ "const",
154
+ "let",
155
+ "var",
156
+ "function",
157
+ "class",
158
+ "new",
159
+ "if",
160
+ "else",
161
+ "for",
162
+ "while",
163
+ "async",
164
+ "await",
165
+ "try",
166
+ "catch",
167
+ "switch",
168
+ "case",
169
+ "default",
170
+ "import",
171
+ "export",
172
+ "from",
173
+ "true",
174
+ "false",
175
+ "null",
176
+ "undefined",
177
+ "this"
178
+ ]);
179
+ const tokenize = (norm) => norm.split(/[\s(){}\[\];,\.]+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
180
+ const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
181
+ const invertedIndex = /* @__PURE__ */ new Map();
182
+ if (approx) {
183
+ for (let i = 0; i < blockTokens.length; i++) {
184
+ for (const tok of blockTokens[i]) {
185
+ let arr = invertedIndex.get(tok);
186
+ if (!arr) {
187
+ arr = [];
188
+ invertedIndex.set(tok, arr);
189
+ }
190
+ arr.push(i);
191
+ }
192
+ }
193
+ }
194
+ const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
195
+ if (totalComparisons !== void 0) {
196
+ console.log(`Processing ${totalComparisons.toLocaleString()} comparisons in batches...`);
197
+ } else {
198
+ console.log(`Using approximate candidate selection to reduce comparisons...`);
199
+ }
200
+ let comparisonsProcessed = 0;
201
+ let comparisonsBudgetExhausted = false;
202
+ const startTime = Date.now();
119
203
  for (let i = 0; i < allBlocks.length; i++) {
120
- for (let j = i + 1; j < allBlocks.length; j++) {
121
- const block1 = allBlocks[i];
122
- const block2 = allBlocks[j];
123
- if (block1.file === block2.file) continue;
124
- const similarity = calculateSimilarity(block1.content, block2.content);
125
- if (similarity >= minSimilarity) {
126
- duplicates.push({
127
- file1: block1.file,
128
- file2: block2.file,
129
- line1: block1.startLine,
130
- line2: block2.startLine,
131
- similarity,
132
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
133
- patternType: block1.patternType,
134
- tokenCost: block1.tokenCost + block2.tokenCost,
135
- linesOfCode: block1.linesOfCode
136
- });
204
+ if (maxComparisons && comparisonsProcessed >= maxComparisons) {
205
+ comparisonsBudgetExhausted = true;
206
+ break;
207
+ }
208
+ if (i % batchSize === 0 && i > 0) {
209
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
210
+ const duplicatesFound = duplicates.length;
211
+ if (totalComparisons !== void 0) {
212
+ const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
213
+ const remaining = totalComparisons - comparisonsProcessed;
214
+ const rate = comparisonsProcessed / parseFloat(elapsed);
215
+ const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
216
+ console.log(` ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`);
217
+ } else {
218
+ console.log(` Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`);
219
+ }
220
+ await new Promise((resolve) => setImmediate(resolve));
221
+ }
222
+ const block1 = allBlocks[i];
223
+ let candidates = null;
224
+ if (approx) {
225
+ const counts = /* @__PURE__ */ new Map();
226
+ for (const tok of blockTokens[i]) {
227
+ const ids = invertedIndex.get(tok);
228
+ if (!ids) continue;
229
+ for (const j of ids) {
230
+ if (j <= i) continue;
231
+ if (allBlocks[j].file === block1.file) continue;
232
+ counts.set(j, (counts.get(j) || 0) + 1);
233
+ }
137
234
  }
235
+ candidates = Array.from(counts.entries()).filter(([, shared]) => shared >= minSharedTokens).sort((a, b) => b[1] - a[1]).slice(0, maxCandidatesPerBlock).map(([j, shared]) => ({ j, shared }));
138
236
  }
237
+ if (approx && candidates) {
238
+ for (const { j } of candidates) {
239
+ if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
240
+ comparisonsProcessed++;
241
+ const block2 = allBlocks[j];
242
+ const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
243
+ if (similarity >= minSimilarity) {
244
+ const duplicate = {
245
+ file1: block1.file,
246
+ file2: block2.file,
247
+ line1: block1.startLine,
248
+ line2: block2.startLine,
249
+ endLine1: block1.endLine,
250
+ endLine2: block2.endLine,
251
+ similarity,
252
+ snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
253
+ patternType: block1.patternType,
254
+ tokenCost: block1.tokenCost + block2.tokenCost,
255
+ linesOfCode: block1.linesOfCode
256
+ };
257
+ duplicates.push(duplicate);
258
+ if (streamResults) {
259
+ console.log(`
260
+ \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
261
+ console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
262
+ console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
263
+ }
264
+ }
265
+ }
266
+ } else {
267
+ for (let j = i + 1; j < allBlocks.length; j++) {
268
+ if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
269
+ comparisonsProcessed++;
270
+ const block2 = allBlocks[j];
271
+ if (block1.file === block2.file) continue;
272
+ const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
273
+ if (similarity >= minSimilarity) {
274
+ const duplicate = {
275
+ file1: block1.file,
276
+ file2: block2.file,
277
+ line1: block1.startLine,
278
+ line2: block2.startLine,
279
+ endLine1: block1.endLine,
280
+ endLine2: block2.endLine,
281
+ similarity,
282
+ snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
283
+ patternType: block1.patternType,
284
+ tokenCost: block1.tokenCost + block2.tokenCost,
285
+ linesOfCode: block1.linesOfCode
286
+ };
287
+ duplicates.push(duplicate);
288
+ if (streamResults) {
289
+ console.log(`
290
+ \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
291
+ console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
292
+ console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
293
+ }
294
+ }
295
+ }
296
+ }
297
+ }
298
+ if (comparisonsBudgetExhausted) {
299
+ console.log(`\u26A0\uFE0F Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`);
139
300
  }
140
301
  return duplicates.sort(
141
302
  (a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
@@ -157,7 +318,20 @@ function getRefactoringSuggestion(patternType, similarity) {
157
318
  return baseMessages[patternType] + urgency;
158
319
  }
159
320
  async function analyzePatterns(options) {
160
- const { minSimilarity = 0.85, minLines = 5, ...scanOptions } = options;
321
+ const {
322
+ minSimilarity = 0.65,
323
+ // Lower default for fast Jaccard mode (Levenshtein would be 0.85+)
324
+ minLines = 5,
325
+ maxBlocks = 500,
326
+ batchSize = 100,
327
+ approx = true,
328
+ minSharedTokens = 8,
329
+ maxCandidatesPerBlock = 100,
330
+ fastMode = true,
331
+ maxComparisons = 5e4,
332
+ streamResults = false,
333
+ ...scanOptions
334
+ } = options;
161
335
  const files = await (0, import_core2.scanFiles)(scanOptions);
162
336
  const results = [];
163
337
  const fileContents = await Promise.all(
@@ -166,9 +340,17 @@ async function analyzePatterns(options) {
166
340
  content: await (0, import_core2.readFileContent)(file)
167
341
  }))
168
342
  );
169
- const duplicates = detectDuplicatePatterns(fileContents, {
343
+ const duplicates = await detectDuplicatePatterns(fileContents, {
170
344
  minSimilarity,
171
- minLines
345
+ minLines,
346
+ maxBlocks,
347
+ batchSize,
348
+ approx,
349
+ minSharedTokens,
350
+ maxCandidatesPerBlock,
351
+ fastMode,
352
+ maxComparisons,
353
+ streamResults
172
354
  });
173
355
  for (const file of files) {
174
356
  const fileDuplicates = duplicates.filter(
@@ -233,6 +415,13 @@ function generateSummary(results) {
233
415
  return {
234
416
  file1: issue.location.file,
235
417
  file2: fileMatch?.[1] || "unknown",
418
+ line1: issue.location.line,
419
+ line2: 0,
420
+ // Not available from Issue
421
+ endLine1: 0,
422
+ // Not available from Issue
423
+ endLine2: 0,
424
+ // Not available from Issue
236
425
  similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
237
426
  patternType: typeMatch?.[1] || "unknown",
238
427
  tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
package/dist/index.mjs CHANGED
@@ -2,7 +2,7 @@ import {
2
2
  analyzePatterns,
3
3
  detectDuplicatePatterns,
4
4
  generateSummary
5
- } from "./chunk-RLWJXASG.mjs";
5
+ } from "./chunk-N5DE7IYX.mjs";
6
6
  export {
7
7
  analyzePatterns,
8
8
  detectDuplicatePatterns,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aiready/pattern-detect",
3
- "version": "0.1.2",
3
+ "version": "0.1.3",
4
4
  "description": "Semantic duplicate pattern detection for AI-generated code - finds similar implementations that waste AI context tokens",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",