@aiready/pattern-detect 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -26,7 +26,6 @@ interface DetectionOptions {
26
26
  approx?: boolean;
27
27
  minSharedTokens?: number;
28
28
  maxCandidatesPerBlock?: number;
29
- fastMode?: boolean;
30
29
  maxComparisons?: number;
31
30
  streamResults?: boolean;
32
31
  }
@@ -38,13 +37,10 @@ declare function detectDuplicatePatterns(files: FileContent[], options: Detectio
38
37
  interface PatternDetectOptions extends ScanOptions {
39
38
  minSimilarity?: number;
40
39
  minLines?: number;
41
- maxBlocks?: number;
42
40
  batchSize?: number;
43
41
  approx?: boolean;
44
42
  minSharedTokens?: number;
45
43
  maxCandidatesPerBlock?: number;
46
- fastMode?: boolean;
47
- maxComparisons?: number;
48
44
  streamResults?: boolean;
49
45
  }
50
46
  interface PatternSummary {
@@ -52,12 +48,11 @@ interface PatternSummary {
52
48
  totalTokenCost: number;
53
49
  patternsByType: Record<PatternType, number>;
54
50
  topDuplicates: Array<{
55
- file1: string;
56
- file2: string;
57
- line1: number;
58
- line2: number;
59
- endLine1: number;
60
- endLine2: number;
51
+ files: Array<{
52
+ path: string;
53
+ startLine: number;
54
+ endLine: number;
55
+ }>;
61
56
  similarity: number;
62
57
  patternType: PatternType;
63
58
  tokenCost: number;
package/dist/index.js CHANGED
@@ -106,31 +106,19 @@ function jaccardSimilarity(tokens1, tokens2) {
106
106
  const union = set1.size + set2.size - intersection;
107
107
  return union === 0 ? 0 : intersection / union;
108
108
  }
109
- function calculateSimilarity(block1, block2) {
110
- const norm1 = normalizeCode(block1);
111
- const norm2 = normalizeCode(block2);
112
- const baseSimilarity = (0, import_core.similarityScore)(norm1, norm2);
113
- const tokens1 = norm1.split(/[\s(){}[\];,]+/).filter(Boolean);
114
- const tokens2 = norm2.split(/[\s(){}[\];,]+/).filter(Boolean);
115
- const tokenSimilarity = (0, import_core.similarityScore)(tokens1.join(" "), tokens2.join(" "));
116
- return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
117
- }
118
109
  async function detectDuplicatePatterns(files, options) {
119
110
  const {
120
111
  minSimilarity,
121
112
  minLines,
122
- maxBlocks = 500,
123
113
  batchSize = 100,
124
114
  approx = true,
125
115
  minSharedTokens = 8,
126
116
  maxCandidatesPerBlock = 100,
127
- fastMode = true,
128
- maxComparisons = 5e4,
129
- // Cap at 50K comparisons by default
130
117
  streamResults = false
131
118
  } = options;
132
119
  const duplicates = [];
133
- let allBlocks = files.flatMap(
120
+ const maxComparisons = approx ? Infinity : 5e5;
121
+ const allBlocks = files.flatMap(
134
122
  (file) => extractCodeBlocks(file.content, minLines).map((block) => ({
135
123
  content: block.content,
136
124
  startLine: block.startLine,
@@ -143,10 +131,9 @@ async function detectDuplicatePatterns(files, options) {
143
131
  }))
144
132
  );
145
133
  console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
146
- if (allBlocks.length > maxBlocks) {
147
- console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
148
- console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
149
- allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
134
+ if (!approx && allBlocks.length > 500) {
135
+ console.log(`\u26A0\uFE0F Using --no-approx mode with ${allBlocks.length} blocks may be slow (O(B\xB2) complexity).`);
136
+ console.log(` Consider using approximate mode (default) for better performance.`);
150
137
  }
151
138
  const stopwords = /* @__PURE__ */ new Set([
152
139
  "return",
@@ -236,10 +223,14 @@ async function detectDuplicatePatterns(files, options) {
236
223
  }
237
224
  if (approx && candidates) {
238
225
  for (const { j } of candidates) {
239
- if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
226
+ if (!approx && maxComparisons !== Infinity && comparisonsProcessed >= maxComparisons) {
227
+ console.log(`\u26A0\uFE0F Comparison safety limit reached (${maxComparisons.toLocaleString()} comparisons in --no-approx mode).`);
228
+ console.log(` This prevents excessive runtime on large repos. Consider using approximate mode (default) or --min-lines to reduce blocks.`);
229
+ break;
230
+ }
240
231
  comparisonsProcessed++;
241
232
  const block2 = allBlocks[j];
242
- const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
233
+ const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
243
234
  if (similarity >= minSimilarity) {
244
235
  const duplicate = {
245
236
  file1: block1.file,
@@ -269,7 +260,7 @@ async function detectDuplicatePatterns(files, options) {
269
260
  comparisonsProcessed++;
270
261
  const block2 = allBlocks[j];
271
262
  if (block1.file === block2.file) continue;
272
- const similarity = fastMode ? jaccardSimilarity(blockTokens[i], blockTokens[j]) : calculateSimilarity(block1.content, block2.content);
263
+ const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
273
264
  if (similarity >= minSimilarity) {
274
265
  const duplicate = {
275
266
  file1: block1.file,
@@ -319,16 +310,13 @@ function getRefactoringSuggestion(patternType, similarity) {
319
310
  }
320
311
  async function analyzePatterns(options) {
321
312
  const {
322
- minSimilarity = 0.65,
323
- // Lower default for fast Jaccard mode (Levenshtein would be 0.85+)
313
+ minSimilarity = 0.4,
314
+ // Jaccard similarity default (40% threshold)
324
315
  minLines = 5,
325
- maxBlocks = 500,
326
316
  batchSize = 100,
327
317
  approx = true,
328
318
  minSharedTokens = 8,
329
319
  maxCandidatesPerBlock = 100,
330
- fastMode = true,
331
- maxComparisons = 5e4,
332
320
  streamResults = false,
333
321
  ...scanOptions
334
322
  } = options;
@@ -343,13 +331,10 @@ async function analyzePatterns(options) {
343
331
  const duplicates = await detectDuplicatePatterns(fileContents, {
344
332
  minSimilarity,
345
333
  minLines,
346
- maxBlocks,
347
334
  batchSize,
348
335
  approx,
349
336
  minSharedTokens,
350
337
  maxCandidatesPerBlock,
351
- fastMode,
352
- maxComparisons,
353
338
  streamResults
354
339
  });
355
340
  for (const file of files) {
@@ -413,15 +398,21 @@ function generateSummary(results) {
413
398
  const typeMatch = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
414
399
  const fileMatch = issue.message.match(/similar to (.+?) \(/);
415
400
  return {
416
- file1: issue.location.file,
417
- file2: fileMatch?.[1] || "unknown",
418
- line1: issue.location.line,
419
- line2: 0,
420
- // Not available from Issue
421
- endLine1: 0,
422
- // Not available from Issue
423
- endLine2: 0,
424
- // Not available from Issue
401
+ files: [
402
+ {
403
+ path: issue.location.file,
404
+ startLine: issue.location.line,
405
+ endLine: 0
406
+ // Not available from Issue
407
+ },
408
+ {
409
+ path: fileMatch?.[1] || "unknown",
410
+ startLine: 0,
411
+ // Not available from Issue
412
+ endLine: 0
413
+ // Not available from Issue
414
+ }
415
+ ],
425
416
  similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
426
417
  patternType: typeMatch?.[1] || "unknown",
427
418
  tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
package/dist/index.mjs CHANGED
@@ -2,7 +2,7 @@ import {
2
2
  analyzePatterns,
3
3
  detectDuplicatePatterns,
4
4
  generateSummary
5
- } from "./chunk-N5DE7IYX.mjs";
5
+ } from "./chunk-JKVKOXYR.mjs";
6
6
  export {
7
7
  analyzePatterns,
8
8
  detectDuplicatePatterns,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aiready/pattern-detect",
3
- "version": "0.1.3",
3
+ "version": "0.2.0",
4
4
  "description": "Semantic duplicate pattern detection for AI-generated code - finds similar implementations that waste AI context tokens",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",
@@ -15,6 +15,13 @@
15
15
  "import": "./dist/index.mjs"
16
16
  }
17
17
  },
18
+ "scripts": {
19
+ "build": "tsup src/index.ts src/cli.ts --format cjs,esm --dts",
20
+ "dev": "tsup src/index.ts src/cli.ts --format cjs,esm --dts --watch",
21
+ "test": "vitest run",
22
+ "lint": "eslint src",
23
+ "clean": "rm -rf dist"
24
+ },
18
25
  "keywords": [
19
26
  "aiready",
20
27
  "duplicate-detection",
@@ -43,9 +50,9 @@
43
50
  "url": "https://github.com/caopengau/aiready-pattern-detect/issues"
44
51
  },
45
52
  "dependencies": {
53
+ "@aiready/core": "workspace:*",
46
54
  "commander": "^12.1.0",
47
- "chalk": "^5.3.0",
48
- "@aiready/core": "0.1.2"
55
+ "chalk": "^5.3.0"
49
56
  },
50
57
  "devDependencies": {
51
58
  "tsup": "^8.3.5",
@@ -62,12 +69,5 @@
62
69
  },
63
70
  "publishConfig": {
64
71
  "access": "public"
65
- },
66
- "scripts": {
67
- "build": "tsup src/index.ts src/cli.ts --format cjs,esm --dts",
68
- "dev": "tsup src/index.ts src/cli.ts --format cjs,esm --dts --watch",
69
- "test": "vitest run",
70
- "lint": "eslint src",
71
- "clean": "rm -rf dist"
72
72
  }
73
- }
73
+ }
@@ -1,245 +0,0 @@
1
- // src/index.ts
2
- import { scanFiles, readFileContent } from "@aiready/core";
3
-
4
- // src/detector.ts
5
- import { similarityScore, estimateTokens } from "@aiready/core";
6
- function categorizePattern(code) {
7
- const lower = code.toLowerCase();
8
- if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
9
- return "api-handler";
10
- }
11
- if (lower.includes("validate") || lower.includes("schema") || lower.includes("zod") || lower.includes("yup") || lower.includes("if") && lower.includes("throw")) {
12
- return "validator";
13
- }
14
- if (lower.includes("return (") || lower.includes("jsx") || lower.includes("component") || lower.includes("props")) {
15
- return "component";
16
- }
17
- if (lower.includes("class ") || lower.includes("this.")) {
18
- return "class-method";
19
- }
20
- if (lower.includes("return ") && !lower.includes("this") && !lower.includes("new ")) {
21
- return "utility";
22
- }
23
- if (lower.includes("function") || lower.includes("=>")) {
24
- return "function";
25
- }
26
- return "unknown";
27
- }
28
- function extractCodeBlocks(content, minLines) {
29
- const lines = content.split("\n");
30
- const blocks = [];
31
- let currentBlock = [];
32
- let blockStart = 0;
33
- let braceDepth = 0;
34
- let inFunction = false;
35
- for (let i = 0; i < lines.length; i++) {
36
- const line = lines[i];
37
- const trimmed = line.trim();
38
- if (!inFunction && (trimmed.includes("function ") || trimmed.includes("=>") || trimmed.includes("async ") || /^(export\s+)?(async\s+)?function\s+/.test(trimmed) || /^(export\s+)?const\s+\w+\s*=\s*(async\s*)?\(/.test(trimmed))) {
39
- inFunction = true;
40
- blockStart = i;
41
- }
42
- for (const char of line) {
43
- if (char === "{") braceDepth++;
44
- if (char === "}") braceDepth--;
45
- }
46
- if (inFunction) {
47
- currentBlock.push(line);
48
- }
49
- if (inFunction && braceDepth === 0 && currentBlock.length >= minLines) {
50
- const blockContent = currentBlock.join("\n");
51
- const linesOfCode = currentBlock.filter(
52
- (l) => l.trim() && !l.trim().startsWith("//")
53
- ).length;
54
- blocks.push({
55
- content: blockContent,
56
- startLine: blockStart + 1,
57
- patternType: categorizePattern(blockContent),
58
- linesOfCode
59
- });
60
- currentBlock = [];
61
- inFunction = false;
62
- } else if (inFunction && braceDepth === 0) {
63
- currentBlock = [];
64
- inFunction = false;
65
- }
66
- }
67
- return blocks;
68
- }
69
- function normalizeCode(code) {
70
- return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
71
- }
72
- function calculateSimilarity(block1, block2) {
73
- const norm1 = normalizeCode(block1);
74
- const norm2 = normalizeCode(block2);
75
- const baseSimilarity = similarityScore(norm1, norm2);
76
- const tokens1 = norm1.split(/[\s(){}[\];,]+/).filter(Boolean);
77
- const tokens2 = norm2.split(/[\s(){}[\];,]+/).filter(Boolean);
78
- const tokenSimilarity = similarityScore(tokens1.join(" "), tokens2.join(" "));
79
- return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
80
- }
81
- async function detectDuplicatePatterns(files, options) {
82
- const { minSimilarity, minLines, maxBlocks = 500, batchSize = 100 } = options;
83
- const duplicates = [];
84
- let allBlocks = files.flatMap(
85
- (file) => extractCodeBlocks(file.content, minLines).map((block) => ({
86
- ...block,
87
- file: file.file,
88
- normalized: normalizeCode(block.content),
89
- tokenCost: estimateTokens(block.content)
90
- }))
91
- );
92
- console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
93
- if (allBlocks.length > maxBlocks) {
94
- console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
95
- console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
96
- allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
97
- }
98
- const totalComparisons = allBlocks.length * (allBlocks.length - 1) / 2;
99
- console.log(`Processing ${totalComparisons.toLocaleString()} comparisons in batches...`);
100
- let comparisonsProcessed = 0;
101
- const startTime = Date.now();
102
- for (let i = 0; i < allBlocks.length; i++) {
103
- if (i % batchSize === 0 && i > 0) {
104
- const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
105
- const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
106
- console.log(` ${progress}% complete (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed)`);
107
- await new Promise((resolve) => setImmediate(resolve));
108
- }
109
- for (let j = i + 1; j < allBlocks.length; j++) {
110
- comparisonsProcessed++;
111
- const block1 = allBlocks[i];
112
- const block2 = allBlocks[j];
113
- if (block1.file === block2.file) continue;
114
- const similarity = calculateSimilarity(block1.content, block2.content);
115
- if (similarity >= minSimilarity) {
116
- duplicates.push({
117
- file1: block1.file,
118
- file2: block2.file,
119
- line1: block1.startLine,
120
- line2: block2.startLine,
121
- similarity,
122
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
123
- patternType: block1.patternType,
124
- tokenCost: block1.tokenCost + block2.tokenCost,
125
- linesOfCode: block1.linesOfCode
126
- });
127
- }
128
- }
129
- }
130
- return duplicates.sort(
131
- (a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
132
- );
133
- }
134
-
135
- // src/index.ts
136
- function getRefactoringSuggestion(patternType, similarity) {
137
- const baseMessages = {
138
- "api-handler": "Extract common middleware or create a base handler class",
139
- validator: "Consolidate validation logic into shared schema validators (Zod/Yup)",
140
- utility: "Move to a shared utilities file and reuse across modules",
141
- "class-method": "Consider inheritance or composition to share behavior",
142
- component: "Extract shared logic into a custom hook or HOC",
143
- function: "Extract into a shared helper function",
144
- unknown: "Extract common logic into a reusable module"
145
- };
146
- const urgency = similarity > 0.95 ? " (CRITICAL: Nearly identical code)" : similarity > 0.9 ? " (HIGH: Very similar, refactor soon)" : "";
147
- return baseMessages[patternType] + urgency;
148
- }
149
- async function analyzePatterns(options) {
150
- const { minSimilarity = 0.85, minLines = 5, maxBlocks = 500, batchSize = 100, ...scanOptions } = options;
151
- const files = await scanFiles(scanOptions);
152
- const results = [];
153
- const fileContents = await Promise.all(
154
- files.map(async (file) => ({
155
- file,
156
- content: await readFileContent(file)
157
- }))
158
- );
159
- const duplicates = await detectDuplicatePatterns(fileContents, {
160
- minSimilarity,
161
- minLines,
162
- maxBlocks,
163
- batchSize
164
- });
165
- for (const file of files) {
166
- const fileDuplicates = duplicates.filter(
167
- (dup) => dup.file1 === file || dup.file2 === file
168
- );
169
- const issues = fileDuplicates.map((dup) => {
170
- const otherFile = dup.file1 === file ? dup.file2 : dup.file1;
171
- const severity = dup.similarity > 0.95 ? "critical" : dup.similarity > 0.9 ? "major" : "minor";
172
- return {
173
- type: "duplicate-pattern",
174
- severity,
175
- message: `${dup.patternType} pattern ${Math.round(dup.similarity * 100)}% similar to ${otherFile} (${dup.tokenCost} tokens wasted)`,
176
- location: {
177
- file,
178
- line: dup.file1 === file ? dup.line1 : dup.line2
179
- },
180
- suggestion: getRefactoringSuggestion(dup.patternType, dup.similarity)
181
- };
182
- });
183
- const totalTokenCost = fileDuplicates.reduce(
184
- (sum, dup) => sum + dup.tokenCost,
185
- 0
186
- );
187
- results.push({
188
- fileName: file,
189
- issues,
190
- metrics: {
191
- tokenCost: totalTokenCost,
192
- consistencyScore: Math.max(0, 1 - fileDuplicates.length * 0.1)
193
- }
194
- });
195
- }
196
- return results;
197
- }
198
- function generateSummary(results) {
199
- const allIssues = results.flatMap((r) => r.issues);
200
- const totalTokenCost = results.reduce(
201
- (sum, r) => sum + (r.metrics.tokenCost || 0),
202
- 0
203
- );
204
- const patternsByType = {
205
- "api-handler": 0,
206
- validator: 0,
207
- utility: 0,
208
- "class-method": 0,
209
- component: 0,
210
- function: 0,
211
- unknown: 0
212
- };
213
- allIssues.forEach((issue) => {
214
- const match = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
215
- if (match) {
216
- const type = match[1];
217
- patternsByType[type] = (patternsByType[type] || 0) + 1;
218
- }
219
- });
220
- const topDuplicates = allIssues.slice(0, 10).map((issue) => {
221
- const similarityMatch = issue.message.match(/(\d+)% similar/);
222
- const tokenMatch = issue.message.match(/\((\d+) tokens/);
223
- const typeMatch = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
224
- const fileMatch = issue.message.match(/similar to (.+?) \(/);
225
- return {
226
- file1: issue.location.file,
227
- file2: fileMatch?.[1] || "unknown",
228
- similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
229
- patternType: typeMatch?.[1] || "unknown",
230
- tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
231
- };
232
- });
233
- return {
234
- totalPatterns: allIssues.length,
235
- totalTokenCost,
236
- patternsByType,
237
- topDuplicates
238
- };
239
- }
240
-
241
- export {
242
- detectDuplicatePatterns,
243
- analyzePatterns,
244
- generateSummary
245
- };