@aiready/pattern-detect 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +214 -3
- package/dist/chunk-AXHGYYYZ.mjs +404 -0
- package/dist/chunk-JKVKOXYR.mjs +407 -0
- package/dist/chunk-OFGMDX66.mjs +402 -0
- package/dist/chunk-QE4E3F7C.mjs +410 -0
- package/dist/chunk-TXWPOVYU.mjs +402 -0
- package/dist/cli.js +265 -65
- package/dist/cli.mjs +52 -32
- package/dist/index.d.mts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +214 -34
- package/dist/index.mjs +1 -1
- package/package.json +11 -11
- package/dist/chunk-K5O2HVB5.mjs +0 -114
- package/dist/chunk-RLWJXASG.mjs +0 -227
package/README.md
CHANGED
|
@@ -24,6 +24,19 @@ AI coding assistants (GitHub Copilot, ChatGPT, Claude) generate functionally sim
|
|
|
24
24
|
| Refactoring Suggestions | ❌ Generic | ✅ Specific to pattern type |
|
|
25
25
|
| Output Formats | Text/JSON | Console/JSON/HTML with rich formatting |
|
|
26
26
|
|
|
27
|
+
#### How We Differ (and When to Use Each)
|
|
28
|
+
|
|
29
|
+
- **Semantic intent vs exact clones**: jscpd flags copy-paste or near-duplicates; we detect functionally similar code even when structure differs (e.g., two API handlers with different frameworks).
|
|
30
|
+
- **Pattern typing**: We classify duplicates into `api-handler`, `validator`, `utility`, `component`, etc., so teams can prioritize coherent refactors.
|
|
31
|
+
- **AI context cost**: We estimate tokens wasted to quantify impact on AI tools (larger context, higher cost, more confusion).
|
|
32
|
+
- **Refactoring guidance**: We propose targeted fixes per pattern type (e.g., extract middleware or create base handler).
|
|
33
|
+
- **Performance profile**: We use Jaccard similarity with candidate filtering; ~2–3s for ~500 blocks on medium repos.
|
|
34
|
+
|
|
35
|
+
Recommended workflow:
|
|
36
|
+
- Run **jscpd** in CI to enforce low clone percentage (blocking).
|
|
37
|
+
- Run **@aiready/pattern-detect** to surface semantic duplicates and token waste (advisory), feeding a refactoring backlog.
|
|
38
|
+
- Use both for comprehensive hygiene: jscpd for exact clones; AIReady for intent-level duplication that AI tends to reintroduce.
|
|
39
|
+
|
|
27
40
|
## 🚀 Installation
|
|
28
41
|
|
|
29
42
|
```bash
|
|
@@ -47,6 +60,9 @@ aiready-patterns ./src --similarity 0.9
|
|
|
47
60
|
# Only look at larger patterns
|
|
48
61
|
aiready-patterns ./src --min-lines 10
|
|
49
62
|
|
|
63
|
+
# Memory optimization for large codebases
|
|
64
|
+
aiready-patterns ./src --max-blocks 1000 --batch-size 200
|
|
65
|
+
|
|
50
66
|
# Export to JSON
|
|
51
67
|
aiready-patterns ./src --output json --output-file report.json
|
|
52
68
|
|
|
@@ -54,6 +70,33 @@ aiready-patterns ./src --output json --output-file report.json
|
|
|
54
70
|
aiready-patterns ./src --output html
|
|
55
71
|
```
|
|
56
72
|
|
|
73
|
+
#### Presets (quick copy/paste)
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Speed-first (large repos)
|
|
77
|
+
aiready-patterns ./src \
|
|
78
|
+
--min-shared-tokens 12 \
|
|
79
|
+
--max-candidates 60 \
|
|
80
|
+
--max-blocks 300
|
|
81
|
+
|
|
82
|
+
# Coverage-first (more findings)
|
|
83
|
+
aiready-patterns ./src \
|
|
84
|
+
--min-shared-tokens 6 \
|
|
85
|
+
--max-candidates 150
|
|
86
|
+
|
|
87
|
+
# Short-block focus (helpers/utilities)
|
|
88
|
+
aiready-patterns ./src \
|
|
89
|
+
--min-lines 5 \
|
|
90
|
+
--min-shared-tokens 6 \
|
|
91
|
+
--max-candidates 120 \
|
|
92
|
+
--exclude "**/test/**"
|
|
93
|
+
|
|
94
|
+
# Deep dive with streaming (comprehensive detection)
|
|
95
|
+
aiready-patterns ./src \
|
|
96
|
+
--no-approx \
|
|
97
|
+
--stream-results
|
|
98
|
+
```
|
|
99
|
+
|
|
57
100
|
### Programmatic API
|
|
58
101
|
|
|
59
102
|
```typescript
|
|
@@ -165,12 +208,38 @@ router.get('/posts/:id', createResourceHandler('Post', database.posts.findOne));
|
|
|
165
208
|
|
|
166
209
|
## ⚙️ Configuration
|
|
167
210
|
|
|
211
|
+
### Common Options
|
|
212
|
+
|
|
168
213
|
| Option | Description | Default |
|
|
169
214
|
|--------|-------------|---------|
|
|
170
|
-
| `minSimilarity` | Similarity threshold (0-1) | `0.
|
|
215
|
+
| `minSimilarity` | Similarity threshold (0-1). Default `0.40` (Jaccard). Raise for only obvious duplicates; lower to catch more | `0.40` |
|
|
216
|
+
| `minSimilarity` | Similarity threshold (0-1). Default `0.40` (Jaccard). Raise for only obvious duplicates; lower to catch more | `0.40` |
|
|
171
217
|
| `minLines` | Minimum lines to consider a pattern | `5` |
|
|
172
|
-
| `
|
|
173
|
-
| `
|
|
218
|
+
| `maxBlocks` | Maximum code blocks to analyze (prevents OOM) | `500` |
|
|
219
|
+
| `include` | File patterns to include | `['**/*.{ts,tsx,js,jsx,py,java}']` |
|
|
220
|
+
| `exclude` | File patterns to exclude | See below |
|
|
221
|
+
|
|
222
|
+
### Exclude Patterns (Default)
|
|
223
|
+
|
|
224
|
+
By default, these patterns are excluded:
|
|
225
|
+
```bash
|
|
226
|
+
**/node_modules/**
|
|
227
|
+
**/dist/**
|
|
228
|
+
**/build/**
|
|
229
|
+
**/.git/**
|
|
230
|
+
**/coverage/**
|
|
231
|
+
**/*.min.js
|
|
232
|
+
**/*.bundle.js
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Override with `--exclude` flag:
|
|
236
|
+
```bash
|
|
237
|
+
# Exclude test files and generated code
|
|
238
|
+
aiready-patterns ./src --exclude "**/test/**,**/generated/**,**/__snapshots__/**"
|
|
239
|
+
|
|
240
|
+
# Add to defaults (comma-separated)
|
|
241
|
+
aiready-patterns ./src --exclude "**/node_modules/**,**/dist/**,**/build/**,**/*.spec.ts"
|
|
242
|
+
```
|
|
174
243
|
|
|
175
244
|
## 📈 Understanding the Output
|
|
176
245
|
|
|
@@ -205,6 +274,148 @@ Estimated tokens wasted when AI tools process duplicate code:
|
|
|
205
274
|
4. **Use pattern types**: Prioritize refactoring by category (API handlers → validators → utilities)
|
|
206
275
|
5. **Export reports**: Generate HTML reports for team reviews
|
|
207
276
|
|
|
277
|
+
## ⚠️ Performance & Memory
|
|
278
|
+
|
|
279
|
+
### Algorithm Complexity
|
|
280
|
+
|
|
281
|
+
**Jaccard Similarity**: **O(B × C × T)** where:
|
|
282
|
+
- B = number of blocks
|
|
283
|
+
- C = average candidates per block (~100)
|
|
284
|
+
- T = average tokens per block (~50)
|
|
285
|
+
- **O(T) per comparison** instead of O(N²)
|
|
286
|
+
- **Default threshold: 0.40** (comprehensive detection including tests and helpers)
|
|
287
|
+
|
|
288
|
+
### Performance Benchmarks
|
|
289
|
+
|
|
290
|
+
| Repo Size | Blocks | Analysis Time |
|
|
291
|
+
|-----------|--------|--------------|
|
|
292
|
+
| Small (<100 files) | ~50 | <1s |
|
|
293
|
+
| Medium (100-500 files) | ~500 | ~2s |
|
|
294
|
+
| Large (500+ files) | ~500 (capped) | ~2s |
|
|
295
|
+
|
|
296
|
+
**Example:** 828 code blocks → limited to 500 → **2.4s** analysis time
|
|
297
|
+
|
|
298
|
+
### Tuning Options
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
# Default (40% threshold - comprehensive detection)
|
|
302
|
+
aiready-patterns ./src
|
|
303
|
+
|
|
304
|
+
# Higher threshold for only obvious duplicates
|
|
305
|
+
aiready-patterns ./src --similarity 0.65
|
|
306
|
+
|
|
307
|
+
# Lower threshold for more potential duplicates
|
|
308
|
+
aiready-patterns ./src --similarity 0.55
|
|
309
|
+
|
|
310
|
+
# Approximate mode is default (fast, with candidate filtering)
|
|
311
|
+
aiready-patterns ./src
|
|
312
|
+
|
|
313
|
+
# Exact mode with progress tracking (shows % and ETA)
|
|
314
|
+
aiready-patterns ./src --no-approx --stream-results
|
|
315
|
+
|
|
316
|
+
# Maximum speed (aggressive filtering)
|
|
317
|
+
aiready-patterns ./src --min-shared-tokens 12 --min-lines 10
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
## 🎛️ Tuning Playbook
|
|
321
|
+
|
|
322
|
+
Use these presets to quickly balance precision, recall, and runtime:
|
|
323
|
+
|
|
324
|
+
- Speed-first (large repos):
|
|
325
|
+
- `aiready-patterns ./src --min-shared-tokens 12 --max-candidates 60 --max-blocks 300`
|
|
326
|
+
- Cuts weak candidates early; best for fast, iterative scans.
|
|
327
|
+
|
|
328
|
+
- Coverage-first (more findings):
|
|
329
|
+
- `aiready-patterns ./src --min-shared-tokens 6 --max-candidates 150`
|
|
330
|
+
- Expands candidate pool; expect more results and longer runtime.
|
|
331
|
+
|
|
332
|
+
- Short-block focus (helpers/utilities):
|
|
333
|
+
- `aiready-patterns ./src --min-lines 5 --min-shared-tokens 6 --max-candidates 120`
|
|
334
|
+
- Better recall for small functions; consider `--exclude "**/test/**"` to reduce noise.
|
|
335
|
+
|
|
336
|
+
### Minimum Lines vs Min Shared Tokens
|
|
337
|
+
|
|
338
|
+
- `minLines` filters which blocks are extracted; lower values include smaller functions that have fewer tokens overall.
|
|
339
|
+
- Smaller blocks naturally share fewer tokens; to avoid missing true matches when `minLines` is low (≤5–6), consider lowering `minSharedTokens` by 1–2.
|
|
340
|
+
- Recommended pairs:
|
|
341
|
+
- `minLines 5–6` → `minSharedTokens 6–8` (recall-friendly; watch noise)
|
|
342
|
+
- `minLines 8–10` → `minSharedTokens 8–10` (precision-first)
|
|
343
|
+
- Default balance: `minLines=5`, `minSharedTokens=8` works well for most repos. Reduce `minSharedTokens` only when you specifically want to catch more short helpers.
|
|
344
|
+
|
|
345
|
+
**CLI Options:**
|
|
346
|
+
- `--stream-results` - Output duplicates as found (enabled by default)
|
|
347
|
+
- `--no-approx` - Disable approximate mode (slower, O(B²) complexity, use with caution)
|
|
348
|
+
- `--min-lines N` - Filter blocks smaller than N lines (default 5)
|
|
349
|
+
|
|
350
|
+
### Controlling Analysis Scope
|
|
351
|
+
|
|
352
|
+
The tool analyzes **all extracted code blocks** by default. Control scope using:
|
|
353
|
+
|
|
354
|
+
**1. `--min-lines` (primary filter):**
|
|
355
|
+
- Filters blocks during extraction (most efficient)
|
|
356
|
+
- Higher values = focus on substantial functions
|
|
357
|
+
- Lower values = catch smaller utility duplicates
|
|
358
|
+
|
|
359
|
+
**2. `--no-approx` mode (use with caution):**
|
|
360
|
+
- Disables approximate mode (candidate pre-filtering)
|
|
361
|
+
- O(B²) complexity - compares every block to every other block
|
|
362
|
+
- **Automatic safety limit:** 500K comparisons (~1000 blocks max)
|
|
363
|
+
- Shows warning when used with >500 blocks
|
|
364
|
+
- Approximate mode (default) is recommended for all use cases
|
|
365
|
+
|
|
366
|
+
**Examples:**
|
|
367
|
+
```bash
|
|
368
|
+
# Focus on substantial functions only
|
|
369
|
+
aiready-patterns ./src --min-lines 15
|
|
370
|
+
|
|
371
|
+
# Comprehensive scan of all functions (recommended)
|
|
372
|
+
aiready-patterns ./src --min-lines 5
|
|
373
|
+
|
|
374
|
+
# Quick scan of major duplicates
|
|
375
|
+
aiready-patterns ./src --min-lines 20
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
**Recommendations by codebase size:**
|
|
379
|
+
|
|
380
|
+
| Repo Size | Files | Strategy | Expected Time |
|
|
381
|
+
|-----------|-------|----------|---------------|
|
|
382
|
+
| **Small** | <100 | Use defaults | <1s ✅ |
|
|
383
|
+
| **Medium** | 100-500 | Use defaults | 1-5s ✅ |
|
|
384
|
+
| **Large** | 500-1,000 | Use defaults or `--min-lines 10` | 3-10s ✅ |
|
|
385
|
+
| **Very Large** | 1,000-5,000 | `--min-lines 15` or analyze by module | 5-20s ⚠️ |
|
|
386
|
+
| **Super Large** | 5,000+ | **Analyze by module** (see below) | 10-60s per module ⚠️ |
|
|
387
|
+
|
|
388
|
+
### Analyzing Very Large Repositories
|
|
389
|
+
|
|
390
|
+
For repos with 1,000+ files, use modular analysis:
|
|
391
|
+
|
|
392
|
+
```bash
|
|
393
|
+
# Analyze by top-level directory
|
|
394
|
+
for dir in src/*/; do
|
|
395
|
+
echo "Analyzing $dir"
|
|
396
|
+
aiready-patterns "$dir" --min-lines 10
|
|
397
|
+
done
|
|
398
|
+
|
|
399
|
+
# Or focus on specific high-value areas
|
|
400
|
+
aiready-patterns ./src/api --min-lines 10
|
|
401
|
+
aiready-patterns ./src/core --min-lines 10
|
|
402
|
+
aiready-patterns ./src/services --min-lines 10
|
|
403
|
+
|
|
404
|
+
# For super large repos (5K+ files), increase thresholds
|
|
405
|
+
aiready-patterns ./src/backend --min-lines 20 --similarity 0.50
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
**Why modular analysis?**
|
|
409
|
+
- Ensures comprehensive coverage (100% of each module)
|
|
410
|
+
- Avoids hitting comparison budget limits
|
|
411
|
+
- Provides focused, actionable results per module
|
|
412
|
+
- Better for CI/CD integration (parallel jobs)
|
|
413
|
+
|
|
414
|
+
**Progress Indicators:**
|
|
415
|
+
- **Approx mode**: Shows blocks processed + duplicates found
|
|
416
|
+
- **Exact mode**: Shows % complete, ETA, and comparisons processed
|
|
417
|
+
- **Stream mode**: Prints each duplicate immediately when found (enabled by default)
|
|
418
|
+
|
|
208
419
|
## 🔧 CI/CD Integration
|
|
209
420
|
|
|
210
421
|
### GitHub Actions
|
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
// src/index.ts
|
|
2
|
+
import { scanFiles, readFileContent } from "@aiready/core";
|
|
3
|
+
|
|
4
|
+
// src/detector.ts
|
|
5
|
+
import { estimateTokens } from "@aiready/core";
|
|
6
|
+
function categorizePattern(code) {
|
|
7
|
+
const lower = code.toLowerCase();
|
|
8
|
+
if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
|
|
9
|
+
return "api-handler";
|
|
10
|
+
}
|
|
11
|
+
if (lower.includes("validate") || lower.includes("schema") || lower.includes("zod") || lower.includes("yup") || lower.includes("if") && lower.includes("throw")) {
|
|
12
|
+
return "validator";
|
|
13
|
+
}
|
|
14
|
+
if (lower.includes("return (") || lower.includes("jsx") || lower.includes("component") || lower.includes("props")) {
|
|
15
|
+
return "component";
|
|
16
|
+
}
|
|
17
|
+
if (lower.includes("class ") || lower.includes("this.")) {
|
|
18
|
+
return "class-method";
|
|
19
|
+
}
|
|
20
|
+
if (lower.includes("return ") && !lower.includes("this") && !lower.includes("new ")) {
|
|
21
|
+
return "utility";
|
|
22
|
+
}
|
|
23
|
+
if (lower.includes("function") || lower.includes("=>")) {
|
|
24
|
+
return "function";
|
|
25
|
+
}
|
|
26
|
+
return "unknown";
|
|
27
|
+
}
|
|
28
|
+
function extractCodeBlocks(content, minLines) {
|
|
29
|
+
const lines = content.split("\n");
|
|
30
|
+
const blocks = [];
|
|
31
|
+
let currentBlock = [];
|
|
32
|
+
let blockStart = 0;
|
|
33
|
+
let braceDepth = 0;
|
|
34
|
+
let inFunction = false;
|
|
35
|
+
for (let i = 0; i < lines.length; i++) {
|
|
36
|
+
const line = lines[i];
|
|
37
|
+
const trimmed = line.trim();
|
|
38
|
+
if (!inFunction && (trimmed.includes("function ") || trimmed.includes("=>") || trimmed.includes("async ") || /^(export\s+)?(async\s+)?function\s+/.test(trimmed) || /^(export\s+)?const\s+\w+\s*=\s*(async\s*)?\(/.test(trimmed))) {
|
|
39
|
+
inFunction = true;
|
|
40
|
+
blockStart = i;
|
|
41
|
+
}
|
|
42
|
+
for (const char of line) {
|
|
43
|
+
if (char === "{") braceDepth++;
|
|
44
|
+
if (char === "}") braceDepth--;
|
|
45
|
+
}
|
|
46
|
+
if (inFunction) {
|
|
47
|
+
currentBlock.push(line);
|
|
48
|
+
}
|
|
49
|
+
if (inFunction && braceDepth === 0 && currentBlock.length >= minLines) {
|
|
50
|
+
const blockContent = currentBlock.join("\n");
|
|
51
|
+
const linesOfCode = currentBlock.filter(
|
|
52
|
+
(l) => l.trim() && !l.trim().startsWith("//")
|
|
53
|
+
).length;
|
|
54
|
+
blocks.push({
|
|
55
|
+
content: blockContent,
|
|
56
|
+
startLine: blockStart + 1,
|
|
57
|
+
endLine: i + 1,
|
|
58
|
+
patternType: categorizePattern(blockContent),
|
|
59
|
+
linesOfCode
|
|
60
|
+
});
|
|
61
|
+
currentBlock = [];
|
|
62
|
+
inFunction = false;
|
|
63
|
+
} else if (inFunction && braceDepth === 0) {
|
|
64
|
+
currentBlock = [];
|
|
65
|
+
inFunction = false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return blocks;
|
|
69
|
+
}
|
|
70
|
+
function normalizeCode(code) {
|
|
71
|
+
return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
|
|
72
|
+
}
|
|
73
|
+
function jaccardSimilarity(tokens1, tokens2) {
|
|
74
|
+
const set1 = new Set(tokens1);
|
|
75
|
+
const set2 = new Set(tokens2);
|
|
76
|
+
let intersection = 0;
|
|
77
|
+
for (const token of set1) {
|
|
78
|
+
if (set2.has(token)) intersection++;
|
|
79
|
+
}
|
|
80
|
+
const union = set1.size + set2.size - intersection;
|
|
81
|
+
return union === 0 ? 0 : intersection / union;
|
|
82
|
+
}
|
|
83
|
+
async function detectDuplicatePatterns(files, options) {
|
|
84
|
+
const {
|
|
85
|
+
minSimilarity,
|
|
86
|
+
minLines,
|
|
87
|
+
maxBlocks = 500,
|
|
88
|
+
batchSize = 100,
|
|
89
|
+
approx = true,
|
|
90
|
+
minSharedTokens = 8,
|
|
91
|
+
maxCandidatesPerBlock = 100,
|
|
92
|
+
maxComparisons = 5e4,
|
|
93
|
+
// Cap at 50K comparisons by default
|
|
94
|
+
streamResults = false
|
|
95
|
+
} = options;
|
|
96
|
+
const duplicates = [];
|
|
97
|
+
let allBlocks = files.flatMap(
|
|
98
|
+
(file) => extractCodeBlocks(file.content, minLines).map((block) => ({
|
|
99
|
+
content: block.content,
|
|
100
|
+
startLine: block.startLine,
|
|
101
|
+
endLine: block.endLine,
|
|
102
|
+
file: file.file,
|
|
103
|
+
normalized: normalizeCode(block.content),
|
|
104
|
+
patternType: block.patternType,
|
|
105
|
+
tokenCost: estimateTokens(block.content),
|
|
106
|
+
linesOfCode: block.linesOfCode
|
|
107
|
+
}))
|
|
108
|
+
);
|
|
109
|
+
console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
|
|
110
|
+
if (allBlocks.length > maxBlocks) {
|
|
111
|
+
console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
|
|
112
|
+
console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
|
|
113
|
+
allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
|
|
114
|
+
}
|
|
115
|
+
const stopwords = /* @__PURE__ */ new Set([
|
|
116
|
+
"return",
|
|
117
|
+
"const",
|
|
118
|
+
"let",
|
|
119
|
+
"var",
|
|
120
|
+
"function",
|
|
121
|
+
"class",
|
|
122
|
+
"new",
|
|
123
|
+
"if",
|
|
124
|
+
"else",
|
|
125
|
+
"for",
|
|
126
|
+
"while",
|
|
127
|
+
"async",
|
|
128
|
+
"await",
|
|
129
|
+
"try",
|
|
130
|
+
"catch",
|
|
131
|
+
"switch",
|
|
132
|
+
"case",
|
|
133
|
+
"default",
|
|
134
|
+
"import",
|
|
135
|
+
"export",
|
|
136
|
+
"from",
|
|
137
|
+
"true",
|
|
138
|
+
"false",
|
|
139
|
+
"null",
|
|
140
|
+
"undefined",
|
|
141
|
+
"this"
|
|
142
|
+
]);
|
|
143
|
+
const tokenize = (norm) => norm.split(/[\s(){}\[\];,\.]+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
|
|
144
|
+
const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
|
|
145
|
+
const invertedIndex = /* @__PURE__ */ new Map();
|
|
146
|
+
if (approx) {
|
|
147
|
+
for (let i = 0; i < blockTokens.length; i++) {
|
|
148
|
+
for (const tok of blockTokens[i]) {
|
|
149
|
+
let arr = invertedIndex.get(tok);
|
|
150
|
+
if (!arr) {
|
|
151
|
+
arr = [];
|
|
152
|
+
invertedIndex.set(tok, arr);
|
|
153
|
+
}
|
|
154
|
+
arr.push(i);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
|
|
159
|
+
if (totalComparisons !== void 0) {
|
|
160
|
+
console.log(`Processing ${totalComparisons.toLocaleString()} comparisons in batches...`);
|
|
161
|
+
} else {
|
|
162
|
+
console.log(`Using approximate candidate selection to reduce comparisons...`);
|
|
163
|
+
}
|
|
164
|
+
let comparisonsProcessed = 0;
|
|
165
|
+
let comparisonsBudgetExhausted = false;
|
|
166
|
+
const startTime = Date.now();
|
|
167
|
+
for (let i = 0; i < allBlocks.length; i++) {
|
|
168
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) {
|
|
169
|
+
comparisonsBudgetExhausted = true;
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
if (i % batchSize === 0 && i > 0) {
|
|
173
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
174
|
+
const duplicatesFound = duplicates.length;
|
|
175
|
+
if (totalComparisons !== void 0) {
|
|
176
|
+
const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
|
|
177
|
+
const remaining = totalComparisons - comparisonsProcessed;
|
|
178
|
+
const rate = comparisonsProcessed / parseFloat(elapsed);
|
|
179
|
+
const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
|
|
180
|
+
console.log(` ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`);
|
|
181
|
+
} else {
|
|
182
|
+
console.log(` Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`);
|
|
183
|
+
}
|
|
184
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
185
|
+
}
|
|
186
|
+
const block1 = allBlocks[i];
|
|
187
|
+
let candidates = null;
|
|
188
|
+
if (approx) {
|
|
189
|
+
const counts = /* @__PURE__ */ new Map();
|
|
190
|
+
for (const tok of blockTokens[i]) {
|
|
191
|
+
const ids = invertedIndex.get(tok);
|
|
192
|
+
if (!ids) continue;
|
|
193
|
+
for (const j of ids) {
|
|
194
|
+
if (j <= i) continue;
|
|
195
|
+
if (allBlocks[j].file === block1.file) continue;
|
|
196
|
+
counts.set(j, (counts.get(j) || 0) + 1);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
candidates = Array.from(counts.entries()).filter(([, shared]) => shared >= minSharedTokens).sort((a, b) => b[1] - a[1]).slice(0, maxCandidatesPerBlock).map(([j, shared]) => ({ j, shared }));
|
|
200
|
+
}
|
|
201
|
+
if (approx && candidates) {
|
|
202
|
+
for (const { j } of candidates) {
|
|
203
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
204
|
+
comparisonsProcessed++;
|
|
205
|
+
const block2 = allBlocks[j];
|
|
206
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
207
|
+
if (similarity >= minSimilarity) {
|
|
208
|
+
const duplicate = {
|
|
209
|
+
file1: block1.file,
|
|
210
|
+
file2: block2.file,
|
|
211
|
+
line1: block1.startLine,
|
|
212
|
+
line2: block2.startLine,
|
|
213
|
+
endLine1: block1.endLine,
|
|
214
|
+
endLine2: block2.endLine,
|
|
215
|
+
similarity,
|
|
216
|
+
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
217
|
+
patternType: block1.patternType,
|
|
218
|
+
tokenCost: block1.tokenCost + block2.tokenCost,
|
|
219
|
+
linesOfCode: block1.linesOfCode
|
|
220
|
+
};
|
|
221
|
+
duplicates.push(duplicate);
|
|
222
|
+
if (streamResults) {
|
|
223
|
+
console.log(`
|
|
224
|
+
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
225
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
226
|
+
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} else {
|
|
231
|
+
for (let j = i + 1; j < allBlocks.length; j++) {
|
|
232
|
+
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
233
|
+
comparisonsProcessed++;
|
|
234
|
+
const block2 = allBlocks[j];
|
|
235
|
+
if (block1.file === block2.file) continue;
|
|
236
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
237
|
+
if (similarity >= minSimilarity) {
|
|
238
|
+
const duplicate = {
|
|
239
|
+
file1: block1.file,
|
|
240
|
+
file2: block2.file,
|
|
241
|
+
line1: block1.startLine,
|
|
242
|
+
line2: block2.startLine,
|
|
243
|
+
endLine1: block1.endLine,
|
|
244
|
+
endLine2: block2.endLine,
|
|
245
|
+
similarity,
|
|
246
|
+
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
247
|
+
patternType: block1.patternType,
|
|
248
|
+
tokenCost: block1.tokenCost + block2.tokenCost,
|
|
249
|
+
linesOfCode: block1.linesOfCode
|
|
250
|
+
};
|
|
251
|
+
duplicates.push(duplicate);
|
|
252
|
+
if (streamResults) {
|
|
253
|
+
console.log(`
|
|
254
|
+
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
255
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
256
|
+
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
if (comparisonsBudgetExhausted) {
|
|
263
|
+
console.log(`\u26A0\uFE0F Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`);
|
|
264
|
+
}
|
|
265
|
+
return duplicates.sort(
|
|
266
|
+
(a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// src/index.ts
|
|
271
|
+
function getRefactoringSuggestion(patternType, similarity) {
|
|
272
|
+
const baseMessages = {
|
|
273
|
+
"api-handler": "Extract common middleware or create a base handler class",
|
|
274
|
+
validator: "Consolidate validation logic into shared schema validators (Zod/Yup)",
|
|
275
|
+
utility: "Move to a shared utilities file and reuse across modules",
|
|
276
|
+
"class-method": "Consider inheritance or composition to share behavior",
|
|
277
|
+
component: "Extract shared logic into a custom hook or HOC",
|
|
278
|
+
function: "Extract into a shared helper function",
|
|
279
|
+
unknown: "Extract common logic into a reusable module"
|
|
280
|
+
};
|
|
281
|
+
const urgency = similarity > 0.95 ? " (CRITICAL: Nearly identical code)" : similarity > 0.9 ? " (HIGH: Very similar, refactor soon)" : "";
|
|
282
|
+
return baseMessages[patternType] + urgency;
|
|
283
|
+
}
|
|
284
|
+
async function analyzePatterns(options) {
|
|
285
|
+
const {
|
|
286
|
+
minSimilarity = 0.4,
|
|
287
|
+
// Jaccard similarity default (40% threshold)
|
|
288
|
+
minLines = 5,
|
|
289
|
+
maxBlocks = 500,
|
|
290
|
+
batchSize = 100,
|
|
291
|
+
approx = true,
|
|
292
|
+
minSharedTokens = 8,
|
|
293
|
+
maxCandidatesPerBlock = 100,
|
|
294
|
+
maxComparisons = 5e4,
|
|
295
|
+
streamResults = false,
|
|
296
|
+
...scanOptions
|
|
297
|
+
} = options;
|
|
298
|
+
const files = await scanFiles(scanOptions);
|
|
299
|
+
const results = [];
|
|
300
|
+
const fileContents = await Promise.all(
|
|
301
|
+
files.map(async (file) => ({
|
|
302
|
+
file,
|
|
303
|
+
content: await readFileContent(file)
|
|
304
|
+
}))
|
|
305
|
+
);
|
|
306
|
+
const duplicates = await detectDuplicatePatterns(fileContents, {
|
|
307
|
+
minSimilarity,
|
|
308
|
+
minLines,
|
|
309
|
+
maxBlocks,
|
|
310
|
+
batchSize,
|
|
311
|
+
approx,
|
|
312
|
+
minSharedTokens,
|
|
313
|
+
maxCandidatesPerBlock,
|
|
314
|
+
maxComparisons,
|
|
315
|
+
streamResults
|
|
316
|
+
});
|
|
317
|
+
for (const file of files) {
|
|
318
|
+
const fileDuplicates = duplicates.filter(
|
|
319
|
+
(dup) => dup.file1 === file || dup.file2 === file
|
|
320
|
+
);
|
|
321
|
+
const issues = fileDuplicates.map((dup) => {
|
|
322
|
+
const otherFile = dup.file1 === file ? dup.file2 : dup.file1;
|
|
323
|
+
const severity = dup.similarity > 0.95 ? "critical" : dup.similarity > 0.9 ? "major" : "minor";
|
|
324
|
+
return {
|
|
325
|
+
type: "duplicate-pattern",
|
|
326
|
+
severity,
|
|
327
|
+
message: `${dup.patternType} pattern ${Math.round(dup.similarity * 100)}% similar to ${otherFile} (${dup.tokenCost} tokens wasted)`,
|
|
328
|
+
location: {
|
|
329
|
+
file,
|
|
330
|
+
line: dup.file1 === file ? dup.line1 : dup.line2
|
|
331
|
+
},
|
|
332
|
+
suggestion: getRefactoringSuggestion(dup.patternType, dup.similarity)
|
|
333
|
+
};
|
|
334
|
+
});
|
|
335
|
+
const totalTokenCost = fileDuplicates.reduce(
|
|
336
|
+
(sum, dup) => sum + dup.tokenCost,
|
|
337
|
+
0
|
|
338
|
+
);
|
|
339
|
+
results.push({
|
|
340
|
+
fileName: file,
|
|
341
|
+
issues,
|
|
342
|
+
metrics: {
|
|
343
|
+
tokenCost: totalTokenCost,
|
|
344
|
+
consistencyScore: Math.max(0, 1 - fileDuplicates.length * 0.1)
|
|
345
|
+
}
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
return results;
|
|
349
|
+
}
|
|
350
|
+
function generateSummary(results) {
|
|
351
|
+
const allIssues = results.flatMap((r) => r.issues);
|
|
352
|
+
const totalTokenCost = results.reduce(
|
|
353
|
+
(sum, r) => sum + (r.metrics.tokenCost || 0),
|
|
354
|
+
0
|
|
355
|
+
);
|
|
356
|
+
const patternsByType = {
|
|
357
|
+
"api-handler": 0,
|
|
358
|
+
validator: 0,
|
|
359
|
+
utility: 0,
|
|
360
|
+
"class-method": 0,
|
|
361
|
+
component: 0,
|
|
362
|
+
function: 0,
|
|
363
|
+
unknown: 0
|
|
364
|
+
};
|
|
365
|
+
allIssues.forEach((issue) => {
|
|
366
|
+
const match = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
|
|
367
|
+
if (match) {
|
|
368
|
+
const type = match[1];
|
|
369
|
+
patternsByType[type] = (patternsByType[type] || 0) + 1;
|
|
370
|
+
}
|
|
371
|
+
});
|
|
372
|
+
const topDuplicates = allIssues.slice(0, 10).map((issue) => {
|
|
373
|
+
const similarityMatch = issue.message.match(/(\d+)% similar/);
|
|
374
|
+
const tokenMatch = issue.message.match(/\((\d+) tokens/);
|
|
375
|
+
const typeMatch = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
|
|
376
|
+
const fileMatch = issue.message.match(/similar to (.+?) \(/);
|
|
377
|
+
return {
|
|
378
|
+
file1: issue.location.file,
|
|
379
|
+
file2: fileMatch?.[1] || "unknown",
|
|
380
|
+
line1: issue.location.line,
|
|
381
|
+
line2: 0,
|
|
382
|
+
// Not available from Issue
|
|
383
|
+
endLine1: 0,
|
|
384
|
+
// Not available from Issue
|
|
385
|
+
endLine2: 0,
|
|
386
|
+
// Not available from Issue
|
|
387
|
+
similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
|
|
388
|
+
patternType: typeMatch?.[1] || "unknown",
|
|
389
|
+
tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
|
|
390
|
+
};
|
|
391
|
+
});
|
|
392
|
+
return {
|
|
393
|
+
totalPatterns: allIssues.length,
|
|
394
|
+
totalTokenCost,
|
|
395
|
+
patternsByType,
|
|
396
|
+
topDuplicates
|
|
397
|
+
};
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
export {
|
|
401
|
+
detectDuplicatePatterns,
|
|
402
|
+
analyzePatterns,
|
|
403
|
+
generateSummary
|
|
404
|
+
};
|