@aiready/pattern-detect 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +149 -38
- package/dist/{chunk-N5DE7IYX.mjs → chunk-AXHGYYYZ.mjs} +5 -17
- package/dist/{chunk-57O7FEEM.mjs → chunk-JKVKOXYR.mjs} +43 -36
- package/dist/{chunk-DNI7S33V.mjs → chunk-OFGMDX66.mjs} +34 -31
- package/dist/{chunk-4CZGZIDL.mjs → chunk-QE4E3F7C.mjs} +20 -19
- package/dist/{chunk-ZNZ5O435.mjs → chunk-TXWPOVYU.mjs} +37 -35
- package/dist/cli.js +45 -55
- package/dist/cli.mjs +17 -18
- package/dist/index.d.mts +5 -10
- package/dist/index.d.ts +5 -10
- package/dist/index.js +29 -38
- package/dist/index.mjs +1 -1
- package/package.json +11 -11
- package/dist/chunk-6VQTQRDW.mjs +0 -245
- package/dist/chunk-JTJXOIO2.mjs +0 -378
- package/dist/chunk-K5O2HVB5.mjs +0 -114
- package/dist/chunk-RLWJXASG.mjs +0 -227
- package/dist/chunk-YA3N6EC5.mjs +0 -351
package/README.md
CHANGED
|
@@ -24,6 +24,19 @@ AI coding assistants (GitHub Copilot, ChatGPT, Claude) generate functionally sim
|
|
|
24
24
|
| Refactoring Suggestions | ❌ Generic | ✅ Specific to pattern type |
|
|
25
25
|
| Output Formats | Text/JSON | Console/JSON/HTML with rich formatting |
|
|
26
26
|
|
|
27
|
+
#### How We Differ (and When to Use Each)
|
|
28
|
+
|
|
29
|
+
- **Semantic intent vs exact clones**: jscpd flags copy-paste or near-duplicates; we detect functionally similar code even when structure differs (e.g., two API handlers with different frameworks).
|
|
30
|
+
- **Pattern typing**: We classify duplicates into `api-handler`, `validator`, `utility`, `component`, etc., so teams can prioritize coherent refactors.
|
|
31
|
+
- **AI context cost**: We estimate tokens wasted to quantify impact on AI tools (larger context, higher cost, more confusion).
|
|
32
|
+
- **Refactoring guidance**: We propose targeted fixes per pattern type (e.g., extract middleware or create base handler).
|
|
33
|
+
- **Performance profile**: We use Jaccard similarity with candidate filtering; ~2–3s for ~500 blocks on medium repos.
|
|
34
|
+
|
|
35
|
+
Recommended workflow:
|
|
36
|
+
- Run **jscpd** in CI to enforce low clone percentage (blocking).
|
|
37
|
+
- Run **@aiready/pattern-detect** to surface semantic duplicates and token waste (advisory), feeding a refactoring backlog.
|
|
38
|
+
- Use both for comprehensive hygiene: jscpd for exact clones; AIReady for intent-level duplication that AI tends to reintroduce.
|
|
39
|
+
|
|
27
40
|
## 🚀 Installation
|
|
28
41
|
|
|
29
42
|
```bash
|
|
@@ -57,6 +70,33 @@ aiready-patterns ./src --output json --output-file report.json
|
|
|
57
70
|
aiready-patterns ./src --output html
|
|
58
71
|
```
|
|
59
72
|
|
|
73
|
+
#### Presets (quick copy/paste)
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Speed-first (large repos)
|
|
77
|
+
aiready-patterns ./src \
|
|
78
|
+
--min-shared-tokens 12 \
|
|
79
|
+
--max-candidates 60 \
|
|
80
|
+
--max-blocks 300
|
|
81
|
+
|
|
82
|
+
# Coverage-first (more findings)
|
|
83
|
+
aiready-patterns ./src \
|
|
84
|
+
--min-shared-tokens 6 \
|
|
85
|
+
--max-candidates 150
|
|
86
|
+
|
|
87
|
+
# Short-block focus (helpers/utilities)
|
|
88
|
+
aiready-patterns ./src \
|
|
89
|
+
--min-lines 5 \
|
|
90
|
+
--min-shared-tokens 6 \
|
|
91
|
+
--max-candidates 120 \
|
|
92
|
+
--exclude "**/test/**"
|
|
93
|
+
|
|
94
|
+
# Deep dive with streaming (comprehensive detection)
|
|
95
|
+
aiready-patterns ./src \
|
|
96
|
+
--no-approx \
|
|
97
|
+
--stream-results
|
|
98
|
+
```
|
|
99
|
+
|
|
60
100
|
### Programmatic API
|
|
61
101
|
|
|
62
102
|
```typescript
|
|
@@ -172,7 +212,8 @@ router.get('/posts/:id', createResourceHandler('Post', database.posts.findOne));
|
|
|
172
212
|
|
|
173
213
|
| Option | Description | Default |
|
|
174
214
|
|--------|-------------|---------|
|
|
175
|
-
| `minSimilarity` | Similarity threshold (0-1).
|
|
215
|
+
| `minSimilarity` | Similarity threshold (0-1). Default `0.40` (Jaccard). Raise for only obvious duplicates; lower to catch more | `0.40` |
|
|
216
|
+
| `minSimilarity` | Similarity threshold (0-1). Default `0.40` (Jaccard). Raise for only obvious duplicates; lower to catch more | `0.40` |
|
|
176
217
|
| `minLines` | Minimum lines to consider a pattern | `5` |
|
|
177
218
|
| `maxBlocks` | Maximum code blocks to analyze (prevents OOM) | `500` |
|
|
178
219
|
| `include` | File patterns to include | `['**/*.{ts,tsx,js,jsx,py,java}']` |
|
|
@@ -237,34 +278,27 @@ Estimated tokens wasted when AI tools process duplicate code:
|
|
|
237
278
|
|
|
238
279
|
### Algorithm Complexity
|
|
239
280
|
|
|
240
|
-
**
|
|
281
|
+
**Jaccard Similarity**: **O(B × C × T)** where:
|
|
241
282
|
- B = number of blocks
|
|
242
283
|
- C = average candidates per block (~100)
|
|
243
284
|
- T = average tokens per block (~50)
|
|
244
|
-
- **
|
|
285
|
+
- **O(T) per comparison** instead of O(N²)
|
|
245
286
|
- **Default threshold: 0.40** (comprehensive detection including tests and helpers)
|
|
246
287
|
|
|
247
|
-
**Exact Mode** (`--no-approx --no-fast-mode`): **O(B² × N²)** where:
|
|
248
|
-
- B = number of blocks
|
|
249
|
-
- N = average characters per block
|
|
250
|
-
- **Levenshtein similarity** - more accurate, much slower
|
|
251
|
-
- **Recommended threshold: 0.85+**
|
|
252
|
-
- **Not recommended for >100 files**
|
|
253
|
-
|
|
254
288
|
### Performance Benchmarks
|
|
255
289
|
|
|
256
|
-
| Repo Size | Blocks |
|
|
257
|
-
|
|
258
|
-
| Small (<100 files) | ~50 | <1s |
|
|
259
|
-
| Medium (100-500 files) | ~500 | ~2s |
|
|
260
|
-
| Large (500+ files) | ~500 (capped) | ~2s |
|
|
290
|
+
| Repo Size | Blocks | Analysis Time |
|
|
291
|
+
|-----------|--------|--------------|
|
|
292
|
+
| Small (<100 files) | ~50 | <1s |
|
|
293
|
+
| Medium (100-500 files) | ~500 | ~2s |
|
|
294
|
+
| Large (500+ files) | ~500 (capped) | ~2s |
|
|
261
295
|
|
|
262
|
-
**Example:** 828 code blocks → limited to 500 → **2.4s**
|
|
296
|
+
**Example:** 828 code blocks → limited to 500 → **2.4s** analysis time
|
|
263
297
|
|
|
264
298
|
### Tuning Options
|
|
265
299
|
|
|
266
300
|
```bash
|
|
267
|
-
# Default (
|
|
301
|
+
# Default (40% threshold - comprehensive detection)
|
|
268
302
|
aiready-patterns ./src
|
|
269
303
|
|
|
270
304
|
# Higher threshold for only obvious duplicates
|
|
@@ -273,37 +307,114 @@ aiready-patterns ./src --similarity 0.65
|
|
|
273
307
|
# Lower threshold for more potential duplicates
|
|
274
308
|
aiready-patterns ./src --similarity 0.55
|
|
275
309
|
|
|
276
|
-
#
|
|
277
|
-
aiready-patterns ./src
|
|
310
|
+
# Approximate mode is default (fast, with candidate filtering)
|
|
311
|
+
aiready-patterns ./src
|
|
278
312
|
|
|
279
|
-
# Exact mode with progress tracking (
|
|
280
|
-
aiready-patterns ./src --no-approx --
|
|
313
|
+
# Exact mode with progress tracking (shows % and ETA)
|
|
314
|
+
aiready-patterns ./src --no-approx --stream-results
|
|
281
315
|
|
|
282
316
|
# Maximum speed (aggressive filtering)
|
|
283
|
-
aiready-patterns ./src --
|
|
284
|
-
|
|
285
|
-
# Exact mode (slowest, most accurate)
|
|
286
|
-
aiready-patterns ./src --no-approx --no-fast-mode --max-comparisons 500000
|
|
317
|
+
aiready-patterns ./src --min-shared-tokens 12 --min-lines 10
|
|
287
318
|
```
|
|
288
319
|
|
|
320
|
+
## 🎛️ Tuning Playbook
|
|
321
|
+
|
|
322
|
+
Use these presets to quickly balance precision, recall, and runtime:
|
|
323
|
+
|
|
324
|
+
- Speed-first (large repos):
|
|
325
|
+
- `aiready-patterns ./src --min-shared-tokens 12 --max-candidates 60 --max-blocks 300`
|
|
326
|
+
- Cuts weak candidates early; best for fast, iterative scans.
|
|
327
|
+
|
|
328
|
+
- Coverage-first (more findings):
|
|
329
|
+
- `aiready-patterns ./src --min-shared-tokens 6 --max-candidates 150`
|
|
330
|
+
- Expands candidate pool; expect more results and longer runtime.
|
|
331
|
+
|
|
332
|
+
- Short-block focus (helpers/utilities):
|
|
333
|
+
- `aiready-patterns ./src --min-lines 5 --min-shared-tokens 6 --max-candidates 120`
|
|
334
|
+
- Better recall for small functions; consider `--exclude "**/test/**"` to reduce noise.
|
|
335
|
+
|
|
336
|
+
### Minimum Lines vs Min Shared Tokens
|
|
337
|
+
|
|
338
|
+
- `minLines` filters which blocks are extracted; lower values include smaller functions that have fewer tokens overall.
|
|
339
|
+
- Smaller blocks naturally share fewer tokens; to avoid missing true matches when `minLines` is low (≤5–6), consider lowering `minSharedTokens` by 1–2.
|
|
340
|
+
- Recommended pairs:
|
|
341
|
+
- `minLines 5–6` → `minSharedTokens 6–8` (recall-friendly; watch noise)
|
|
342
|
+
- `minLines 8–10` → `minSharedTokens 8–10` (precision-first)
|
|
343
|
+
- Default balance: `minLines=5`, `minSharedTokens=8` works well for most repos. Reduce `minSharedTokens` only when you specifically want to catch more short helpers.
|
|
344
|
+
|
|
289
345
|
**CLI Options:**
|
|
290
|
-
- `--stream-results` - Output duplicates as found (
|
|
291
|
-
- `--no-
|
|
292
|
-
- `--
|
|
293
|
-
|
|
294
|
-
|
|
346
|
+
- `--stream-results` - Output duplicates as found (enabled by default)
|
|
347
|
+
- `--no-approx` - Disable approximate mode (slower, O(B²) complexity, use with caution)
|
|
348
|
+
- `--min-lines N` - Filter blocks smaller than N lines (default 5)
|
|
349
|
+
|
|
350
|
+
### Controlling Analysis Scope
|
|
351
|
+
|
|
352
|
+
The tool analyzes **all extracted code blocks** by default. Control scope using:
|
|
353
|
+
|
|
354
|
+
**1. `--min-lines` (primary filter):**
|
|
355
|
+
- Filters blocks during extraction (most efficient)
|
|
356
|
+
- Higher values = focus on substantial functions
|
|
357
|
+
- Lower values = catch smaller utility duplicates
|
|
358
|
+
|
|
359
|
+
**2. `--no-approx` mode (use with caution):**
|
|
360
|
+
- Disables approximate mode (candidate pre-filtering)
|
|
361
|
+
- O(B²) complexity - compares every block to every other block
|
|
362
|
+
- **Automatic safety limit:** 500K comparisons (~1000 blocks max)
|
|
363
|
+
- Shows warning when used with >500 blocks
|
|
364
|
+
- Approximate mode (default) is recommended for all use cases
|
|
365
|
+
|
|
366
|
+
**Examples:**
|
|
367
|
+
```bash
|
|
368
|
+
# Focus on substantial functions only
|
|
369
|
+
aiready-patterns ./src --min-lines 15
|
|
370
|
+
|
|
371
|
+
# Comprehensive scan of all functions (recommended)
|
|
372
|
+
aiready-patterns ./src --min-lines 5
|
|
373
|
+
|
|
374
|
+
# Quick scan of major duplicates
|
|
375
|
+
aiready-patterns ./src --min-lines 20
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
**Recommendations by codebase size:**
|
|
379
|
+
|
|
380
|
+
| Repo Size | Files | Strategy | Expected Time |
|
|
381
|
+
|-----------|-------|----------|---------------|
|
|
382
|
+
| **Small** | <100 | Use defaults | <1s ✅ |
|
|
383
|
+
| **Medium** | 100-500 | Use defaults | 1-5s ✅ |
|
|
384
|
+
| **Large** | 500-1,000 | Use defaults or `--min-lines 10` | 3-10s ✅ |
|
|
385
|
+
| **Very Large** | 1,000-5,000 | `--min-lines 15` or analyze by module | 5-20s ⚠️ |
|
|
386
|
+
| **Super Large** | 5,000+ | **Analyze by module** (see below) | 10-60s per module ⚠️ |
|
|
387
|
+
|
|
388
|
+
### Analyzing Very Large Repositories
|
|
389
|
+
|
|
390
|
+
For repos with 1,000+ files, use modular analysis:
|
|
391
|
+
|
|
392
|
+
```bash
|
|
393
|
+
# Analyze by top-level directory
|
|
394
|
+
for dir in src/*/; do
|
|
395
|
+
echo "Analyzing $dir"
|
|
396
|
+
aiready-patterns "$dir" --min-lines 10
|
|
397
|
+
done
|
|
398
|
+
|
|
399
|
+
# Or focus on specific high-value areas
|
|
400
|
+
aiready-patterns ./src/api --min-lines 10
|
|
401
|
+
aiready-patterns ./src/core --min-lines 10
|
|
402
|
+
aiready-patterns ./src/services --min-lines 10
|
|
403
|
+
|
|
404
|
+
# For super large repos (5K+ files), increase thresholds
|
|
405
|
+
aiready-patterns ./src/backend --min-lines 20 --similarity 0.50
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
**Why modular analysis?**
|
|
409
|
+
- Ensures comprehensive coverage (100% of each module)
|
|
410
|
+
- Avoids hitting comparison budget limits
|
|
411
|
+
- Provides focused, actionable results per module
|
|
412
|
+
- Better for CI/CD integration (parallel jobs)
|
|
295
413
|
|
|
296
414
|
**Progress Indicators:**
|
|
297
415
|
- **Approx mode**: Shows blocks processed + duplicates found
|
|
298
416
|
- **Exact mode**: Shows % complete, ETA, and comparisons processed
|
|
299
|
-
- **Stream mode**: Prints each duplicate immediately when found
|
|
300
|
-
|
|
301
|
-
**Recommendations:**
|
|
302
|
-
- **< 100 files**: Use defaults, or try `--no-fast-mode` for higher accuracy
|
|
303
|
-
- **100-500 files**: Use defaults with fast mode (2-5s typical)
|
|
304
|
-
- **500-1000 files**: Use `--max-blocks 500 --min-lines 10` (~3-10s)
|
|
305
|
-
- **1000+ files**: Use `--max-blocks 300 --min-lines 15` or analyze by module
|
|
306
|
-
- **Slow analysis**: Add `--stream-results` to see progress in real-time
|
|
417
|
+
- **Stream mode**: Prints each duplicate immediately when found (enabled by default)
|
|
307
418
|
|
|
308
419
|
## 🔧 CI/CD Integration
|
|
309
420
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { scanFiles, readFileContent } from "@aiready/core";
|
|
3
3
|
|
|
4
4
|
// src/detector.ts
|
|
5
|
-
import {
|
|
5
|
+
import { estimateTokens } from "@aiready/core";
|
|
6
6
|
function categorizePattern(code) {
|
|
7
7
|
const lower = code.toLowerCase();
|
|
8
8
|
if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
|
|
@@ -80,15 +80,6 @@ function jaccardSimilarity(tokens1, tokens2) {
|
|
|
80
80
|
const union = set1.size + set2.size - intersection;
|
|
81
81
|
return union === 0 ? 0 : intersection / union;
|
|
82
82
|
}
|
|
83
|
-
function calculateSimilarity(block1, block2) {
|
|
84
|
-
const norm1 = normalizeCode(block1);
|
|
85
|
-
const norm2 = normalizeCode(block2);
|
|
86
|
-
const baseSimilarity = similarityScore(norm1, norm2);
|
|
87
|
-
const tokens1 = norm1.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
88
|
-
const tokens2 = norm2.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
89
|
-
const tokenSimilarity = similarityScore(tokens1.join(" "), tokens2.join(" "));
|
|
90
|
-
return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
|
|
91
|
-
}
|
|
92
83
|
async function detectDuplicatePatterns(files, options) {
|
|
93
84
|
const {
|
|
94
85
|
minSimilarity,
|
|
@@ -98,7 +89,6 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
98
89
|
approx = true,
|
|
99
90
|
minSharedTokens = 8,
|
|
100
91
|
maxCandidatesPerBlock = 100,
|
|
101
|
-
fastMode = true,
|
|
102
92
|
maxComparisons = 5e4,
|
|
103
93
|
// Cap at 50K comparisons by default
|
|
104
94
|
streamResults = false
|
|
@@ -213,7 +203,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
213
203
|
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
214
204
|
comparisonsProcessed++;
|
|
215
205
|
const block2 = allBlocks[j];
|
|
216
|
-
const similarity =
|
|
206
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
217
207
|
if (similarity >= minSimilarity) {
|
|
218
208
|
const duplicate = {
|
|
219
209
|
file1: block1.file,
|
|
@@ -243,7 +233,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
243
233
|
comparisonsProcessed++;
|
|
244
234
|
const block2 = allBlocks[j];
|
|
245
235
|
if (block1.file === block2.file) continue;
|
|
246
|
-
const similarity =
|
|
236
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
247
237
|
if (similarity >= minSimilarity) {
|
|
248
238
|
const duplicate = {
|
|
249
239
|
file1: block1.file,
|
|
@@ -293,15 +283,14 @@ function getRefactoringSuggestion(patternType, similarity) {
|
|
|
293
283
|
}
|
|
294
284
|
async function analyzePatterns(options) {
|
|
295
285
|
const {
|
|
296
|
-
minSimilarity = 0.
|
|
297
|
-
//
|
|
286
|
+
minSimilarity = 0.4,
|
|
287
|
+
// Jaccard similarity default (40% threshold)
|
|
298
288
|
minLines = 5,
|
|
299
289
|
maxBlocks = 500,
|
|
300
290
|
batchSize = 100,
|
|
301
291
|
approx = true,
|
|
302
292
|
minSharedTokens = 8,
|
|
303
293
|
maxCandidatesPerBlock = 100,
|
|
304
|
-
fastMode = true,
|
|
305
294
|
maxComparisons = 5e4,
|
|
306
295
|
streamResults = false,
|
|
307
296
|
...scanOptions
|
|
@@ -322,7 +311,6 @@ async function analyzePatterns(options) {
|
|
|
322
311
|
approx,
|
|
323
312
|
minSharedTokens,
|
|
324
313
|
maxCandidatesPerBlock,
|
|
325
|
-
fastMode,
|
|
326
314
|
maxComparisons,
|
|
327
315
|
streamResults
|
|
328
316
|
});
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { scanFiles, readFileContent } from "@aiready/core";
|
|
3
3
|
|
|
4
4
|
// src/detector.ts
|
|
5
|
-
import {
|
|
5
|
+
import { estimateTokens } from "@aiready/core";
|
|
6
6
|
function categorizePattern(code) {
|
|
7
7
|
const lower = code.toLowerCase();
|
|
8
8
|
if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
|
|
@@ -54,6 +54,7 @@ function extractCodeBlocks(content, minLines) {
|
|
|
54
54
|
blocks.push({
|
|
55
55
|
content: blockContent,
|
|
56
56
|
startLine: blockStart + 1,
|
|
57
|
+
endLine: i + 1,
|
|
57
58
|
patternType: categorizePattern(blockContent),
|
|
58
59
|
linesOfCode
|
|
59
60
|
});
|
|
@@ -79,43 +80,34 @@ function jaccardSimilarity(tokens1, tokens2) {
|
|
|
79
80
|
const union = set1.size + set2.size - intersection;
|
|
80
81
|
return union === 0 ? 0 : intersection / union;
|
|
81
82
|
}
|
|
82
|
-
function calculateSimilarity(block1, block2) {
|
|
83
|
-
const norm1 = normalizeCode(block1);
|
|
84
|
-
const norm2 = normalizeCode(block2);
|
|
85
|
-
const baseSimilarity = similarityScore(norm1, norm2);
|
|
86
|
-
const tokens1 = norm1.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
87
|
-
const tokens2 = norm2.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
88
|
-
const tokenSimilarity = similarityScore(tokens1.join(" "), tokens2.join(" "));
|
|
89
|
-
return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
|
|
90
|
-
}
|
|
91
83
|
async function detectDuplicatePatterns(files, options) {
|
|
92
84
|
const {
|
|
93
85
|
minSimilarity,
|
|
94
86
|
minLines,
|
|
95
|
-
maxBlocks = 500,
|
|
96
87
|
batchSize = 100,
|
|
97
88
|
approx = true,
|
|
98
89
|
minSharedTokens = 8,
|
|
99
90
|
maxCandidatesPerBlock = 100,
|
|
100
|
-
fastMode = true,
|
|
101
|
-
maxComparisons = 5e4,
|
|
102
|
-
// Cap at 50K comparisons by default
|
|
103
91
|
streamResults = false
|
|
104
92
|
} = options;
|
|
105
93
|
const duplicates = [];
|
|
106
|
-
|
|
94
|
+
const maxComparisons = approx ? Infinity : 5e5;
|
|
95
|
+
const allBlocks = files.flatMap(
|
|
107
96
|
(file) => extractCodeBlocks(file.content, minLines).map((block) => ({
|
|
108
|
-
|
|
97
|
+
content: block.content,
|
|
98
|
+
startLine: block.startLine,
|
|
99
|
+
endLine: block.endLine,
|
|
109
100
|
file: file.file,
|
|
110
101
|
normalized: normalizeCode(block.content),
|
|
111
|
-
|
|
102
|
+
patternType: block.patternType,
|
|
103
|
+
tokenCost: estimateTokens(block.content),
|
|
104
|
+
linesOfCode: block.linesOfCode
|
|
112
105
|
}))
|
|
113
106
|
);
|
|
114
107
|
console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
|
|
115
|
-
if (allBlocks.length >
|
|
116
|
-
console.log(`\u26A0\uFE0F
|
|
117
|
-
console.log(`
|
|
118
|
-
allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
|
|
108
|
+
if (!approx && allBlocks.length > 500) {
|
|
109
|
+
console.log(`\u26A0\uFE0F Using --no-approx mode with ${allBlocks.length} blocks may be slow (O(B\xB2) complexity).`);
|
|
110
|
+
console.log(` Consider using approximate mode (default) for better performance.`);
|
|
119
111
|
}
|
|
120
112
|
const stopwords = /* @__PURE__ */ new Set([
|
|
121
113
|
"return",
|
|
@@ -205,16 +197,22 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
205
197
|
}
|
|
206
198
|
if (approx && candidates) {
|
|
207
199
|
for (const { j } of candidates) {
|
|
208
|
-
if (maxComparisons && comparisonsProcessed >= maxComparisons)
|
|
200
|
+
if (!approx && maxComparisons !== Infinity && comparisonsProcessed >= maxComparisons) {
|
|
201
|
+
console.log(`\u26A0\uFE0F Comparison safety limit reached (${maxComparisons.toLocaleString()} comparisons in --no-approx mode).`);
|
|
202
|
+
console.log(` This prevents excessive runtime on large repos. Consider using approximate mode (default) or --min-lines to reduce blocks.`);
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
209
205
|
comparisonsProcessed++;
|
|
210
206
|
const block2 = allBlocks[j];
|
|
211
|
-
const similarity =
|
|
207
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
212
208
|
if (similarity >= minSimilarity) {
|
|
213
209
|
const duplicate = {
|
|
214
210
|
file1: block1.file,
|
|
215
211
|
file2: block2.file,
|
|
216
212
|
line1: block1.startLine,
|
|
217
213
|
line2: block2.startLine,
|
|
214
|
+
endLine1: block1.endLine,
|
|
215
|
+
endLine2: block2.endLine,
|
|
218
216
|
similarity,
|
|
219
217
|
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
220
218
|
patternType: block1.patternType,
|
|
@@ -225,7 +223,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
225
223
|
if (streamResults) {
|
|
226
224
|
console.log(`
|
|
227
225
|
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
228
|
-
console.log(` ${duplicate.file1}:${duplicate.line1} \u21D4 ${duplicate.file2}:${duplicate.line2}`);
|
|
226
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
229
227
|
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
230
228
|
}
|
|
231
229
|
}
|
|
@@ -236,13 +234,15 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
236
234
|
comparisonsProcessed++;
|
|
237
235
|
const block2 = allBlocks[j];
|
|
238
236
|
if (block1.file === block2.file) continue;
|
|
239
|
-
const similarity =
|
|
237
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
240
238
|
if (similarity >= minSimilarity) {
|
|
241
239
|
const duplicate = {
|
|
242
240
|
file1: block1.file,
|
|
243
241
|
file2: block2.file,
|
|
244
242
|
line1: block1.startLine,
|
|
245
243
|
line2: block2.startLine,
|
|
244
|
+
endLine1: block1.endLine,
|
|
245
|
+
endLine2: block2.endLine,
|
|
246
246
|
similarity,
|
|
247
247
|
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
248
248
|
patternType: block1.patternType,
|
|
@@ -253,7 +253,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
253
253
|
if (streamResults) {
|
|
254
254
|
console.log(`
|
|
255
255
|
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
256
|
-
console.log(` ${duplicate.file1}:${duplicate.line1} \u21D4 ${duplicate.file2}:${duplicate.line2}`);
|
|
256
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
257
257
|
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
258
258
|
}
|
|
259
259
|
}
|
|
@@ -284,16 +284,13 @@ function getRefactoringSuggestion(patternType, similarity) {
|
|
|
284
284
|
}
|
|
285
285
|
async function analyzePatterns(options) {
|
|
286
286
|
const {
|
|
287
|
-
minSimilarity = 0.
|
|
288
|
-
//
|
|
287
|
+
minSimilarity = 0.4,
|
|
288
|
+
// Jaccard similarity default (40% threshold)
|
|
289
289
|
minLines = 5,
|
|
290
|
-
maxBlocks = 500,
|
|
291
290
|
batchSize = 100,
|
|
292
291
|
approx = true,
|
|
293
292
|
minSharedTokens = 8,
|
|
294
293
|
maxCandidatesPerBlock = 100,
|
|
295
|
-
fastMode = true,
|
|
296
|
-
maxComparisons = 5e4,
|
|
297
294
|
streamResults = false,
|
|
298
295
|
...scanOptions
|
|
299
296
|
} = options;
|
|
@@ -308,13 +305,10 @@ async function analyzePatterns(options) {
|
|
|
308
305
|
const duplicates = await detectDuplicatePatterns(fileContents, {
|
|
309
306
|
minSimilarity,
|
|
310
307
|
minLines,
|
|
311
|
-
maxBlocks,
|
|
312
308
|
batchSize,
|
|
313
309
|
approx,
|
|
314
310
|
minSharedTokens,
|
|
315
311
|
maxCandidatesPerBlock,
|
|
316
|
-
fastMode,
|
|
317
|
-
maxComparisons,
|
|
318
312
|
streamResults
|
|
319
313
|
});
|
|
320
314
|
for (const file of files) {
|
|
@@ -378,8 +372,21 @@ function generateSummary(results) {
|
|
|
378
372
|
const typeMatch = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
|
|
379
373
|
const fileMatch = issue.message.match(/similar to (.+?) \(/);
|
|
380
374
|
return {
|
|
381
|
-
|
|
382
|
-
|
|
375
|
+
files: [
|
|
376
|
+
{
|
|
377
|
+
path: issue.location.file,
|
|
378
|
+
startLine: issue.location.line,
|
|
379
|
+
endLine: 0
|
|
380
|
+
// Not available from Issue
|
|
381
|
+
},
|
|
382
|
+
{
|
|
383
|
+
path: fileMatch?.[1] || "unknown",
|
|
384
|
+
startLine: 0,
|
|
385
|
+
// Not available from Issue
|
|
386
|
+
endLine: 0
|
|
387
|
+
// Not available from Issue
|
|
388
|
+
}
|
|
389
|
+
],
|
|
383
390
|
similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
|
|
384
391
|
patternType: typeMatch?.[1] || "unknown",
|
|
385
392
|
tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { scanFiles, readFileContent } from "@aiready/core";
|
|
3
3
|
|
|
4
4
|
// src/detector.ts
|
|
5
|
-
import {
|
|
5
|
+
import { estimateTokens } from "@aiready/core";
|
|
6
6
|
function categorizePattern(code) {
|
|
7
7
|
const lower = code.toLowerCase();
|
|
8
8
|
if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
|
|
@@ -54,6 +54,7 @@ function extractCodeBlocks(content, minLines) {
|
|
|
54
54
|
blocks.push({
|
|
55
55
|
content: blockContent,
|
|
56
56
|
startLine: blockStart + 1,
|
|
57
|
+
endLine: i + 1,
|
|
57
58
|
patternType: categorizePattern(blockContent),
|
|
58
59
|
linesOfCode
|
|
59
60
|
});
|
|
@@ -79,44 +80,32 @@ function jaccardSimilarity(tokens1, tokens2) {
|
|
|
79
80
|
const union = set1.size + set2.size - intersection;
|
|
80
81
|
return union === 0 ? 0 : intersection / union;
|
|
81
82
|
}
|
|
82
|
-
function calculateSimilarity(block1, block2) {
|
|
83
|
-
const norm1 = normalizeCode(block1);
|
|
84
|
-
const norm2 = normalizeCode(block2);
|
|
85
|
-
const baseSimilarity = similarityScore(norm1, norm2);
|
|
86
|
-
const tokens1 = norm1.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
87
|
-
const tokens2 = norm2.split(/[\s(){}[\];,]+/).filter(Boolean);
|
|
88
|
-
const tokenSimilarity = similarityScore(tokens1.join(" "), tokens2.join(" "));
|
|
89
|
-
return baseSimilarity * 0.4 + tokenSimilarity * 0.6;
|
|
90
|
-
}
|
|
91
83
|
async function detectDuplicatePatterns(files, options) {
|
|
92
84
|
const {
|
|
93
85
|
minSimilarity,
|
|
94
86
|
minLines,
|
|
95
|
-
maxBlocks = 500,
|
|
96
87
|
batchSize = 100,
|
|
97
88
|
approx = true,
|
|
98
89
|
minSharedTokens = 8,
|
|
99
90
|
maxCandidatesPerBlock = 100,
|
|
100
|
-
fastMode = true,
|
|
101
91
|
maxComparisons = 5e4,
|
|
102
92
|
// Cap at 50K comparisons by default
|
|
103
93
|
streamResults = false
|
|
104
94
|
} = options;
|
|
105
95
|
const duplicates = [];
|
|
106
|
-
|
|
96
|
+
const allBlocks = files.flatMap(
|
|
107
97
|
(file) => extractCodeBlocks(file.content, minLines).map((block) => ({
|
|
108
|
-
|
|
98
|
+
content: block.content,
|
|
99
|
+
startLine: block.startLine,
|
|
100
|
+
endLine: block.endLine,
|
|
109
101
|
file: file.file,
|
|
110
102
|
normalized: normalizeCode(block.content),
|
|
111
|
-
|
|
103
|
+
patternType: block.patternType,
|
|
104
|
+
tokenCost: estimateTokens(block.content),
|
|
105
|
+
linesOfCode: block.linesOfCode
|
|
112
106
|
}))
|
|
113
107
|
);
|
|
114
108
|
console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
|
|
115
|
-
if (allBlocks.length > maxBlocks) {
|
|
116
|
-
console.log(`\u26A0\uFE0F Limiting to ${maxBlocks} blocks (sorted by size) to prevent memory issues`);
|
|
117
|
-
console.log(` Use --max-blocks to increase limit or --min-lines to filter smaller blocks`);
|
|
118
|
-
allBlocks = allBlocks.sort((a, b) => b.linesOfCode - a.linesOfCode).slice(0, maxBlocks);
|
|
119
|
-
}
|
|
120
109
|
const stopwords = /* @__PURE__ */ new Set([
|
|
121
110
|
"return",
|
|
122
111
|
"const",
|
|
@@ -208,13 +197,15 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
208
197
|
if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
|
|
209
198
|
comparisonsProcessed++;
|
|
210
199
|
const block2 = allBlocks[j];
|
|
211
|
-
const similarity =
|
|
200
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
212
201
|
if (similarity >= minSimilarity) {
|
|
213
202
|
const duplicate = {
|
|
214
203
|
file1: block1.file,
|
|
215
204
|
file2: block2.file,
|
|
216
205
|
line1: block1.startLine,
|
|
217
206
|
line2: block2.startLine,
|
|
207
|
+
endLine1: block1.endLine,
|
|
208
|
+
endLine2: block2.endLine,
|
|
218
209
|
similarity,
|
|
219
210
|
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
220
211
|
patternType: block1.patternType,
|
|
@@ -225,7 +216,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
225
216
|
if (streamResults) {
|
|
226
217
|
console.log(`
|
|
227
218
|
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
228
|
-
console.log(` ${duplicate.file1}:${duplicate.line1} \u21D4 ${duplicate.file2}:${duplicate.line2}`);
|
|
219
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
229
220
|
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
230
221
|
}
|
|
231
222
|
}
|
|
@@ -236,13 +227,15 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
236
227
|
comparisonsProcessed++;
|
|
237
228
|
const block2 = allBlocks[j];
|
|
238
229
|
if (block1.file === block2.file) continue;
|
|
239
|
-
const similarity =
|
|
230
|
+
const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
|
|
240
231
|
if (similarity >= minSimilarity) {
|
|
241
232
|
const duplicate = {
|
|
242
233
|
file1: block1.file,
|
|
243
234
|
file2: block2.file,
|
|
244
235
|
line1: block1.startLine,
|
|
245
236
|
line2: block2.startLine,
|
|
237
|
+
endLine1: block1.endLine,
|
|
238
|
+
endLine2: block2.endLine,
|
|
246
239
|
similarity,
|
|
247
240
|
snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
|
|
248
241
|
patternType: block1.patternType,
|
|
@@ -253,7 +246,7 @@ async function detectDuplicatePatterns(files, options) {
|
|
|
253
246
|
if (streamResults) {
|
|
254
247
|
console.log(`
|
|
255
248
|
\u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`);
|
|
256
|
-
console.log(` ${duplicate.file1}:${duplicate.line1} \u21D4 ${duplicate.file2}:${duplicate.line2}`);
|
|
249
|
+
console.log(` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`);
|
|
257
250
|
console.log(` Token cost: ${duplicate.tokenCost.toLocaleString()}`);
|
|
258
251
|
}
|
|
259
252
|
}
|
|
@@ -284,14 +277,13 @@ function getRefactoringSuggestion(patternType, similarity) {
|
|
|
284
277
|
}
|
|
285
278
|
async function analyzePatterns(options) {
|
|
286
279
|
const {
|
|
287
|
-
minSimilarity = 0.
|
|
280
|
+
minSimilarity = 0.4,
|
|
281
|
+
// Jaccard similarity default (40% threshold)
|
|
288
282
|
minLines = 5,
|
|
289
|
-
maxBlocks = 500,
|
|
290
283
|
batchSize = 100,
|
|
291
284
|
approx = true,
|
|
292
285
|
minSharedTokens = 8,
|
|
293
286
|
maxCandidatesPerBlock = 100,
|
|
294
|
-
fastMode = true,
|
|
295
287
|
maxComparisons = 5e4,
|
|
296
288
|
streamResults = false,
|
|
297
289
|
...scanOptions
|
|
@@ -307,12 +299,10 @@ async function analyzePatterns(options) {
|
|
|
307
299
|
const duplicates = await detectDuplicatePatterns(fileContents, {
|
|
308
300
|
minSimilarity,
|
|
309
301
|
minLines,
|
|
310
|
-
maxBlocks,
|
|
311
302
|
batchSize,
|
|
312
303
|
approx,
|
|
313
304
|
minSharedTokens,
|
|
314
305
|
maxCandidatesPerBlock,
|
|
315
|
-
fastMode,
|
|
316
306
|
maxComparisons,
|
|
317
307
|
streamResults
|
|
318
308
|
});
|
|
@@ -377,8 +367,21 @@ function generateSummary(results) {
|
|
|
377
367
|
const typeMatch = issue.message.match(/^(\S+(?:-\S+)*) pattern/);
|
|
378
368
|
const fileMatch = issue.message.match(/similar to (.+?) \(/);
|
|
379
369
|
return {
|
|
380
|
-
|
|
381
|
-
|
|
370
|
+
files: [
|
|
371
|
+
{
|
|
372
|
+
path: issue.location.file,
|
|
373
|
+
startLine: issue.location.line,
|
|
374
|
+
endLine: 0
|
|
375
|
+
// Not available from Issue
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
path: fileMatch?.[1] || "unknown",
|
|
379
|
+
startLine: 0,
|
|
380
|
+
// Not available from Issue
|
|
381
|
+
endLine: 0
|
|
382
|
+
// Not available from Issue
|
|
383
|
+
}
|
|
384
|
+
],
|
|
382
385
|
similarity: similarityMatch ? parseInt(similarityMatch[1]) / 100 : 0,
|
|
383
386
|
patternType: typeMatch?.[1] || "unknown",
|
|
384
387
|
tokenCost: tokenMatch ? parseInt(tokenMatch[1]) : 0
|