@aiready/pattern-detect 0.11.37 → 0.11.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -323,7 +323,7 @@ function filterBySeverity(duplicates, minSeverity) {
323
323
  });
324
324
  }
325
325
 
326
- // src/detector.ts
326
+ // src/core/extractor.ts
327
327
  function categorizePattern(code) {
328
328
  const lower = code.toLowerCase();
329
329
  if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
@@ -368,17 +368,19 @@ function extractCodeBlocks(content, minLines) {
368
368
  currentBlock.push(line);
369
369
  }
370
370
  if (inFunction && braceDepth === 0 && currentBlock.length >= minLines) {
371
- const blockContent = currentBlock.join("\n");
372
- const linesOfCode = currentBlock.filter(
373
- (l) => l.trim() && !l.trim().startsWith("//")
374
- ).length;
375
- blocks.push({
376
- content: blockContent,
377
- startLine: blockStart + 1,
378
- endLine: i + 1,
379
- patternType: categorizePattern(blockContent),
380
- linesOfCode
381
- });
371
+ const blockContent = currentBlock.join("\n").trim();
372
+ if (blockContent) {
373
+ const loc = currentBlock.filter(
374
+ (l) => l.trim() && !l.trim().startsWith("//")
375
+ ).length;
376
+ blocks.push({
377
+ content: blockContent,
378
+ startLine: blockStart + 1,
379
+ endLine: i + 1,
380
+ patternType: categorizePattern(blockContent),
381
+ linesOfCode: loc
382
+ });
383
+ }
382
384
  currentBlock = [];
383
385
  inFunction = false;
384
386
  } else if (inFunction && braceDepth === 0) {
@@ -388,15 +390,51 @@ function extractCodeBlocks(content, minLines) {
388
390
  }
389
391
  return blocks;
390
392
  }
393
+
394
+ // src/core/normalizer.ts
391
395
  function normalizeCode(code) {
392
- if (!code) {
393
- return "";
394
- }
396
+ if (!code) return "";
395
397
  return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
396
398
  }
399
+ var stopwords = /* @__PURE__ */ new Set([
400
+ "return",
401
+ "const",
402
+ "let",
403
+ "var",
404
+ "function",
405
+ "class",
406
+ "new",
407
+ "if",
408
+ "else",
409
+ "for",
410
+ "while",
411
+ "async",
412
+ "await",
413
+ "try",
414
+ "catch",
415
+ "switch",
416
+ "case",
417
+ "default",
418
+ "import",
419
+ "export",
420
+ "from",
421
+ "true",
422
+ "false",
423
+ "null",
424
+ "undefined",
425
+ "this"
426
+ ]);
427
+ function tokenize(norm) {
428
+ const punctuation = "(){}[];.,";
429
+ const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
430
+ return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
431
+ }
432
+
433
+ // src/core/similarity.ts
397
434
  function jaccardSimilarity(tokens1, tokens2) {
398
435
  const set1 = new Set(tokens1);
399
436
  const set2 = new Set(tokens2);
437
+ if (set1.size === 0 && set2.size === 0) return 0;
400
438
  let intersection = 0;
401
439
  for (const token of set1) {
402
440
  if (set2.has(token)) intersection++;
@@ -404,6 +442,53 @@ function jaccardSimilarity(tokens1, tokens2) {
404
442
  const union = set1.size + set2.size - intersection;
405
443
  return union === 0 ? 0 : intersection / union;
406
444
  }
445
+
446
+ // src/core/approx-engine.ts
447
+ var ApproxEngine = class {
448
+ constructor(allBlocks, blockTokens) {
449
+ this.invertedIndex = /* @__PURE__ */ new Map();
450
+ this.allBlocks = allBlocks;
451
+ this.blockTokens = blockTokens;
452
+ this.buildIndex();
453
+ }
454
+ buildIndex() {
455
+ for (let i = 0; i < this.blockTokens.length; i++) {
456
+ for (const tok of this.blockTokens[i]) {
457
+ let arr = this.invertedIndex.get(tok);
458
+ if (!arr) {
459
+ arr = [];
460
+ this.invertedIndex.set(tok, arr);
461
+ }
462
+ arr.push(i);
463
+ }
464
+ }
465
+ }
466
+ findCandidates(blockIdx, minSharedTokens, maxCandidates) {
467
+ const block1 = this.allBlocks[blockIdx];
468
+ const block1Tokens = this.blockTokens[blockIdx];
469
+ const counts = /* @__PURE__ */ new Map();
470
+ const rareTokens = block1Tokens.filter((tok) => {
471
+ const freq = this.invertedIndex.get(tok)?.length || 0;
472
+ return freq < this.allBlocks.length * 0.1;
473
+ });
474
+ for (const tok of rareTokens) {
475
+ const ids = this.invertedIndex.get(tok);
476
+ if (!ids) continue;
477
+ for (const j of ids) {
478
+ if (j <= blockIdx) continue;
479
+ if (this.allBlocks[j].file === block1.file) continue;
480
+ counts.set(j, (counts.get(j) || 0) + 1);
481
+ }
482
+ }
483
+ return Array.from(counts.entries()).filter(([j, shared]) => {
484
+ const block2Size = this.blockTokens[j].length;
485
+ const minSize = Math.min(block1Tokens.length, block2Size);
486
+ return shared >= minSharedTokens && shared / minSize >= 0.3;
487
+ }).sort((a, b) => b[1] - a[1]).slice(0, maxCandidates).map(([j, shared]) => ({ j, shared }));
488
+ }
489
+ };
490
+
491
+ // src/detector.ts
407
492
  async function detectDuplicatePatterns(files, options) {
408
493
  const {
409
494
  minSimilarity,
@@ -417,274 +502,92 @@ async function detectDuplicatePatterns(files, options) {
417
502
  const duplicates = [];
418
503
  const maxComparisons = approx ? Infinity : 5e5;
419
504
  const allBlocks = files.flatMap(
420
- (file) => extractCodeBlocks(file.content, minLines).filter((block) => block.content && block.content.trim().length > 0).map((block) => ({
421
- content: block.content,
422
- startLine: block.startLine,
423
- endLine: block.endLine,
505
+ (file) => extractCodeBlocks(file.content, minLines).filter(
506
+ (block) => block && block.content && block.content.trim().length > 0
507
+ ).map((block) => ({
508
+ ...block,
424
509
  file: file.file,
425
510
  normalized: normalizeCode(block.content),
426
- patternType: block.patternType,
427
- tokenCost: (0, import_core2.estimateTokens)(block.content),
428
- linesOfCode: block.linesOfCode
511
+ tokenCost: block.content ? (0, import_core2.estimateTokens)(block.content) : 0
429
512
  }))
430
513
  );
431
- if (!options.onProgress) {
432
- console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
433
- }
434
- const pythonFiles = files.filter((f) => f.file.toLowerCase().endsWith(".py"));
514
+ const pythonFiles = files.filter((f) => f.file.endsWith(".py"));
435
515
  if (pythonFiles.length > 0) {
436
516
  const { extractPythonPatterns: extractPythonPatterns2 } = await Promise.resolve().then(() => (init_python_extractor(), python_extractor_exports));
437
- const patterns = await extractPythonPatterns2(
517
+ const pythonPatterns = await extractPythonPatterns2(
438
518
  pythonFiles.map((f) => f.file)
439
519
  );
440
- const pythonBlocks = patterns.filter((p) => p.code && p.code.trim().length > 0).map((p) => ({
441
- content: p.code,
442
- startLine: p.startLine,
443
- endLine: p.endLine,
444
- file: p.file,
445
- normalized: normalizeCode(p.code),
446
- patternType: p.type,
447
- tokenCost: (0, import_core2.estimateTokens)(p.code),
448
- linesOfCode: p.endLine - p.startLine + 1
449
- }));
450
- allBlocks.push(...pythonBlocks);
451
- if (!options.onProgress) {
452
- console.log(`Added ${pythonBlocks.length} Python patterns`);
453
- }
454
- }
455
- if (!approx && allBlocks.length > 500) {
456
- console.log(
457
- `\u26A0\uFE0F Using --no-approx mode with ${allBlocks.length} blocks may be slow (O(B\xB2) complexity).`
458
- );
459
- console.log(
460
- ` Consider using approximate mode (default) for better performance.`
520
+ allBlocks.push(
521
+ ...pythonPatterns.map((p) => ({
522
+ content: p.code,
523
+ startLine: p.startLine,
524
+ endLine: p.endLine,
525
+ file: p.file,
526
+ normalized: normalizeCode(p.code),
527
+ patternType: p.type,
528
+ tokenCost: p.code ? (0, import_core2.estimateTokens)(p.code) : 0,
529
+ linesOfCode: p.endLine - p.startLine + 1
530
+ }))
461
531
  );
462
532
  }
463
- const stopwords = /* @__PURE__ */ new Set([
464
- "return",
465
- "const",
466
- "let",
467
- "var",
468
- "function",
469
- "class",
470
- "new",
471
- "if",
472
- "else",
473
- "for",
474
- "while",
475
- "async",
476
- "await",
477
- "try",
478
- "catch",
479
- "switch",
480
- "case",
481
- "default",
482
- "import",
483
- "export",
484
- "from",
485
- "true",
486
- "false",
487
- "null",
488
- "undefined",
489
- "this"
490
- ]);
491
- const tokenize = (norm) => {
492
- const punctuation = "(){}[];.,";
493
- const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
494
- return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
495
- };
496
533
  const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
497
- const invertedIndex = /* @__PURE__ */ new Map();
498
- if (approx) {
499
- for (let i = 0; i < blockTokens.length; i++) {
500
- for (const tok of blockTokens[i]) {
501
- let arr = invertedIndex.get(tok);
502
- if (!arr) {
503
- arr = [];
504
- invertedIndex.set(tok, arr);
505
- }
506
- arr.push(i);
507
- }
508
- }
509
- }
510
- const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
511
- if (totalComparisons !== void 0) {
512
- console.log(
513
- `Processing ${totalComparisons.toLocaleString()} comparisons in batches...`
514
- );
515
- } else {
516
- console.log(
517
- `Using approximate candidate selection to reduce comparisons...`
518
- );
519
- }
534
+ const engine = approx ? new ApproxEngine(allBlocks, blockTokens) : null;
520
535
  let comparisonsProcessed = 0;
521
- let comparisonsBudgetExhausted = false;
522
536
  const startTime = Date.now();
523
537
  for (let i = 0; i < allBlocks.length; i++) {
524
- if (maxComparisons && comparisonsProcessed >= maxComparisons) {
525
- comparisonsBudgetExhausted = true;
526
- break;
527
- }
538
+ if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
528
539
  if (i % batchSize === 0 && i > 0) {
529
540
  if (options.onProgress) {
530
- options.onProgress(i, allBlocks.length, `pattern-detect: analyzing blocks`);
541
+ options.onProgress(i, allBlocks.length, "Analyzing patterns");
531
542
  } else {
532
- const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
533
- const duplicatesFound = duplicates.length;
534
- if (totalComparisons !== void 0) {
535
- const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
536
- const remaining = totalComparisons - comparisonsProcessed;
537
- const rate = comparisonsProcessed / parseFloat(elapsed);
538
- const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
539
- console.log(
540
- ` ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`
541
- );
542
- } else {
543
- console.log(
544
- ` Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`
545
- );
546
- }
543
+ const elapsed = (Date.now() - startTime) / 1e3;
544
+ console.log(
545
+ ` Processed ${i}/${allBlocks.length} blocks (${elapsed.toFixed(1)}s, ${duplicates.length} duplicates)`
546
+ );
547
547
  }
548
- await new Promise((resolve) => setImmediate(resolve));
548
+ await new Promise((r) => setImmediate((resolve) => r(resolve)));
549
549
  }
550
550
  const block1 = allBlocks[i];
551
- let candidates = null;
552
- if (approx) {
553
- const counts = /* @__PURE__ */ new Map();
554
- const block1Tokens = new Set(blockTokens[i]);
555
- const block1Size = block1Tokens.size;
556
- const rareTokens = blockTokens[i].filter((tok) => {
557
- const blocksWithToken = invertedIndex.get(tok)?.length || 0;
558
- return blocksWithToken < allBlocks.length * 0.1;
559
- });
560
- for (const tok of rareTokens) {
561
- const ids = invertedIndex.get(tok);
562
- if (!ids) continue;
563
- for (const j of ids) {
564
- if (j <= i) continue;
565
- if (allBlocks[j].file === block1.file) continue;
566
- counts.set(j, (counts.get(j) || 0) + 1);
567
- }
568
- }
569
- candidates = Array.from(counts.entries()).filter(([j, shared]) => {
570
- const block2Tokens = blockTokens[j];
571
- const block2Size = block2Tokens.length;
572
- const minSize = Math.min(block1Size, block2Size);
573
- const sharedPercentage = shared / minSize;
574
- return shared >= minSharedTokens && sharedPercentage >= 0.3;
575
- }).sort((a, b) => b[1] - a[1]).slice(0, Math.min(maxCandidatesPerBlock, 5)).map(([j, shared]) => ({ j, shared }));
576
- }
577
- if (approx && candidates) {
578
- for (const { j } of candidates) {
579
- if (!approx && maxComparisons !== Infinity && comparisonsProcessed >= maxComparisons) {
580
- console.log(
581
- `\u26A0\uFE0F Comparison safety limit reached (${maxComparisons.toLocaleString()} comparisons in --no-approx mode).`
582
- );
551
+ const candidates = engine ? engine.findCandidates(i, minSharedTokens, maxCandidatesPerBlock) : allBlocks.slice(i + 1).map((_, idx) => ({ j: i + 1 + idx, shared: 0 }));
552
+ for (const { j } of candidates) {
553
+ if (!approx && comparisonsProcessed >= maxComparisons) break;
554
+ comparisonsProcessed++;
555
+ const block2 = allBlocks[j];
556
+ if (block1.file === block2.file) continue;
557
+ const sim = jaccardSimilarity(blockTokens[i], blockTokens[j]);
558
+ if (sim >= minSimilarity) {
559
+ const severity = calculateSeverity(
560
+ block1.file,
561
+ block2.file,
562
+ block1.content,
563
+ sim,
564
+ block1.linesOfCode
565
+ );
566
+ const dup = {
567
+ file1: block1.file,
568
+ file2: block2.file,
569
+ line1: block1.startLine,
570
+ line2: block2.startLine,
571
+ endLine1: block1.endLine,
572
+ endLine2: block2.endLine,
573
+ similarity: sim,
574
+ snippet: block1.content.substring(0, 200),
575
+ patternType: block1.patternType,
576
+ tokenCost: block1.tokenCost,
577
+ linesOfCode: block1.linesOfCode,
578
+ severity: severity.severity,
579
+ reason: severity.reason,
580
+ suggestion: severity.suggestion
581
+ };
582
+ duplicates.push(dup);
583
+ if (streamResults)
583
584
  console.log(
584
- ` This prevents excessive runtime on large repos. Consider using approximate mode (default) or --min-lines to reduce blocks.`
585
- );
586
- break;
587
- }
588
- comparisonsProcessed++;
589
- const block2 = allBlocks[j];
590
- const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
591
- if (similarity >= minSimilarity) {
592
- const { severity, reason, suggestion, matchedRule } = calculateSeverity(
593
- block1.file,
594
- block2.file,
595
- block1.content,
596
- similarity,
597
- block1.linesOfCode
585
+ `[DUPLICATE] ${dup.file1}:${dup.line1} <-> ${dup.file2}:${dup.line2} (${Math.round(sim * 100)}%)`
598
586
  );
599
- const duplicate = {
600
- file1: block1.file,
601
- file2: block2.file,
602
- line1: block1.startLine,
603
- line2: block2.startLine,
604
- endLine1: block1.endLine,
605
- endLine2: block2.endLine,
606
- similarity,
607
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
608
- patternType: block1.patternType,
609
- tokenCost: block1.tokenCost + block2.tokenCost,
610
- linesOfCode: block1.linesOfCode,
611
- severity,
612
- reason,
613
- suggestion,
614
- matchedRule
615
- };
616
- duplicates.push(duplicate);
617
- if (streamResults) {
618
- console.log(
619
- `
620
- \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
621
- );
622
- console.log(
623
- ` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
624
- );
625
- console.log(
626
- ` Token cost: ${duplicate.tokenCost.toLocaleString()}`
627
- );
628
- }
629
- }
630
- }
631
- } else {
632
- for (let j = i + 1; j < allBlocks.length; j++) {
633
- if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
634
- comparisonsProcessed++;
635
- const block2 = allBlocks[j];
636
- if (block1.file === block2.file) continue;
637
- const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
638
- if (similarity >= minSimilarity) {
639
- const { severity, reason, suggestion, matchedRule } = calculateSeverity(
640
- block1.file,
641
- block2.file,
642
- block1.content,
643
- similarity,
644
- block1.linesOfCode
645
- );
646
- const duplicate = {
647
- file1: block1.file,
648
- file2: block2.file,
649
- line1: block1.startLine,
650
- line2: block2.startLine,
651
- endLine1: block1.endLine,
652
- endLine2: block2.endLine,
653
- similarity,
654
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
655
- patternType: block1.patternType,
656
- tokenCost: block1.tokenCost + block2.tokenCost,
657
- linesOfCode: block1.linesOfCode,
658
- severity,
659
- reason,
660
- suggestion,
661
- matchedRule
662
- };
663
- duplicates.push(duplicate);
664
- if (streamResults) {
665
- console.log(
666
- `
667
- \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
668
- );
669
- console.log(
670
- ` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
671
- );
672
- console.log(
673
- ` Token cost: ${duplicate.tokenCost.toLocaleString()}`
674
- );
675
- }
676
- }
677
587
  }
678
588
  }
679
589
  }
680
- if (comparisonsBudgetExhausted) {
681
- console.log(
682
- `\u26A0\uFE0F Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`
683
- );
684
- }
685
- return duplicates.sort(
686
- (a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
687
- );
590
+ return duplicates;
688
591
  }
689
592
 
690
593
  // src/grouping.ts
package/dist/cli.mjs CHANGED
@@ -3,7 +3,7 @@ import {
3
3
  analyzePatterns,
4
4
  filterBySeverity,
5
5
  generateSummary
6
- } from "./chunk-YSDOUNJJ.mjs";
6
+ } from "./chunk-6OEHUI5J.mjs";
7
7
 
8
8
  // src/cli.ts
9
9
  import { Command } from "commander";
package/dist/index.d.mts CHANGED
@@ -25,6 +25,7 @@ declare function filterBySeverity<T extends {
25
25
  severity: Severity;
26
26
  }>(duplicates: T[], minSeverity: Severity): T[];
27
27
 
28
+ type PatternType = 'function' | 'class-method' | 'api-handler' | 'validator' | 'utility' | 'component' | 'unknown';
28
29
  interface DuplicatePattern {
29
30
  file1: string;
30
31
  file2: string;
@@ -42,7 +43,6 @@ interface DuplicatePattern {
42
43
  suggestion?: string;
43
44
  matchedRule?: string;
44
45
  }
45
- type PatternType = 'function' | 'class-method' | 'api-handler' | 'validator' | 'utility' | 'component' | 'unknown';
46
46
  interface FileContent {
47
47
  file: string;
48
48
  content: string;
@@ -50,7 +50,6 @@ interface FileContent {
50
50
  interface DetectionOptions {
51
51
  minSimilarity: number;
52
52
  minLines: number;
53
- maxBlocks?: number;
54
53
  batchSize?: number;
55
54
  approx?: boolean;
56
55
  minSharedTokens?: number;
@@ -59,6 +58,7 @@ interface DetectionOptions {
59
58
  streamResults?: boolean;
60
59
  onProgress?: (processed: number, total: number, message: string) => void;
61
60
  }
61
+
62
62
  /**
63
63
  * Detect duplicate patterns across files with enhanced analysis
64
64
  */
package/dist/index.d.ts CHANGED
@@ -25,6 +25,7 @@ declare function filterBySeverity<T extends {
25
25
  severity: Severity;
26
26
  }>(duplicates: T[], minSeverity: Severity): T[];
27
27
 
28
+ type PatternType = 'function' | 'class-method' | 'api-handler' | 'validator' | 'utility' | 'component' | 'unknown';
28
29
  interface DuplicatePattern {
29
30
  file1: string;
30
31
  file2: string;
@@ -42,7 +43,6 @@ interface DuplicatePattern {
42
43
  suggestion?: string;
43
44
  matchedRule?: string;
44
45
  }
45
- type PatternType = 'function' | 'class-method' | 'api-handler' | 'validator' | 'utility' | 'component' | 'unknown';
46
46
  interface FileContent {
47
47
  file: string;
48
48
  content: string;
@@ -50,7 +50,6 @@ interface FileContent {
50
50
  interface DetectionOptions {
51
51
  minSimilarity: number;
52
52
  minLines: number;
53
- maxBlocks?: number;
54
53
  batchSize?: number;
55
54
  approx?: boolean;
56
55
  minSharedTokens?: number;
@@ -59,6 +58,7 @@ interface DetectionOptions {
59
58
  streamResults?: boolean;
60
59
  onProgress?: (processed: number, total: number, message: string) => void;
61
60
  }
61
+
62
62
  /**
63
63
  * Detect duplicate patterns across files with enhanced analysis
64
64
  */