@aiready/pattern-detect 0.11.36 → 0.11.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -341,7 +341,7 @@ function filterBySeverity(duplicates, minSeverity) {
341
341
  });
342
342
  }
343
343
 
344
- // src/detector.ts
344
+ // src/core/extractor.ts
345
345
  function categorizePattern(code) {
346
346
  const lower = code.toLowerCase();
347
347
  if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
@@ -386,17 +386,19 @@ function extractCodeBlocks(content, minLines) {
386
386
  currentBlock.push(line);
387
387
  }
388
388
  if (inFunction && braceDepth === 0 && currentBlock.length >= minLines) {
389
- const blockContent = currentBlock.join("\n");
390
- const linesOfCode = currentBlock.filter(
391
- (l) => l.trim() && !l.trim().startsWith("//")
392
- ).length;
393
- blocks.push({
394
- content: blockContent,
395
- startLine: blockStart + 1,
396
- endLine: i + 1,
397
- patternType: categorizePattern(blockContent),
398
- linesOfCode
399
- });
389
+ const blockContent = currentBlock.join("\n").trim();
390
+ if (blockContent) {
391
+ const loc = currentBlock.filter(
392
+ (l) => l.trim() && !l.trim().startsWith("//")
393
+ ).length;
394
+ blocks.push({
395
+ content: blockContent,
396
+ startLine: blockStart + 1,
397
+ endLine: i + 1,
398
+ patternType: categorizePattern(blockContent),
399
+ linesOfCode: loc
400
+ });
401
+ }
400
402
  currentBlock = [];
401
403
  inFunction = false;
402
404
  } else if (inFunction && braceDepth === 0) {
@@ -406,15 +408,51 @@ function extractCodeBlocks(content, minLines) {
406
408
  }
407
409
  return blocks;
408
410
  }
411
+
412
+ // src/core/normalizer.ts
409
413
  function normalizeCode(code) {
410
- if (!code) {
411
- return "";
412
- }
414
+ if (!code) return "";
413
415
  return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
414
416
  }
417
+ var stopwords = /* @__PURE__ */ new Set([
418
+ "return",
419
+ "const",
420
+ "let",
421
+ "var",
422
+ "function",
423
+ "class",
424
+ "new",
425
+ "if",
426
+ "else",
427
+ "for",
428
+ "while",
429
+ "async",
430
+ "await",
431
+ "try",
432
+ "catch",
433
+ "switch",
434
+ "case",
435
+ "default",
436
+ "import",
437
+ "export",
438
+ "from",
439
+ "true",
440
+ "false",
441
+ "null",
442
+ "undefined",
443
+ "this"
444
+ ]);
445
+ function tokenize(norm) {
446
+ const punctuation = "(){}[];.,";
447
+ const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
448
+ return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
449
+ }
450
+
451
+ // src/core/similarity.ts
415
452
  function jaccardSimilarity(tokens1, tokens2) {
416
453
  const set1 = new Set(tokens1);
417
454
  const set2 = new Set(tokens2);
455
+ if (set1.size === 0 && set2.size === 0) return 0;
418
456
  let intersection = 0;
419
457
  for (const token of set1) {
420
458
  if (set2.has(token)) intersection++;
@@ -422,6 +460,53 @@ function jaccardSimilarity(tokens1, tokens2) {
422
460
  const union = set1.size + set2.size - intersection;
423
461
  return union === 0 ? 0 : intersection / union;
424
462
  }
463
+
464
+ // src/core/approx-engine.ts
465
+ var ApproxEngine = class {
466
+ constructor(allBlocks, blockTokens) {
467
+ this.invertedIndex = /* @__PURE__ */ new Map();
468
+ this.allBlocks = allBlocks;
469
+ this.blockTokens = blockTokens;
470
+ this.buildIndex();
471
+ }
472
+ buildIndex() {
473
+ for (let i = 0; i < this.blockTokens.length; i++) {
474
+ for (const tok of this.blockTokens[i]) {
475
+ let arr = this.invertedIndex.get(tok);
476
+ if (!arr) {
477
+ arr = [];
478
+ this.invertedIndex.set(tok, arr);
479
+ }
480
+ arr.push(i);
481
+ }
482
+ }
483
+ }
484
+ findCandidates(blockIdx, minSharedTokens, maxCandidates) {
485
+ const block1 = this.allBlocks[blockIdx];
486
+ const block1Tokens = this.blockTokens[blockIdx];
487
+ const counts = /* @__PURE__ */ new Map();
488
+ const rareTokens = block1Tokens.filter((tok) => {
489
+ const freq = this.invertedIndex.get(tok)?.length || 0;
490
+ return freq < this.allBlocks.length * 0.1;
491
+ });
492
+ for (const tok of rareTokens) {
493
+ const ids = this.invertedIndex.get(tok);
494
+ if (!ids) continue;
495
+ for (const j of ids) {
496
+ if (j <= blockIdx) continue;
497
+ if (this.allBlocks[j].file === block1.file) continue;
498
+ counts.set(j, (counts.get(j) || 0) + 1);
499
+ }
500
+ }
501
+ return Array.from(counts.entries()).filter(([j, shared]) => {
502
+ const block2Size = this.blockTokens[j].length;
503
+ const minSize = Math.min(block1Tokens.length, block2Size);
504
+ return shared >= minSharedTokens && shared / minSize >= 0.3;
505
+ }).sort((a, b) => b[1] - a[1]).slice(0, maxCandidates).map(([j, shared]) => ({ j, shared }));
506
+ }
507
+ };
508
+
509
+ // src/detector.ts
425
510
  async function detectDuplicatePatterns(files, options) {
426
511
  const {
427
512
  minSimilarity,
@@ -435,274 +520,92 @@ async function detectDuplicatePatterns(files, options) {
435
520
  const duplicates = [];
436
521
  const maxComparisons = approx ? Infinity : 5e5;
437
522
  const allBlocks = files.flatMap(
438
- (file) => extractCodeBlocks(file.content, minLines).filter((block) => block.content && block.content.trim().length > 0).map((block) => ({
439
- content: block.content,
440
- startLine: block.startLine,
441
- endLine: block.endLine,
523
+ (file) => extractCodeBlocks(file.content, minLines).filter(
524
+ (block) => block && block.content && block.content.trim().length > 0
525
+ ).map((block) => ({
526
+ ...block,
442
527
  file: file.file,
443
528
  normalized: normalizeCode(block.content),
444
- patternType: block.patternType,
445
- tokenCost: (0, import_core2.estimateTokens)(block.content),
446
- linesOfCode: block.linesOfCode
529
+ tokenCost: block.content ? (0, import_core2.estimateTokens)(block.content) : 0
447
530
  }))
448
531
  );
449
- if (!options.onProgress) {
450
- console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
451
- }
452
- const pythonFiles = files.filter((f) => f.file.toLowerCase().endsWith(".py"));
532
+ const pythonFiles = files.filter((f) => f.file.endsWith(".py"));
453
533
  if (pythonFiles.length > 0) {
454
534
  const { extractPythonPatterns: extractPythonPatterns2 } = await Promise.resolve().then(() => (init_python_extractor(), python_extractor_exports));
455
- const patterns = await extractPythonPatterns2(
535
+ const pythonPatterns = await extractPythonPatterns2(
456
536
  pythonFiles.map((f) => f.file)
457
537
  );
458
- const pythonBlocks = patterns.filter((p) => p.code && p.code.trim().length > 0).map((p) => ({
459
- content: p.code,
460
- startLine: p.startLine,
461
- endLine: p.endLine,
462
- file: p.file,
463
- normalized: normalizeCode(p.code),
464
- patternType: p.type,
465
- tokenCost: (0, import_core2.estimateTokens)(p.code),
466
- linesOfCode: p.endLine - p.startLine + 1
467
- }));
468
- allBlocks.push(...pythonBlocks);
469
- if (!options.onProgress) {
470
- console.log(`Added ${pythonBlocks.length} Python patterns`);
471
- }
472
- }
473
- if (!approx && allBlocks.length > 500) {
474
- console.log(
475
- `\u26A0\uFE0F Using --no-approx mode with ${allBlocks.length} blocks may be slow (O(B\xB2) complexity).`
476
- );
477
- console.log(
478
- ` Consider using approximate mode (default) for better performance.`
538
+ allBlocks.push(
539
+ ...pythonPatterns.map((p) => ({
540
+ content: p.code,
541
+ startLine: p.startLine,
542
+ endLine: p.endLine,
543
+ file: p.file,
544
+ normalized: normalizeCode(p.code),
545
+ patternType: p.type,
546
+ tokenCost: p.code ? (0, import_core2.estimateTokens)(p.code) : 0,
547
+ linesOfCode: p.endLine - p.startLine + 1
548
+ }))
479
549
  );
480
550
  }
481
- const stopwords = /* @__PURE__ */ new Set([
482
- "return",
483
- "const",
484
- "let",
485
- "var",
486
- "function",
487
- "class",
488
- "new",
489
- "if",
490
- "else",
491
- "for",
492
- "while",
493
- "async",
494
- "await",
495
- "try",
496
- "catch",
497
- "switch",
498
- "case",
499
- "default",
500
- "import",
501
- "export",
502
- "from",
503
- "true",
504
- "false",
505
- "null",
506
- "undefined",
507
- "this"
508
- ]);
509
- const tokenize = (norm) => {
510
- const punctuation = "(){}[];.,";
511
- const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
512
- return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
513
- };
514
551
  const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
515
- const invertedIndex = /* @__PURE__ */ new Map();
516
- if (approx) {
517
- for (let i = 0; i < blockTokens.length; i++) {
518
- for (const tok of blockTokens[i]) {
519
- let arr = invertedIndex.get(tok);
520
- if (!arr) {
521
- arr = [];
522
- invertedIndex.set(tok, arr);
523
- }
524
- arr.push(i);
525
- }
526
- }
527
- }
528
- const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
529
- if (totalComparisons !== void 0) {
530
- console.log(
531
- `Processing ${totalComparisons.toLocaleString()} comparisons in batches...`
532
- );
533
- } else {
534
- console.log(
535
- `Using approximate candidate selection to reduce comparisons...`
536
- );
537
- }
552
+ const engine = approx ? new ApproxEngine(allBlocks, blockTokens) : null;
538
553
  let comparisonsProcessed = 0;
539
- let comparisonsBudgetExhausted = false;
540
554
  const startTime = Date.now();
541
555
  for (let i = 0; i < allBlocks.length; i++) {
542
- if (maxComparisons && comparisonsProcessed >= maxComparisons) {
543
- comparisonsBudgetExhausted = true;
544
- break;
545
- }
556
+ if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
546
557
  if (i % batchSize === 0 && i > 0) {
547
558
  if (options.onProgress) {
548
- options.onProgress(i, allBlocks.length, `pattern-detect: analyzing blocks`);
559
+ options.onProgress(i, allBlocks.length, "Analyzing patterns");
549
560
  } else {
550
- const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
551
- const duplicatesFound = duplicates.length;
552
- if (totalComparisons !== void 0) {
553
- const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
554
- const remaining = totalComparisons - comparisonsProcessed;
555
- const rate = comparisonsProcessed / parseFloat(elapsed);
556
- const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
557
- console.log(
558
- ` ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`
559
- );
560
- } else {
561
- console.log(
562
- ` Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`
563
- );
564
- }
561
+ const elapsed = (Date.now() - startTime) / 1e3;
562
+ console.log(
563
+ ` Processed ${i}/${allBlocks.length} blocks (${elapsed.toFixed(1)}s, ${duplicates.length} duplicates)`
564
+ );
565
565
  }
566
- await new Promise((resolve) => setImmediate(resolve));
566
+ await new Promise((r) => setImmediate((resolve) => r(resolve)));
567
567
  }
568
568
  const block1 = allBlocks[i];
569
- let candidates = null;
570
- if (approx) {
571
- const counts = /* @__PURE__ */ new Map();
572
- const block1Tokens = new Set(blockTokens[i]);
573
- const block1Size = block1Tokens.size;
574
- const rareTokens = blockTokens[i].filter((tok) => {
575
- const blocksWithToken = invertedIndex.get(tok)?.length || 0;
576
- return blocksWithToken < allBlocks.length * 0.1;
577
- });
578
- for (const tok of rareTokens) {
579
- const ids = invertedIndex.get(tok);
580
- if (!ids) continue;
581
- for (const j of ids) {
582
- if (j <= i) continue;
583
- if (allBlocks[j].file === block1.file) continue;
584
- counts.set(j, (counts.get(j) || 0) + 1);
585
- }
586
- }
587
- candidates = Array.from(counts.entries()).filter(([j, shared]) => {
588
- const block2Tokens = blockTokens[j];
589
- const block2Size = block2Tokens.length;
590
- const minSize = Math.min(block1Size, block2Size);
591
- const sharedPercentage = shared / minSize;
592
- return shared >= minSharedTokens && sharedPercentage >= 0.3;
593
- }).sort((a, b) => b[1] - a[1]).slice(0, Math.min(maxCandidatesPerBlock, 5)).map(([j, shared]) => ({ j, shared }));
594
- }
595
- if (approx && candidates) {
596
- for (const { j } of candidates) {
597
- if (!approx && maxComparisons !== Infinity && comparisonsProcessed >= maxComparisons) {
598
- console.log(
599
- `\u26A0\uFE0F Comparison safety limit reached (${maxComparisons.toLocaleString()} comparisons in --no-approx mode).`
600
- );
569
+ const candidates = engine ? engine.findCandidates(i, minSharedTokens, maxCandidatesPerBlock) : allBlocks.slice(i + 1).map((_, idx) => ({ j: i + 1 + idx, shared: 0 }));
570
+ for (const { j } of candidates) {
571
+ if (!approx && comparisonsProcessed >= maxComparisons) break;
572
+ comparisonsProcessed++;
573
+ const block2 = allBlocks[j];
574
+ if (block1.file === block2.file) continue;
575
+ const sim = jaccardSimilarity(blockTokens[i], blockTokens[j]);
576
+ if (sim >= minSimilarity) {
577
+ const severity = calculateSeverity(
578
+ block1.file,
579
+ block2.file,
580
+ block1.content,
581
+ sim,
582
+ block1.linesOfCode
583
+ );
584
+ const dup = {
585
+ file1: block1.file,
586
+ file2: block2.file,
587
+ line1: block1.startLine,
588
+ line2: block2.startLine,
589
+ endLine1: block1.endLine,
590
+ endLine2: block2.endLine,
591
+ similarity: sim,
592
+ snippet: block1.content.substring(0, 200),
593
+ patternType: block1.patternType,
594
+ tokenCost: block1.tokenCost,
595
+ linesOfCode: block1.linesOfCode,
596
+ severity: severity.severity,
597
+ reason: severity.reason,
598
+ suggestion: severity.suggestion
599
+ };
600
+ duplicates.push(dup);
601
+ if (streamResults)
601
602
  console.log(
602
- ` This prevents excessive runtime on large repos. Consider using approximate mode (default) or --min-lines to reduce blocks.`
603
- );
604
- break;
605
- }
606
- comparisonsProcessed++;
607
- const block2 = allBlocks[j];
608
- const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
609
- if (similarity >= minSimilarity) {
610
- const { severity, reason, suggestion, matchedRule } = calculateSeverity(
611
- block1.file,
612
- block2.file,
613
- block1.content,
614
- similarity,
615
- block1.linesOfCode
616
- );
617
- const duplicate = {
618
- file1: block1.file,
619
- file2: block2.file,
620
- line1: block1.startLine,
621
- line2: block2.startLine,
622
- endLine1: block1.endLine,
623
- endLine2: block2.endLine,
624
- similarity,
625
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
626
- patternType: block1.patternType,
627
- tokenCost: block1.tokenCost + block2.tokenCost,
628
- linesOfCode: block1.linesOfCode,
629
- severity,
630
- reason,
631
- suggestion,
632
- matchedRule
633
- };
634
- duplicates.push(duplicate);
635
- if (streamResults) {
636
- console.log(
637
- `
638
- \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
639
- );
640
- console.log(
641
- ` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
642
- );
643
- console.log(
644
- ` Token cost: ${duplicate.tokenCost.toLocaleString()}`
645
- );
646
- }
647
- }
648
- }
649
- } else {
650
- for (let j = i + 1; j < allBlocks.length; j++) {
651
- if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
652
- comparisonsProcessed++;
653
- const block2 = allBlocks[j];
654
- if (block1.file === block2.file) continue;
655
- const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
656
- if (similarity >= minSimilarity) {
657
- const { severity, reason, suggestion, matchedRule } = calculateSeverity(
658
- block1.file,
659
- block2.file,
660
- block1.content,
661
- similarity,
662
- block1.linesOfCode
603
+ `[DUPLICATE] ${dup.file1}:${dup.line1} <-> ${dup.file2}:${dup.line2} (${Math.round(sim * 100)}%)`
663
604
  );
664
- const duplicate = {
665
- file1: block1.file,
666
- file2: block2.file,
667
- line1: block1.startLine,
668
- line2: block2.startLine,
669
- endLine1: block1.endLine,
670
- endLine2: block2.endLine,
671
- similarity,
672
- snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
673
- patternType: block1.patternType,
674
- tokenCost: block1.tokenCost + block2.tokenCost,
675
- linesOfCode: block1.linesOfCode,
676
- severity,
677
- reason,
678
- suggestion,
679
- matchedRule
680
- };
681
- duplicates.push(duplicate);
682
- if (streamResults) {
683
- console.log(
684
- `
685
- \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
686
- );
687
- console.log(
688
- ` ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
689
- );
690
- console.log(
691
- ` Token cost: ${duplicate.tokenCost.toLocaleString()}`
692
- );
693
- }
694
- }
695
605
  }
696
606
  }
697
607
  }
698
- if (comparisonsBudgetExhausted) {
699
- console.log(
700
- `\u26A0\uFE0F Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`
701
- );
702
- }
703
- return duplicates.sort(
704
- (a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
705
- );
608
+ return duplicates;
706
609
  }
707
610
 
708
611
  // src/grouping.ts
package/dist/index.mjs CHANGED
@@ -7,7 +7,7 @@ import {
7
7
  generateSummary,
8
8
  getSeverityLabel,
9
9
  getSmartDefaults
10
- } from "./chunk-YSDOUNJJ.mjs";
10
+ } from "./chunk-6OEHUI5J.mjs";
11
11
  export {
12
12
  analyzePatterns,
13
13
  calculatePatternScore,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aiready/pattern-detect",
3
- "version": "0.11.36",
3
+ "version": "0.11.38",
4
4
  "description": "Semantic duplicate pattern detection for AI-generated code - finds similar implementations that waste AI context tokens",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",
@@ -45,7 +45,7 @@
45
45
  "dependencies": {
46
46
  "commander": "^14.0.0",
47
47
  "chalk": "^5.3.0",
48
- "@aiready/core": "0.9.37"
48
+ "@aiready/core": "0.9.39"
49
49
  },
50
50
  "devDependencies": {
51
51
  "tsup": "^8.3.5",