npm - @aiready/pattern-detect - Versions diffs - 0.11.36 → 0.11.38 - Mend

@aiready/pattern-detect 0.11.36 → 0.11.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/chunk-4BPRGZRG.mjs +1041 -0
package/dist/chunk-6OEHUI5J.mjs +1045 -0
package/dist/chunk-CTDBJP25.mjs +1043 -0
package/dist/chunk-DGAKXYIP.mjs +1041 -0
package/dist/chunk-P7B6Z4I2.mjs +1043 -0
package/dist/chunk-QEP76HGK.mjs +1039 -0
package/dist/cli.js +161 -258
package/dist/cli.mjs +1 -1
package/dist/index.d.mts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +161 -258
package/dist/index.mjs +1 -1
package/package.json +2 -2

package/dist/index.js CHANGED Viewed

@@ -341,7 +341,7 @@ function filterBySeverity(duplicates, minSeverity) {
   });
 }
-// src/detector.ts
+// src/core/extractor.ts
 function categorizePattern(code) {
   const lower = code.toLowerCase();
   if (lower.includes("request") && lower.includes("response") || lower.includes("router.") || lower.includes("app.get") || lower.includes("app.post") || lower.includes("express") || lower.includes("ctx.body")) {
@@ -386,17 +386,19 @@ function extractCodeBlocks(content, minLines) {
       currentBlock.push(line);
     }
     if (inFunction && braceDepth === 0 && currentBlock.length >= minLines) {
-      const blockContent = currentBlock.join("\n");
-      const linesOfCode = currentBlock.filter(
-        (l) => l.trim() && !l.trim().startsWith("//")
-      ).length;
-      blocks.push({
-        content: blockContent,
-        startLine: blockStart + 1,
-        endLine: i + 1,
-        patternType: categorizePattern(blockContent),
-        linesOfCode
-      });
+      const blockContent = currentBlock.join("\n").trim();
+      if (blockContent) {
+        const loc = currentBlock.filter(
+          (l) => l.trim() && !l.trim().startsWith("//")
+        ).length;
+        blocks.push({
+          content: blockContent,
+          startLine: blockStart + 1,
+          endLine: i + 1,
+          patternType: categorizePattern(blockContent),
+          linesOfCode: loc
+        });
+      }
       currentBlock = [];
       inFunction = false;
     } else if (inFunction && braceDepth === 0) {
@@ -406,15 +408,51 @@ function extractCodeBlocks(content, minLines) {
   }
   return blocks;
 }
+// src/core/normalizer.ts
 function normalizeCode(code) {
-  if (!code) {
-    return "";
-  }
+  if (!code) return "";
   return code.replace(/\/\/.*$/gm, "").replace(/\/\*[\s\S]*?\*\//g, "").replace(/"[^"]*"/g, '"STR"').replace(/'[^']*'/g, "'STR'").replace(/`[^`]*`/g, "`STR`").replace(/\b\d+\b/g, "NUM").replace(/\s+/g, " ").trim();
 }
+var stopwords = /* @__PURE__ */ new Set([
+  "return",
+  "const",
+  "let",
+  "var",
+  "function",
+  "class",
+  "new",
+  "if",
+  "else",
+  "for",
+  "while",
+  "async",
+  "await",
+  "try",
+  "catch",
+  "switch",
+  "case",
+  "default",
+  "import",
+  "export",
+  "from",
+  "true",
+  "false",
+  "null",
+  "undefined",
+  "this"
+]);
+function tokenize(norm) {
+  const punctuation = "(){}[];.,";
+  const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
+  return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
+}
+// src/core/similarity.ts
 function jaccardSimilarity(tokens1, tokens2) {
   const set1 = new Set(tokens1);
   const set2 = new Set(tokens2);
+  if (set1.size === 0 && set2.size === 0) return 0;
   let intersection = 0;
   for (const token of set1) {
     if (set2.has(token)) intersection++;
@@ -422,6 +460,53 @@ function jaccardSimilarity(tokens1, tokens2) {
   const union = set1.size + set2.size - intersection;
   return union === 0 ? 0 : intersection / union;
 }
+// src/core/approx-engine.ts
+var ApproxEngine = class {
+  constructor(allBlocks, blockTokens) {
+    this.invertedIndex = /* @__PURE__ */ new Map();
+    this.allBlocks = allBlocks;
+    this.blockTokens = blockTokens;
+    this.buildIndex();
+  }
+  buildIndex() {
+    for (let i = 0; i < this.blockTokens.length; i++) {
+      for (const tok of this.blockTokens[i]) {
+        let arr = this.invertedIndex.get(tok);
+        if (!arr) {
+          arr = [];
+          this.invertedIndex.set(tok, arr);
+        }
+        arr.push(i);
+      }
+    }
+  }
+  findCandidates(blockIdx, minSharedTokens, maxCandidates) {
+    const block1 = this.allBlocks[blockIdx];
+    const block1Tokens = this.blockTokens[blockIdx];
+    const counts = /* @__PURE__ */ new Map();
+    const rareTokens = block1Tokens.filter((tok) => {
+      const freq = this.invertedIndex.get(tok)?.length || 0;
+      return freq < this.allBlocks.length * 0.1;
+    });
+    for (const tok of rareTokens) {
+      const ids = this.invertedIndex.get(tok);
+      if (!ids) continue;
+      for (const j of ids) {
+        if (j <= blockIdx) continue;
+        if (this.allBlocks[j].file === block1.file) continue;
+        counts.set(j, (counts.get(j) || 0) + 1);
+      }
+    }
+    return Array.from(counts.entries()).filter(([j, shared]) => {
+      const block2Size = this.blockTokens[j].length;
+      const minSize = Math.min(block1Tokens.length, block2Size);
+      return shared >= minSharedTokens && shared / minSize >= 0.3;
+    }).sort((a, b) => b[1] - a[1]).slice(0, maxCandidates).map(([j, shared]) => ({ j, shared }));
+  }
+};
+// src/detector.ts
 async function detectDuplicatePatterns(files, options) {
   const {
     minSimilarity,
@@ -435,274 +520,92 @@ async function detectDuplicatePatterns(files, options) {
   const duplicates = [];
   const maxComparisons = approx ? Infinity : 5e5;
   const allBlocks = files.flatMap(
-    (file) => extractCodeBlocks(file.content, minLines).filter((block) => block.content && block.content.trim().length > 0).map((block) => ({
-      content: block.content,
-      startLine: block.startLine,
-      endLine: block.endLine,
+    (file) => extractCodeBlocks(file.content, minLines).filter(
+      (block) => block && block.content && block.content.trim().length > 0
+    ).map((block) => ({
+      ...block,
       file: file.file,
       normalized: normalizeCode(block.content),
-      patternType: block.patternType,
-      tokenCost: (0, import_core2.estimateTokens)(block.content),
-      linesOfCode: block.linesOfCode
+      tokenCost: block.content ? (0, import_core2.estimateTokens)(block.content) : 0
     }))
   );
-  if (!options.onProgress) {
-    console.log(`Extracted ${allBlocks.length} code blocks for analysis`);
-  }
-  const pythonFiles = files.filter((f) => f.file.toLowerCase().endsWith(".py"));
+  const pythonFiles = files.filter((f) => f.file.endsWith(".py"));
   if (pythonFiles.length > 0) {
     const { extractPythonPatterns: extractPythonPatterns2 } = await Promise.resolve().then(() => (init_python_extractor(), python_extractor_exports));
-    const patterns = await extractPythonPatterns2(
+    const pythonPatterns = await extractPythonPatterns2(
       pythonFiles.map((f) => f.file)
     );
-    const pythonBlocks = patterns.filter((p) => p.code && p.code.trim().length > 0).map((p) => ({
-      content: p.code,
-      startLine: p.startLine,
-      endLine: p.endLine,
-      file: p.file,
-      normalized: normalizeCode(p.code),
-      patternType: p.type,
-      tokenCost: (0, import_core2.estimateTokens)(p.code),
-      linesOfCode: p.endLine - p.startLine + 1
-    }));
-    allBlocks.push(...pythonBlocks);
-    if (!options.onProgress) {
-      console.log(`Added ${pythonBlocks.length} Python patterns`);
-    }
-  }
-  if (!approx && allBlocks.length > 500) {
-    console.log(
-      `\u26A0\uFE0F  Using --no-approx mode with ${allBlocks.length} blocks may be slow (O(B\xB2) complexity).`
-    );
-    console.log(
-      `   Consider using approximate mode (default) for better performance.`
+    allBlocks.push(
+      ...pythonPatterns.map((p) => ({
+        content: p.code,
+        startLine: p.startLine,
+        endLine: p.endLine,
+        file: p.file,
+        normalized: normalizeCode(p.code),
+        patternType: p.type,
+        tokenCost: p.code ? (0, import_core2.estimateTokens)(p.code) : 0,
+        linesOfCode: p.endLine - p.startLine + 1
+      }))
     );
   }
-  const stopwords = /* @__PURE__ */ new Set([
-    "return",
-    "const",
-    "let",
-    "var",
-    "function",
-    "class",
-    "new",
-    "if",
-    "else",
-    "for",
-    "while",
-    "async",
-    "await",
-    "try",
-    "catch",
-    "switch",
-    "case",
-    "default",
-    "import",
-    "export",
-    "from",
-    "true",
-    "false",
-    "null",
-    "undefined",
-    "this"
-  ]);
-  const tokenize = (norm) => {
-    const punctuation = "(){}[];.,";
-    const cleaned = norm.split("").map((ch) => punctuation.includes(ch) ? " " : ch).join("");
-    return cleaned.split(/\s+/).filter((t) => t && t.length >= 3 && !stopwords.has(t.toLowerCase()));
-  };
   const blockTokens = allBlocks.map((b) => tokenize(b.normalized));
-  const invertedIndex = /* @__PURE__ */ new Map();
-  if (approx) {
-    for (let i = 0; i < blockTokens.length; i++) {
-      for (const tok of blockTokens[i]) {
-        let arr = invertedIndex.get(tok);
-        if (!arr) {
-          arr = [];
-          invertedIndex.set(tok, arr);
-        }
-        arr.push(i);
-      }
-    }
-  }
-  const totalComparisons = approx ? void 0 : allBlocks.length * (allBlocks.length - 1) / 2;
-  if (totalComparisons !== void 0) {
-    console.log(
-      `Processing ${totalComparisons.toLocaleString()} comparisons in batches...`
-    );
-  } else {
-    console.log(
-      `Using approximate candidate selection to reduce comparisons...`
-    );
-  }
+  const engine = approx ? new ApproxEngine(allBlocks, blockTokens) : null;
   let comparisonsProcessed = 0;
-  let comparisonsBudgetExhausted = false;
   const startTime = Date.now();
   for (let i = 0; i < allBlocks.length; i++) {
-    if (maxComparisons && comparisonsProcessed >= maxComparisons) {
-      comparisonsBudgetExhausted = true;
-      break;
-    }
+    if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
     if (i % batchSize === 0 && i > 0) {
       if (options.onProgress) {
-        options.onProgress(i, allBlocks.length, `pattern-detect: analyzing blocks`);
+        options.onProgress(i, allBlocks.length, "Analyzing patterns");
       } else {
-        const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
-        const duplicatesFound = duplicates.length;
-        if (totalComparisons !== void 0) {
-          const progress = (comparisonsProcessed / totalComparisons * 100).toFixed(1);
-          const remaining = totalComparisons - comparisonsProcessed;
-          const rate = comparisonsProcessed / parseFloat(elapsed);
-          const eta = remaining > 0 ? (remaining / rate).toFixed(0) : 0;
-          console.log(
-            `   ${progress}% (${comparisonsProcessed.toLocaleString()}/${totalComparisons.toLocaleString()} comparisons, ${elapsed}s elapsed, ~${eta}s remaining, ${duplicatesFound} duplicates)`
-          );
-        } else {
-          console.log(
-            `   Processed ${i.toLocaleString()}/${allBlocks.length} blocks (${elapsed}s elapsed, ${duplicatesFound} duplicates)`
-          );
-        }
+        const elapsed = (Date.now() - startTime) / 1e3;
+        console.log(
+          `   Processed ${i}/${allBlocks.length} blocks (${elapsed.toFixed(1)}s, ${duplicates.length} duplicates)`
+        );
       }
-      await new Promise((resolve) => setImmediate(resolve));
+      await new Promise((r) => setImmediate((resolve) => r(resolve)));
     }
     const block1 = allBlocks[i];
-    let candidates = null;
-    if (approx) {
-      const counts = /* @__PURE__ */ new Map();
-      const block1Tokens = new Set(blockTokens[i]);
-      const block1Size = block1Tokens.size;
-      const rareTokens = blockTokens[i].filter((tok) => {
-        const blocksWithToken = invertedIndex.get(tok)?.length || 0;
-        return blocksWithToken < allBlocks.length * 0.1;
-      });
-      for (const tok of rareTokens) {
-        const ids = invertedIndex.get(tok);
-        if (!ids) continue;
-        for (const j of ids) {
-          if (j <= i) continue;
-          if (allBlocks[j].file === block1.file) continue;
-          counts.set(j, (counts.get(j) || 0) + 1);
-        }
-      }
-      candidates = Array.from(counts.entries()).filter(([j, shared]) => {
-        const block2Tokens = blockTokens[j];
-        const block2Size = block2Tokens.length;
-        const minSize = Math.min(block1Size, block2Size);
-        const sharedPercentage = shared / minSize;
-        return shared >= minSharedTokens && sharedPercentage >= 0.3;
-      }).sort((a, b) => b[1] - a[1]).slice(0, Math.min(maxCandidatesPerBlock, 5)).map(([j, shared]) => ({ j, shared }));
-    }
-    if (approx && candidates) {
-      for (const { j } of candidates) {
-        if (!approx && maxComparisons !== Infinity && comparisonsProcessed >= maxComparisons) {
-          console.log(
-            `\u26A0\uFE0F  Comparison safety limit reached (${maxComparisons.toLocaleString()} comparisons in --no-approx mode).`
-          );
+    const candidates = engine ? engine.findCandidates(i, minSharedTokens, maxCandidatesPerBlock) : allBlocks.slice(i + 1).map((_, idx) => ({ j: i + 1 + idx, shared: 0 }));
+    for (const { j } of candidates) {
+      if (!approx && comparisonsProcessed >= maxComparisons) break;
+      comparisonsProcessed++;
+      const block2 = allBlocks[j];
+      if (block1.file === block2.file) continue;
+      const sim = jaccardSimilarity(blockTokens[i], blockTokens[j]);
+      if (sim >= minSimilarity) {
+        const severity = calculateSeverity(
+          block1.file,
+          block2.file,
+          block1.content,
+          sim,
+          block1.linesOfCode
+        );
+        const dup = {
+          file1: block1.file,
+          file2: block2.file,
+          line1: block1.startLine,
+          line2: block2.startLine,
+          endLine1: block1.endLine,
+          endLine2: block2.endLine,
+          similarity: sim,
+          snippet: block1.content.substring(0, 200),
+          patternType: block1.patternType,
+          tokenCost: block1.tokenCost,
+          linesOfCode: block1.linesOfCode,
+          severity: severity.severity,
+          reason: severity.reason,
+          suggestion: severity.suggestion
+        };
+        duplicates.push(dup);
+        if (streamResults)
           console.log(
-            `   This prevents excessive runtime on large repos. Consider using approximate mode (default) or --min-lines to reduce blocks.`
-          );
-          break;
-        }
-        comparisonsProcessed++;
-        const block2 = allBlocks[j];
-        const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
-        if (similarity >= minSimilarity) {
-          const { severity, reason, suggestion, matchedRule } = calculateSeverity(
-            block1.file,
-            block2.file,
-            block1.content,
-            similarity,
-            block1.linesOfCode
-          );
-          const duplicate = {
-            file1: block1.file,
-            file2: block2.file,
-            line1: block1.startLine,
-            line2: block2.startLine,
-            endLine1: block1.endLine,
-            endLine2: block2.endLine,
-            similarity,
-            snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
-            patternType: block1.patternType,
-            tokenCost: block1.tokenCost + block2.tokenCost,
-            linesOfCode: block1.linesOfCode,
-            severity,
-            reason,
-            suggestion,
-            matchedRule
-          };
-          duplicates.push(duplicate);
-          if (streamResults) {
-            console.log(
-              `
-   \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
-            );
-            console.log(
-              `      ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
-            );
-            console.log(
-              `      Token cost: ${duplicate.tokenCost.toLocaleString()}`
-            );
-          }
-        }
-      }
-    } else {
-      for (let j = i + 1; j < allBlocks.length; j++) {
-        if (maxComparisons && comparisonsProcessed >= maxComparisons) break;
-        comparisonsProcessed++;
-        const block2 = allBlocks[j];
-        if (block1.file === block2.file) continue;
-        const similarity = jaccardSimilarity(blockTokens[i], blockTokens[j]);
-        if (similarity >= minSimilarity) {
-          const { severity, reason, suggestion, matchedRule } = calculateSeverity(
-            block1.file,
-            block2.file,
-            block1.content,
-            similarity,
-            block1.linesOfCode
+            `[DUPLICATE] ${dup.file1}:${dup.line1} <-> ${dup.file2}:${dup.line2} (${Math.round(sim * 100)}%)`
           );
-          const duplicate = {
-            file1: block1.file,
-            file2: block2.file,
-            line1: block1.startLine,
-            line2: block2.startLine,
-            endLine1: block1.endLine,
-            endLine2: block2.endLine,
-            similarity,
-            snippet: block1.content.split("\n").slice(0, 5).join("\n") + "\n...",
-            patternType: block1.patternType,
-            tokenCost: block1.tokenCost + block2.tokenCost,
-            linesOfCode: block1.linesOfCode,
-            severity,
-            reason,
-            suggestion,
-            matchedRule
-          };
-          duplicates.push(duplicate);
-          if (streamResults) {
-            console.log(
-              `
-   \u2705 Found: ${duplicate.patternType} ${Math.round(similarity * 100)}% similar`
-            );
-            console.log(
-              `      ${duplicate.file1}:${duplicate.line1}-${duplicate.endLine1} \u21D4 ${duplicate.file2}:${duplicate.line2}-${duplicate.endLine2}`
-            );
-            console.log(
-              `      Token cost: ${duplicate.tokenCost.toLocaleString()}`
-            );
-          }
-        }
       }
     }
   }
-  if (comparisonsBudgetExhausted) {
-    console.log(
-      `\u26A0\uFE0F  Comparison budget exhausted (${maxComparisons.toLocaleString()} comparisons). Use --max-comparisons to increase.`
-    );
-  }
-  return duplicates.sort(
-    (a, b) => b.similarity - a.similarity || b.tokenCost - a.tokenCost
-  );
+  return duplicates;
 }
 // src/grouping.ts

package/dist/index.mjs CHANGED Viewed

@@ -7,7 +7,7 @@ import {
   generateSummary,
   getSeverityLabel,
   getSmartDefaults
-} from "./chunk-YSDOUNJJ.mjs";
+} from "./chunk-6OEHUI5J.mjs";
 export {
   analyzePatterns,
   calculatePatternScore,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aiready/pattern-detect",
-  "version": "0.11.36",
+  "version": "0.11.38",
   "description": "Semantic duplicate pattern detection for AI-generated code - finds similar implementations that waste AI context tokens",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",
@@ -45,7 +45,7 @@
   "dependencies": {
     "commander": "^14.0.0",
     "chalk": "^5.3.0",
-    "@aiready/core": "0.9.37"
+    "@aiready/core": "0.9.39"
   },
   "devDependencies": {
     "tsup": "^8.3.5",