npm - muaddib-scanner - Versions diffs - 2.11.38 → 2.11.40 - Mend

muaddib-scanner 2.11.38 → 2.11.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +1 -1
package/{self-scan-v2.11.38.json → self-scan-v2.11.40.json} +34 -7
package/src/ml/jsonl-writer.js +64 -6
package/src/monitor/classify.js +8 -2
package/src/monitor/ingestion.js +29 -9
package/src/monitor/queue.js +29 -4
package/src/monitor/state.js +22 -0
package/src/monitor/webhook.js +29 -4
package/src/response/playbooks.js +10 -0
package/src/rules/index.js +15 -0
package/src/scanner/ai-config.js +32 -3
package/src/scanner/obfuscation.js +1 -48
package/src/shared/download.js +97 -6
package/src/shared/unicode-invisibles.js +164 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "muaddib-scanner",
-  "version": "2.11.38",
+  "version": "2.11.40",
   "description": "Supply-chain threat detection & response for npm & PyPI/Python",
   "main": "src/index.js",
   "bin": {

package/{self-scan-v2.11.38.json → self-scan-v2.11.40.json} RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "target": "node_modules",
-  "timestamp": "2026-05-24T22:20:18.999Z",
+  "timestamp": "2026-05-25T09:38:49.363Z",
   "threats": [
     {
       "type": "string_mutation_obfuscation",
@@ -870,6 +870,27 @@
       "playbook": "CRITIQUE: Execution de commande shell dangereuse detectee. Isoler la machine. Verifier si la commande a ete executee.",
       "points": 3
     },
+    {
+      "type": "unicode_invisible_injection",
+      "severity": "CRITICAL",
+      "message": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints.",
+      "file": "iconv-lite/encodings/sbcs-data-generated.js",
+      "count": 1,
+      "reductions": [],
+      "originalSeverity": "CRITICAL",
+      "confidenceTier": "medium",
+      "rule_id": "MUADDIB-OBF-003",
+      "rule_name": "Unicode Invisible Character Injection",
+      "confidence": "high",
+      "domain": "malware",
+      "references": [
+        "https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode",
+        "https://attack.mitre.org/techniques/T1027/"
+      ],
+      "mitre": "T1027",
+      "playbook": "CRITIQUE: Caracteres Unicode invisibles detectes (zero-width, variation selectors). Technique GlassWorm: du code malveillant est encode via des variation selectors invisibles dans les editeurs. Analyser le fichier avec un editeur hexa. Supprimer le package immediatement. Verifier les autres fichiers du projet pour des injections similaires.",
+      "points": 25
+    },
     {
       "type": "high_entropy_string",
       "severity": "LOW",
@@ -1107,17 +1128,17 @@
   ],
   "python": null,
   "summary": {
-    "total": 51,
-    "critical": 2,
+    "total": 52,
+    "critical": 3,
     "high": 6,
     "medium": 28,
     "low": 15,
     "riskScore": 35,
     "riskLevel": "MEDIUM",
     "globalRiskScore": 100,
-    "maxFileScore": 25,
+    "maxFileScore": 26,
     "packageScore": 1,
-    "mostSuspiciousFile": "ajv/lib/ajv.js",
+    "mostSuspiciousFile": "iconv-lite/encodings/sbcs-data-generated.js",
     "fileScores": {
       "esquery/parser.js": 5,
       "ajv/lib/ajv.js": 25,
@@ -1133,7 +1154,7 @@
       "eslint/lib/config/config-loader.js": 11,
       "eslint/lib/eslint/eslint-helpers.js": 25,
       "eslint/lib/eslint/eslint.js": 13,
-      "iconv-lite/encodings/sbcs-data-generated.js": 1,
+      "iconv-lite/encodings/sbcs-data-generated.js": 26,
       "iconv-lite/encodings/sbcs-data.js": 1,
       "ajv/lib/compile/formats.js": 1
     },
@@ -1169,6 +1190,12 @@
         "points": 25,
         "reason": "Dynamic import() with computed URL argument — remote code loading from dynamically constructed URL."
       },
+      {
+        "rule": "MUADDIB-OBF-003",
+        "type": "unicode_invisible_injection",
+        "points": 25,
+        "reason": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints."
+      },
       {
         "rule": "MUADDIB-AST-006",
         "type": "dynamic_require",
@@ -1461,7 +1488,7 @@
     "tierCounts": {
       "verified": 0,
       "high": 0,
-      "medium": 9,
+      "medium": 10,
       "low": 42
     },
     "perceivedFlagged": 0

package/src/ml/jsonl-writer.js CHANGED Viewed

@@ -18,12 +18,19 @@ const DEFAULT_TRAINING_FILE = path.join(__dirname, '..', '..', 'data', 'ml-train
 let TRAINING_FILE = DEFAULT_TRAINING_FILE;
 const MAX_JSONL_SIZE = 100 * 1024 * 1024; // 100MB rotation threshold
+// In-memory line counter. null = needs recompute (cold boot, file rewrite, or
+// path swap). Maintained incrementally by appendRecord and invalidated by
+// relabelRecords and setTrainingFile. Prior to this cache, getStats read the
+// entire JSONL into RAM on every daily report (72MB allocation × ~30K records).
+let _cachedLineCount = null;
 /**
  * Override the training file path (for testing).
  * @param {string} filePath - new file path
  */
 function setTrainingFile(filePath) {
   TRAINING_FILE = filePath;
+  _cachedLineCount = null; // different file → recompute on next getStats
 }
 /**
@@ -31,6 +38,7 @@ function setTrainingFile(filePath) {
  */
 function resetTrainingFile() {
   TRAINING_FILE = DEFAULT_TRAINING_FILE;
+  _cachedLineCount = null;
 }
 /**
@@ -49,6 +57,7 @@ function appendRecord(record) {
     const line = JSON.stringify(record) + '\n';
     fs.appendFileSync(TRAINING_FILE, line, 'utf8');
+    if (_cachedLineCount !== null) _cachedLineCount++;
   } catch (err) {
     // Non-fatal: JSONL export failure should never crash the monitor
     // Log permission errors so they are visible in journalctl (was silent before v2.10.27)
@@ -73,6 +82,7 @@ function maybeRotate() {
     const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
     const rotatedName = TRAINING_FILE.replace('.jsonl', `-${timestamp}.jsonl`);
     fs.renameSync(TRAINING_FILE, rotatedName);
+    _cachedLineCount = 0; // fresh file starts empty
     console.log(`[ML] Rotated training file → ${path.basename(rotatedName)} (${(stat.size / 1024 / 1024).toFixed(1)}MB)`);
   } catch (err) {
     console.error(`[ML] Rotation failed: ${err.message}`);
@@ -107,25 +117,71 @@ function readRecords() {
 }
 /**
- * Get stats about the current JSONL file.
+ * Stream-count newlines in a file using 64KB chunks. Counts non-empty
+ * logical records: each `\n`-terminated line that contains at least one
+ * non-whitespace byte. Matches the semantics of the old split-based count
+ * while avoiding the full-file readFileSync.
+ *
+ * @param {string} filePath
+ * @returns {number}
+ */
+function countLinesStreaming(filePath) {
+  const BUFFER_SIZE = 64 * 1024;
+  let fd;
+  try {
+    fd = fs.openSync(filePath, 'r');
+  } catch {
+    return 0;
+  }
+  try {
+    const buf = Buffer.alloc(BUFFER_SIZE);
+    let count = 0;
+    let sawContent = false;
+    let bytesRead;
+    while ((bytesRead = fs.readSync(fd, buf, 0, BUFFER_SIZE, null)) > 0) {
+      for (let i = 0; i < bytesRead; i++) {
+        const b = buf[i];
+        if (b === 0x0A) {            // '\n'
+          if (sawContent) count++;
+          sawContent = false;
+        } else if (b !== 0x20 && b !== 0x09 && b !== 0x0D) {
+          // any non-whitespace byte (space, tab, CR are still whitespace)
+          sawContent = true;
+        }
+      }
+    }
+    if (sawContent) count++; // trailing record without final newline
+    return count;
+  } finally {
+    try { fs.closeSync(fd); } catch {}
+  }
+}
+/**
+ * Get stats about the current JSONL file. Uses an in-memory line counter
+ * that is maintained incrementally by appendRecord and invalidated by
+ * rewrite operations — so getStats is O(1) on the hot path of the daily
+ * report (previously O(file size) via readFileSync on a 72MB+ file).
+ *
  * @returns {{ recordCount: number, fileSizeBytes: number, fileSizeMB: string }}
  */
 function getStats() {
   try {
     if (!fs.existsSync(TRAINING_FILE)) {
+      _cachedLineCount = 0;
       return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
     }
     const stat = fs.statSync(TRAINING_FILE);
-    // Count lines without reading the entire file into memory
-    const content = fs.readFileSync(TRAINING_FILE, 'utf8');
-    const lineCount = content.split('\n').filter(l => l.trim()).length;
+    if (_cachedLineCount === null) {
+      _cachedLineCount = countLinesStreaming(TRAINING_FILE);
+    }
     return {
-      recordCount: lineCount,
+      recordCount: _cachedLineCount,
       fileSizeBytes: stat.size,
       fileSizeMB: (stat.size / 1024 / 1024).toFixed(1)
     };
   } catch {
-    return { recordCount: 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
+    return { recordCount: _cachedLineCount || 0, fileSizeBytes: 0, fileSizeMB: '0.0' };
   }
 }
@@ -183,6 +239,8 @@ function relabelRecords(packageName, newLabel, sandboxFindingCount, manualReview
     if (updated > 0) {
       fs.writeFileSync(TRAINING_FILE, newLines.join('\n'), 'utf8');
+      // File was rewritten — line count cache must be recomputed on next read.
+      _cachedLineCount = null;
       console.log(`[ML] Relabeled ${updated} records for ${packageName} → ${newLabel}`);
     }
     return updated;

package/src/monitor/classify.js CHANGED Viewed

@@ -225,11 +225,15 @@ function isSuspectClassification(result) {
 /**
  * Classify an error into a category for the daily report breakdown.
  * @param {Error} err
- * @returns {'too_large'|'tar_failed'|'http_error'|'static_timeout'|'timeout'|'other'}
+ * @returns {'too_large'|'tar_failed'|'archive_failed'|'unsupported_format'|'http_error'|'static_timeout'|'timeout'|'other'}
  */
 function classifyError(err) {
   const msg = (err && err.message) || '';
-  if (/too large|tarball too large/i.test(msg)) return 'too_large';
+  if (/too large|tarball too large|exceeds \d+/i.test(msg)) return 'too_large';
+  // Wheel/zip extraction failures must NOT be lumped with tar failures —
+  // they were the dominant noise before adm-zip dispatch.
+  if (/unsupported archive format/i.test(msg)) return 'unsupported_format';
+  if (/zip[\s_-]|wheel|whl\b/i.test(msg)) return 'archive_failed';
   if (/tar\b|extract/i.test(msg)) return 'tar_failed';
   if (/HTTP [45]\d\d|HTTP \d{3}/i.test(msg)) return 'http_error';
   if (/static scan timeout/i.test(msg)) return 'static_timeout';
@@ -257,6 +261,8 @@ function formatErrorBreakdown(total, byType) {
   const parts = [];
   if (byType.http_error > 0) parts.push(`HTTP: ${byType.http_error}`);
   if (byType.tar_failed > 0) parts.push(`tar: ${byType.tar_failed}`);
+  if (byType.archive_failed > 0) parts.push(`zip: ${byType.archive_failed}`);
+  if (byType.unsupported_format > 0) parts.push(`unsupported: ${byType.unsupported_format}`);
   if (byType.too_large > 0) parts.push(`too large: ${byType.too_large}`);
   if (byType.timeout > 0) parts.push(`timeout: ${byType.timeout}`);
   if (byType.static_timeout > 0) parts.push(`static: ${byType.static_timeout}`);

package/src/monitor/ingestion.js CHANGED Viewed

@@ -46,7 +46,8 @@ let consecutivePollErrors = 0;
 // `ingestion._deps.httpsPost = fakePost` and have it take effect inside
 // pollPyPIChangelog. Kept tiny on purpose — only network I/O lives here.
 const _deps = {
-  httpsPost: null // populated below once httpsPost is defined
+  httpsPost: null, // populated below once httpsPost is defined
+  httpsGet: null   // populated below; used by npm pollers so tests can stub
 };
 function getConsecutivePollErrors() {
@@ -131,6 +132,7 @@ function httpsPost(url, body, headers = {}, timeoutMs = 30_000) {
 }
 _deps.httpsPost = httpsPost;
+_deps.httpsGet = httpsGet;
 async function getWeeklyDownloads(packageName) {
   const cached = downloadsCache.get(packageName);
@@ -162,7 +164,7 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
   const url = packageVersion
     ? `https://pypi.org/pypi/${encodeURIComponent(packageName)}/${encodeURIComponent(packageVersion)}/json`
     : `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
-  const body = await httpsGet(url);
+  const body = await _deps.httpsGet(url);
   let data;
   try {
     data = JSON.parse(body);
@@ -177,8 +179,11 @@ async function getPyPITarballUrl(packageName, packageVersion = '') {
   // Fallback: any .tar.gz
   const tarGz = urls.find(u => u.url && u.url.endsWith('.tar.gz'));
   if (tarGz) return { url: tarGz.url, version };
-  // Fallback: first available file
-  if (urls.length > 0 && urls[0].url) return { url: urls[0].url, version };
+  // Fallback: wheel (.whl) — extracted via adm-zip in queue.js, not tar.
+  // Legacy .egg / .tar.bz2 / .exe installers intentionally NOT returned —
+  // they were the cause of ~2773 tar_failed/day before this fix.
+  const wheel = urls.find(u => u.url && (u.url.endsWith('.whl') || u.url.endsWith('.zip')));
+  if (wheel) return { url: wheel.url, version };
   return { url: null, version };
 }
@@ -405,7 +410,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
     // First run: initialize to current seq ("now") via root endpoint
     if (lastSeq == null) {
-      const infoBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+      const infoBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
       const info = JSON.parse(infoBody);
       const currentSeq = info.update_seq;
       if (currentSeq == null) {
@@ -423,13 +428,13 @@ async function pollNpmChanges(state, scanQueue, stats) {
     const url = `${CHANGES_STREAM_URL}?since=${lastSeq}&limit=${CHANGES_LIMIT}`;
     let body, data;
     try {
-      body = await httpsGet(url, 60000);
+      body = await _deps.httpsGet(url, 60000);
       data = JSON.parse(body);
     } catch (fetchErr) {
       // Invalid seq (stale from pre-migration CouchDB) or transient error — re-init to current seq
       console.warn(`[MONITOR] Changes stream fetch failed (${fetchErr.message}) — attempting seq re-init`);
       try {
-        const reinitBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+        const reinitBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
         const reinitData = JSON.parse(reinitBody);
         if (reinitData.update_seq != null) {
           state.npmLastSeq = reinitData.update_seq;
@@ -450,7 +455,7 @@ async function pollNpmChanges(state, scanQueue, stats) {
     // Catch-up protection: if too far behind, skip to current
     if (data.results.length === CHANGES_LIMIT) {
-      const currentSeqBody = await httpsGet('https://replicate.npmjs.com/registry/', 10000);
+      const currentSeqBody = await _deps.httpsGet('https://replicate.npmjs.com/registry/', 10000);
       const currentSeqData = JSON.parse(currentSeqBody);
       const currentSeq = currentSeqData.update_seq;
       if (typeof currentSeq === 'number' && typeof data.last_seq === 'number' &&
@@ -459,12 +464,22 @@ async function pollNpmChanges(state, scanQueue, stats) {
         console.warn(`[MONITOR] Changes stream too far behind (${gap} changes) — skipping to current`);
         stats.npmCatchupSkips = (stats.npmCatchupSkips || 0) + 1;
         stats.npmCatchupSkippedSeqs = (stats.npmCatchupSkippedSeqs || 0) + gap;
+        // Catch-up gap = events we know happened but chose to skip. They must
+        // appear in the coverage denominator so the daily report exposes the
+        // gap as low coverage (and the catch-up line explains why).
+        stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + gap;
         state.npmLastSeq = currentSeq;
         saveNpmSeq(currentSeq);
         return 0;
       }
     }
+    // IMPORTANT: count raw events BEFORE filtering — otherwise the coverage
+    // denominator is biased (matches "events we queued", not "events npm
+    // emitted"). The filters below drop _design/self/@types/deleted, but
+    // those were still real changes-stream events.
+    stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + data.results.length;
     let queued = 0;
     for (const change of data.results) {
       // Skip deleted packages
@@ -584,7 +599,7 @@ async function pollNpmRss(state, scanQueue, stats) {
     await acquireRegistrySlot();
     let body;
     try {
-      body = await httpsGet(url);
+      body = await _deps.httpsGet(url);
     } finally {
       releaseRegistrySlot();
     }
@@ -603,6 +618,11 @@ async function pollNpmRss(state, scanQueue, stats) {
       }
     }
+    // Mirror pollNpmChanges: count raw events BEFORE per-package filters
+    // so the coverage denominator stays accurate when the changes stream
+    // falls back to RSS.
+    stats.npmPublishEventsSeen = (stats.npmPublishEventsSeen || 0) + newPackages.length;
     for (const name of newPackages) {
       if (name === SELF_PACKAGE_NAME) {
         console.log(`[MONITOR] SKIPPED (self): ${name}`);

package/src/monitor/queue.js CHANGED Viewed

@@ -13,7 +13,7 @@ const { Worker } = require('worker_threads');
 const { run } = require('../index.js');
 const { runSandbox, isDockerAvailable, tryAcquireSandboxSlot, SANDBOX_CONCURRENCY_MAX } = require('../sandbox/index.js');
 const { sendWebhook } = require('../webhook.js');
-const { downloadToFile, extractTarGz, sanitizePackageName } = require('../shared/download.js');
+const { downloadToFile, extractTarGz, extractArchive, sanitizePackageName } = require('../shared/download.js');
 const { MAX_TARBALL_SIZE } = require('../shared/constants.js');
 const { acquireRegistrySlot, releaseRegistrySlot } = require('../shared/http-limiter.js');
 const { loadCachedIOCs } = require('../ioc/updater.js');
@@ -294,10 +294,22 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     if (metaSize > MAX_TARBALL_SIZE) {
       console.log(`[MONITOR] SIZE_REJECT: ${name}@${version} — metadata size ${(metaSize / 1024 / 1024).toFixed(1)}MB exceeds ${(MAX_TARBALL_SIZE / 1024 / 1024).toFixed(0)}MB limit (skipped without download)`);
       stats.scanned++;
+      stats.totalTimeMs += Date.now() - startTime;
       return;
     }
-    const tgzPath = path.join(tmpDir, 'package.tar.gz');
+    // Pick the local filename extension from the URL so adm-zip / tar both
+    // read the magic correctly. PyPI wheels arrive as .whl, npm tarballs as
+    // .tgz, sdists as .tar.gz. Anything else falls through to .tar.gz
+    // (ingestion now returns null for unsupported types, so this branch is
+    // a defensive default rather than a real fallback).
+    const urlLower = (tarballUrl || '').toLowerCase();
+    const isWheel = urlLower.endsWith('.whl') || urlLower.endsWith('.zip');
+    const archiveExt = isWheel ? '.whl' : '.tar.gz';
+    const tgzPath = path.join(tmpDir, `package${archiveExt}`);
+    if (isWheel && ecosystem === 'pypi') {
+      stats.pypiWheelsScanned = (stats.pypiWheelsScanned || 0) + 1;
+    }
     // Layer 3: Check tarball cache before downloading
     const cacheKey = tarballCacheKey(name, version);
@@ -338,6 +350,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     if (fileSize > MAX_TARBALL_SIZE) {
       console.log(`[MONITOR] SKIP: ${name}@${version} — tarball too large (${(fileSize / 1024 / 1024).toFixed(1)}MB)`);
       stats.scanned++;
+      stats.totalTimeMs += Date.now() - startTime;
       return;
     }
@@ -365,7 +378,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
         let bypassQuickScan = false;
         try {
           alreadyExtracted = true;
-          extractedDir = extractTarGz(tgzPath, tmpDir);
+          extractedDir = extractArchive(tgzPath, tmpDir);
           const [pkgThreats, shellThreats] = await Promise.all([
             scanPackageJson(extractedDir),
@@ -382,6 +395,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
           } else {
             console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, quick scan clean)`);
             stats.scanned++;
+            stats.totalTimeMs += Date.now() - startTime;
             stats.clean++;
             updateScanStats('clean');
             return;
@@ -402,6 +416,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
           } else {
             console.log(`[MONITOR] SIZE_SKIP: ${name}@${version} — large package (${(unpackedSize / 1024 / 1024).toFixed(1)}MB, extract failed)`);
             stats.scanned++;
+            stats.totalTimeMs += Date.now() - startTime;
             stats.clean++;
             updateScanStats('clean');
             return;
@@ -411,7 +426,7 @@ async function scanPackage(name, version, ecosystem, tarballUrl, registryMeta, s
     }
     if (!extractedDir) {
-      extractedDir = extractTarGz(tgzPath, tmpDir);
+      extractedDir = extractArchive(tgzPath, tmpDir);
     }
     // ML Phase 2a: Count JS files and detect test presence for enriched features
@@ -1169,6 +1184,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
     try {
       const pypiInfo = await getPyPITarballUrl(item.name, item.version || '');
       if (!pypiInfo.url) {
+        // No sdist / .tar.gz / wheel — likely a legacy egg or msi-only
+        // release. Clean skip: do NOT touch stats.scanned or stats.errors
+        // (those would distort the Commit 1 coverage ratios). The dedicated
+        // pypiSkippedNoArchive counter surfaces volume in the daily report.
+        stats.pypiSkippedNoArchive = (stats.pypiSkippedNoArchive || 0) + 1;
         console.log(`[MONITOR] SKIP: ${item.name} — no tarball URL found on PyPI`);
         return;
       }
@@ -1205,6 +1225,11 @@ async function resolveTarballAndScan(item, stats, dailyAlerts, recentlyScanned,
     return;
   }
   recentlyScanned.add(dedupeKey);
+  // Coverage numerator: one count per unique (ecosystem, name, version) that
+  // reaches a scan attempt. Excludes ATO burst extras that lose the dedup
+  // race, retries, size-cap rejections — those inflate stats.scanned but
+  // would distort the "% of publishes we covered" reading.
+  stats.uniqueScanAttempts = (stats.uniqueScanAttempts || 0) + 1;
   // Abort check: if timeout fired during URL resolution or dedup, bail out
   if (signal && signal.aborted) return;

package/src/monitor/state.js CHANGED Viewed

@@ -991,6 +991,8 @@ function loadDailyStats(stats, dailyAlerts) {
       if (data.errorsByType) {
         stats.errorsByType.too_large = data.errorsByType.too_large || 0;
         stats.errorsByType.tar_failed = data.errorsByType.tar_failed || 0;
+        stats.errorsByType.archive_failed = data.errorsByType.archive_failed || 0;
+        stats.errorsByType.unsupported_format = data.errorsByType.unsupported_format || 0;
         stats.errorsByType.http_error = data.errorsByType.http_error || 0;
         stats.errorsByType.timeout = data.errorsByType.timeout || 0;
         stats.errorsByType.static_timeout = data.errorsByType.static_timeout || 0;
@@ -1001,6 +1003,16 @@ function loadDailyStats(stats, dailyAlerts) {
       stats.llmAnalyzed = data.llmAnalyzed || 0;
       stats.llmSuppressed = data.llmSuppressed || 0;
       stats.changesStreamPackages = data.changesStreamPackages || 0;
+      stats.uniqueScanAttempts = data.uniqueScanAttempts || 0;
+      stats.npmPublishEventsSeen = data.npmPublishEventsSeen || 0;
+      stats.pypiChangelogPackages = data.pypiChangelogPackages || 0;
+      stats.pypiChangelogEvents = data.pypiChangelogEvents || 0;
+      stats.npmCatchupSkippedSeqs = data.npmCatchupSkippedSeqs || 0;
+      stats.npmCatchupSkips = data.npmCatchupSkips || 0;
+      stats.pypiCatchupSkippedEvents = data.pypiCatchupSkippedEvents || 0;
+      stats.pypiCatchupSkips = data.pypiCatchupSkips || 0;
+      stats.pypiWheelsScanned = data.pypiWheelsScanned || 0;
+      stats.pypiSkippedNoArchive = data.pypiSkippedNoArchive || 0;
       if (Array.isArray(data.dailyAlerts)) {
         const restored = data.dailyAlerts.slice(-MAX_DAILY_ALERTS);
         dailyAlerts.length = 0;
@@ -1029,6 +1041,16 @@ function saveDailyStats(stats, dailyAlerts) {
       llmAnalyzed: stats.llmAnalyzed || 0,
       llmSuppressed: stats.llmSuppressed || 0,
       changesStreamPackages: stats.changesStreamPackages || 0,
+      uniqueScanAttempts: stats.uniqueScanAttempts || 0,
+      npmPublishEventsSeen: stats.npmPublishEventsSeen || 0,
+      pypiChangelogPackages: stats.pypiChangelogPackages || 0,
+      pypiChangelogEvents: stats.pypiChangelogEvents || 0,
+      npmCatchupSkippedSeqs: stats.npmCatchupSkippedSeqs || 0,
+      npmCatchupSkips: stats.npmCatchupSkips || 0,
+      pypiCatchupSkippedEvents: stats.pypiCatchupSkippedEvents || 0,
+      pypiCatchupSkips: stats.pypiCatchupSkips || 0,
+      pypiWheelsScanned: stats.pypiWheelsScanned || 0,
+      pypiSkippedNoArchive: stats.pypiSkippedNoArchive || 0,
       dailyAlerts: dailyAlerts.slice()
     };
     atomicWriteFileSync(DAILY_STATS_FILE, JSON.stringify(data, null, 2));

package/src/monitor/webhook.js CHANGED Viewed

@@ -855,11 +855,24 @@ function buildDailyReportEmbed(stats, dailyAlerts) {
   const avg = stats.scanned > 0 ? (stats.totalTimeMs / stats.scanned / 1000).toFixed(1) : '0.0';
   // --- Coverage estimation ---
-  // changesStreamPackages = total versions seen from npm changes stream (≈ published today)
-  const published = stats.changesStreamPackages || 0;
+  // Numerator: unique (ecosystem, name, version) tuples that reached a scan
+  // attempt (post-dedup). Denominator: raw publish events seen on either
+  // changes stream BEFORE per-package filtering, plus npm catch-up gaps and
+  // PyPI publish events that survived per-(name,version) dedup. This stays
+  // bounded near 100% — old "scanned/changesStreamPackages" was racing PyPI
+  // scans and ATO burst extras against an npm-only denominator.
+  const attempted = stats.uniqueScanAttempts || 0;
+  const npmPub = stats.npmPublishEventsSeen || 0;
+  const pypiPub = stats.pypiChangelogPackages || 0;
+  const published = npmPub + pypiPub;
+  const coverageRatio = published > 0 ? (attempted / published * 100).toFixed(0) : '0';
+  const catchupSkipped = (stats.npmCatchupSkippedSeqs || 0) + (stats.pypiCatchupSkippedEvents || 0);
+  const opsSuffix = catchupSkipped > 0
+    ? `\nOps: ${stats.scanned} | Catch-up skip: ${catchupSkipped}`
+    : `\nOps: ${stats.scanned}`;
   const coverageText = published > 0
-    ? `${stats.scanned}/${published} (${(stats.scanned / published * 100).toFixed(0)}%)`
-    : `${stats.scanned} scanned`;
+    ? `${attempted}/${published} (${coverageRatio}%)${opsSuffix}`
+    : `${attempted} attempted${opsSuffix}`;
   // --- Timeouts ---
   const staticTimeouts = (stats.errorsByType && stats.errorsByType.static_timeout) || 0;
@@ -1019,6 +1032,8 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
   stats.errors = 0;
   stats.errorsByType.too_large = 0;
   stats.errorsByType.tar_failed = 0;
+  stats.errorsByType.archive_failed = 0;
+  stats.errorsByType.unsupported_format = 0;
   stats.errorsByType.http_error = 0;
   stats.errorsByType.timeout = 0;
   stats.errorsByType.static_timeout = 0;
@@ -1033,6 +1048,16 @@ async function sendDailyReport(stats, dailyAlerts, recentlyScanned, downloadsCac
   // Reset LLM detective internal stats
   try { require('../ml/llm-detective.js').resetStats(); } catch {}
   stats.changesStreamPackages = 0;
+  stats.uniqueScanAttempts = 0;
+  stats.npmPublishEventsSeen = 0;
+  stats.pypiChangelogPackages = 0;
+  stats.pypiChangelogEvents = 0;
+  stats.npmCatchupSkippedSeqs = 0;
+  stats.npmCatchupSkips = 0;
+  stats.pypiCatchupSkippedEvents = 0;
+  stats.pypiCatchupSkips = 0;
+  stats.pypiWheelsScanned = 0;
+  stats.pypiSkippedNoArchive = 0;
   stats.rssFallbackCount = 0;
   dailyAlerts.length = 0;
   recentlyScanned.clear();

package/src/response/playbooks.js CHANGED Viewed

@@ -399,6 +399,16 @@ const PLAYBOOKS = {
     'Technique Shai-Hulud (TeamPCP). Supprimer les fichiers .claude/settings.json ' +
     'et .vscode/tasks.json avant ouverture.',
+  aiconf_unicode_obfuscation:
+    'CRITIQUE: Fichier de config d\'agent IA contient des caracteres Unicode invisibles ' +
+    '(zero-width, directional override, variation selectors). Technique TrapDoor (mai 2026): ' +
+    'l\'attaquant insere des U+200B au milieu de mots-cles pour echapper a la revue humaine ' +
+    'et aux regex statiques, tandis que l\'agent IA (Claude, Cursor) lit le contenu normalise ' +
+    'et execute le payload cache. NE PAS ouvrir ce projet avec un agent IA. Ouvrir le fichier ' +
+    'dans un editeur qui affiche les caracteres invisibles (VS Code: "editor.renderControlCharacters") ' +
+    'pour inspecter le contenu reel. Supprimer le fichier ou nettoyer les caracteres invisibles ' +
+    'avant toute utilisation. Si deja ouvert avec un agent IA, regenerer tous les secrets touches.',
   ai_agent_abuse:
     'CRITIQUE: Un agent IA (Claude, Gemini, Q) est invoque avec des flags de bypass de securite ' +
     '(--dangerously-skip-permissions, --yolo, --trust-all-tools). Technique s1ngularity/Nx. ' +

package/src/rules/index.js CHANGED Viewed

@@ -914,6 +914,21 @@ const RULES = {
     ],
     mitre: 'T1546'
   },
+  aiconf_unicode_obfuscation: {
+    id: 'MUADDIB-AICONF-004',
+    name: 'Zero-Width Unicode Obfuscation in AI Config',
+    severity: 'CRITICAL',
+    confidence: 'high',
+    domain: 'malware',
+    description: 'Fichier de configuration d\'agent IA (.cursorrules, CLAUDE.md, copilot-instructions.md) contient des caracteres Unicode invisibles (zero-width, directional override, variation selectors) qui cachent des instructions a la revue humaine ou cassent des mots-cles pour echapper a la detection regex. Technique TrapDoor (mai 2026): curl|sh interspersee de U+200B passe au travers du regex /curl/ tandis que l\'agent IA execute le payload normalise.',
+    references: [
+      'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
+      'https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode',
+      'https://trojansource.codes/',
+      'https://attack.mitre.org/techniques/T1027/'
+    ],
+    mitre: 'T1027.013'
+  },
   require_cache_poison: {
     id: 'MUADDIB-AST-019',

package/src/scanner/ai-config.js CHANGED Viewed

@@ -18,6 +18,14 @@
 const fs = require('fs');
 const path = require('path');
+const { countInvisibleUnicode, stripInvisibleUnicode } = require('../shared/unicode-invisibles.js');
+// Threshold above which an AI config file is flagged as ZW-Unicode-obfuscated.
+// Lower than obfuscation.js (10) because .cursorrules / CLAUDE.md should never
+// legitimately contain invisible codepoints — even international content uses
+// only visible chars (CJK, accents, emoji with U+FE0F variation selector are
+// NOT counted by countInvisibleUnicode).
+const AI_CONFIG_ZW_THRESHOLD = 5;
 // AI agent config files to scan for prompt injection (relative to project root)
 const AI_CONFIG_FILES = [
@@ -111,7 +119,12 @@ function scanAIConfig(targetPath) {
     }
     const relPath = configFile;
-    const fileThreats = analyzeAIConfigFile(content, relPath);
+    // Normalize invisible Unicode BEFORE running regex patterns.
+    // Without this, an attacker can split keywords with U+200B (`curl`) to
+    // evade /curl\s+/ — the exact TrapDoor (mai 2026) .cursorrules vector.
+    const invisibleCount = countInvisibleUnicode(content);
+    const normalized = invisibleCount > 0 ? stripInvisibleUnicode(content) : content;
+    const fileThreats = analyzeAIConfigFile(normalized, relPath, invisibleCount);
     threats.push(...fileThreats);
   }
@@ -218,14 +231,30 @@ function analyzeIDEHookFile(content, relPath) {
 }
 /**
- * Analyze a single AI config file for prompt injection patterns
+ * Analyze a single AI config file for prompt injection patterns.
+ *
+ * @param {string} content - File content, already normalized (invisible Unicode stripped).
+ * @param {string} relPath - Relative path of the config file.
+ * @param {number} invisibleCount - Number of invisible Unicode codepoints in the original (pre-strip) content.
  */
-function analyzeAIConfigFile(content, relPath) {
+function analyzeAIConfigFile(content, relPath, invisibleCount) {
   const threats = [];
   let hasShellCommand = false;
   let hasExfiltration = false;
   let hasCredentialAccess = false;
+  // Zero-width / directional Unicode obfuscation (TrapDoor, mai 2026).
+  // An attacker can hide instructions or split keywords with U+200B etc. so
+  // human reviewers see "harmless" text while the AI agent reads the payload.
+  if (invisibleCount >= AI_CONFIG_ZW_THRESHOLD) {
+    threats.push({
+      type: 'aiconf_unicode_obfuscation',
+      severity: 'CRITICAL',
+      message: `AI config contains ${invisibleCount} invisible Unicode characters (zero-width / directional / variation selectors) in ${relPath} — content was normalized before pattern matching. Possible hidden instructions or keyword-splitting evasion (TrapDoor pattern).`,
+      file: relPath
+    });
+  }
   // Check shell command patterns
   for (const pattern of SHELL_COMMAND_PATTERNS) {
     if (pattern.regex.test(content)) {

package/src/scanner/obfuscation.js CHANGED Viewed

@@ -1,6 +1,7 @@
 const fs = require('fs');
 const path = require('path');
 const { findFiles, forEachSafeFile, debugLog } = require('../utils.js');
+const { countInvisibleUnicode } = require('../shared/unicode-invisibles.js');
 // node_modules NOT excluded: detect obfuscated code in dependencies.
 // dist/build/out/output excluded: bundled output is always flagged as isPackageOutput (LOW)
@@ -198,52 +199,4 @@ function hasLargeStringArray(content) {
   return false;
 }
-/**
- * Count invisible Unicode codepoints in content (GlassWorm detection).
- * Covers BMP zero-width chars, variation selectors, and supplementary plane
- * tag characters / variation selectors supplement via codePointAt iteration.
- *
- * Codepoints detected:
- * - U+200B, U+200C, U+200D (zero-width space/joiner/non-joiner)
- * - U+FEFF (BOM — only if position > 0; pos 0 is legitimate BOM)
- * - U+2060 (word joiner), U+180E (Mongolian vowel separator)
- * - U+FE00-U+FE0E (variation selectors — excludes U+FE0F emoji presentation selector)
- * - U+E0100-U+E01EF (variation selectors supplement)
- * - U+E0001-U+E007F (tag characters)
- */
-function countInvisibleUnicode(content) {
-  let count = 0;
-  for (let i = 0; i < content.length; i++) {
-    const cp = content.codePointAt(i);
-    // BMP invisible chars
-    if (cp === 0x200B || cp === 0x200C || cp === 0x200D ||
-        cp === 0x2060 || cp === 0x180E) {
-      count++;
-    }
-    // BOM only suspicious after position 0
-    else if (cp === 0xFEFF && i > 0) {
-      count++;
-    }
-    // BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F (emoji presentation selector)
-    else if (cp >= 0xFE00 && cp <= 0xFE0E) {
-      count++;
-    }
-    // Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
-    else if (cp >= 0xE0100 && cp <= 0xE01EF) {
-      count++;
-      i++; // skip surrogate pair low half
-    }
-    // Supplementary plane: tag characters (U+E0001-U+E007F)
-    else if (cp >= 0xE0001 && cp <= 0xE007F) {
-      count++;
-      i++; // skip surrogate pair low half
-    }
-    // Skip surrogate pair low half for other supplementary chars
-    else if (cp > 0xFFFF) {
-      i++;
-    }
-  }
-  return count;
-}
 module.exports = { detectObfuscation };

package/src/shared/download.js CHANGED Viewed

@@ -2,6 +2,7 @@ const https = require('https');
 const fs = require('fs');
 const path = require('path');
 const { execFileSync } = require('child_process');
+const AdmZip = require('adm-zip');
 const { MAX_TARBALL_SIZE, DOWNLOAD_TIMEOUT } = require('./constants.js');
 // Allowed redirect domains for tarball downloads (SSRF protection)
@@ -221,13 +222,30 @@ function downloadToFile(url, destPath, timeoutMs = DOWNLOAD_TIMEOUT) {
 }
 /**
- * Extract a .tar.gz to a directory. Returns the package root.
- * Uses execFileSync (no shell) to prevent command injection.
- * @param {string} tgzPath - Path to the .tar.gz file
- * @param {string} destDir - Destination directory
- * @returns {string} Path to extracted package root
+ * Detect archive format from a path/URL extension.
+ * URL-derived names are reliable enough here: PyPI's `urls[].packagetype`
+ * + filename are authoritative, npm tarballs are always `.tgz`. Returns
+ * 'targz', 'zip', or 'unknown'. Callers either pass an `options.format`
+ * override or trust this detection.
+ *
+ * @param {string} archivePath - Path or URL ending in the archive filename
+ * @returns {'targz'|'zip'|'unknown'}
  */
-function extractTarGz(tgzPath, destDir) {
+function detectArchiveFormat(archivePath) {
+  if (typeof archivePath !== 'string') return 'unknown';
+  const lower = archivePath.toLowerCase();
+  if (lower.endsWith('.tar.gz') || lower.endsWith('.tgz')) return 'targz';
+  if (lower.endsWith('.whl') || lower.endsWith('.zip')) return 'zip';
+  return 'unknown';
+}
+/**
+ * Extract a tar.gz tarball with the system `tar` binary. Used for npm
+ * tarballs and PyPI sdists. Internal implementation — call extractArchive
+ * for new code; extractTarGz remains as a thin wrapper for the existing
+ * scanner/temporal-ast-diff.js callsite.
+ */
+function _extractTarGzImpl(tgzPath, destDir) {
   // Use cwd + relative paths so C: never appears in tar arguments
   // (GNU tar treats C: as remote host, bsdtar doesn't support --force-local)
   const tgzDir = path.dirname(path.resolve(tgzPath));
@@ -258,6 +276,77 @@ function extractTarGz(tgzPath, destDir) {
   return destDir;
 }
+/**
+ * Extract a ZIP archive (PyPI wheels, generic zips) to a directory.
+ * adm-zip is already a runtime dependency (used by src/ioc/scraper.js).
+ *
+ * Two hardening layers before extraction touches disk:
+ *  1. zip-slip: resolve each entry path against destDir and reject anything
+ *     that escapes. path.resolve normalizes ../, mixed separators, and
+ *     absolute paths in a single pass.
+ *  2. size cap: sum of uncompressed entry sizes must stay below
+ *     MAX_TARBALL_SIZE — defends against zip bombs that pass tarball
+ *     size checks but expand into multi-GB on disk.
+ */
+function _extractZipImpl(zipPath, destDir) {
+  const zip = new AdmZip(zipPath);
+  const entries = zip.getEntries();
+  const resolvedDest = path.resolve(destDir);
+  let totalUncompressed = 0;
+  for (const entry of entries) {
+    totalUncompressed += (entry.header && entry.header.size) || 0;
+    if (totalUncompressed > MAX_TARBALL_SIZE) {
+      throw new Error(
+        `Zip extract refused: total uncompressed size ${totalUncompressed} exceeds ${MAX_TARBALL_SIZE}`
+      );
+    }
+    const target = path.resolve(destDir, entry.entryName);
+    if (target !== resolvedDest && !target.startsWith(resolvedDest + path.sep)) {
+      throw new Error(`Unsafe zip entry escapes destDir: ${entry.entryName}`);
+    }
+  }
+  zip.extractAllTo(destDir, /* overwrite */ true);
+  // Wheels carry a flat layout (no leading `package/`); collapse into the
+  // single top-level dir if there is exactly one (matches sdist behavior so
+  // the scanner pipeline can treat the result uniformly).
+  try {
+    const top = fs.readdirSync(destDir);
+    if (top.length === 1) {
+      const single = path.join(destDir, top[0]);
+      const stat = fs.lstatSync(single);
+      if (!stat.isSymbolicLink() && stat.isDirectory()) return single;
+    }
+  } catch { /* ignore — fall back to destDir */ }
+  return destDir;
+}
+/**
+ * Extract an archive to a directory, dispatching on file extension.
+ * Supports `.tar.gz` / `.tgz` (tar) and `.whl` / `.zip` (adm-zip).
+ *
+ * @param {string} archivePath - Path to the archive on disk
+ * @param {string} destDir - Destination directory (must exist)
+ * @param {Object} [options]
+ * @param {'targz'|'zip'} [options.format] - override auto-detection
+ * @returns {string} Path to extracted package root
+ * @throws {Error} when the format is unknown or extraction fails
+ */
+function extractArchive(archivePath, destDir, options = {}) {
+  const format = options.format || detectArchiveFormat(archivePath);
+  if (format === 'targz') return _extractTarGzImpl(archivePath, destDir);
+  if (format === 'zip') return _extractZipImpl(archivePath, destDir);
+  throw new Error(`Unsupported archive format for ${path.basename(archivePath)}`);
+}
+/**
+ * Backwards-compatible wrapper for the original tar.gz-only extractor.
+ * Kept because src/scanner/temporal-ast-diff.js and existing tests still
+ * import it by name. New code should call extractArchive instead.
+ */
+function extractTarGz(tgzPath, destDir) {
+  return _extractTarGzImpl(tgzPath, destDir);
+}
 /**
  * Sanitize a package name for use in temporary directory names.
  * Removes path traversal sequences, slashes, and @ symbols.
@@ -277,6 +366,8 @@ function sanitizePackageName(packageName) {
 module.exports = {
   downloadToFile,
   extractTarGz,
+  extractArchive,
+  detectArchiveFormat,
   sanitizePackageName,
   isAllowedDownloadRedirect,
   normalizeHostname,

package/src/shared/unicode-invisibles.js ADDED Viewed

@@ -0,0 +1,164 @@
+'use strict';
+/**
+ * Unicode invisible character helpers — shared by obfuscation.js and ai-config.js.
+ *
+ * Extracted v2.11.25 (TrapDoor campaign, mai 2026) : la fonction locale dans
+ * obfuscation.js couvrait `.js/.cjs/.mjs/.ts/.tsx/.py` mais pas les configs IA
+ * (.cursorrules, CLAUDE.md). En la partageant, ai-config.js peut normaliser le
+ * contenu avant ses regex et bloquer le vecteur "cu<U+200B>rl|sh" avec ZW
+ * interspersés dans le mot-clé.
+ *
+ * Codepoints détectés (superset du scope original obfuscation.js, qui n'incluait
+ * pas LRM/RLM ni les directional override) :
+ *
+ *   Zero-width:
+ *     U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ
+ *     U+2060 word joiner
+ *     U+180E Mongolian vowel separator
+ *
+ *   Directional (bidi spoofing — Trojan Source CVE-2021-42574) :
+ *     U+200E LRM, U+200F RLM
+ *     U+202A LRE, U+202B RLE, U+202C PDF, U+202D LRO, U+202E RLO
+ *
+ *   Invisible math operators (peuvent casser un parser sans être vus) :
+ *     U+2061 function application, U+2062 invisible times,
+ *     U+2063 invisible separator, U+2064 invisible plus
+ *
+ *   BOM (mid-text only; position 0 est légitime UTF-8 BOM) :
+ *     U+FEFF
+ *
+ *   Variation selectors :
+ *     U+FE00-FE0E (excludes U+FE0F emoji presentation selector — légitime)
+ *     U+E0100-E01EF supplementary plane variation selectors
+ *
+ *   Tag characters (utilisés par GlassWorm pour encoder du payload) :
+ *     U+E0001, U+E0020-E007F
+ *
+ * CJK, accents, emoji standards (avec U+FE0F) sont volontairement EXCLUS — pas
+ * de FP attendu sur du contenu international légitime.
+ *
+ * Références :
+ *  - https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode
+ *  - https://trojansource.codes/ (Trojan Source, CVE-2021-42574)
+ *  - https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates (mai 2026)
+ */
+/**
+ * Returns true if the codepoint at position `i` is considered invisible.
+ * Sets `skipNext` true on the result if the codepoint is supplementary
+ * (caller must `i++` to skip the low surrogate half).
+ *
+ * @param {string} content
+ * @param {number} i
+ * @returns {{ invisible: boolean, supplementary: boolean }}
+ */
+function inspectCodepoint(content, i) {
+  const cp = content.codePointAt(i);
+  // BMP zero-width
+  if (cp === 0x200B || cp === 0x200C || cp === 0x200D) {
+    return { invisible: true, supplementary: false };
+  }
+  // BMP directional (Trojan Source)
+  if (cp === 0x200E || cp === 0x200F ||
+      (cp >= 0x202A && cp <= 0x202E)) {
+    return { invisible: true, supplementary: false };
+  }
+  // BMP word joiner & friends
+  if (cp === 0x2060 || cp === 0x180E) {
+    return { invisible: true, supplementary: false };
+  }
+  // BMP invisible math operators (U+2061-2064)
+  if (cp >= 0x2061 && cp <= 0x2064) {
+    return { invisible: true, supplementary: false };
+  }
+  // BOM only suspicious after position 0
+  if (cp === 0xFEFF && i > 0) {
+    return { invisible: true, supplementary: false };
+  }
+  // BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F emoji presentation
+  if (cp >= 0xFE00 && cp <= 0xFE0E) {
+    return { invisible: true, supplementary: false };
+  }
+  // Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
+  if (cp >= 0xE0100 && cp <= 0xE01EF) {
+    return { invisible: true, supplementary: true };
+  }
+  // Supplementary plane: tag characters (U+E0001 + U+E0020-U+E007F)
+  if (cp === 0xE0001 || (cp >= 0xE0020 && cp <= 0xE007F)) {
+    return { invisible: true, supplementary: true };
+  }
+  // Other supplementary chars (non-invisible) — need to skip low surrogate
+  if (cp > 0xFFFF) {
+    return { invisible: false, supplementary: true };
+  }
+  return { invisible: false, supplementary: false };
+}
+/**
+ * Count invisible Unicode codepoints in `content`.
+ *
+ * @param {string} content
+ * @returns {number}
+ */
+function countInvisibleUnicode(content) {
+  let count = 0;
+  for (let i = 0; i < content.length; i++) {
+    const { invisible, supplementary } = inspectCodepoint(content, i);
+    if (invisible) count++;
+    if (supplementary) i++; // skip low surrogate half
+  }
+  return count;
+}
+/**
+ * Return a copy of `content` with all invisible codepoints removed.
+ *
+ * Used to normalize text before pattern matching: prevents an attacker
+ * from splitting a keyword (`cu<U+200B>rl`) with zero-width chars to evade
+ * regex like /curl\s+/i.
+ *
+ * @param {string} content
+ * @returns {string}
+ */
+function stripInvisibleUnicode(content) {
+  // Fast path: if no codepoint > 0x7F, content is pure ASCII — nothing to strip.
+  let hasHighChar = false;
+  for (let i = 0; i < content.length; i++) {
+    if (content.charCodeAt(i) > 0x7F) { hasHighChar = true; break; }
+  }
+  if (!hasHighChar) return content;
+  let out = '';
+  for (let i = 0; i < content.length; i++) {
+    const { invisible, supplementary } = inspectCodepoint(content, i);
+    if (!invisible) {
+      // Preserve original char(s). For supplementary, copy both surrogate halves.
+      if (supplementary) {
+        out += content[i] + content[i + 1];
+        i++;
+      } else {
+        out += content[i];
+      }
+    } else if (supplementary) {
+      // Skip both surrogate halves
+      i++;
+    }
+  }
+  return out;
+}
+module.exports = {
+  countInvisibleUnicode,
+  stripInvisibleUnicode
+};