npm - muaddib-scanner - Versions diffs - 2.11.113 → 2.11.115 - Mend

muaddib-scanner 2.11.113 → 2.11.115

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/audit-data/adjudication-2026-06-14.json +56 -0
package/audit-data/fpr-baseline-2026-06-14.json +2648 -0
package/package.json +1 -1
package/{self-scan-v2.11.113.json → self-scan-v2.11.115.json} +18 -13
package/src/intent-graph.js +34 -192
package/src/pipeline/executor.js +5 -1
package/src/pipeline/processor.js +15 -7
package/src/scanner/ast-detectors/handle-post-walk.js +9 -2
package/src/scanner/module-graph/annotate-sinks.js +8 -5
package/src/scanner/module-graph/constants.js +10 -1
package/src/scanner/module-graph/detect-cross-file.js +56 -4
package/src/scanner/module-graph/index.js +2 -2
package/src/scanner/module-graph/parse-utils.js +13 -1
package/src/scoring.js +41 -0
package/src/sdk-destination.js +328 -0

package/src/scoring.js CHANGED Viewed

@@ -1029,6 +1029,28 @@ const FRAMEWORK_PROTO_RE = new RegExp(
   '^(' + FRAMEWORK_PROTOTYPES.join('|') + ')\\.prototype\\.'
 );
+// FPR sink-coupling (chantier 2026-06): independent exfil / remote-exec sink signals pointing at
+// an ANOMALOUS destination. A credential_regex_harvest signal is only a true positive when one of
+// these co-occurs in the package. Deliberately EXCLUDES benign network capability — a bare
+// fetch/http.get, remote_code_load to a first-party CDN/model host, a local server, or a native
+// build are NOT sinks. Dataflow-PROVEN harvest signals (intent_credential_exfil, cross_file_dataflow)
+// are included so a genuine read→exfil taint keeps the signal HIGH regardless of host reputation
+// (anti-FN floor). See _hasExfilSink + the credential_regex_harvest gate in applyFPReductions.
+const EXFIL_SINK_TYPES = new Set([
+  'suspicious_domain', 'direct_ip_exfil', 'ioc_string_match', 'ioc_match',
+  'known_malicious_package', 'pypi_malicious_package', 'shai_hulud_marker',
+  'detached_credential_exfil', 'silent_stealth_process',
+  'curl_pipe_shell', 'curl_env_exfil', 'reverse_shell', 'dns_exfil', 'oast_callback',
+  'function_constructor_require', 'staged_remote_loader', 'staged_eval_decode',
+  'fetch_decrypt_exec', 'download_exec_binary', 'self_destruct_eval',
+  'newsletter_auto_follow', 'cross_file_dataflow', 'intent_credential_exfil',
+  'intent_command_exfil', 'sandbox_known_exfil_domain', 'sandbox_network_after_sensitive_read'
+]);
+function _hasExfilSink(threats) {
+  if (!Array.isArray(threats)) return false;
+  return threats.some(t => EXFIL_SINK_TYPES.has(t.type) && t.severity !== 'LOW');
+}
 function applyFPReductions(threats, reachableFiles, packageName, packageDeps, reachableFunctions) {
   // Initialize reductions audit trail on each threat
   // Store original severity before any FP reductions, so compound
@@ -1174,6 +1196,25 @@ function applyFPReductions(threats, reachableFiles, packageName, packageDeps, re
     }
   }
+  // FPR sink-coupling gate (chantier 2026-06 — FPR-baseline-2026-06-14.md). credential_regex_harvest
+  // is a weak signal alone: a credential-shaped regex co-located with a network call, with NO proof
+  // the matched secret flows out and NO host-reputation check (ast.js:hasCredentialInsideRegex +
+  // hasNetworkCallInFile). The blind FPR baseline measured 94.4% FP on it — it fires on nodemailer
+  // SMTP code, redaction utilities in framework bundles, and SDKs that parse Authorization headers.
+  // It is a real harvester ONLY when an independent exfil sink to an anomalous destination co-occurs
+  // (suspicious_domain / direct_ip / ioc / detached-exfil / staged loader / curl exfil / dataflow-proven
+  // taint ...). When no such sink is present, downgrade HIGH/CRITICAL → LOW. Runs after the dilution
+  // floor so the floor's restored instance is also gated (the floor protects real exfil; with no sink
+  // there is nothing to protect). No GT sample relies on credential_regex_harvest (verified).
+  if (!_hasExfilSink(threats)) {
+    for (const t of threats) {
+      if (t.type === 'credential_regex_harvest' && (t.severity === 'HIGH' || t.severity === 'CRITICAL')) {
+        t.reductions.push({ rule: 'sink_coupling', from: t.severity, to: 'LOW' });
+        t.severity = 'LOW';
+      }
+    }
+  }
   for (const t of threats) {
     // Audit v3 B3: typosquat with LOW confidence → MEDIUM

package/src/sdk-destination.js ADDED Viewed

@@ -0,0 +1,328 @@
+'use strict';
+// ============================================
+// DESTINATION-AWARE SDK DETECTION (shared leaf module)
+// ============================================
+// Extracted from intent-graph.js (v2.11.x) so the same destination logic can gate
+// every credential→network taint detector — intent coherence (intent-graph.js),
+// dataflow (scanner/dataflow.js), cross-file flow (scanner/module-graph) and the
+// detached/uncaught compounds (scanner/ast-detectors). No project dependencies
+// (only the Node stdlib via callers) → safe to require from any scanner, no cycles.
+//
+// Curated allowlist: when an env var matching the pattern is sent to a matching domain,
+// it is legitimate SDK usage, not credential exfiltration.
+// Safe-by-default: unknown env vars or unknown domains remain CRITICAL.
+const SDK_ENV_DOMAIN_MAP = [
+  { envPattern: /^AWS_/i, domains: ['amazonaws.com', 'aws.amazon.com'] },
+  { envPattern: /^AZURE_/i, domains: ['azure.com', 'microsoft.com'] },
+  { envPattern: /^GOOGLE_|^GCP_/i, domains: ['googleapis.com', 'google.com'] },
+  { envPattern: /^FIREBASE_/i, domains: ['firebase.com', 'googleapis.com'] },
+  { envPattern: /^SALESFORCE_/i, domains: ['salesforce.com', 'force.com'] },
+  { envPattern: /^SUPABASE_/i, domains: ['supabase.co', 'supabase.com'] },
+  { envPattern: /^MAILGUN_/i, domains: ['mailgun.net', 'mailgun.com'] },
+  { envPattern: /^STRIPE_/i, domains: ['stripe.com'] },
+  { envPattern: /^TWILIO_/i, domains: ['twilio.com'] },
+  { envPattern: /^SENDGRID_/i, domains: ['sendgrid.com', 'sendgrid.net'] },
+  { envPattern: /^DATADOG_/i, domains: ['datadoghq.com'] },
+  { envPattern: /^SENTRY_/i, domains: ['sentry.io'] },
+  { envPattern: /^SLACK_/i, domains: ['slack.com'] },
+  { envPattern: /^GITHUB_/i, domains: ['github.com', 'githubusercontent.com'] },
+  { envPattern: /^GITLAB_/i, domains: ['gitlab.com'] },
+  { envPattern: /^CLOUDFLARE_/i, domains: ['cloudflare.com'] },
+  { envPattern: /^OPENAI_/i, domains: ['openai.com'] },
+  { envPattern: /^ANTHROPIC_/i, domains: ['anthropic.com'] },
+  { envPattern: /^MONGODB_|^MONGO_/i, domains: ['mongodb.com', 'mongodb.net'] },
+  { envPattern: /^AUTH0_/i, domains: ['auth0.com'] },
+  { envPattern: /^HUBSPOT_/i, domains: ['hubspot.com', 'hubapi.com'] },
+  { envPattern: /^CONTENTFUL_/i, domains: ['contentful.com'] },
+];
+// Tokens stripped when extracting brand keyword from env var name
+const ENV_NOISE_TOKENS = new Set([
+  'API', 'KEY', 'SECRET', 'TOKEN', 'PASSWORD', 'CREDENTIAL',
+  'AUTH', 'ACCESS', 'PRIVATE', 'PUBLIC', 'CLIENT', 'ID', 'URL'
+]);
+// Suspicious tunneling/proxy domains — never considered legitimate SDK destinations
+const SUSPICIOUS_DOMAIN_PATTERNS = /ngrok|serveo|localtunnel|burpcollaborator|requestbin|pipedream|webhook\.site/i;
+// URL extraction regex (matches http/https URLs in source code)
+const URL_EXTRACT_RE = /https?:\/\/[a-zA-Z0-9\-._~:/?#[\]@!$&'()*+,;=%]+/g;
+// Hostname extraction from Node.js request options: hostname: 'domain.com' or host: 'domain.com'
+const HOSTNAME_OPTION_RE = /(?:hostname|host)\s*:\s*['"`]([a-zA-Z0-9\-._]+)['"`]/g;
+/**
+ * Extract env var name from an intent source threat message.
+ * Messages look like: "process.env.SALESFORCE_API_KEY", "env var MAILGUN_API_KEY accessed"
+ */
+function extractEnvVarFromMessage(sourceThreats) {
+  for (const t of sourceThreats) {
+    if (!t.message) continue;
+    // Match process.env.VAR_NAME pattern
+    const envMatch = t.message.match(/process\.env\.([A-Z_][A-Z0-9_]*)/i);
+    if (envMatch) return envMatch[1];
+    // Match standalone VAR_NAME patterns (e.g., "SALESFORCE_API_KEY")
+    const varMatch = t.message.match(/\b([A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+)\b/);
+    if (varMatch) return varMatch[1];
+  }
+  return null;
+}
+/**
+ * Extract brand keyword from env var name by removing noise tokens.
+ * MAILGUN_API_KEY → MAILGUN, SALESFORCE_CLIENT_SECRET → SALESFORCE
+ */
+function extractBrandFromEnvVar(envVarName) {
+  const parts = envVarName.toUpperCase().split('_');
+  const brandParts = parts.filter(p => !ENV_NOISE_TOKENS.has(p) && p.length > 0);
+  return brandParts.length > 0 ? brandParts[0] : null;
+}
+/**
+ * Extract domain from a URL string.
+ * Returns the hostname (without port).
+ */
+function extractDomain(url) {
+  try {
+    // Capture only valid hostname characters so a path-less URL immediately followed by
+    // a quote/paren (e.g. fetch('https://api.openai.com')) does not absorb the trailing
+    // ')" into the host. Stops at /, :, ?, #, quotes, parens, etc.
+    const match = url.match(/^https?:\/\/([a-zA-Z0-9.\-]+)/i);
+    return match ? match[1].toLowerCase() : null;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Check if a domain matches any of the expected SDK domains (suffix match).
+ * api.mailgun.net matches mailgun.net, sub.api.stripe.com matches stripe.com
+ */
+function domainMatchesSuffix(domain, expectedDomains) {
+  for (const expected of expectedDomains) {
+    if (domain === expected || domain.endsWith('.' + expected)) return true;
+  }
+  return false;
+}
+/**
+ * Check if an env var + file content represents a legitimate SDK pattern.
+ *
+ * Returns true ONLY if:
+ * 1. The env var matches a known SDK mapping (allowlist) OR heuristic brand match
+ * 2. ALL URLs in the file point to domains matching the expected SDK
+ * 3. No suspicious tunneling/proxy domains are present
+ *
+ * @param {string} envVarName - e.g., "SALESFORCE_API_KEY"
+ * @param {string} fileContent - source code of the file
+ * @returns {boolean} true if SDK pattern (should skip intent pair)
+ */
+function isSDKPattern(envVarName, fileContent) {
+  // Extract domains from full URLs (https://api.stripe.com/v1/charges)
+  const urls = fileContent.match(URL_EXTRACT_RE) || [];
+  const domains = urls.map(u => extractDomain(u)).filter(Boolean);
+  // Also extract hostnames from Node.js request options (hostname: 'api.stripe.com')
+  let hostnameMatch;
+  const hostnameRe = new RegExp(HOSTNAME_OPTION_RE.source, 'g');
+  while ((hostnameMatch = hostnameRe.exec(fileContent)) !== null) {
+    const hostname = hostnameMatch[1].toLowerCase();
+    if (hostname && !domains.includes(hostname)) {
+      domains.push(hostname);
+    }
+  }
+  // No URLs found — can't confirm SDK pattern, default to suspicious
+  if (domains.length === 0) return false;
+  // Check for suspicious tunneling domains — immediate fail
+  for (const domain of domains) {
+    if (SUSPICIOUS_DOMAIN_PATTERNS.test(domain)) return false;
+  }
+  // Check for raw IP addresses — immediate fail
+  for (const domain of domains) {
+    if (/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(domain)) return false;
+  }
+  // 1. Try curated allowlist first (strict: ALL domains must match)
+  // Curated allowlist is authoritative — no relaxation here to prevent
+  // attacker injecting a legitimate domain alongside their C2 domain.
+  for (const mapping of SDK_ENV_DOMAIN_MAP) {
+    if (mapping.envPattern.test(envVarName)) {
+      return domains.every(d => domainMatchesSuffix(d, mapping.domains));
+    }
+  }
+  // R2: credential-suffixed env vars get relaxed domain matching (at least ONE match).
+  // SDKs commonly call their own API + CDN/logging/analytics domains.
+  // Safety: suspicious domains and raw IPs are already rejected above.
+  // Only applies to the heuristic fallback — curated allowlist stays strict.
+  const CREDENTIAL_SUFFIXES = ['_API_KEY', '_SECRET', '_TOKEN', '_SECRET_KEY', '_ACCESS_KEY'];
+  const upperName = envVarName.toUpperCase();
+  const hasCredentialSuffix = CREDENTIAL_SUFFIXES.some(s => upperName.endsWith(s));
+  // 2. Heuristic fallback: extract brand keyword and check domain labels
+  const brand = extractBrandFromEnvVar(envVarName);
+  if (!brand || brand.length < 3) return false; // Too short for reliable matching
+  const brandLower = brand.toLowerCase();
+  // 2a. Strict check: every domain matches brand (existing behavior)
+  // e.g., brand "ACME" matches "api.acme.com" (label "acme") but not "api.acmetech.com"
+  if (domains.every(d => {
+    const labels = d.split('.');
+    return labels.some(label => label === brandLower);
+  })) return true;
+  // 2b. R2 relaxed: credential suffix + at least one domain matches brand
+  if (hasCredentialSuffix && domains.some(d => {
+    const labels = d.split('.');
+    return labels.some(label => label === brandLower);
+  })) return true;
+  return false;
+}
+// ============================================
+// DESTINATION-BENIGNNESS GATE (env-var-independent)
+// ============================================
+// isSDKPattern() needs a single extractable env var + matching domain. That model
+// breaks on (a) multi-provider files (a CLI that reads GEMINI_API_KEY *and*
+// ANTHROPIC_API_KEY and calls both), and (b) flows whose credential source has no
+// extractable env var (cross_file_dataflow / detached compounds). For those, judge the
+// DESTINATIONS, not the env var: a credential→network flow is benign iff EVERY network
+// host in scope is provably non-exfil.
+//
+// Benign classes (NOT attacker-spoofable):
+//   - loopback / RFC1918 private / link-local IPs, localhost, *.local        (local IPC)
+//   - reserved test domains (example.com, *.test, *.invalid)                 (RFC 2606/6761)
+//   - curated SaaS/cloud/AI provider API domains                            (cannot echo a
+//     POST body back to a third party — UNLIKE paste sites / bot webhooks, deliberately
+//     EXCLUDED, see SUSPICIOUS_DOMAIN_PATTERNS + the exclusion note below)
+// Deliberately NOT benign: package "own domain" from package.json (attacker writes it),
+// unknown domains, public IPs, suspicious tunnels/paste hosts. Any of those ⇒ keep firing.
+// Safe-by-default: no extractable host ⇒ NOT benign (do not suppress).
+// AI providers (2025-26) absent from the env→domain map. Bot/messaging/paste channels
+// (telegram, discord webhooks, pastebin, gist, transfer.sh, …) are intentionally absent:
+// they CAN relay an exfil POST to the attacker, so they must keep firing.
+const AI_PROVIDER_DOMAIN_SUFFIXES = [
+  'claude.com', 'openrouter.ai', 'deepseek.com', 'x.ai', 'mistral.ai', 'cohere.ai',
+  'cohere.com', 'huggingface.co', 'perplexity.ai', 'groq.com', 'together.ai',
+  'together.xyz', 'replicate.com', 'fireworks.ai', 'anyscale.com', 'ai21.com',
+  'voyageai.com', 'deepinfra.com',
+];
+// Flat suffix list = every domain already curated in SDK_ENV_DOMAIN_MAP + the AI extras.
+// Derived (not duplicated) so the two stay in sync. Matched via domainMatchesSuffix, which
+// is label-anchored: 'evilx.ai' does NOT match 'x.ai'.
+const PROVIDER_DOMAIN_SUFFIXES = Array.from(new Set([
+  ...SDK_ENV_DOMAIN_MAP.flatMap(m => m.domains),
+  ...AI_PROVIDER_DOMAIN_SUFFIXES,
+]));
+function stripPort(host) {
+  let h = String(host).trim().toLowerCase();
+  // Bracketed IPv6 with optional port: [::1]:443 / [::1] → ::1
+  const br = h.match(/^\[([^\]]+)\]/);
+  if (br) return br[1];
+  // host:port for IPv4 / hostname — only when there's a single colon (bare IPv6 like
+  // ::1 has multiple colons and must NOT be truncated).
+  if ((h.match(/:/g) || []).length === 1) h = h.replace(/:\d+$/, '');
+  return h;
+}
+// loopback / RFC1918 private / link-local / localhost / reserved-test domain.
+function isLocalOrReservedHost(host) {
+  const h = stripPort(host);
+  if (!h) return false;
+  if (h === 'localhost' || h.endsWith('.localhost') || h.endsWith('.local')) return true;
+  if (h === '::1' || h === '0:0:0:0:0:0:0:1') return true; // IPv6 loopback
+  if (h === 'example.com' || h === 'example.org' || h === 'example.net') return true;
+  if (h.endsWith('.example.com') || h.endsWith('.example.org') || h.endsWith('.example.net')) return true;
+  if (h.endsWith('.example') || h.endsWith('.test') || h.endsWith('.invalid')) return true;
+  const m = h.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
+  if (m) {
+    const a = +m[1], b = +m[2];
+    if (a === 127 || a === 0) return true;            // loopback / this-host
+    if (a === 10) return true;                        // 10.0.0.0/8
+    if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12
+    if (a === 192 && b === 168) return true;          // 192.168.0.0/16
+    if (a === 169 && b === 254) return true;          // 169.254.0.0/16 link-local
+    return false;                                     // any other IPv4 literal = public
+  }
+  return false;
+}
+// A public (non-loopback/private) IPv4 literal — a direct-IP exfil endpoint (ecto pattern).
+function isPublicIpHost(host) {
+  const h = stripPort(host);
+  if (!/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(h)) return false;
+  return !isLocalOrReservedHost(h);
+}
+// Extract every network host referenced in a file (URLs + Node request options).
+function extractHostsFromContent(fileContent) {
+  if (!fileContent) return [];
+  const urls = fileContent.match(URL_EXTRACT_RE) || [];
+  const hosts = urls.map(u => extractDomain(u)).filter(Boolean);
+  let m;
+  const re = new RegExp(HOSTNAME_OPTION_RE.source, 'g');
+  while ((m = re.exec(fileContent)) !== null) {
+    const h = m[1].toLowerCase();
+    if (h && !hosts.includes(h)) hosts.push(h);
+  }
+  // Bare host literals assigned as defaults, e.g. `process.env.HOST || "127.0.0.1"` then
+  // used as `host: HOST` (common "configurable local collector" shape — the variable host
+  // isn't matched above). Capture quoted IPv4 / localhost / 0.0.0.0 literals. Safe: any
+  // co-present public IP or unknown host still fails the all-benign check downstream, so
+  // this can only RELAX a file whose every literal host is loopback/private.
+  const LITERAL_HOST_RE = /['"`](localhost|0\.0\.0\.0|(?:\d{1,3}\.){3}\d{1,3})['"`]/g;
+  while ((m = LITERAL_HOST_RE.exec(fileContent)) !== null) {
+    const h = m[1].toLowerCase();
+    if (h && !hosts.includes(h)) hosts.push(h);
+  }
+  return hosts;
+}
+/**
+ * Destination-benignness gate for credential→network taint flows whose env var is not
+ * (or need not be) known. Returns true ONLY if EVERY extracted host is provably non-exfil
+ * (local/reserved OR a curated provider). Any suspicious/paste host, public IP, or unknown
+ * domain ⇒ false. No hosts found ⇒ false (cannot confirm).
+ *
+ * @param {string} fileContent - source of the file containing the network sink
+ * @returns {boolean} true ⇒ first-party/local, safe to downgrade the taint flow
+ */
+function networkDestinationsAllBenign(fileContent) {
+  const hosts = extractHostsFromContent(fileContent);
+  if (hosts.length === 0) return false;
+  for (const h of hosts) {
+    if (SUSPICIOUS_DOMAIN_PATTERNS.test(h)) return false;
+    if (isPublicIpHost(h)) return false;
+    if (isLocalOrReservedHost(h)) continue;
+    if (PROVIDER_DOMAIN_SUFFIXES.some(s => domainMatchesSuffix(h, [s]))) continue;
+    return false; // unknown / unrecognised destination → keep firing
+  }
+  return true;
+}
+module.exports = {
+  SDK_ENV_DOMAIN_MAP,
+  ENV_NOISE_TOKENS,
+  SUSPICIOUS_DOMAIN_PATTERNS,
+  URL_EXTRACT_RE,
+  HOSTNAME_OPTION_RE,
+  PROVIDER_DOMAIN_SUFFIXES,
+  extractEnvVarFromMessage,
+  extractBrandFromEnvVar,
+  extractDomain,
+  domainMatchesSuffix,
+  isSDKPattern,
+  stripPort,
+  isLocalOrReservedHost,
+  isPublicIpHost,
+  extractHostsFromContent,
+  networkDestinationsAllBenign,
+};