muaddib-scanner 2.7.6 → 2.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.7.6",
3
+ "version": "2.7.7",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
package/src/index.js CHANGED
@@ -568,8 +568,9 @@ async function run(targetPath, options = {}) {
568
568
  // A malware package typically has 1-3 occurrences, not dozens.
569
569
  applyFPReductions(deduped, reachableFiles, packageName);
570
570
 
571
- // Intent coherence analysis: detect source→sink pairs across files
572
- const intentResult = buildIntentPairs(deduped);
571
+ // Intent coherence analysis: detect source→sink pairs within files
572
+ // Pass targetPath for destination-aware SDK pattern detection
573
+ const intentResult = buildIntentPairs(deduped, targetPath);
573
574
  // Add intent threats to deduped before enrichment so they get rules/playbooks
574
575
  if (intentResult.intentThreats) {
575
576
  for (const it of intentResult.intentThreats) {
@@ -1,5 +1,8 @@
1
1
  'use strict';
2
2
 
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+
3
6
  // ============================================
4
7
  // INTENT GRAPH — Intra-File Coherence Analysis
5
8
  // ============================================
@@ -14,6 +17,7 @@
14
17
  // 3. Sources = ONLY high-confidence credential access (NOT env_access, NOT suspicious_dataflow)
15
18
  // 4. Sinks = ONLY threats already identified by scanners (NO content-based scanning)
16
19
  // 5. No double-counting — suspicious_dataflow is already a compound detection
20
+ // 6. Destination-aware: SDK patterns (env key matches API domain) are NOT exfiltration
17
21
 
18
22
  // ============================================
19
23
  // SOURCE CLASSIFICATION
@@ -32,6 +36,165 @@ const SOURCE_TYPES = {
32
36
  // Sensitive env var patterns — env_access referencing these is credential theft, not config
33
37
  const SENSITIVE_ENV_PATTERNS = /TOKEN|KEY|SECRET|PASSWORD|CREDENTIAL|API_KEY|AUTH/i;
34
38
 
39
+ // ============================================
40
+ // DESTINATION-AWARE SDK DETECTION
41
+ // ============================================
42
+ // Curated allowlist: when an env var matching the pattern is sent to a matching domain,
43
+ // it is legitimate SDK usage, not credential exfiltration.
44
+ // Safe-by-default: unknown env vars or unknown domains remain CRITICAL.
45
+ const SDK_ENV_DOMAIN_MAP = [
46
+ { envPattern: /^AWS_/i, domains: ['amazonaws.com', 'aws.amazon.com'] },
47
+ { envPattern: /^AZURE_/i, domains: ['azure.com', 'microsoft.com'] },
48
+ { envPattern: /^GOOGLE_|^GCP_/i, domains: ['googleapis.com', 'google.com'] },
49
+ { envPattern: /^FIREBASE_/i, domains: ['firebase.com', 'googleapis.com'] },
50
+ { envPattern: /^SALESFORCE_/i, domains: ['salesforce.com', 'force.com'] },
51
+ { envPattern: /^SUPABASE_/i, domains: ['supabase.co', 'supabase.com'] },
52
+ { envPattern: /^MAILGUN_/i, domains: ['mailgun.net', 'mailgun.com'] },
53
+ { envPattern: /^STRIPE_/i, domains: ['stripe.com'] },
54
+ { envPattern: /^TWILIO_/i, domains: ['twilio.com'] },
55
+ { envPattern: /^SENDGRID_/i, domains: ['sendgrid.com', 'sendgrid.net'] },
56
+ { envPattern: /^DATADOG_/i, domains: ['datadoghq.com'] },
57
+ { envPattern: /^SENTRY_/i, domains: ['sentry.io'] },
58
+ { envPattern: /^SLACK_/i, domains: ['slack.com'] },
59
+ { envPattern: /^GITHUB_/i, domains: ['github.com', 'githubusercontent.com'] },
60
+ { envPattern: /^GITLAB_/i, domains: ['gitlab.com'] },
61
+ { envPattern: /^CLOUDFLARE_/i, domains: ['cloudflare.com'] },
62
+ { envPattern: /^OPENAI_/i, domains: ['openai.com'] },
63
+ { envPattern: /^ANTHROPIC_/i, domains: ['anthropic.com'] },
64
+ { envPattern: /^MONGODB_|^MONGO_/i, domains: ['mongodb.com', 'mongodb.net'] },
65
+ { envPattern: /^AUTH0_/i, domains: ['auth0.com'] },
66
+ { envPattern: /^HUBSPOT_/i, domains: ['hubspot.com', 'hubapi.com'] },
67
+ { envPattern: /^CONTENTFUL_/i, domains: ['contentful.com'] },
68
+ ];
69
+
70
+ // Tokens stripped when extracting brand keyword from env var name
71
+ const ENV_NOISE_TOKENS = new Set([
72
+ 'API', 'KEY', 'SECRET', 'TOKEN', 'PASSWORD', 'CREDENTIAL',
73
+ 'AUTH', 'ACCESS', 'PRIVATE', 'PUBLIC', 'CLIENT', 'ID', 'URL'
74
+ ]);
75
+
76
+ // Suspicious tunneling/proxy domains — never considered legitimate SDK destinations
77
+ const SUSPICIOUS_DOMAIN_PATTERNS = /ngrok|serveo|localtunnel|burpcollaborator|requestbin|pipedream|webhook\.site/i;
78
+
79
+ // URL extraction regex (matches http/https URLs in source code)
80
+ const URL_EXTRACT_RE = /https?:\/\/[a-zA-Z0-9\-._~:/?#[\]@!$&'()*+,;=%]+/g;
81
+
82
+ // Hostname extraction from Node.js request options: hostname: 'domain.com' or host: 'domain.com'
83
+ const HOSTNAME_OPTION_RE = /(?:hostname|host)\s*:\s*['"`]([a-zA-Z0-9\-._]+)['"`]/g;
84
+
85
+ /**
86
+ * Extract env var name from an intent source threat message.
87
+ * Messages look like: "process.env.SALESFORCE_API_KEY", "env var MAILGUN_API_KEY accessed"
88
+ */
89
+ function extractEnvVarFromMessage(sourceThreats) {
90
+ for (const t of sourceThreats) {
91
+ if (!t.message) continue;
92
+ // Match process.env.VAR_NAME pattern
93
+ const envMatch = t.message.match(/process\.env\.([A-Z_][A-Z0-9_]*)/i);
94
+ if (envMatch) return envMatch[1];
95
+ // Match standalone VAR_NAME patterns (e.g., "SALESFORCE_API_KEY")
96
+ const varMatch = t.message.match(/\b([A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+)\b/);
97
+ if (varMatch) return varMatch[1];
98
+ }
99
+ return null;
100
+ }
101
+
102
+ /**
103
+ * Extract brand keyword from env var name by removing noise tokens.
104
+ * MAILGUN_API_KEY → MAILGUN, SALESFORCE_CLIENT_SECRET → SALESFORCE
105
+ */
106
+ function extractBrandFromEnvVar(envVarName) {
107
+ const parts = envVarName.toUpperCase().split('_');
108
+ const brandParts = parts.filter(p => !ENV_NOISE_TOKENS.has(p) && p.length > 0);
109
+ return brandParts.length > 0 ? brandParts[0] : null;
110
+ }
111
+
112
+ /**
113
+ * Extract domain from a URL string.
114
+ * Returns the hostname (without port).
115
+ */
116
+ function extractDomain(url) {
117
+ try {
118
+ const match = url.match(/^https?:\/\/([^/:?#]+)/i);
119
+ return match ? match[1].toLowerCase() : null;
120
+ } catch {
121
+ return null;
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Check if a domain matches any of the expected SDK domains (suffix match).
127
+ * api.mailgun.net matches mailgun.net, sub.api.stripe.com matches stripe.com
128
+ */
129
+ function domainMatchesSuffix(domain, expectedDomains) {
130
+ for (const expected of expectedDomains) {
131
+ if (domain === expected || domain.endsWith('.' + expected)) return true;
132
+ }
133
+ return false;
134
+ }
135
+
136
+ /**
137
+ * Check if an env var + file content represents a legitimate SDK pattern.
138
+ *
139
+ * Returns true ONLY if:
140
+ * 1. The env var matches a known SDK mapping (allowlist) OR heuristic brand match
141
+ * 2. ALL URLs in the file point to domains matching the expected SDK
142
+ * 3. No suspicious tunneling/proxy domains are present
143
+ *
144
+ * @param {string} envVarName - e.g., "SALESFORCE_API_KEY"
145
+ * @param {string} fileContent - source code of the file
146
+ * @returns {boolean} true if SDK pattern (should skip intent pair)
147
+ */
148
+ function isSDKPattern(envVarName, fileContent) {
149
+ // Extract domains from full URLs (https://api.stripe.com/v1/charges)
150
+ const urls = fileContent.match(URL_EXTRACT_RE) || [];
151
+ const domains = urls.map(u => extractDomain(u)).filter(Boolean);
152
+
153
+ // Also extract hostnames from Node.js request options (hostname: 'api.stripe.com')
154
+ let hostnameMatch;
155
+ const hostnameRe = new RegExp(HOSTNAME_OPTION_RE.source, 'g');
156
+ while ((hostnameMatch = hostnameRe.exec(fileContent)) !== null) {
157
+ const hostname = hostnameMatch[1].toLowerCase();
158
+ if (hostname && !domains.includes(hostname)) {
159
+ domains.push(hostname);
160
+ }
161
+ }
162
+
163
+ // No URLs found — can't confirm SDK pattern, default to suspicious
164
+ if (domains.length === 0) return false;
165
+
166
+ // Check for suspicious tunneling domains — immediate fail
167
+ for (const domain of domains) {
168
+ if (SUSPICIOUS_DOMAIN_PATTERNS.test(domain)) return false;
169
+ }
170
+
171
+ // Check for raw IP addresses — immediate fail
172
+ for (const domain of domains) {
173
+ if (/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(domain)) return false;
174
+ }
175
+
176
+ // 1. Try curated allowlist first
177
+ for (const mapping of SDK_ENV_DOMAIN_MAP) {
178
+ if (mapping.envPattern.test(envVarName)) {
179
+ // All domains must match expected SDK domains
180
+ return domains.every(d => domainMatchesSuffix(d, mapping.domains));
181
+ }
182
+ }
183
+
184
+ // 2. Heuristic fallback: extract brand keyword and check domain labels
185
+ const brand = extractBrandFromEnvVar(envVarName);
186
+ if (!brand || brand.length < 3) return false; // Too short for reliable matching
187
+
188
+ const brandLower = brand.toLowerCase();
189
+ // Check if every domain has the brand as a whole label
190
+ // e.g., brand "ACME" matches "api.acme.com" (label "acme") but not "api.acmetech.com"
191
+ return domains.every(d => {
192
+ const labels = d.split('.');
193
+ return labels.some(label => label === brandLower);
194
+ });
195
+ }
196
+
197
+
35
198
  // ============================================
36
199
  // SINK CLASSIFICATION (from existing threats only)
37
200
  // ============================================
@@ -149,9 +312,10 @@ function classifySink(threat) {
149
312
  * Cross-file detection is handled by module-graph.js (cross_file_dataflow).
150
313
  *
151
314
  * @param {Array} threats - deduplicated threat array
315
+ * @param {string} [targetPath] - root path for reading source files (SDK pattern detection)
152
316
  * @returns {Object} { pairs, intentScore, intentThreats }
153
317
  */
154
- function buildIntentPairs(threats) {
318
+ function buildIntentPairs(threats, targetPath) {
155
319
  // Only consider MEDIUM+ threats. LOW severity means applyFPReductions already
156
320
  // determined this is noise (bundler artifact, dist/ file, count threshold exceeded).
157
321
  // Re-elevating LOW threats via intent pairing would undo FP reductions.
@@ -169,15 +333,23 @@ function buildIntentPairs(threats) {
169
333
  const pairs = [];
170
334
  let intentScore = 0;
171
335
 
336
+ // Cache file contents for SDK pattern checks (lazy, per file)
337
+ const fileContentCache = new Map();
338
+
172
339
  // Only pair sources and sinks within the SAME file
173
340
  for (const [file, fileThreats] of byFile) {
174
341
  const sources = [];
175
342
  const sinks = [];
343
+ // Track which threats are credential sources (for env var extraction)
344
+ const sourceThreats = [];
176
345
 
177
346
  for (const t of fileThreats) {
178
347
  const srcType = classifySource(t);
179
348
  const sinkType = classifySink(t);
180
- if (srcType) sources.push(srcType);
349
+ if (srcType) {
350
+ sources.push(srcType);
351
+ sourceThreats.push(t);
352
+ }
181
353
  if (sinkType) sinks.push(sinkType);
182
354
  }
183
355
 
@@ -194,6 +366,30 @@ function buildIntentPairs(threats) {
194
366
 
195
367
  const pairKey = `${srcType}:${sinkType}:${file}`;
196
368
  if (pairSet.has(pairKey)) continue;
369
+
370
+ // Destination-aware SDK check: credential_read → network_external
371
+ // If the env var matches the API domain, this is legitimate SDK usage
372
+ if (srcType === 'credential_read' && sinkType === 'network_external' && targetPath) {
373
+ const envVarName = extractEnvVarFromMessage(sourceThreats);
374
+ if (envVarName) {
375
+ try {
376
+ let content = fileContentCache.get(file);
377
+ if (content === undefined) {
378
+ const filePath = path.join(targetPath, file);
379
+ content = fs.readFileSync(filePath, 'utf8');
380
+ fileContentCache.set(file, content);
381
+ }
382
+ if (isSDKPattern(envVarName, content)) {
383
+ // SDK pattern confirmed — skip this pair
384
+ pairSet.add(pairKey); // Mark as seen to avoid re-checking
385
+ continue;
386
+ }
387
+ } catch {
388
+ // File read error — default to suspicious (CRITICAL)
389
+ }
390
+ }
391
+ }
392
+
197
393
  pairSet.add(pairKey);
198
394
 
199
395
  pairs.push({
@@ -235,5 +431,9 @@ module.exports = {
235
431
  classifySource,
236
432
  classifySink,
237
433
  buildIntentPairs,
238
- COHERENCE_MATRIX
434
+ COHERENCE_MATRIX,
435
+ isSDKPattern,
436
+ extractEnvVarFromMessage,
437
+ extractBrandFromEnvVar,
438
+ SDK_ENV_DOMAIN_MAP
239
439
  };