muaddib-scanner 2.10.95 → 2.10.97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.95",
3
+ "version": "2.10.97",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -76,6 +76,504 @@ const TOP_THREAT_TYPES = [
76
76
 
77
77
  const TOP_THREAT_TYPES_SET = new Set(TOP_THREAT_TYPES);
78
78
 
79
+ // --- Cluster FP contextual feature helpers (v2.10.96) ---
80
+ //
81
+ // Target: P1 CRITICAL webhook suppression (score >= 75). The four helpers
82
+ // below encode the four FP clusters identified in the v2.10.9x weekly FP
83
+ // review: Cluster A (native binary installers via GitHub releases),
84
+ // Cluster B (minified bundles w/o install scripts), Cluster C (dev tooling
85
+ // writing git hooks from local files), Cluster E (first-party SDKs exfil
86
+ // pattern on their own API).
87
+ //
88
+ // These features intentionally operate on scan-result signals ONLY so they
89
+ // can be recomputed on historical JSONL records without re-scanning.
90
+
91
+ // Threats whose presence implies the package performs a network call.
92
+ const NETWORK_ADJACENT_TYPES = new Set([
93
+ 'suspicious_dataflow',
94
+ 'network_require',
95
+ 'remote_code_load',
96
+ 'curl_exec',
97
+ 'intent_credential_exfil',
98
+ 'intent_command_exfil',
99
+ 'dangerous_call_fetch',
100
+ 'external_tarball_dep',
101
+ 'dependency_url_suspicious'
102
+ ]);
103
+
104
+ // Package-scope -> first-party domain mapping for well-known SDK publishers.
105
+ // Keys are lowercase npm scope names (without '@'). Used by
106
+ // `network_destination_first_party` when the package is scoped.
107
+ const SCOPE_FIRST_PARTY_DOMAINS = {
108
+ 'anthropic-ai': ['anthropic.com'],
109
+ 'openai': ['openai.com'],
110
+ 'google-cloud': ['googleapis.com', 'google.com'],
111
+ 'google-ai': ['googleapis.com', 'google.com'],
112
+ 'aws-sdk': ['amazonaws.com', 'aws.amazon.com'],
113
+ 'aws-amplify': ['amazonaws.com'],
114
+ 'azure': ['azure.com', 'microsoft.com'],
115
+ 'microsoft': ['microsoft.com', 'azure.com'],
116
+ 'supabase': ['supabase.co', 'supabase.com'],
117
+ 'stripe': ['stripe.com'],
118
+ 'twilio': ['twilio.com'],
119
+ 'sendgrid': ['sendgrid.com', 'sendgrid.net'],
120
+ 'datadog': ['datadoghq.com'],
121
+ 'sentry': ['sentry.io'],
122
+ 'slack': ['slack.com'],
123
+ 'octokit': ['github.com', 'githubusercontent.com'],
124
+ 'cloudflare': ['cloudflare.com'],
125
+ 'auth0': ['auth0.com'],
126
+ 'hubspot': ['hubspot.com', 'hubapi.com'],
127
+ 'contentful': ['contentful.com'],
128
+ 'mongodb': ['mongodb.com', 'mongodb.net'],
129
+ 'mailgun': ['mailgun.net', 'mailgun.com'],
130
+ 'vercel': ['vercel.com', 'vercel.app'],
131
+ 'netlify': ['netlify.com', 'netlify.app'],
132
+ 'pinecone-database': ['pinecone.io'],
133
+ 'langchain': ['langchain.com']
134
+ };
135
+
136
+ // GitHub release hosts (install_url_github_releases).
137
+ const GITHUB_RELEASE_HOSTS = ['github.com', 'objects.githubusercontent.com', 'raw.githubusercontent.com'];
138
+
139
+ // Bundle file-shape patterns. Conservative: only flag paths that clearly
140
+ // correspond to build output, so the feature stays specific to Cluster B.
141
+ const BUNDLE_PATH_RE = /(?:^|[\\/])(?:dist|build|lib|out|umd|esm|cjs|bundle|_next[\\/]static|\.next[\\/]static|public[\\/]static|webpack|rollup)[\\/]/i;
142
+ const BUNDLE_FILE_RE = /\.(?:min|bundle|prod|umd|iife|esm|cjs)\.(?:m?js|cjs)$|\.min\.js$|chunk-[0-9a-f]+\.js$|vendors?~?.*\.js$/i;
143
+
144
+ // Threat types that indicate remote content fetch in a file (for
145
+ // `git_hook_source_local` heuristic: absence => local source).
146
+ const REMOTE_FETCH_TYPES = new Set([
147
+ 'remote_code_load',
148
+ 'network_require',
149
+ 'curl_exec',
150
+ 'suspicious_dataflow',
151
+ 'suspicious_domain',
152
+ 'dangerous_call_fetch',
153
+ 'external_tarball_dep',
154
+ 'dependency_url_suspicious',
155
+ 'binary_dropper',
156
+ 'download_exec_binary'
157
+ ]);
158
+
159
+ // Match URLs inside threat message strings (legacy fallback when threats
160
+ // predate v2.10.96 URL enrichment — historical JSONL scan results).
161
+ const MESSAGE_URL_RE = /https?:\/\/([a-zA-Z0-9._-]+)(?:[:/?#][^\s'"`)<>]*)?/g;
162
+
163
+ function hostFromUrl(url) {
164
+ if (typeof url !== 'string') return null;
165
+ const m = url.match(/^https?:\/\/([^/:?#\s'"`)<>]+)/i);
166
+ return m ? m[1].toLowerCase() : null;
167
+ }
168
+
169
+ function extractHostsFromThreats(threats) {
170
+ const hosts = new Set();
171
+ let sawStructured = false;
172
+ for (const t of threats) {
173
+ if (t && Array.isArray(t.urls) && t.urls.length > 0) {
174
+ sawStructured = true;
175
+ for (const u of t.urls) {
176
+ const h = hostFromUrl(u);
177
+ if (h) hosts.add(h);
178
+ }
179
+ }
180
+ }
181
+ // If no threat carries structured URLs, fall back to message-regex so that
182
+ // callers can still reason about old scan records. Once the scan fleet is
183
+ // fully on v2.10.96+ the regex branch becomes dead.
184
+ if (sawStructured) return hosts;
185
+ for (const t of threats) {
186
+ const msg = t && t.message;
187
+ if (!msg || typeof msg !== 'string') continue;
188
+ MESSAGE_URL_RE.lastIndex = 0;
189
+ let m;
190
+ while ((m = MESSAGE_URL_RE.exec(msg)) !== null) {
191
+ if (m[1]) hosts.add(m[1].toLowerCase());
192
+ }
193
+ }
194
+ return hosts;
195
+ }
196
+
197
+ function hostMatchesSuffix(host, candidates) {
198
+ for (const c of candidates) {
199
+ if (host === c || host.endsWith('.' + c)) return true;
200
+ }
201
+ return false;
202
+ }
203
+
204
+ function getPackageScope(name) {
205
+ if (!name || typeof name !== 'string') return null;
206
+ const m = name.match(/^@([^/]+)\//);
207
+ return m ? m[1].toLowerCase() : null;
208
+ }
209
+
210
+ function getHomepageHost(meta) {
211
+ if (!meta) return null;
212
+ const candidates = [
213
+ meta.homepage,
214
+ meta.registryMeta && meta.registryMeta.homepage,
215
+ meta.npmRegistryMeta && meta.npmRegistryMeta.homepage
216
+ ];
217
+ for (const raw of candidates) {
218
+ if (!raw || typeof raw !== 'string') continue;
219
+ const m = raw.match(/^https?:\/\/([^/:?#]+)/i);
220
+ if (m) return m[1].toLowerCase();
221
+ }
222
+ return null;
223
+ }
224
+
225
+ /**
226
+ * Feature 1 — TRUE iff the package performs a network call AND every
227
+ * extractable destination is a first-party host of that package.
228
+ * First-party = package-scope SDK publisher or package.homepage host.
229
+ *
230
+ * Targets Cluster E: Claude Code / OpenAI / Anthropic SDK wrappers that
231
+ * read API keys from env and POST them to their legitimate vendor API.
232
+ */
233
+ function networkDestinationFirstParty(result, meta) {
234
+ const threats = (result && result.threats) || [];
235
+ const hasNetwork = threats.some(t => NETWORK_ADJACENT_TYPES.has(t.type));
236
+ if (!hasNetwork) return false;
237
+
238
+ const firstParty = [];
239
+ const scope = getPackageScope(meta && meta.name);
240
+ if (scope && SCOPE_FIRST_PARTY_DOMAINS[scope]) {
241
+ firstParty.push(...SCOPE_FIRST_PARTY_DOMAINS[scope]);
242
+ }
243
+ // Unscoped packages: accept exact-name match against the scope table for
244
+ // packages whose own identifier IS the publisher (e.g., `stripe`, `twilio`).
245
+ const baseName = (meta && meta.name && String(meta.name).replace(/^@[^/]+\//, '').toLowerCase()) || '';
246
+ if (!scope && SCOPE_FIRST_PARTY_DOMAINS[baseName]) {
247
+ firstParty.push(...SCOPE_FIRST_PARTY_DOMAINS[baseName]);
248
+ }
249
+ const homepageHost = getHomepageHost(meta);
250
+ if (homepageHost) firstParty.push(homepageHost);
251
+ if (firstParty.length === 0) return false;
252
+
253
+ const hosts = extractHostsFromThreats(threats);
254
+ // No destination host was observable (scanner saw the network sink but
255
+ // no URL literal leaked into threat messages). Accept as first-party only
256
+ // when the package identity alone is a strong signal (scoped SDK).
257
+ if (hosts.size === 0) return scope !== null && SCOPE_FIRST_PARTY_DOMAINS[scope] !== undefined;
258
+
259
+ for (const h of hosts) {
260
+ if (!hostMatchesSuffix(h, firstParty)) return false;
261
+ }
262
+ return true;
263
+ }
264
+
265
+ /**
266
+ * Feature 2 — TRUE iff the package behaves as a native-binary installer
267
+ * AND every URL visible in its threat messages points to GitHub releases.
268
+ *
269
+ * Targets Cluster A: esbuild / swc / prisma style platform binary drops.
270
+ */
271
+ function installUrlGithubReleases(result) {
272
+ const threats = (result && result.threats) || [];
273
+ const hasInstaller = threats.some(t => t.type === 'binary_dropper' || t.type === 'download_exec_binary');
274
+ if (!hasInstaller) return false;
275
+ // Any known-suspicious destination present => not a github-only installer.
276
+ if (threats.some(t => t.type === 'suspicious_domain')) return false;
277
+
278
+ const hosts = extractHostsFromThreats(threats);
279
+ if (hosts.size === 0) return false;
280
+ for (const h of hosts) {
281
+ if (!hostMatchesSuffix(h, GITHUB_RELEASE_HOSTS)) return false;
282
+ }
283
+ // At least one host must be a github release host (guards against the
284
+ // degenerate case where every extracted host happened to be unrelated
285
+ // allowlist traffic — e.g., registry.npmjs.org).
286
+ for (const h of hosts) {
287
+ if (hostMatchesSuffix(h, GITHUB_RELEASE_HOSTS)) return true;
288
+ }
289
+ return false;
290
+ }
291
+
292
+ function hasBundlePath(file) {
293
+ if (!file || typeof file !== 'string') return false;
294
+ return BUNDLE_PATH_RE.test(file) || BUNDLE_FILE_RE.test(file);
295
+ }
296
+
297
+ function hasLifecycleScripts(meta) {
298
+ const scripts = (meta && meta.registryMeta && meta.registryMeta.scripts) || null;
299
+ if (!scripts || typeof scripts !== 'object') return false;
300
+ for (const key of ['preinstall', 'install', 'postinstall']) {
301
+ const v = scripts[key];
302
+ if (typeof v === 'string' && v.trim().length > 0) return true;
303
+ }
304
+ return false;
305
+ }
306
+
307
+ // Threshold derived from the v2.10.9x FP review of minified bundles:
308
+ // Cluster B FPs all ship at least one > 100KB file (typical webpack chunk
309
+ // is 200-800KB). 100KB is low enough to catch small bundlers yet high
310
+ // enough to exclude hand-written source.
311
+ const BUNDLE_FILE_MIN_BYTES = 100 * 1024;
312
+
313
+ /**
314
+ * Feature 3 — TRUE iff the package ships at least one large (>100KB) file
315
+ * AND the findings all sit in those large files AND the package declares
316
+ * no install lifecycle script. Targets Cluster B: minified webpack/rollup
317
+ * output triggering eval / obfuscation heuristics without any runtime
318
+ * install vector.
319
+ *
320
+ * Primary size source: `summary.fileSizes` (populated by processor.js in
321
+ * v2.10.96+). When sizes are absent (historical JSONL records), fall back
322
+ * to the path-shape proxy (`dist/`, `.min.js`, etc.).
323
+ *
324
+ * `registryMeta.scripts` is REQUIRED: callers that do not populate it will
325
+ * always get FALSE — we must not claim a package has no install hook when
326
+ * we never looked.
327
+ */
328
+ function bundleWithoutInstallScripts(result, meta) {
329
+ if (!meta || !meta.registryMeta || meta.registryMeta.scripts === undefined) return false;
330
+ if (hasLifecycleScripts(meta)) return false;
331
+
332
+ const threats = (result && result.threats) || [];
333
+ if (threats.length === 0) return false;
334
+
335
+ const threatFiles = new Set();
336
+ for (const t of threats) {
337
+ if (t.file) threatFiles.add(t.file);
338
+ }
339
+ if (threatFiles.size === 0) return false;
340
+
341
+ const summary = (result && result.summary) || {};
342
+ const fileSizes = summary.fileSizes;
343
+ const haveSizes = fileSizes && typeof fileSizes === 'object' && Object.keys(fileSizes).length > 0;
344
+
345
+ if (haveSizes) {
346
+ let sawLargeFile = false;
347
+ for (const f of threatFiles) {
348
+ const size = fileSizes[f];
349
+ if (typeof size !== 'number') return false;
350
+ if (size < BUNDLE_FILE_MIN_BYTES) return false;
351
+ sawLargeFile = true;
352
+ }
353
+ return sawLargeFile;
354
+ }
355
+
356
+ // Legacy proxy: no file sizes available, fall back to path shape.
357
+ for (const f of threatFiles) {
358
+ if (!hasBundlePath(f)) return false;
359
+ }
360
+ return true;
361
+ }
362
+
363
+ /**
364
+ * Feature 4 — TRUE iff the package fires `git_hooks_injection` AND none of
365
+ * the files that triggered it also show a remote-fetch signal. Proxy for
366
+ * "hook body was read from a local source file", i.e. dev tooling like
367
+ * husky / simple-git-hooks installing its own canned hook.
368
+ */
369
+ function gitHookSourceLocal(result) {
370
+ const threats = (result && result.threats) || [];
371
+ const hookThreats = threats.filter(t => t.type === 'git_hooks_injection');
372
+ if (hookThreats.length === 0) return false;
373
+
374
+ const remoteByFile = new Map();
375
+ for (const t of threats) {
376
+ if (!t.file || !REMOTE_FETCH_TYPES.has(t.type)) continue;
377
+ remoteByFile.set(t.file, true);
378
+ }
379
+ for (const h of hookThreats) {
380
+ if (h.file && remoteByFile.has(h.file)) return false;
381
+ }
382
+ return true;
383
+ }
384
+
385
+ // --- v2.10.96 extended FP features (F5-F8, VPS review 2026-04-18) ---
386
+ //
387
+ // Covers an additional 319 FP (15.2%) on top of F1-F4; combined F1-F8
388
+ // cover 2069/2104 reviewed FP = 98.3%.
389
+
390
+ // Obfuscation-shape threats used by Feature 6.
391
+ const OBFUSCATION_TYPES = new Set([
392
+ 'obfuscation_detected',
393
+ 'js_obfuscation_pattern',
394
+ 'high_entropy_string',
395
+ 'unicode_invisible_injection'
396
+ ]);
397
+
398
+ // Threat types that indicate a runtime vector (install, env, network).
399
+ // Their presence disqualifies Feature 6 (obfuscation-without-vector).
400
+ const VECTOR_TYPES = new Set([
401
+ // install / lifecycle
402
+ 'lifecycle_script',
403
+ 'lifecycle_shell_pipe',
404
+ // env read (credential source)
405
+ 'env_access',
406
+ 'env_charcode_reconstruction',
407
+ 'credential_regex_harvest',
408
+ // network / exec / dynamic code
409
+ 'suspicious_dataflow',
410
+ 'network_require',
411
+ 'remote_code_load',
412
+ 'curl_exec',
413
+ 'intent_credential_exfil',
414
+ 'intent_command_exfil',
415
+ 'dangerous_call_fetch',
416
+ 'external_tarball_dep',
417
+ 'dependency_url_suspicious',
418
+ 'dangerous_exec',
419
+ 'dangerous_call_eval',
420
+ 'dangerous_call_exec',
421
+ 'dangerous_call_function',
422
+ 'module_compile',
423
+ 'binary_dropper',
424
+ 'download_exec_binary',
425
+ 'fetch_decrypt_exec',
426
+ 'suspicious_domain',
427
+ 'reverse_shell'
428
+ ]);
429
+
430
+ // Threats that indicate a network egress capability somewhere in the
431
+ // package. Broader than NETWORK_ADJACENT_TYPES: includes domain literals,
432
+ // drop-exec pairs, and suspicious dataflows. Used by Feature 8.
433
+ const EGRESS_TYPES = new Set([
434
+ 'suspicious_dataflow',
435
+ 'network_require',
436
+ 'remote_code_load',
437
+ 'curl_exec',
438
+ 'intent_credential_exfil',
439
+ 'intent_command_exfil',
440
+ 'dangerous_call_fetch',
441
+ 'external_tarball_dep',
442
+ 'dependency_url_suspicious',
443
+ 'suspicious_domain',
444
+ 'binary_dropper',
445
+ 'download_exec_binary',
446
+ 'fetch_decrypt_exec',
447
+ 'reverse_shell'
448
+ ]);
449
+
450
+ // Dep-confusion / defensive-placeholder phrases matched against the
451
+ // package description. Case-insensitive, whole-phrase (no substring
452
+ // inside an unrelated word). The list is deliberately conservative —
453
+ // a real README that happens to mention "dependency confusion" once
454
+ // still needs to look like a placeholder in every other dimension
455
+ // (see `placeholderAntiDepConfusion`).
456
+ const PLACEHOLDER_DESCRIPTION_RE = new RegExp([
457
+ 'dependency[- ]?confusion',
458
+ 'dep[- ]?confusion',
459
+ 'namespace[- ]?squatt?ing',
460
+ 'name[- ]?squatt?ing',
461
+ 'squatting[- ]?prevention',
462
+ 'defensive[- ]?(?:registration|publish|package|placeholder)',
463
+ 'placeholder[- ]?(?:package|to[- ]?reserve|for[- ]?the[- ]?name)',
464
+ 'reserv(?:e|ing|ation)[- ]?(?:this[- ]?)?(?:name|package|namespace)',
465
+ 'prevents?[- ]+(?:malicious[- ]+)?dependency[- ]+confusion',
466
+ 'blocks?[- ]+(?:malicious[- ]+)?dependency[- ]+confusion',
467
+ 'reserved[- ]+by[- ]+.*?(?:to[- ]+prevent|against)'
468
+ ].join('|'), 'i');
469
+
470
+ // Alias — same semantics as hasLifecycleScripts (used by F3), just named
471
+ // from the perspective of F7/F8 which reason about install vectors.
472
+ const hasInstallScript = hasLifecycleScripts;
473
+
474
+ function getDescription(meta) {
475
+ if (!meta) return '';
476
+ const candidates = [
477
+ meta.description,
478
+ meta.registryMeta && meta.registryMeta.description,
479
+ meta.npmRegistryMeta && meta.npmRegistryMeta.description
480
+ ];
481
+ for (const c of candidates) {
482
+ if (typeof c === 'string' && c.length > 0) return c;
483
+ }
484
+ return '';
485
+ }
486
+
487
+ /**
488
+ * Feature 5 — TRUE iff a `typosquat_detected` threat fires on a scoped
489
+ * package (`@scope/name`). Rationale: the typosquat rule computes edit
490
+ * distance on the bare name (`@vendor/client-foo` -> `client-foo`) and
491
+ * will sometimes treat `@scope/adapter-rubrik` as a typosquat of the
492
+ * unscoped `rubrik`. Scoping implies a separate namespace, so the
493
+ * collision is structurally false.
494
+ *
495
+ * Covers 52 FP (2.5%) on the VPS extended corpus.
496
+ */
497
+ function typosquatScopedPackage(result, meta) {
498
+ const threats = (result && result.threats) || [];
499
+ const hasTyposquat = threats.some(t =>
500
+ t.type === 'typosquat_detected' || t.type === 'pypi_typosquat_detected'
501
+ );
502
+ if (!hasTyposquat) return false;
503
+ const name = (meta && meta.name && String(meta.name)) || '';
504
+ return name.startsWith('@') && name.includes('/');
505
+ }
506
+
507
+ /**
508
+ * Feature 6 — TRUE iff the package shows only obfuscation-shape findings
509
+ * (obfuscation_detected, js_obfuscation_pattern, high_entropy_string,
510
+ * unicode_invisible_injection) AND carries no install / env / network
511
+ * vector threat. This is the commercial-obfuscator pattern: webpack
512
+ * output or a hardening vendor (jsjiami, obfuscator.io) trips heuristics
513
+ * but the package has no runtime capability to exfiltrate anything.
514
+ *
515
+ * Mutually exclusive with F8 by construction (F8 requires a lifecycle
516
+ * script, which is a VECTOR_TYPE here).
517
+ *
518
+ * Covers 33 FP (1.6%).
519
+ */
520
+ function obfuscationWithoutVector(result) {
521
+ const threats = (result && result.threats) || [];
522
+ if (threats.length === 0) return false;
523
+ let sawObf = false;
524
+ for (const t of threats) {
525
+ if (OBFUSCATION_TYPES.has(t.type)) { sawObf = true; continue; }
526
+ if (VECTOR_TYPES.has(t.type)) return false;
527
+ }
528
+ return sawObf;
529
+ }
530
+
531
+ /**
532
+ * Feature 7 — TRUE iff the package description explicitly declares a
533
+ * defensive / placeholder / dependency-confusion-prevention purpose AND
534
+ * the package body is effectively empty (no install script, trivial
535
+ * footprint). These are namespace reservations published by vendors to
536
+ * block attackers from squatting internal package names.
537
+ *
538
+ * Covers 15 FP (0.7%). Conservative double-check (description + empty
539
+ * body) protects against real packages whose README merely mentions
540
+ * dep-confusion as a discussed topic.
541
+ */
542
+ function placeholderAntiDepConfusion(result, meta) {
543
+ const desc = getDescription(meta);
544
+ if (!desc || !PLACEHOLDER_DESCRIPTION_RE.test(desc)) return false;
545
+ if (hasInstallScript(meta)) return false;
546
+ const threats = (result && result.threats) || [];
547
+ // Real placeholder packages should not carry any CRITICAL/HIGH static
548
+ // finding — empty by construction.
549
+ for (const t of threats) {
550
+ if (t.severity === 'CRITICAL' || t.severity === 'HIGH') return false;
551
+ }
552
+ return true;
553
+ }
554
+
555
+ /**
556
+ * Feature 8 — TRUE iff the package declares at least one install
557
+ * lifecycle script AND the scan shows no network egress capability
558
+ * anywhere (no fetch/curl/dns/suspicious dataflow/drop-exec).
559
+ *
560
+ * Install scripts that only do `echo`, `mkdir`, `chmod`, `npm run
561
+ * build`, or call a local node script without network access cannot
562
+ * exfiltrate data — the 219 FP this covers are almost entirely build
563
+ * helpers and version/engine gates.
564
+ *
565
+ * Mutually exclusive with F1 (requires no install) and F2 (requires
566
+ * a binary downloader, hence network egress).
567
+ */
568
+ function installScriptNoNetworkEgress(result, meta) {
569
+ if (!hasInstallScript(meta)) return false;
570
+ const threats = (result && result.threats) || [];
571
+ for (const t of threats) {
572
+ if (EGRESS_TYPES.has(t.type)) return false;
573
+ }
574
+ return true;
575
+ }
576
+
79
577
  /**
80
578
  * Extract ML features from a scan result object.
81
579
  *
@@ -190,6 +688,20 @@ function extractFeatures(result, meta) {
190
688
  ? Math.round((features.count_total / features.file_count_with_threats) * 100) / 100
191
689
  : 0;
192
690
 
691
+ // --- Cluster FP contextual features (v2.10.96) ---
692
+ features.network_destination_first_party = networkDestinationFirstParty(result, meta) ? 1 : 0;
693
+ features.install_url_github_releases = installUrlGithubReleases(result) ? 1 : 0;
694
+ features.bundle_without_install_scripts = bundleWithoutInstallScripts(result, meta) ? 1 : 0;
695
+ features.git_hook_source_local = gitHookSourceLocal(result) ? 1 : 0;
696
+ features.typosquat_scoped_package = typosquatScopedPackage(result, meta) ? 1 : 0;
697
+ features.obfuscation_without_vector = obfuscationWithoutVector(result) ? 1 : 0;
698
+ features.placeholder_anti_dep_confusion = placeholderAntiDepConfusion(result, meta) ? 1 : 0;
699
+ // F8 disabled for retrain — fires on malware due to incomplete EGRESS_TYPES
700
+ // (missing dangerous_exec, lifecycle_dangerous_exec, node_inline_exec).
701
+ // Re-enable in v2.10.97 after EGRESS_TYPES fix + re-validation.
702
+ // See ml-retrain/ml-auc-v2.10.96.md for details.
703
+ features.install_script_no_network_egress = 0; // installScriptNoNetworkEgress(result, meta) ? 1 : 0;
704
+
193
705
  return features;
194
706
  }
195
707
 
@@ -258,5 +770,14 @@ module.exports = {
258
770
  extractFeatures,
259
771
  buildTrainingRecord,
260
772
  TOP_THREAT_TYPES,
261
- TOP_THREAT_TYPES_SET
773
+ TOP_THREAT_TYPES_SET,
774
+ // Exported for direct unit testing of the cluster-FP helpers.
775
+ networkDestinationFirstParty,
776
+ installUrlGithubReleases,
777
+ bundleWithoutInstallScripts,
778
+ gitHookSourceLocal,
779
+ typosquatScopedPackage,
780
+ obfuscationWithoutVector,
781
+ placeholderAntiDepConfusion,
782
+ installScriptNoNetworkEgress
262
783
  };
@@ -282,8 +282,10 @@ function extractTarballFromDoc(doc) {
282
282
  const unpackedSize = (versionData.dist && versionData.dist.unpackedSize) || 0;
283
283
  const version = versionData.version || latestTag;
284
284
  const scripts = versionData.scripts || {};
285
+ const homepage = (typeof versionData.homepage === 'string') ? versionData.homepage : '';
286
+ const description = (typeof versionData.description === 'string') ? versionData.description : '';
285
287
 
286
- return { version, tarball, unpackedSize, scripts };
288
+ return { version, tarball, unpackedSize, scripts, homepage, description };
287
289
  } catch {
288
290
  return null; // Parse failure -> fallback to lazy resolution
289
291
  }
@@ -312,7 +314,9 @@ async function getNpmLatestTarball(packageName) {
312
314
  const tarball = (data.dist && data.dist.tarball) || null;
313
315
  const unpackedSize = (data.dist && data.dist.unpackedSize) || 0;
314
316
  const scripts = (data.scripts) || {};
315
- return { version, tarball, unpackedSize, scripts };
317
+ const homepage = (typeof data.homepage === 'string') ? data.homepage : '';
318
+ const description = (typeof data.description === 'string') ? data.description : '';
319
+ return { version, tarball, unpackedSize, scripts, homepage, description };
316
320
  }
317
321
 
318
322
  // --- npm polling ---
@@ -3,7 +3,7 @@ const path = require('path');
3
3
  const { getRule } = require('../rules/index.js');
4
4
  const { getPlaybook } = require('../response/playbooks.js');
5
5
  const { computeReachableFiles } = require('../scanner/reachability.js');
6
- const { applyFPReductions, applyCompoundBoosts, calculateRiskScore, getSeverityWeights } = require('../scoring.js');
6
+ const { applyFPReductions, applyCompoundBoosts, calculateRiskScore, getSeverityWeights, applyContextualFPCaps } = require('../scoring.js');
7
7
  const { buildIntentPairs } = require('../intent-graph.js');
8
8
  const { debugLog } = require('../utils.js');
9
9
 
@@ -100,12 +100,21 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
100
100
  // Read package name and dependencies for FP reduction heuristics
101
101
  let packageName = null;
102
102
  let packageDeps = null;
103
+ let _pkgMeta = null; // v2.10.97: full pkg metadata for contextual FP caps
103
104
  try {
104
105
  const pkgPath = path.join(targetPath, 'package.json');
105
106
  if (fs.existsSync(pkgPath)) {
106
107
  const pkgData = JSON.parse(fs.readFileSync(pkgPath, 'utf8'));
107
108
  packageName = pkgData.name || null;
108
109
  packageDeps = pkgData.dependencies || null;
110
+ _pkgMeta = {
111
+ name: pkgData.name,
112
+ scripts: pkgData.scripts || {},
113
+ description: pkgData.description || '',
114
+ homepage: pkgData.homepage || (typeof pkgData.repository === 'string' ? pkgData.repository : (pkgData.repository && pkgData.repository.url) || ''),
115
+ dependencies: pkgData.dependencies,
116
+ devDependencies: pkgData.devDependencies,
117
+ };
109
118
  }
110
119
  } catch { /* graceful fallback */ }
111
120
 
@@ -251,6 +260,23 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
251
260
  criticalCount, highCount, mediumCount, lowCount
252
261
  } = calculateRiskScore(deduped, intentResult);
253
262
 
263
+ // v2.10.96: stat each file that carries a threat and expose sizes on the
264
+ // scan result. Used by ML cluster-FP features (bundle_without_install_scripts)
265
+ // to replace the bundle-path-shape proxy with a real ">100KB" check.
266
+ // Cost: one statSync per unique threatened file (typically <30); same
267
+ // operation already runs elsewhere in the pipeline (executor.js:251).
268
+ const fileSizes = {};
269
+ for (const rel of Object.keys(fileScores)) {
270
+ if (!rel || rel === '(unknown)' || rel.startsWith('[SANDBOX]')) continue;
271
+ try {
272
+ const abs = path.isAbsolute(rel) ? rel : path.join(targetPath, rel);
273
+ const st = fs.statSync(abs);
274
+ if (st.isFile()) fileSizes[rel] = st.size;
275
+ } catch {
276
+ // File removed between scan and stat, or unreadable: skip silently.
277
+ }
278
+ }
279
+
254
280
  // Python scan metadata
255
281
  const pythonInfo = pythonDeps.length > 0 ? {
256
282
  dependencies: pythonDeps.length,
@@ -276,6 +302,7 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
276
302
  packageScore,
277
303
  mostSuspiciousFile,
278
304
  fileScores,
305
+ fileSizes,
279
306
  breakdown
280
307
  },
281
308
  sandbox: sandboxData,
@@ -283,6 +310,15 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
283
310
  scannerErrors: scannerErrors.length > 0 ? scannerErrors : undefined
284
311
  };
285
312
 
313
+ // v2.10.97: contextual FP post-filter — deterministic score caps for
314
+ // packages matching well-known FP clusters (100% precision, 302 human labels).
315
+ const fpCaps = applyContextualFPCaps(result, _pkgMeta);
316
+ if (fpCaps.length > 0) {
317
+ debugLog('[FP-CAP] ' + (packageName || targetPath) + ': ' +
318
+ fpCaps.map(c => c.feature + (c.cap > 0 ? '→MAX' + c.cap : '→suppress')).join(', ') +
319
+ ' → score=' + result.summary.riskScore);
320
+ }
321
+
286
322
  return {
287
323
  result,
288
324
  deduped,
@@ -90,12 +90,14 @@ function handlePostWalk(ctx) {
90
90
  t.file === ctx.relFile && execTypes.includes(t.type)
91
91
  );
92
92
  if (hasExecInFile) {
93
- ctx.threats.push({
93
+ const t = {
94
94
  type: 'binary_dropper',
95
95
  severity: 'CRITICAL',
96
96
  message: `${ctx.chmodMessage} + exec/spawn in same file — binary dropper pattern.`,
97
97
  file: ctx.relFile
98
- });
98
+ };
99
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
100
+ ctx.threats.push(t);
99
101
  }
100
102
  }
101
103
 
@@ -112,22 +114,26 @@ function handlePostWalk(ctx) {
112
114
  // Remote code loading: fetch + eval/Function in same file = multi-stage payload
113
115
  // Distinct from fetch_decrypt_exec which also requires crypto. This catches SVG/HTML payload extraction.
114
116
  if (ctx.hasRemoteFetch && ctx.hasDynamicExec && !ctx.hasCryptoDecipher) {
115
- ctx.threats.push({
117
+ const t = {
116
118
  type: 'remote_code_load',
117
119
  severity: 'CRITICAL',
118
120
  message: 'Remote code loading: network fetch + dynamic eval/Function in same file — multi-stage payload execution.',
119
121
  file: ctx.relFile
120
- });
122
+ };
123
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
124
+ ctx.threats.push(t);
121
125
  }
122
126
 
123
127
  // Wave 4: Remote fetch + crypto decrypt + dynamic eval = steganographic payload chain
124
128
  if (ctx.hasRemoteFetch && ctx.hasCryptoDecipher && ctx.hasDynamicExec) {
125
- ctx.threats.push({
129
+ const t = {
126
130
  type: 'fetch_decrypt_exec',
127
131
  severity: 'CRITICAL',
128
132
  message: 'Steganographic payload chain: remote fetch + crypto decryption + dynamic execution. No legitimate package uses this pattern.',
129
133
  file: ctx.relFile
130
- });
134
+ };
135
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
136
+ ctx.threats.push(t);
131
137
  }
132
138
 
133
139
  // Wave 4: Download-execute-cleanup — https download + chmod executable + execSync + unlink
@@ -142,13 +148,15 @@ function handlePostWalk(ctx) {
142
148
  // dominated by other CRITICAL rules, so a MEDIUM tier here had 0 FPR impact.
143
149
  // Full validation in data/fp-v2.10.95-validation.md.
144
150
  if (ctx.hasRemoteFetch && ctx.hasChmodExecutable && ctx.hasExecSyncCall) {
145
- ctx.threats.push({
151
+ const t = {
146
152
  type: 'download_exec_binary',
147
153
  severity: ctx.hasHashVerification ? 'HIGH' : 'CRITICAL',
148
154
  message: 'Download-execute pattern: remote fetch + chmod executable + execSync in same file.' +
149
155
  (ctx.hasHashVerification ? ' Hash verification detected — likely legitimate binary installer.' : ' Binary dropper camouflaged as native addon build.'),
150
156
  file: ctx.relFile
151
- });
157
+ };
158
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
159
+ ctx.threats.push(t);
152
160
  }
153
161
 
154
162
  // Wave 4: IDE persistence via content co-occurrence — tasks.json + runOn + writeFileSync
@@ -281,6 +281,10 @@ function analyzeFile(content, filePath, basePath) {
281
281
  })) {
282
282
  ctx.fetchOnlySafeDomains = true;
283
283
  }
284
+ // v2.10.96: retain the URL set on ctx so post-walk detectors can attach
285
+ // it to download/install-shaped threats. Consumed by ML feature
286
+ // install_url_github_releases to avoid regex-on-message proxying.
287
+ ctx.fetchUrls = urlMatches.slice(0, 32);
284
288
  }
285
289
 
286
290
  walk.simple(ast, {
@@ -142,6 +142,8 @@ async function getPackageMetadata(packageName) {
142
142
  const weeklyDownloads = downloadsData?.downloads ?? 0;
143
143
  const authorPackageCount = authorData?.total ?? 0;
144
144
  const versionCount = meta.versions ? Object.keys(meta.versions).length : 0;
145
+ const description = (typeof latestMeta?.description === 'string' ? latestMeta.description
146
+ : (typeof meta.description === 'string' ? meta.description : ''));
145
147
 
146
148
  return {
147
149
  created_at: createdAt,
@@ -151,7 +153,8 @@ async function getPackageMetadata(packageName) {
151
153
  has_readme: hasReadme,
152
154
  has_repository: hasRepository,
153
155
  version_count: versionCount,
154
- readme_size: readmeText.length
156
+ readme_size: readmeText.length,
157
+ description
155
158
  };
156
159
  }
157
160
 
package/src/scoring.js CHANGED
@@ -1011,8 +1011,110 @@ function calculateRiskScore(deduped, intentResult) {
1011
1011
  };
1012
1012
  }
1013
1013
 
1014
+ // ============================================
1015
+ // v2.10.97: CONTEXTUAL FP POST-FILTER
1016
+ // ============================================
1017
+ // Deterministic score caps for packages matching well-known FP clusters.
1018
+ // Each feature has 100% precision on 302 human-reviewed packages (zero
1019
+ // malware misclassified). Applied AFTER calculateRiskScore() so that
1020
+ // compound boosts and lifecycle floors have already had their say.
1021
+ const {
1022
+ bundleWithoutInstallScripts,
1023
+ installUrlGithubReleases,
1024
+ networkDestinationFirstParty,
1025
+ gitHookSourceLocal,
1026
+ typosquatScopedPackage,
1027
+ obfuscationWithoutVector,
1028
+ placeholderAntiDepConfusion,
1029
+ } = require('./ml/feature-extractor.js');
1030
+
1031
+ /**
1032
+ * Apply contextual FP score caps to a scan result.
1033
+ * Mutates result.summary.riskScore / riskLevel in-place.
1034
+ * Returns array of { feature, cap } describing applied caps (empty if none).
1035
+ */
1036
+ function applyContextualFPCaps(result, pkgMeta) {
1037
+ if (!result || !result.summary) return [];
1038
+
1039
+ const meta = {
1040
+ name: pkgMeta && pkgMeta.name,
1041
+ registryMeta: {
1042
+ scripts: (pkgMeta && pkgMeta.scripts) || {},
1043
+ description: (pkgMeta && pkgMeta.description) || '',
1044
+ homepage: (pkgMeta && pkgMeta.homepage) || '',
1045
+ dependencies: (pkgMeta && pkgMeta.dependencies),
1046
+ devDependencies: (pkgMeta && pkgMeta.devDependencies),
1047
+ },
1048
+ };
1049
+
1050
+ const applied = [];
1051
+
1052
+ // F7: placeholder anti-dep-confusion → MAX 20
1053
+ if (placeholderAntiDepConfusion(result, meta)) {
1054
+ applied.push({ feature: 'placeholder_anti_dep_confusion', cap: 20 });
1055
+ }
1056
+ // F1: minified bundle without install scripts → MAX 30
1057
+ if (bundleWithoutInstallScripts(result, meta)) {
1058
+ applied.push({ feature: 'bundle_without_install_scripts', cap: 30 });
1059
+ }
1060
+ // F3: credential destination first-party API → MAX 30
1061
+ if (networkDestinationFirstParty(result, meta)) {
1062
+ applied.push({ feature: 'network_destination_first_party', cap: 30 });
1063
+ }
1064
+ // F2: binary installer from GitHub Releases → MAX 35
1065
+ if (installUrlGithubReleases(result)) {
1066
+ applied.push({ feature: 'install_url_github_releases', cap: 35 });
1067
+ }
1068
+ // F4: git hooks from local source → MAX 35
1069
+ if (gitHookSourceLocal(result)) {
1070
+ applied.push({ feature: 'git_hook_source_local', cap: 35 });
1071
+ }
1072
+ // F6: commercial obfuscation without attack vector → MAX 35
1073
+ if (obfuscationWithoutVector(result)) {
1074
+ applied.push({ feature: 'obfuscation_without_vector', cap: 35 });
1075
+ }
1076
+ // F5: typosquat on scoped package → suppress typosquat points
1077
+ if (typosquatScopedPackage(result, meta)) {
1078
+ applied.push({ feature: 'typosquat_scoped_package', cap: -1 });
1079
+ }
1080
+
1081
+ if (applied.length === 0) return applied;
1082
+
1083
+ // Apply the tightest (lowest) cap
1084
+ const caps = applied.filter(a => a.cap > 0);
1085
+ const lowestCap = caps.length > 0 ? Math.min(...caps.map(a => a.cap)) : Infinity;
1086
+
1087
+ if (lowestCap < result.summary.riskScore) {
1088
+ result.summary.riskScore = lowestCap;
1089
+ result.summary.riskLevel =
1090
+ lowestCap >= _riskThresholds.CRITICAL ? 'CRITICAL'
1091
+ : lowestCap >= _riskThresholds.HIGH ? 'HIGH'
1092
+ : lowestCap >= _riskThresholds.MEDIUM ? 'MEDIUM'
1093
+ : lowestCap > 0 ? 'LOW' : 'SAFE';
1094
+ }
1095
+
1096
+ // F5: subtract typosquat points from score
1097
+ if (applied.find(a => a.feature === 'typosquat_scoped_package')) {
1098
+ const typoPoints = result.threats
1099
+ .filter(t => t.type === 'typosquat_detected' || t.type === 'lifecycle_typosquat')
1100
+ .reduce((s, t) => s + (t.points || 0), 0);
1101
+ if (typoPoints > 0) {
1102
+ result.summary.riskScore = Math.max(0, result.summary.riskScore - typoPoints);
1103
+ const rs = result.summary.riskScore;
1104
+ result.summary.riskLevel =
1105
+ rs >= _riskThresholds.CRITICAL ? 'CRITICAL'
1106
+ : rs >= _riskThresholds.HIGH ? 'HIGH'
1107
+ : rs >= _riskThresholds.MEDIUM ? 'MEDIUM'
1108
+ : rs > 0 ? 'LOW' : 'SAFE';
1109
+ }
1110
+ }
1111
+
1112
+ return applied;
1113
+ }
1114
+
1014
1115
  module.exports = {
1015
1116
  SEVERITY_WEIGHTS, RISK_THRESHOLDS, MAX_RISK_SCORE, CONFIDENCE_FACTORS,
1016
1117
  isPackageLevelThreat, computeGroupScore, applyFPReductions, applyCompoundBoosts, calculateRiskScore,
1017
- applyConfigOverrides, resetConfigOverrides, getSeverityWeights, getRiskThresholds
1118
+ applyConfigOverrides, resetConfigOverrides, getSeverityWeights, getRiskThresholds,
1119
+ applyContextualFPCaps
1018
1120
  };