muaddib-scanner 2.10.94 → 2.10.96

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.10.94",
3
+ "version": "2.10.96",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -76,6 +76,504 @@ const TOP_THREAT_TYPES = [
76
76
 
77
77
  const TOP_THREAT_TYPES_SET = new Set(TOP_THREAT_TYPES);
78
78
 
79
+ // --- Cluster FP contextual feature helpers (v2.10.96) ---
80
+ //
81
+ // Target: P1 CRITICAL webhook suppression (score >= 75). The four helpers
82
+ // below encode the four FP clusters identified in the v2.10.9x weekly FP
83
+ // review: Cluster A (native binary installers via GitHub releases),
84
+ // Cluster B (minified bundles w/o install scripts), Cluster C (dev tooling
85
+ // writing git hooks from local files), Cluster E (first-party SDKs exfil
86
+ // pattern on their own API).
87
+ //
88
+ // These features intentionally operate on scan-result signals ONLY so they
89
+ // can be recomputed on historical JSONL records without re-scanning.
90
+
91
+ // Threats whose presence implies the package performs a network call.
92
+ const NETWORK_ADJACENT_TYPES = new Set([
93
+ 'suspicious_dataflow',
94
+ 'network_require',
95
+ 'remote_code_load',
96
+ 'curl_exec',
97
+ 'intent_credential_exfil',
98
+ 'intent_command_exfil',
99
+ 'dangerous_call_fetch',
100
+ 'external_tarball_dep',
101
+ 'dependency_url_suspicious'
102
+ ]);
103
+
104
+ // Package-scope -> first-party domain mapping for well-known SDK publishers.
105
+ // Keys are lowercase npm scope names (without '@'). Used by
106
+ // `network_destination_first_party` when the package is scoped.
107
+ const SCOPE_FIRST_PARTY_DOMAINS = {
108
+ 'anthropic-ai': ['anthropic.com'],
109
+ 'openai': ['openai.com'],
110
+ 'google-cloud': ['googleapis.com', 'google.com'],
111
+ 'google-ai': ['googleapis.com', 'google.com'],
112
+ 'aws-sdk': ['amazonaws.com', 'aws.amazon.com'],
113
+ 'aws-amplify': ['amazonaws.com'],
114
+ 'azure': ['azure.com', 'microsoft.com'],
115
+ 'microsoft': ['microsoft.com', 'azure.com'],
116
+ 'supabase': ['supabase.co', 'supabase.com'],
117
+ 'stripe': ['stripe.com'],
118
+ 'twilio': ['twilio.com'],
119
+ 'sendgrid': ['sendgrid.com', 'sendgrid.net'],
120
+ 'datadog': ['datadoghq.com'],
121
+ 'sentry': ['sentry.io'],
122
+ 'slack': ['slack.com'],
123
+ 'octokit': ['github.com', 'githubusercontent.com'],
124
+ 'cloudflare': ['cloudflare.com'],
125
+ 'auth0': ['auth0.com'],
126
+ 'hubspot': ['hubspot.com', 'hubapi.com'],
127
+ 'contentful': ['contentful.com'],
128
+ 'mongodb': ['mongodb.com', 'mongodb.net'],
129
+ 'mailgun': ['mailgun.net', 'mailgun.com'],
130
+ 'vercel': ['vercel.com', 'vercel.app'],
131
+ 'netlify': ['netlify.com', 'netlify.app'],
132
+ 'pinecone-database': ['pinecone.io'],
133
+ 'langchain': ['langchain.com']
134
+ };
135
+
136
+ // GitHub release hosts (install_url_github_releases).
137
+ const GITHUB_RELEASE_HOSTS = ['github.com', 'objects.githubusercontent.com', 'raw.githubusercontent.com'];
138
+
139
+ // Bundle file-shape patterns. Conservative: only flag paths that clearly
140
+ // correspond to build output, so the feature stays specific to Cluster B.
141
+ const BUNDLE_PATH_RE = /(?:^|[\\/])(?:dist|build|lib|out|umd|esm|cjs|bundle|_next[\\/]static|\.next[\\/]static|public[\\/]static|webpack|rollup)[\\/]/i;
142
+ const BUNDLE_FILE_RE = /\.(?:min|bundle|prod|umd|iife|esm|cjs)\.(?:m?js|cjs)$|\.min\.js$|chunk-[0-9a-f]+\.js$|vendors?~?.*\.js$/i;
143
+
144
+ // Threat types that indicate remote content fetch in a file (for
145
+ // `git_hook_source_local` heuristic: absence => local source).
146
+ const REMOTE_FETCH_TYPES = new Set([
147
+ 'remote_code_load',
148
+ 'network_require',
149
+ 'curl_exec',
150
+ 'suspicious_dataflow',
151
+ 'suspicious_domain',
152
+ 'dangerous_call_fetch',
153
+ 'external_tarball_dep',
154
+ 'dependency_url_suspicious',
155
+ 'binary_dropper',
156
+ 'download_exec_binary'
157
+ ]);
158
+
159
+ // Match URLs inside threat message strings (legacy fallback when threats
160
+ // predate v2.10.96 URL enrichment — historical JSONL scan results).
161
+ const MESSAGE_URL_RE = /https?:\/\/([a-zA-Z0-9._-]+)(?:[:/?#][^\s'"`)<>]*)?/g;
162
+
163
+ function hostFromUrl(url) {
164
+ if (typeof url !== 'string') return null;
165
+ const m = url.match(/^https?:\/\/([^/:?#\s'"`)<>]+)/i);
166
+ return m ? m[1].toLowerCase() : null;
167
+ }
168
+
169
+ function extractHostsFromThreats(threats) {
170
+ const hosts = new Set();
171
+ let sawStructured = false;
172
+ for (const t of threats) {
173
+ if (t && Array.isArray(t.urls) && t.urls.length > 0) {
174
+ sawStructured = true;
175
+ for (const u of t.urls) {
176
+ const h = hostFromUrl(u);
177
+ if (h) hosts.add(h);
178
+ }
179
+ }
180
+ }
181
+ // If no threat carries structured URLs, fall back to message-regex so that
182
+ // callers can still reason about old scan records. Once the scan fleet is
183
+ // fully on v2.10.96+ the regex branch becomes dead.
184
+ if (sawStructured) return hosts;
185
+ for (const t of threats) {
186
+ const msg = t && t.message;
187
+ if (!msg || typeof msg !== 'string') continue;
188
+ MESSAGE_URL_RE.lastIndex = 0;
189
+ let m;
190
+ while ((m = MESSAGE_URL_RE.exec(msg)) !== null) {
191
+ if (m[1]) hosts.add(m[1].toLowerCase());
192
+ }
193
+ }
194
+ return hosts;
195
+ }
196
+
197
+ function hostMatchesSuffix(host, candidates) {
198
+ for (const c of candidates) {
199
+ if (host === c || host.endsWith('.' + c)) return true;
200
+ }
201
+ return false;
202
+ }
203
+
204
+ function getPackageScope(name) {
205
+ if (!name || typeof name !== 'string') return null;
206
+ const m = name.match(/^@([^/]+)\//);
207
+ return m ? m[1].toLowerCase() : null;
208
+ }
209
+
210
+ function getHomepageHost(meta) {
211
+ if (!meta) return null;
212
+ const candidates = [
213
+ meta.homepage,
214
+ meta.registryMeta && meta.registryMeta.homepage,
215
+ meta.npmRegistryMeta && meta.npmRegistryMeta.homepage
216
+ ];
217
+ for (const raw of candidates) {
218
+ if (!raw || typeof raw !== 'string') continue;
219
+ const m = raw.match(/^https?:\/\/([^/:?#]+)/i);
220
+ if (m) return m[1].toLowerCase();
221
+ }
222
+ return null;
223
+ }
224
+
225
+ /**
226
+ * Feature 1 — TRUE iff the package performs a network call AND every
227
+ * extractable destination is a first-party host of that package.
228
+ * First-party = package-scope SDK publisher or package.homepage host.
229
+ *
230
+ * Targets Cluster E: Claude Code / OpenAI / Anthropic SDK wrappers that
231
+ * read API keys from env and POST them to their legitimate vendor API.
232
+ */
233
+ function networkDestinationFirstParty(result, meta) {
234
+ const threats = (result && result.threats) || [];
235
+ const hasNetwork = threats.some(t => NETWORK_ADJACENT_TYPES.has(t.type));
236
+ if (!hasNetwork) return false;
237
+
238
+ const firstParty = [];
239
+ const scope = getPackageScope(meta && meta.name);
240
+ if (scope && SCOPE_FIRST_PARTY_DOMAINS[scope]) {
241
+ firstParty.push(...SCOPE_FIRST_PARTY_DOMAINS[scope]);
242
+ }
243
+ // Unscoped packages: accept exact-name match against the scope table for
244
+ // packages whose own identifier IS the publisher (e.g., `stripe`, `twilio`).
245
+ const baseName = (meta && meta.name && String(meta.name).replace(/^@[^/]+\//, '').toLowerCase()) || '';
246
+ if (!scope && SCOPE_FIRST_PARTY_DOMAINS[baseName]) {
247
+ firstParty.push(...SCOPE_FIRST_PARTY_DOMAINS[baseName]);
248
+ }
249
+ const homepageHost = getHomepageHost(meta);
250
+ if (homepageHost) firstParty.push(homepageHost);
251
+ if (firstParty.length === 0) return false;
252
+
253
+ const hosts = extractHostsFromThreats(threats);
254
+ // No destination host was observable (scanner saw the network sink but
255
+ // no URL literal leaked into threat messages). Accept as first-party only
256
+ // when the package identity alone is a strong signal (scoped SDK).
257
+ if (hosts.size === 0) return scope !== null && SCOPE_FIRST_PARTY_DOMAINS[scope] !== undefined;
258
+
259
+ for (const h of hosts) {
260
+ if (!hostMatchesSuffix(h, firstParty)) return false;
261
+ }
262
+ return true;
263
+ }
264
+
265
+ /**
266
+ * Feature 2 — TRUE iff the package behaves as a native-binary installer
267
+ * AND every URL visible in its threat messages points to GitHub releases.
268
+ *
269
+ * Targets Cluster A: esbuild / swc / prisma style platform binary drops.
270
+ */
271
+ function installUrlGithubReleases(result) {
272
+ const threats = (result && result.threats) || [];
273
+ const hasInstaller = threats.some(t => t.type === 'binary_dropper' || t.type === 'download_exec_binary');
274
+ if (!hasInstaller) return false;
275
+ // Any known-suspicious destination present => not a github-only installer.
276
+ if (threats.some(t => t.type === 'suspicious_domain')) return false;
277
+
278
+ const hosts = extractHostsFromThreats(threats);
279
+ if (hosts.size === 0) return false;
280
+ for (const h of hosts) {
281
+ if (!hostMatchesSuffix(h, GITHUB_RELEASE_HOSTS)) return false;
282
+ }
283
+ // At least one host must be a github release host (guards against the
284
+ // degenerate case where every extracted host happened to be unrelated
285
+ // allowlist traffic — e.g., registry.npmjs.org).
286
+ for (const h of hosts) {
287
+ if (hostMatchesSuffix(h, GITHUB_RELEASE_HOSTS)) return true;
288
+ }
289
+ return false;
290
+ }
291
+
292
+ function hasBundlePath(file) {
293
+ if (!file || typeof file !== 'string') return false;
294
+ return BUNDLE_PATH_RE.test(file) || BUNDLE_FILE_RE.test(file);
295
+ }
296
+
297
+ function hasLifecycleScripts(meta) {
298
+ const scripts = (meta && meta.registryMeta && meta.registryMeta.scripts) || null;
299
+ if (!scripts || typeof scripts !== 'object') return false;
300
+ for (const key of ['preinstall', 'install', 'postinstall']) {
301
+ const v = scripts[key];
302
+ if (typeof v === 'string' && v.trim().length > 0) return true;
303
+ }
304
+ return false;
305
+ }
306
+
307
+ // Threshold derived from the v2.10.9x FP review of minified bundles:
308
+ // Cluster B FPs all ship at least one > 100KB file (typical webpack chunk
309
+ // is 200-800KB). 100KB is low enough to catch small bundlers yet high
310
+ // enough to exclude hand-written source.
311
+ const BUNDLE_FILE_MIN_BYTES = 100 * 1024;
312
+
313
+ /**
314
+ * Feature 3 — TRUE iff the package ships at least one large (>100KB) file
315
+ * AND the findings all sit in those large files AND the package declares
316
+ * no install lifecycle script. Targets Cluster B: minified webpack/rollup
317
+ * output triggering eval / obfuscation heuristics without any runtime
318
+ * install vector.
319
+ *
320
+ * Primary size source: `summary.fileSizes` (populated by processor.js in
321
+ * v2.10.96+). When sizes are absent (historical JSONL records), fall back
322
+ * to the path-shape proxy (`dist/`, `.min.js`, etc.).
323
+ *
324
+ * `registryMeta.scripts` is REQUIRED: callers that do not populate it will
325
+ * always get FALSE — we must not claim a package has no install hook when
326
+ * we never looked.
327
+ */
328
+ function bundleWithoutInstallScripts(result, meta) {
329
+ if (!meta || !meta.registryMeta || meta.registryMeta.scripts === undefined) return false;
330
+ if (hasLifecycleScripts(meta)) return false;
331
+
332
+ const threats = (result && result.threats) || [];
333
+ if (threats.length === 0) return false;
334
+
335
+ const threatFiles = new Set();
336
+ for (const t of threats) {
337
+ if (t.file) threatFiles.add(t.file);
338
+ }
339
+ if (threatFiles.size === 0) return false;
340
+
341
+ const summary = (result && result.summary) || {};
342
+ const fileSizes = summary.fileSizes;
343
+ const haveSizes = fileSizes && typeof fileSizes === 'object' && Object.keys(fileSizes).length > 0;
344
+
345
+ if (haveSizes) {
346
+ let sawLargeFile = false;
347
+ for (const f of threatFiles) {
348
+ const size = fileSizes[f];
349
+ if (typeof size !== 'number') return false;
350
+ if (size < BUNDLE_FILE_MIN_BYTES) return false;
351
+ sawLargeFile = true;
352
+ }
353
+ return sawLargeFile;
354
+ }
355
+
356
+ // Legacy proxy: no file sizes available, fall back to path shape.
357
+ for (const f of threatFiles) {
358
+ if (!hasBundlePath(f)) return false;
359
+ }
360
+ return true;
361
+ }
362
+
363
+ /**
364
+ * Feature 4 — TRUE iff the package fires `git_hooks_injection` AND none of
365
+ * the files that triggered it also show a remote-fetch signal. Proxy for
366
+ * "hook body was read from a local source file", i.e. dev tooling like
367
+ * husky / simple-git-hooks installing its own canned hook.
368
+ */
369
+ function gitHookSourceLocal(result) {
370
+ const threats = (result && result.threats) || [];
371
+ const hookThreats = threats.filter(t => t.type === 'git_hooks_injection');
372
+ if (hookThreats.length === 0) return false;
373
+
374
+ const remoteByFile = new Map();
375
+ for (const t of threats) {
376
+ if (!t.file || !REMOTE_FETCH_TYPES.has(t.type)) continue;
377
+ remoteByFile.set(t.file, true);
378
+ }
379
+ for (const h of hookThreats) {
380
+ if (h.file && remoteByFile.has(h.file)) return false;
381
+ }
382
+ return true;
383
+ }
384
+
385
+ // --- v2.10.96 extended FP features (F5-F8, VPS review 2026-04-18) ---
386
+ //
387
+ // Covers an additional 319 FP (15.2%) on top of F1-F4; combined F1-F8
388
+ // cover 2069/2104 reviewed FP = 98.3%.
389
+
390
+ // Obfuscation-shape threats used by Feature 6.
391
+ const OBFUSCATION_TYPES = new Set([
392
+ 'obfuscation_detected',
393
+ 'js_obfuscation_pattern',
394
+ 'high_entropy_string',
395
+ 'unicode_invisible_injection'
396
+ ]);
397
+
398
+ // Threat types that indicate a runtime vector (install, env, network).
399
+ // Their presence disqualifies Feature 6 (obfuscation-without-vector).
400
+ const VECTOR_TYPES = new Set([
401
+ // install / lifecycle
402
+ 'lifecycle_script',
403
+ 'lifecycle_shell_pipe',
404
+ // env read (credential source)
405
+ 'env_access',
406
+ 'env_charcode_reconstruction',
407
+ 'credential_regex_harvest',
408
+ // network / exec / dynamic code
409
+ 'suspicious_dataflow',
410
+ 'network_require',
411
+ 'remote_code_load',
412
+ 'curl_exec',
413
+ 'intent_credential_exfil',
414
+ 'intent_command_exfil',
415
+ 'dangerous_call_fetch',
416
+ 'external_tarball_dep',
417
+ 'dependency_url_suspicious',
418
+ 'dangerous_exec',
419
+ 'dangerous_call_eval',
420
+ 'dangerous_call_exec',
421
+ 'dangerous_call_function',
422
+ 'module_compile',
423
+ 'binary_dropper',
424
+ 'download_exec_binary',
425
+ 'fetch_decrypt_exec',
426
+ 'suspicious_domain',
427
+ 'reverse_shell'
428
+ ]);
429
+
430
+ // Threats that indicate a network egress capability somewhere in the
431
+ // package. Broader than NETWORK_ADJACENT_TYPES: includes domain literals,
432
+ // drop-exec pairs, and suspicious dataflows. Used by Feature 8.
433
+ const EGRESS_TYPES = new Set([
434
+ 'suspicious_dataflow',
435
+ 'network_require',
436
+ 'remote_code_load',
437
+ 'curl_exec',
438
+ 'intent_credential_exfil',
439
+ 'intent_command_exfil',
440
+ 'dangerous_call_fetch',
441
+ 'external_tarball_dep',
442
+ 'dependency_url_suspicious',
443
+ 'suspicious_domain',
444
+ 'binary_dropper',
445
+ 'download_exec_binary',
446
+ 'fetch_decrypt_exec',
447
+ 'reverse_shell'
448
+ ]);
449
+
450
+ // Dep-confusion / defensive-placeholder phrases matched against the
451
+ // package description. Case-insensitive, whole-phrase (no substring
452
+ // inside an unrelated word). The list is deliberately conservative —
453
+ // a real README that happens to mention "dependency confusion" once
454
+ // still needs to look like a placeholder in every other dimension
455
+ // (see `placeholderAntiDepConfusion`).
456
+ const PLACEHOLDER_DESCRIPTION_RE = new RegExp([
457
+ 'dependency[- ]?confusion',
458
+ 'dep[- ]?confusion',
459
+ 'namespace[- ]?squatt?ing',
460
+ 'name[- ]?squatt?ing',
461
+ 'squatting[- ]?prevention',
462
+ 'defensive[- ]?(?:registration|publish|package|placeholder)',
463
+ 'placeholder[- ]?(?:package|to[- ]?reserve|for[- ]?the[- ]?name)',
464
+ 'reserv(?:e|ing|ation)[- ]?(?:this[- ]?)?(?:name|package|namespace)',
465
+ 'prevents?[- ]+(?:malicious[- ]+)?dependency[- ]+confusion',
466
+ 'blocks?[- ]+(?:malicious[- ]+)?dependency[- ]+confusion',
467
+ 'reserved[- ]+by[- ]+.*?(?:to[- ]+prevent|against)'
468
+ ].join('|'), 'i');
469
+
470
+ // Alias — same semantics as hasLifecycleScripts (used by F3), just named
471
+ // from the perspective of F7/F8 which reason about install vectors.
472
+ const hasInstallScript = hasLifecycleScripts;
473
+
474
+ function getDescription(meta) {
475
+ if (!meta) return '';
476
+ const candidates = [
477
+ meta.description,
478
+ meta.registryMeta && meta.registryMeta.description,
479
+ meta.npmRegistryMeta && meta.npmRegistryMeta.description
480
+ ];
481
+ for (const c of candidates) {
482
+ if (typeof c === 'string' && c.length > 0) return c;
483
+ }
484
+ return '';
485
+ }
486
+
487
+ /**
488
+ * Feature 5 — TRUE iff a `typosquat_detected` threat fires on a scoped
489
+ * package (`@scope/name`). Rationale: the typosquat rule computes edit
490
+ * distance on the bare name (`@vendor/client-foo` -> `client-foo`) and
491
+ * will sometimes treat `@scope/adapter-rubrik` as a typosquat of the
492
+ * unscoped `rubrik`. Scoping implies a separate namespace, so the
493
+ * collision is structurally false.
494
+ *
495
+ * Covers 52 FP (2.5%) on the VPS extended corpus.
496
+ */
497
+ function typosquatScopedPackage(result, meta) {
498
+ const threats = (result && result.threats) || [];
499
+ const hasTyposquat = threats.some(t =>
500
+ t.type === 'typosquat_detected' || t.type === 'pypi_typosquat_detected'
501
+ );
502
+ if (!hasTyposquat) return false;
503
+ const name = (meta && meta.name && String(meta.name)) || '';
504
+ return name.startsWith('@') && name.includes('/');
505
+ }
506
+
507
+ /**
508
+ * Feature 6 — TRUE iff the package shows only obfuscation-shape findings
509
+ * (obfuscation_detected, js_obfuscation_pattern, high_entropy_string,
510
+ * unicode_invisible_injection) AND carries no install / env / network
511
+ * vector threat. This is the commercial-obfuscator pattern: webpack
512
+ * output or a hardening vendor (jsjiami, obfuscator.io) trips heuristics
513
+ * but the package has no runtime capability to exfiltrate anything.
514
+ *
515
+ * Mutually exclusive with F8 by construction (F8 requires a lifecycle
516
+ * script, which is a VECTOR_TYPE here).
517
+ *
518
+ * Covers 33 FP (1.6%).
519
+ */
520
+ function obfuscationWithoutVector(result) {
521
+ const threats = (result && result.threats) || [];
522
+ if (threats.length === 0) return false;
523
+ let sawObf = false;
524
+ for (const t of threats) {
525
+ if (OBFUSCATION_TYPES.has(t.type)) { sawObf = true; continue; }
526
+ if (VECTOR_TYPES.has(t.type)) return false;
527
+ }
528
+ return sawObf;
529
+ }
530
+
531
+ /**
532
+ * Feature 7 — TRUE iff the package description explicitly declares a
533
+ * defensive / placeholder / dependency-confusion-prevention purpose AND
534
+ * the package body is effectively empty (no install script, trivial
535
+ * footprint). These are namespace reservations published by vendors to
536
+ * block attackers from squatting internal package names.
537
+ *
538
+ * Covers 15 FP (0.7%). Conservative double-check (description + empty
539
+ * body) protects against real packages whose README merely mentions
540
+ * dep-confusion as a discussed topic.
541
+ */
542
+ function placeholderAntiDepConfusion(result, meta) {
543
+ const desc = getDescription(meta);
544
+ if (!desc || !PLACEHOLDER_DESCRIPTION_RE.test(desc)) return false;
545
+ if (hasInstallScript(meta)) return false;
546
+ const threats = (result && result.threats) || [];
547
+ // Real placeholder packages should not carry any CRITICAL/HIGH static
548
+ // finding — empty by construction.
549
+ for (const t of threats) {
550
+ if (t.severity === 'CRITICAL' || t.severity === 'HIGH') return false;
551
+ }
552
+ return true;
553
+ }
554
+
555
+ /**
556
+ * Feature 8 — TRUE iff the package declares at least one install
557
+ * lifecycle script AND the scan shows no network egress capability
558
+ * anywhere (no fetch/curl/dns/suspicious dataflow/drop-exec).
559
+ *
560
+ * Install scripts that only do `echo`, `mkdir`, `chmod`, `npm run
561
+ * build`, or call a local node script without network access cannot
562
+ * exfiltrate data — the 219 FP this covers are almost entirely build
563
+ * helpers and version/engine gates.
564
+ *
565
+ * Mutually exclusive with F1 (requires no install) and F2 (requires
566
+ * a binary downloader, hence network egress).
567
+ */
568
+ function installScriptNoNetworkEgress(result, meta) {
569
+ if (!hasInstallScript(meta)) return false;
570
+ const threats = (result && result.threats) || [];
571
+ for (const t of threats) {
572
+ if (EGRESS_TYPES.has(t.type)) return false;
573
+ }
574
+ return true;
575
+ }
576
+
79
577
  /**
80
578
  * Extract ML features from a scan result object.
81
579
  *
@@ -190,6 +688,16 @@ function extractFeatures(result, meta) {
190
688
  ? Math.round((features.count_total / features.file_count_with_threats) * 100) / 100
191
689
  : 0;
192
690
 
691
+ // --- Cluster FP contextual features (v2.10.96) ---
692
+ features.network_destination_first_party = networkDestinationFirstParty(result, meta) ? 1 : 0;
693
+ features.install_url_github_releases = installUrlGithubReleases(result) ? 1 : 0;
694
+ features.bundle_without_install_scripts = bundleWithoutInstallScripts(result, meta) ? 1 : 0;
695
+ features.git_hook_source_local = gitHookSourceLocal(result) ? 1 : 0;
696
+ features.typosquat_scoped_package = typosquatScopedPackage(result, meta) ? 1 : 0;
697
+ features.obfuscation_without_vector = obfuscationWithoutVector(result) ? 1 : 0;
698
+ features.placeholder_anti_dep_confusion = placeholderAntiDepConfusion(result, meta) ? 1 : 0;
699
+ features.install_script_no_network_egress = installScriptNoNetworkEgress(result, meta) ? 1 : 0;
700
+
193
701
  return features;
194
702
  }
195
703
 
@@ -258,5 +766,14 @@ module.exports = {
258
766
  extractFeatures,
259
767
  buildTrainingRecord,
260
768
  TOP_THREAT_TYPES,
261
- TOP_THREAT_TYPES_SET
769
+ TOP_THREAT_TYPES_SET,
770
+ // Exported for direct unit testing of the cluster-FP helpers.
771
+ networkDestinationFirstParty,
772
+ installUrlGithubReleases,
773
+ bundleWithoutInstallScripts,
774
+ gitHookSourceLocal,
775
+ typosquatScopedPackage,
776
+ obfuscationWithoutVector,
777
+ placeholderAntiDepConfusion,
778
+ installScriptNoNetworkEgress
262
779
  };
@@ -282,8 +282,10 @@ function extractTarballFromDoc(doc) {
282
282
  const unpackedSize = (versionData.dist && versionData.dist.unpackedSize) || 0;
283
283
  const version = versionData.version || latestTag;
284
284
  const scripts = versionData.scripts || {};
285
+ const homepage = (typeof versionData.homepage === 'string') ? versionData.homepage : '';
286
+ const description = (typeof versionData.description === 'string') ? versionData.description : '';
285
287
 
286
- return { version, tarball, unpackedSize, scripts };
288
+ return { version, tarball, unpackedSize, scripts, homepage, description };
287
289
  } catch {
288
290
  return null; // Parse failure -> fallback to lazy resolution
289
291
  }
@@ -312,7 +314,9 @@ async function getNpmLatestTarball(packageName) {
312
314
  const tarball = (data.dist && data.dist.tarball) || null;
313
315
  const unpackedSize = (data.dist && data.dist.unpackedSize) || 0;
314
316
  const scripts = (data.scripts) || {};
315
- return { version, tarball, unpackedSize, scripts };
317
+ const homepage = (typeof data.homepage === 'string') ? data.homepage : '';
318
+ const description = (typeof data.description === 'string') ? data.description : '';
319
+ return { version, tarball, unpackedSize, scripts, homepage, description };
316
320
  }
317
321
 
318
322
  // --- npm polling ---
@@ -251,6 +251,23 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
251
251
  criticalCount, highCount, mediumCount, lowCount
252
252
  } = calculateRiskScore(deduped, intentResult);
253
253
 
254
+ // v2.10.96: stat each file that carries a threat and expose sizes on the
255
+ // scan result. Used by ML cluster-FP features (bundle_without_install_scripts)
256
+ // to replace the bundle-path-shape proxy with a real ">100KB" check.
257
+ // Cost: one statSync per unique threatened file (typically <30); same
258
+ // operation already runs elsewhere in the pipeline (executor.js:251).
259
+ const fileSizes = {};
260
+ for (const rel of Object.keys(fileScores)) {
261
+ if (!rel || rel === '(unknown)' || rel.startsWith('[SANDBOX]')) continue;
262
+ try {
263
+ const abs = path.isAbsolute(rel) ? rel : path.join(targetPath, rel);
264
+ const st = fs.statSync(abs);
265
+ if (st.isFile()) fileSizes[rel] = st.size;
266
+ } catch {
267
+ // File removed between scan and stat, or unreadable: skip silently.
268
+ }
269
+ }
270
+
254
271
  // Python scan metadata
255
272
  const pythonInfo = pythonDeps.length > 0 ? {
256
273
  dependencies: pythonDeps.length,
@@ -276,6 +293,7 @@ async function process(threats, targetPath, options, pythonDeps, warnings, scann
276
293
  packageScore,
277
294
  mostSuspiciousFile,
278
295
  fileScores,
296
+ fileSizes,
279
297
  breakdown
280
298
  },
281
299
  sandbox: sandboxData,
@@ -90,12 +90,14 @@ function handlePostWalk(ctx) {
90
90
  t.file === ctx.relFile && execTypes.includes(t.type)
91
91
  );
92
92
  if (hasExecInFile) {
93
- ctx.threats.push({
93
+ const t = {
94
94
  type: 'binary_dropper',
95
95
  severity: 'CRITICAL',
96
96
  message: `${ctx.chmodMessage} + exec/spawn in same file — binary dropper pattern.`,
97
97
  file: ctx.relFile
98
- });
98
+ };
99
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
100
+ ctx.threats.push(t);
99
101
  }
100
102
  }
101
103
 
@@ -112,22 +114,26 @@ function handlePostWalk(ctx) {
112
114
  // Remote code loading: fetch + eval/Function in same file = multi-stage payload
113
115
  // Distinct from fetch_decrypt_exec which also requires crypto. This catches SVG/HTML payload extraction.
114
116
  if (ctx.hasRemoteFetch && ctx.hasDynamicExec && !ctx.hasCryptoDecipher) {
115
- ctx.threats.push({
117
+ const t = {
116
118
  type: 'remote_code_load',
117
119
  severity: 'CRITICAL',
118
120
  message: 'Remote code loading: network fetch + dynamic eval/Function in same file — multi-stage payload execution.',
119
121
  file: ctx.relFile
120
- });
122
+ };
123
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
124
+ ctx.threats.push(t);
121
125
  }
122
126
 
123
127
  // Wave 4: Remote fetch + crypto decrypt + dynamic eval = steganographic payload chain
124
128
  if (ctx.hasRemoteFetch && ctx.hasCryptoDecipher && ctx.hasDynamicExec) {
125
- ctx.threats.push({
129
+ const t = {
126
130
  type: 'fetch_decrypt_exec',
127
131
  severity: 'CRITICAL',
128
132
  message: 'Steganographic payload chain: remote fetch + crypto decryption + dynamic execution. No legitimate package uses this pattern.',
129
133
  file: ctx.relFile
130
- });
134
+ };
135
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
136
+ ctx.threats.push(t);
131
137
  }
132
138
 
133
139
  // Wave 4: Download-execute-cleanup — https download + chmod executable + execSync + unlink
@@ -135,14 +141,22 @@ function handlePostWalk(ctx) {
135
141
  // B4: removed fetchOnlySafeDomains guard — compound requires fetch+chmod+exec, which is never legitimate
136
142
  // C10: If file also contains hash/checksum verification, downgrade to HIGH — real droppers
137
143
  // don't verify payload integrity; legitimate installers (esbuild, sharp) do.
144
+ // v2.10.95: hasHashVerification is now gated by presence of a comparison operator
145
+ // in the same file (see ast.js:211 — best-effort heuristic). No additional tier
146
+ // added: diagnostic on 545 benign packages showed download_exec_binary fires on
147
+ // only 3 packages (esbuild, yarn, @backstage/create-app) and their final score is
148
+ // dominated by other CRITICAL rules, so a MEDIUM tier here had 0 FPR impact.
149
+ // Full validation in data/fp-v2.10.95-validation.md.
138
150
  if (ctx.hasRemoteFetch && ctx.hasChmodExecutable && ctx.hasExecSyncCall) {
139
- ctx.threats.push({
151
+ const t = {
140
152
  type: 'download_exec_binary',
141
153
  severity: ctx.hasHashVerification ? 'HIGH' : 'CRITICAL',
142
154
  message: 'Download-execute pattern: remote fetch + chmod executable + execSync in same file.' +
143
155
  (ctx.hasHashVerification ? ' Hash verification detected — likely legitimate binary installer.' : ' Binary dropper camouflaged as native addon build.'),
144
156
  file: ctx.relFile
145
- });
157
+ };
158
+ if (ctx.fetchUrls && ctx.fetchUrls.length > 0) t.urls = ctx.fetchUrls.slice();
159
+ ctx.threats.push(t);
146
160
  }
147
161
 
148
162
  // Wave 4: IDE persistence via content co-occurrence — tasks.json + runOn + writeFileSync
@@ -205,10 +205,20 @@ function analyzeFile(content, filePath, basePath) {
205
205
  stringBuildVars: new Set(), // variables assigned from BinaryExpression with '+' (string concat)
206
206
  // Audit v3 B2: Entropy split detection — high-entropy string concat + eval/decode
207
207
  highEntropyConcatFound: false, // set when a concat chain with >=3 leaves and high combined entropy is found
208
- // C10: Hash verification — legitimate binary installers verify checksums
209
- // Requires BOTH createHash() call AND .digest() call false positives from
210
- // standalone mentions of 'sha256' or 'integrity' in comments/descriptions
211
- hasHashVerification: /\bcreateHash\s*\(/.test(content) && /\.digest\s*\(/.test(content),
208
+ // C10: Hash verification — legitimate binary installers verify checksums.
209
+ // v2.10.95: file-level heuristic durcie par un check de comparaison. Requires
210
+ // createHash+digest AND at least one comparison/assert/throw in the same file.
211
+ // THIS IS NOT A PROOF that the hash is actually verified — a malicious author
212
+ // can include a === or assert elsewhere in the file without comparing the
213
+ // digest result. This gate is best-effort and gains value only through the
214
+ // triple-gate in handle-post-walk.js (requires also fetchOnlySafeDomains).
215
+ // Proper fix would require function-scope AST tracking to confirm the
216
+ // comparison consumes the digest result — deferred until a dedicated
217
+ // taint-tracking PR.
218
+ hasHashVerification:
219
+ /\bcreateHash\s*\(/.test(content) &&
220
+ /\.digest\s*\(/.test(content) &&
221
+ /\b(===|!==|\.equals\s*\(|assert\.(strictEqual|equal|deepEqual|deepStrictEqual)\s*\(|\bthrow\b)/.test(content),
212
222
  // GlassWorm: variation selector decoder pattern (.codePointAt + 0xFE00/0xE0100)
213
223
  hasCodePointAt: false,
214
224
  hasVariationSelectorConst: false,
@@ -271,6 +281,10 @@ function analyzeFile(content, filePath, basePath) {
271
281
  })) {
272
282
  ctx.fetchOnlySafeDomains = true;
273
283
  }
284
+ // v2.10.96: retain the URL set on ctx so post-walk detectors can attach
285
+ // it to download/install-shaped threats. Consumed by ML feature
286
+ // install_url_github_releases to avoid regex-on-message proxying.
287
+ ctx.fetchUrls = urlMatches.slice(0, 32);
274
288
  }
275
289
 
276
290
  walk.simple(ast, {
@@ -142,6 +142,8 @@ async function getPackageMetadata(packageName) {
142
142
  const weeklyDownloads = downloadsData?.downloads ?? 0;
143
143
  const authorPackageCount = authorData?.total ?? 0;
144
144
  const versionCount = meta.versions ? Object.keys(meta.versions).length : 0;
145
+ const description = (typeof latestMeta?.description === 'string' ? latestMeta.description
146
+ : (typeof meta.description === 'string' ? meta.description : ''));
145
147
 
146
148
  return {
147
149
  created_at: createdAt,
@@ -151,7 +153,8 @@ async function getPackageMetadata(packageName) {
151
153
  has_readme: hasReadme,
152
154
  has_repository: hasRepository,
153
155
  version_count: versionCount,
154
- readme_size: readmeText.length
156
+ readme_size: readmeText.length,
157
+ description
155
158
  };
156
159
  }
157
160