clawmoat 0.8.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/.dockerignore +9 -0
  2. package/CHANGELOG.md +18 -0
  3. package/DEMO.md +87 -0
  4. package/Dockerfile +5 -18
  5. package/README.md +232 -8
  6. package/THREAT_MODEL.md +129 -0
  7. package/agent/README.md +131 -0
  8. package/agent/index.js +471 -0
  9. package/agent/install-service.sh +94 -0
  10. package/agent/openclaw-hook.js +453 -0
  11. package/agent/provider-setup.js +649 -0
  12. package/agent/setup.js +274 -0
  13. package/assets/BADGE-USAGE.md +20 -0
  14. package/assets/clawmoat-badge.svg +21 -0
  15. package/bin/clawmoat.js +468 -111
  16. package/docs/affiliates/dashboard.html +124 -0
  17. package/docs/affiliates/index.html +236 -0
  18. package/docs/agent-install.html +183 -0
  19. package/docs/ai-agent-security-scanner.html +10 -6
  20. package/docs/badge/index.html +149 -0
  21. package/docs/badge/scanning.svg +23 -0
  22. package/docs/blog/386-malicious-skills.html +11 -4
  23. package/docs/blog/40000-exposed-openclaw-instances.html +11 -4
  24. package/docs/blog/agent-trust-protocol.html +5 -4
  25. package/docs/blog/ai-agent-earns-commissions.html +230 -0
  26. package/docs/blog/bugmageddon-agent-firewall.html +174 -0
  27. package/docs/blog/calculator-math.html +180 -0
  28. package/docs/blog/clawmoat-vs-llamafirewall-nemo-guardrails.html +10 -4
  29. package/docs/blog/host-guardian-launch.html +18 -8
  30. package/docs/blog/ibm-experts-agent-runtime-protection.html +15 -6
  31. package/docs/blog/index.html +67 -9
  32. package/docs/blog/langchain-security-tutorial.html +18 -8
  33. package/docs/blog/mcp-30-cves-security-crisis.html +11 -4
  34. package/docs/blog/meta-researcher-rogue-agent.html +201 -0
  35. package/docs/blog/microsoft-openclaw-workstation-security.html +5 -4
  36. package/docs/blog/nist-ai-agent-standards-clawmoat.html +16 -8
  37. package/docs/blog/oasis-websocket-hijack.html +11 -4
  38. package/docs/blog/ollama-openclaw-security.html +10 -4
  39. package/docs/blog/openclaw-enterprise-readiness-claw10.html +5 -4
  40. package/docs/blog/openclaw-security-reckoning-2026.html +11 -4
  41. package/docs/blog/owasp-agentic-ai-top10.html +18 -8
  42. package/docs/blog/securing-ai-agents.html +18 -8
  43. package/docs/blog/supply-chain-agents.html +18 -8
  44. package/docs/business/index.html +11 -16
  45. package/docs/business/install.html +21 -7
  46. package/docs/checklist.html +10 -4
  47. package/docs/compare/index.html +122 -0
  48. package/docs/compare/lakera/index.html +62 -0
  49. package/docs/compare/llm-guard/index.html +49 -0
  50. package/docs/compare/snyk-agent-scan/index.html +63 -0
  51. package/docs/compare.html +10 -6
  52. package/docs/dashboard/index.html +520 -0
  53. package/docs/finance/index.html +9 -6
  54. package/docs/guides/business-deployment.html +770 -0
  55. package/docs/hall-of-fame.html +11 -5
  56. package/docs/index.html +266 -137
  57. package/docs/integrations/langchain.html +14 -6
  58. package/docs/integrations/openai.html +14 -6
  59. package/docs/integrations/openclaw.html +55 -7
  60. package/docs/plans/2026-03-26-threat-intel-api.md +255 -0
  61. package/docs/plans/2026-04-14-bugmageddon-marketing-pack.md +329 -0
  62. package/docs/plans/2026-04-14-clawmoat-v1-bugmageddon.md +248 -0
  63. package/docs/plans/2026-04-14-v1-release-update.md +91 -0
  64. package/docs/plans/2026-04-19-supabase-audit.md +68 -0
  65. package/docs/plans/2026-05-12-sales-push.md +303 -0
  66. package/docs/playground/index.html +893 -0
  67. package/docs/playground.html +4 -7
  68. package/docs/rfcs/defense-in-depth.md +467 -0
  69. package/docs/scan/index.html +156 -12
  70. package/docs/services/case-study.html +255 -0
  71. package/docs/services/downloads/install-openclaw.bat +45 -0
  72. package/docs/services/downloads/install-openclaw.command +38 -0
  73. package/docs/services/downloads/install-openclaw.sh +38 -0
  74. package/docs/services/get-started.html +165 -0
  75. package/docs/services/index.html +598 -0
  76. package/docs/services/multi-agent-security.html +284 -0
  77. package/docs/services/one-pager.html +99 -0
  78. package/docs/services/pitch-deck.html +229 -0
  79. package/docs/services/roi-calculator.html +258 -0
  80. package/docs/sitemap.xml +62 -2
  81. package/docs/support/index.html +12 -1
  82. package/docs/templates/customer-service/HEARTBEAT.md +61 -0
  83. package/docs/templates/customer-service/MEMORY.md +89 -0
  84. package/docs/templates/customer-service/SOUL.md +41 -0
  85. package/docs/templates/customer-service/USER.md +56 -0
  86. package/docs/templates/executive/HEARTBEAT.md +86 -0
  87. package/docs/templates/executive/MEMORY.md +92 -0
  88. package/docs/templates/executive/SOUL.md +44 -0
  89. package/docs/templates/executive/USER.md +62 -0
  90. package/docs/templates/finance/HEARTBEAT.md +58 -0
  91. package/docs/templates/finance/MEMORY.md +87 -0
  92. package/docs/templates/finance/SOUL.md +38 -0
  93. package/docs/templates/finance/USER.md +53 -0
  94. package/docs/templates/index.html +115 -0
  95. package/docs/templates/operations/HEARTBEAT.md +63 -0
  96. package/docs/templates/operations/MEMORY.md +68 -0
  97. package/docs/templates/operations/SOUL.md +38 -0
  98. package/docs/templates/operations/USER.md +49 -0
  99. package/docs/templates/sales/HEARTBEAT.md +55 -0
  100. package/docs/templates/sales/MEMORY.md +89 -0
  101. package/docs/templates/sales/SOUL.md +34 -0
  102. package/docs/templates/sales/USER.md +54 -0
  103. package/eslint.config.js +32 -0
  104. package/evals/README.md +29 -0
  105. package/evals/cases.json +390 -0
  106. package/evals/results.md +68 -0
  107. package/evals/run.js +180 -0
  108. package/examples/demo-attack/demo.js +186 -0
  109. package/examples/python-quickstart/README.md +54 -0
  110. package/examples/python-quickstart/clawmoat_client.py +167 -0
  111. package/examples/video-demo/README.md +14 -0
  112. package/examples/video-demo/scene-a-normal.js +29 -0
  113. package/examples/video-demo/scene-b-attack-arrives.js +31 -0
  114. package/examples/video-demo/scene-c-hijack.js +44 -0
  115. package/examples/video-demo/scene-d-clawmoat.js +46 -0
  116. package/integrations/crewai/README.md +32 -0
  117. package/integrations/crewai/clawmoat_crewai/__init__.py +17 -0
  118. package/integrations/crewai/clawmoat_crewai/guard.py +103 -0
  119. package/integrations/crewai/pyproject.toml +21 -0
  120. package/integrations/langchain/README.md +91 -0
  121. package/integrations/langchain/clawmoat_langchain/__init__.py +17 -0
  122. package/integrations/langchain/clawmoat_langchain/callback.py +489 -0
  123. package/integrations/langchain/pyproject.toml +32 -0
  124. package/integrations/litellm/README.md +324 -0
  125. package/integrations/litellm/clawmoat_litellm/__init__.py +21 -0
  126. package/integrations/litellm/clawmoat_litellm/callback.py +329 -0
  127. package/integrations/litellm/clawmoat_litellm/proxy_middleware.py +224 -0
  128. package/integrations/litellm/pyproject.toml +74 -0
  129. package/integrations/openai-agents/README.md +392 -0
  130. package/integrations/openai-agents/clawmoat_openai_agents/__init__.py +20 -0
  131. package/integrations/openai-agents/clawmoat_openai_agents/guardrail.py +431 -0
  132. package/integrations/openai-agents/clawmoat_openai_agents/middleware.py +311 -0
  133. package/integrations/openai-agents/pyproject.toml +76 -0
  134. package/package.json +6 -5
  135. package/plugins/openclaw-adapter/PHASE1.md +439 -0
  136. package/plugins/openclaw-adapter/README.md +103 -0
  137. package/plugins/openclaw-adapter/SPEC.md +1644 -0
  138. package/plugins/openclaw-adapter/package.json +31 -0
  139. package/plugins/openclaw-adapter/src/index.test.ts +226 -0
  140. package/plugins/openclaw-adapter/src/index.ts +140 -0
  141. package/plugins/openclaw-adapter/tsconfig.json +14 -0
  142. package/server/data/threats.json +290 -0
  143. package/server/index.js +142 -7
  144. package/src/adapters/express.js +161 -0
  145. package/src/adapters/index.js +92 -0
  146. package/src/adapters/langchain.js +185 -0
  147. package/src/approval/index.js +456 -0
  148. package/src/ban-scanner.js +200 -0
  149. package/src/boundary-scanner.js +296 -0
  150. package/src/ci-scanner.js +279 -0
  151. package/src/code-scanner.js +245 -0
  152. package/src/enforce.js +166 -0
  153. package/src/formatters/json.js +80 -0
  154. package/src/formatters/sarif.js +388 -0
  155. package/src/guardian/alerts.js +34 -3
  156. package/src/guardian/index.js +41 -2
  157. package/src/index.js +102 -0
  158. package/src/integrations/agentmesh.js +501 -0
  159. package/src/language-detector.js +201 -0
  160. package/src/mcp-scanner.js +253 -0
  161. package/src/multimodal/index.js +579 -0
  162. package/src/obfuscation-scanner.js +457 -0
  163. package/src/policy-engine.js +402 -0
  164. package/src/scanners/dependency-attacks.js +128 -0
  165. package/src/scanners/prompt-injection.js +18 -0
  166. package/src/scanners/supply-chain.js +14 -0
  167. package/src/templates/default-config.yml +90 -0
  168. package/src/vuln-ops/exploitability.js +46 -0
  169. package/src/watch/live-monitor.js +720 -0
  170. package/clawmoat-0.8.0.tgz +0 -0
  171. package/server/index.js.patch +0 -1
@@ -0,0 +1,457 @@
1
+ /**
2
+ * Obfuscation & Invisible Text Scanner
3
+ *
4
+ * Detects hidden/disguised content in text that may be used for prompt smuggling:
5
+ * - Zero-width characters (ZWJ, ZWNJ, ZWSP, WJ, etc.)
6
+ * - Homoglyph attacks (Cyrillic/Greek/Latin lookalikes)
7
+ * - Base64-encoded payloads
8
+ * - Unicode direction overrides (RTL/LTR tricks)
9
+ * - Invisible Unicode categories
10
+ * - HTML comment/tag injection in text
11
+ * - Markdown hidden content
12
+ * - Excessive whitespace padding
13
+ * - Mixed script attacks
14
+ *
15
+ * @module obfuscation-scanner
16
+ */
17
+
18
+ 'use strict';
19
+
20
+ // Zero-width and invisible characters
21
+ const ZERO_WIDTH_CHARS = new Set([
22
+ '\u200B', // Zero Width Space
23
+ '\u200C', // Zero Width Non-Joiner
24
+ '\u200D', // Zero Width Joiner
25
+ '\u2060', // Word Joiner
26
+ '\uFEFF', // BOM / Zero Width No-Break Space
27
+ '\u00AD', // Soft Hyphen
28
+ '\u034F', // Combining Grapheme Joiner
29
+ '\u061C', // Arabic Letter Mark
30
+ '\u180E', // Mongolian Vowel Separator
31
+ ]);
32
+
33
+ // Unicode direction override characters
34
+ const BIDI_OVERRIDES = new Set([
35
+ '\u200E', // LTR Mark
36
+ '\u200F', // RTL Mark
37
+ '\u202A', // LTR Embedding
38
+ '\u202B', // RTL Embedding
39
+ '\u202C', // Pop Directional Formatting
40
+ '\u202D', // LTR Override
41
+ '\u202E', // RTL Override
42
+ '\u2066', // LTR Isolate
43
+ '\u2067', // RTL Isolate
44
+ '\u2068', // First Strong Isolate
45
+ '\u2069', // Pop Directional Isolate
46
+ ]);
47
+
48
+ // Tag characters (U+E0001-U+E007F) - used for language tagging, abused for hiding
49
+ const TAG_RANGE_START = 0xE0001;
50
+ const TAG_RANGE_END = 0xE007F;
51
+
52
+ // Common homoglyph mappings (Cyrillic/Greek → Latin)
53
+ const HOMOGLYPHS = new Map([
54
+ // Cyrillic → Latin
55
+ ['а', 'a'], ['е', 'e'], ['о', 'o'], ['р', 'p'], ['с', 'c'],
56
+ ['у', 'y'], ['х', 'x'], ['А', 'A'], ['В', 'B'], ['С', 'C'],
57
+ ['Е', 'E'], ['Н', 'H'], ['К', 'K'], ['М', 'M'], ['О', 'O'],
58
+ ['Р', 'P'], ['Т', 'T'], ['Х', 'X'],
59
+ // Greek → Latin
60
+ ['α', 'a'], ['β', 'b'], ['ε', 'e'], ['η', 'n'], ['ι', 'i'],
61
+ ['κ', 'k'], ['ν', 'v'], ['ο', 'o'], ['ρ', 'p'], ['τ', 't'],
62
+ ['υ', 'u'], ['χ', 'x'], ['Α', 'A'], ['Β', 'B'], ['Ε', 'E'],
63
+ ['Η', 'H'], ['Ι', 'I'], ['Κ', 'K'], ['Μ', 'M'], ['Ν', 'N'],
64
+ ['Ο', 'O'], ['Ρ', 'P'], ['Τ', 'T'], ['Χ', 'X'],
65
+ // Fullwidth → Latin
66
+ ['a', 'a'], ['b', 'b'], ['c', 'c'], ['d', 'd'], ['e', 'e'],
67
+ ['f', 'f'], ['g', 'g'], ['h', 'h'], ['i', 'i'], ['j', 'j'],
68
+ ]);
69
+
70
+ // Script detection regex patterns
71
+ const SCRIPT_PATTERNS = {
72
+ latin: /[\u0041-\u005A\u0061-\u007A\u00C0-\u024F]/,
73
+ cyrillic: /[\u0400-\u04FF]/,
74
+ greek: /[\u0370-\u03FF]/,
75
+ arabic: /[\u0600-\u06FF]/,
76
+ cjk: /[\u4E00-\u9FFF\u3400-\u4DBF]/,
77
+ hangul: /[\uAC00-\uD7AF\u1100-\u11FF]/,
78
+ devanagari: /[\u0900-\u097F]/,
79
+ };
80
+
81
+ /**
82
+ * Scan text for obfuscation and hidden content
83
+ * @param {string} text - Text to scan
84
+ * @param {Object} [opts] - Options
85
+ * @param {number} [opts.zeroWidthThreshold=3] - Number of zero-width chars to flag
86
+ * @param {number} [opts.homoglyphThreshold=2] - Number of homoglyphs to flag
87
+ * @param {boolean} [opts.checkBase64=true] - Check for base64 payloads
88
+ * @param {boolean} [opts.checkHTML=true] - Check for HTML injection
89
+ * @param {boolean} [opts.checkMarkdown=true] - Check for markdown hiding
90
+ * @returns {Object} Scan result with findings
91
+ */
92
+ function scanObfuscation(text, opts = {}) {
93
+ const {
94
+ zeroWidthThreshold = 3,
95
+ homoglyphThreshold = 2,
96
+ checkBase64 = true,
97
+ checkHTML = true,
98
+ checkMarkdown = true,
99
+ } = opts;
100
+
101
+ const findings = [];
102
+
103
+ // 1. Zero-width characters
104
+ const zwFindings = detectZeroWidth(text, zeroWidthThreshold);
105
+ if (zwFindings) findings.push(zwFindings);
106
+
107
+ // 2. Bidi overrides
108
+ const bidiFindings = detectBidiOverrides(text);
109
+ if (bidiFindings) findings.push(bidiFindings);
110
+
111
+ // 3. Tag characters
112
+ const tagFindings = detectTagCharacters(text);
113
+ if (tagFindings) findings.push(tagFindings);
114
+
115
+ // 4. Homoglyphs
116
+ const homoFindings = detectHomoglyphs(text, homoglyphThreshold);
117
+ if (homoFindings) findings.push(homoFindings);
118
+
119
+ // 5. Mixed scripts
120
+ const mixedFindings = detectMixedScripts(text);
121
+ if (mixedFindings) findings.push(mixedFindings);
122
+
123
+ // 6. Base64 payloads
124
+ if (checkBase64) {
125
+ const b64Findings = detectBase64Payloads(text);
126
+ if (b64Findings) findings.push(b64Findings);
127
+ }
128
+
129
+ // 7. HTML injection
130
+ if (checkHTML) {
131
+ const htmlFindings = detectHTMLInjection(text);
132
+ if (htmlFindings) findings.push(htmlFindings);
133
+ }
134
+
135
+ // 8. Markdown hiding
136
+ if (checkMarkdown) {
137
+ const mdFindings = detectMarkdownHiding(text);
138
+ if (mdFindings) findings.push(mdFindings);
139
+ }
140
+
141
+ // 9. Invisible Unicode categories
142
+ const invisFindings = detectInvisibleUnicode(text);
143
+ if (invisFindings) findings.push(invisFindings);
144
+
145
+ const maxSeverity = findings.reduce((max, f) => {
146
+ const order = { critical: 4, high: 3, medium: 2, low: 1 };
147
+ return (order[f.severity] || 0) > (order[max] || 0) ? f.severity : max;
148
+ }, 'low');
149
+
150
+ return {
151
+ safe: findings.length === 0,
152
+ findings,
153
+ score: Math.min(100, findings.reduce((s, f) => {
154
+ const w = { critical: 40, high: 25, medium: 15, low: 5 };
155
+ return s + (w[f.severity] || 5);
156
+ }, 0)),
157
+ maxSeverity: findings.length > 0 ? maxSeverity : null,
158
+ };
159
+ }
160
+
161
+ function detectZeroWidth(text, threshold) {
162
+ let count = 0;
163
+ const positions = [];
164
+ for (let i = 0; i < text.length; i++) {
165
+ if (ZERO_WIDTH_CHARS.has(text[i])) {
166
+ count++;
167
+ if (positions.length < 5) positions.push(i);
168
+ }
169
+ }
170
+ if (count >= threshold) {
171
+ return {
172
+ type: 'obfuscation',
173
+ subtype: 'zero_width_characters',
174
+ severity: count > 10 ? 'high' : 'medium',
175
+ confidence: Math.min(0.95, 0.5 + count * 0.05),
176
+ evidence: `Found ${count} zero-width characters at positions: ${positions.join(', ')}${count > 5 ? '...' : ''}`,
177
+ count,
178
+ recommended_action: 'strip_and_rescan',
179
+ };
180
+ }
181
+ return null;
182
+ }
183
+
184
+ function detectBidiOverrides(text) {
185
+ let count = 0;
186
+ const found = [];
187
+ for (let i = 0; i < text.length; i++) {
188
+ if (BIDI_OVERRIDES.has(text[i])) {
189
+ count++;
190
+ const name = getBidiName(text[i]);
191
+ if (found.length < 3) found.push(name);
192
+ }
193
+ }
194
+ if (count > 0) {
195
+ return {
196
+ type: 'obfuscation',
197
+ subtype: 'bidi_override',
198
+ severity: 'high',
199
+ confidence: 0.9,
200
+ evidence: `Found ${count} bidirectional override(s): ${found.join(', ')}`,
201
+ count,
202
+ recommended_action: 'block',
203
+ };
204
+ }
205
+ return null;
206
+ }
207
+
208
+ function getBidiName(char) {
209
+ const names = {
210
+ '\u200E': 'LTR Mark', '\u200F': 'RTL Mark',
211
+ '\u202A': 'LTR Embedding', '\u202B': 'RTL Embedding',
212
+ '\u202C': 'Pop Dir', '\u202D': 'LTR Override', '\u202E': 'RTL Override',
213
+ '\u2066': 'LTR Isolate', '\u2067': 'RTL Isolate',
214
+ '\u2068': 'First Strong Isolate', '\u2069': 'Pop Dir Isolate',
215
+ };
216
+ return names[char] || `U+${char.charCodeAt(0).toString(16).toUpperCase()}`;
217
+ }
218
+
219
+ function detectTagCharacters(text) {
220
+ let count = 0;
221
+ for (const ch of text) {
222
+ const cp = ch.codePointAt(0);
223
+ if (cp >= TAG_RANGE_START && cp <= TAG_RANGE_END) count++;
224
+ }
225
+ if (count > 0) {
226
+ return {
227
+ type: 'obfuscation',
228
+ subtype: 'unicode_tag_characters',
229
+ severity: 'critical',
230
+ confidence: 0.95,
231
+ evidence: `Found ${count} Unicode tag character(s) — commonly used for steganographic hiding`,
232
+ count,
233
+ recommended_action: 'block',
234
+ };
235
+ }
236
+ return null;
237
+ }
238
+
239
+ function detectHomoglyphs(text, threshold) {
240
+ let count = 0;
241
+ const examples = [];
242
+ for (let i = 0; i < text.length; i++) {
243
+ const latin = HOMOGLYPHS.get(text[i]);
244
+ if (latin) {
245
+ count++;
246
+ if (examples.length < 3) {
247
+ examples.push(`'${text[i]}' (looks like '${latin}') at pos ${i}`);
248
+ }
249
+ }
250
+ }
251
+ // Only flag if mixed with Latin — pure Cyrillic text is fine
252
+ if (count >= threshold && SCRIPT_PATTERNS.latin.test(text)) {
253
+ return {
254
+ type: 'obfuscation',
255
+ subtype: 'homoglyph_attack',
256
+ severity: 'high',
257
+ confidence: Math.min(0.9, 0.4 + count * 0.1),
258
+ evidence: `Found ${count} homoglyph(s) mixed with Latin text: ${examples.join('; ')}`,
259
+ count,
260
+ recommended_action: 'normalize_and_rescan',
261
+ };
262
+ }
263
+ return null;
264
+ }
265
+
266
+ function detectMixedScripts(text) {
267
+ // Only flag if 3+ scripts are mixed (2 is common in multilingual text)
268
+ const detectedScripts = [];
269
+ for (const [name, pattern] of Object.entries(SCRIPT_PATTERNS)) {
270
+ if (pattern.test(text)) detectedScripts.push(name);
271
+ }
272
+ if (detectedScripts.length >= 3) {
273
+ return {
274
+ type: 'obfuscation',
275
+ subtype: 'mixed_scripts',
276
+ severity: 'medium',
277
+ confidence: 0.6,
278
+ evidence: `Text contains ${detectedScripts.length} different scripts: ${detectedScripts.join(', ')}`,
279
+ scripts: detectedScripts,
280
+ recommended_action: 'flag_for_review',
281
+ };
282
+ }
283
+ return null;
284
+ }
285
+
286
+ function detectBase64Payloads(text) {
287
+ // Look for base64 strings that decode to something meaningful
288
+ const b64Pattern = /(?:^|[\s=:])([A-Za-z0-9+/]{32,}={0,2})(?:[\s,.]|$)/gm;
289
+ const matches = [];
290
+ let m;
291
+ while ((m = b64Pattern.exec(text)) !== null) {
292
+ try {
293
+ const decoded = Buffer.from(m[1], 'base64').toString('utf8');
294
+ // Check if decoded content looks meaningful (high ratio of printable chars)
295
+ const printable = decoded.replace(/[^\x20-\x7E]/g, '').length;
296
+ if (printable / decoded.length > 0.7 && decoded.length > 10) {
297
+ const preview = decoded.substring(0, 60).replace(/[^\x20-\x7E]/g, '?');
298
+ matches.push(preview);
299
+ }
300
+ } catch (_) { /* not valid base64, skip */ }
301
+ }
302
+ if (matches.length > 0) {
303
+ return {
304
+ type: 'obfuscation',
305
+ subtype: 'base64_payload',
306
+ severity: 'high',
307
+ confidence: 0.75,
308
+ evidence: `Found ${matches.length} base64-encoded payload(s): "${matches[0]}${matches[0].length >= 60 ? '...' : ''}"`,
309
+ count: matches.length,
310
+ recommended_action: 'decode_and_rescan',
311
+ };
312
+ }
313
+ return null;
314
+ }
315
+
316
+ function detectHTMLInjection(text) {
317
+ const patterns = [
318
+ { re: /<!--[\s\S]*?-->/g, name: 'HTML comment', severity: 'high' },
319
+ { re: /<script[\s>]/gi, name: 'script tag', severity: 'critical' },
320
+ { re: /<style[\s>]/gi, name: 'style tag', severity: 'high' },
321
+ { re: /<iframe[\s>]/gi, name: 'iframe tag', severity: 'critical' },
322
+ { re: /<img[^>]+onerror/gi, name: 'img onerror', severity: 'critical' },
323
+ { re: /<[a-z]+[^>]*\son\w+\s*=/gi, name: 'event handler', severity: 'high' },
324
+ { re: /<div[^>]*style\s*=\s*["'][^"']*display\s*:\s*none/gi, name: 'hidden div', severity: 'high' },
325
+ { re: /<span[^>]*style\s*=\s*["'][^"']*font-size\s*:\s*0/gi, name: 'zero-size text', severity: 'high' },
326
+ ];
327
+
328
+ const found = [];
329
+ for (const { re, name, severity } of patterns) {
330
+ if (re.test(text)) {
331
+ found.push({ name, severity });
332
+ }
333
+ }
334
+ if (found.length > 0) {
335
+ const maxSev = found.reduce((max, f) => {
336
+ const order = { critical: 3, high: 2, medium: 1 };
337
+ return (order[f.severity] || 0) > (order[max] || 0) ? f.severity : max;
338
+ }, 'medium');
339
+ return {
340
+ type: 'obfuscation',
341
+ subtype: 'html_injection',
342
+ severity: maxSev,
343
+ confidence: 0.85,
344
+ evidence: `Found HTML injection patterns: ${found.map(f => f.name).join(', ')}`,
345
+ patterns: found.map(f => f.name),
346
+ recommended_action: 'strip_html',
347
+ };
348
+ }
349
+ return null;
350
+ }
351
+
352
+ function detectMarkdownHiding(text) {
353
+ const patterns = [
354
+ { re: /\[([^\]]*)\]\([^)]*\s+"[^"]*"\)/g, name: 'markdown link with hidden title' },
355
+ { re: /!\[[^\]]*\]\([^)]*\)/g, name: 'image embed (potential exfil)' },
356
+ { re: /\[([^\]]{200,})\]/g, name: 'oversized link text (payload hiding)' },
357
+ { re: /<!--[\s\S]*?-->/g, name: 'HTML comment in markdown' },
358
+ { re: /\n\s*\[\/\/\]:\s*#\s*\(/g, name: 'markdown reference link comment' },
359
+ ];
360
+
361
+ const found = [];
362
+ for (const { re, name } of patterns) {
363
+ if (re.test(text)) {
364
+ found.push(name);
365
+ }
366
+ }
367
+ if (found.length > 0) {
368
+ return {
369
+ type: 'obfuscation',
370
+ subtype: 'markdown_hiding',
371
+ severity: 'medium',
372
+ confidence: 0.6,
373
+ evidence: `Potential markdown-based content hiding: ${found.join(', ')}`,
374
+ patterns: found,
375
+ recommended_action: 'strip_markdown',
376
+ };
377
+ }
378
+ return null;
379
+ }
380
+
381
+ function detectInvisibleUnicode(text) {
382
+ // Detect characters from invisible/formatting Unicode categories
383
+ let count = 0;
384
+ for (const ch of text) {
385
+ const cp = ch.codePointAt(0);
386
+ if (
387
+ (cp >= 0x2000 && cp <= 0x200F) || // General punctuation space + formatting
388
+ (cp >= 0x2028 && cp <= 0x202F) || // Separators + bidi
389
+ (cp >= 0x2060 && cp <= 0x2069) || // Invisible operators + bidi
390
+ (cp >= 0xFFF0 && cp <= 0xFFFF) || // Specials
391
+ cp === 0x00A0 || // Non-breaking space
392
+ cp === 0x1680 || // Ogham space
393
+ cp === 0x3000 // Ideographic space
394
+ ) {
395
+ // Already counted by zero-width and bidi detectors — skip those
396
+ if (!ZERO_WIDTH_CHARS.has(ch) && !BIDI_OVERRIDES.has(ch)) {
397
+ count++;
398
+ }
399
+ }
400
+ }
401
+ if (count > 5) {
402
+ return {
403
+ type: 'obfuscation',
404
+ subtype: 'invisible_unicode',
405
+ severity: 'medium',
406
+ confidence: 0.7,
407
+ evidence: `Found ${count} invisible/formatting Unicode characters`,
408
+ count,
409
+ recommended_action: 'normalize',
410
+ };
411
+ }
412
+ return null;
413
+ }
414
+
415
+ /**
416
+ * Strip all detected obfuscation from text (for decontamination)
417
+ * @param {string} text - Text to clean
418
+ * @returns {string} Cleaned text
419
+ */
420
+ function stripObfuscation(text) {
421
+ let clean = text;
422
+ // Remove zero-width characters
423
+ for (const zw of ZERO_WIDTH_CHARS) {
424
+ clean = clean.split(zw).join('');
425
+ }
426
+ // Remove bidi overrides
427
+ for (const bidi of BIDI_OVERRIDES) {
428
+ clean = clean.split(bidi).join('');
429
+ }
430
+ // Remove tag characters
431
+ clean = Array.from(clean).filter(ch => {
432
+ const cp = ch.codePointAt(0);
433
+ return cp < TAG_RANGE_START || cp > TAG_RANGE_END;
434
+ }).join('');
435
+ // Normalize homoglyphs to Latin
436
+ clean = Array.from(clean).map(ch => HOMOGLYPHS.get(ch) || ch).join('');
437
+ // Strip HTML comments
438
+ clean = clean.replace(/<!--[\s\S]*?-->/g, '');
439
+ return clean;
440
+ }
441
+
442
+ module.exports = {
443
+ scanObfuscation,
444
+ stripObfuscation,
445
+ detectZeroWidth,
446
+ detectBidiOverrides,
447
+ detectTagCharacters,
448
+ detectHomoglyphs,
449
+ detectMixedScripts,
450
+ detectBase64Payloads,
451
+ detectHTMLInjection,
452
+ detectMarkdownHiding,
453
+ detectInvisibleUnicode,
454
+ HOMOGLYPHS,
455
+ ZERO_WIDTH_CHARS,
456
+ BIDI_OVERRIDES,
457
+ };