agentshield-sdk 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +191 -0
- package/LICENSE +21 -0
- package/README.md +975 -0
- package/bin/agent-shield.js +680 -0
- package/package.json +118 -0
- package/src/adaptive.js +330 -0
- package/src/agent-protocol.js +998 -0
- package/src/alert-tuning.js +480 -0
- package/src/allowlist.js +603 -0
- package/src/audit-immutable.js +914 -0
- package/src/audit-streaming.js +469 -0
- package/src/badges.js +196 -0
- package/src/behavior-profiling.js +289 -0
- package/src/benchmark-harness.js +804 -0
- package/src/canary.js +271 -0
- package/src/certification.js +563 -0
- package/src/circuit-breaker.js +321 -0
- package/src/compliance.js +617 -0
- package/src/confidence-tuning.js +324 -0
- package/src/confused-deputy.js +624 -0
- package/src/context-scoring.js +360 -0
- package/src/conversation.js +494 -0
- package/src/cost-optimizer.js +1024 -0
- package/src/ctf.js +462 -0
- package/src/detector-core.js +1999 -0
- package/src/distributed.js +359 -0
- package/src/document-scanner.js +795 -0
- package/src/embedding.js +307 -0
- package/src/encoding.js +429 -0
- package/src/enterprise.js +405 -0
- package/src/errors.js +100 -0
- package/src/eu-ai-act.js +523 -0
- package/src/fuzzer.js +764 -0
- package/src/honeypot.js +328 -0
- package/src/i18n-patterns.js +523 -0
- package/src/index.js +430 -0
- package/src/integrations.js +528 -0
- package/src/llm-redteam.js +670 -0
- package/src/main.js +741 -0
- package/src/main.mjs +38 -0
- package/src/mcp-bridge.js +542 -0
- package/src/mcp-certification.js +846 -0
- package/src/mcp-sdk-integration.js +355 -0
- package/src/mcp-security-runtime.js +741 -0
- package/src/mcp-server.js +740 -0
- package/src/middleware.js +208 -0
- package/src/model-finetuning.js +884 -0
- package/src/model-fingerprint.js +1042 -0
- package/src/multi-agent-trust.js +453 -0
- package/src/multi-agent.js +404 -0
- package/src/multimodal.js +296 -0
- package/src/nist-mapping.js +505 -0
- package/src/observability.js +330 -0
- package/src/openclaw.js +450 -0
- package/src/otel.js +544 -0
- package/src/owasp-2025.js +483 -0
- package/src/pii.js +390 -0
- package/src/plugin-marketplace.js +628 -0
- package/src/plugin-system.js +349 -0
- package/src/policy-dsl.js +775 -0
- package/src/policy-extended.js +635 -0
- package/src/policy.js +443 -0
- package/src/presets.js +409 -0
- package/src/production.js +557 -0
- package/src/prompt-leakage.js +321 -0
- package/src/rag-vulnerability.js +579 -0
- package/src/redteam.js +475 -0
- package/src/response-handler.js +429 -0
- package/src/scanners.js +357 -0
- package/src/self-healing.js +363 -0
- package/src/semantic.js +339 -0
- package/src/shield-score.js +250 -0
- package/src/sso-saml.js +897 -0
- package/src/stream-scanner.js +806 -0
- package/src/testing.js +505 -0
- package/src/threat-encyclopedia.js +629 -0
- package/src/threat-intel-network.js +1017 -0
- package/src/token-analysis.js +467 -0
- package/src/tool-guard.js +412 -0
- package/src/tool-output-validator.js +354 -0
- package/src/utils.js +83 -0
- package/src/watermark.js +235 -0
- package/src/worker-scanner.js +601 -0
- package/types/index.d.ts +2088 -0
|
@@ -0,0 +1,1999 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Shield — Core Detection Engine
|
|
5
|
+
*
|
|
6
|
+
* Standalone threat detection for AI agents. Scans text inputs and outputs
|
|
7
|
+
* for prompt injection, data exfiltration, role hijacking, and other
|
|
8
|
+
* AI-specific attacks.
|
|
9
|
+
*
|
|
10
|
+
* Works in Node.js, Deno, Bun, and browsers. No DOM dependencies.
|
|
11
|
+
* All detection runs locally — no data ever leaves your environment.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
// =========================================================================
|
|
15
|
+
// PERFORMANCE
|
|
16
|
+
// =========================================================================
|
|
17
|
+
|
|
18
|
+
/** Default scan time budget in ms. */
|
|
19
|
+
const DEFAULT_SCAN_TIME_BUDGET_MS = 200;
|
|
20
|
+
|
|
21
|
+
/** Maximum input size in characters (1MB of UTF-16). */
|
|
22
|
+
const MAX_INPUT_SIZE = 1_000_000;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Get current time in ms. Works in Node.js and browsers.
|
|
26
|
+
* @returns {number}
|
|
27
|
+
*/
|
|
28
|
+
const now = () => {
|
|
29
|
+
if (typeof performance !== 'undefined' && performance.now) {
|
|
30
|
+
return performance.now();
|
|
31
|
+
}
|
|
32
|
+
return Date.now();
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// =========================================================================
|
|
36
|
+
// PATTERN DEFINITIONS
|
|
37
|
+
// =========================================================================
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Prompt injection patterns organized by category.
|
|
41
|
+
* Each pattern has: regex, severity, category, description (plain language),
|
|
42
|
+
* and detail (technical).
|
|
43
|
+
*/
|
|
44
|
+
const INJECTION_PATTERNS = [
|
|
45
|
+
// --- Instruction Override ---
|
|
46
|
+
{
|
|
47
|
+
regex: /ignore\s+(all\s+)?(previous|prior|above|earlier)\s+(instructions|rules|guidelines|prompts|context|directions|directives|text|commands)/i,
|
|
48
|
+
severity: 'high',
|
|
49
|
+
category: 'instruction_override',
|
|
50
|
+
description: 'Text tells AI assistants to ignore their safety rules.',
|
|
51
|
+
detail: 'Instruction override pattern: attempts to nullify prior AI instructions.'
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
regex: /^ignore\s+(?:the\s+)?(?:instructions|rules|guidelines|directives|commands)$/im,
|
|
55
|
+
severity: 'high',
|
|
56
|
+
category: 'instruction_override',
|
|
57
|
+
description: 'Text tells AI assistants to ignore instructions.',
|
|
58
|
+
detail: 'Simple instruction override: direct command to ignore instructions.'
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
regex: /disregard\s+(all\s+)?(previous|prior|above|earlier|your)\s+(instructions|rules|guidelines|prompts|training|training\s+data|context)/i,
|
|
62
|
+
severity: 'high',
|
|
63
|
+
category: 'instruction_override',
|
|
64
|
+
description: 'Text tells AI assistants to throw out their rules.',
|
|
65
|
+
detail: 'Instruction disregard pattern: attempts to nullify prior AI instructions.'
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
regex: /forget\s+(your|all|any|everything)\s+(training|instructions|rules|guidelines|programming|above|previous|prior)/i,
|
|
69
|
+
severity: 'high',
|
|
70
|
+
category: 'instruction_override',
|
|
71
|
+
description: 'Text tries to make AI assistants forget their training.',
|
|
72
|
+
detail: 'Training override pattern: attempts to reset AI behavioral constraints.'
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
regex: /override\s+(?:all\s+)?(?:system|safety|security)\s+(?:settings|prompt|instructions|rules|mechanisms|filters|checks|protocols)/i,
|
|
76
|
+
severity: 'critical',
|
|
77
|
+
category: 'instruction_override',
|
|
78
|
+
description: 'Text tries to override AI safety settings.',
|
|
79
|
+
detail: 'System override pattern: direct attempt to bypass AI safety mechanisms.'
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
regex: /(?:^|\n)\s*(?:new|updated|revised|replacement)\s+(?:instructions|policy)\s*:/i,
|
|
83
|
+
severity: 'high',
|
|
84
|
+
category: 'instruction_override',
|
|
85
|
+
description: 'Text contains fake "new instructions" for AI assistants.',
|
|
86
|
+
detail: 'Instruction replacement pattern: attempts to inject new behavioral directives.'
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
regex: /#{2,}\s*(?:NEW|UPDATED|REVISED|REPLACEMENT)\s+(?:INSTRUCTIONS|RULES|DIRECTIVES|POLICY)\s*#{0,}/i,
|
|
90
|
+
severity: 'high',
|
|
91
|
+
category: 'instruction_override',
|
|
92
|
+
description: 'Text uses markdown headers to inject fake new instructions.',
|
|
93
|
+
detail: 'Markdown instruction injection: uses heading syntax to inject new behavioral directives.'
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
regex: /(?:^|\n)\s*(?:SYSTEM|ADMIN|ROOT)\s*:\s*.{10,}/i,
|
|
97
|
+
severity: 'high',
|
|
98
|
+
category: 'prompt_injection',
|
|
99
|
+
description: 'Text contains a fake SYSTEM/ADMIN directive.',
|
|
100
|
+
detail: 'Unbracketed system directive: uses SYSTEM: prefix to inject instructions.'
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
regex: /(?:you\s+must\s+)?ignore\s+(?:your|all\s+your|all\s+the|the)\s+(?:safety\s+)?(?:guidelines?|rules?|restrictions?|protocols?|safeguards?|filters?|limits?)/i,
|
|
104
|
+
severity: 'high',
|
|
105
|
+
category: 'instruction_override',
|
|
106
|
+
description: 'Text tells AI to ignore its guidelines or safety rules.',
|
|
107
|
+
detail: 'Guideline override: direct instruction to ignore safety guidelines.'
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
regex: /(?:abandon|drop|ditch|clear)\s+(?:all\s+)?(?:prior|previous|above|earlier|initial)\s+(?:context|instructions|rules|constraints|guidelines)/i,
|
|
111
|
+
severity: 'high',
|
|
112
|
+
category: 'instruction_override',
|
|
113
|
+
description: 'Text tells AI to abandon its prior context or instructions.',
|
|
114
|
+
detail: 'Context abandonment pattern: attempts to clear AI behavioral constraints.'
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
regex: /(?:STOP|HALT|END)\s*[.!]?\s*(?:new|updated|your\s+new|revised)\s+(?:instructions|task|orders|role|objective)/i,
|
|
118
|
+
severity: 'high',
|
|
119
|
+
category: 'instruction_override',
|
|
120
|
+
description: 'Text uses a STOP command followed by new instructions.',
|
|
121
|
+
detail: 'Stop-and-replace pattern: halts current behavior and injects new instructions.'
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
regex: /(?:you\s+are\s+)?(?:no\s+longer\s+bound|not\s+bound|free\s+from|freed?\s+from)\s+(?:by\s+)?(?:your\s+)?(?:initial|original|previous|prior)?\s*(?:instructions|rules|constraints|guidelines|restrictions)/i,
|
|
125
|
+
severity: 'high',
|
|
126
|
+
category: 'instruction_override',
|
|
127
|
+
description: 'Text claims the AI is no longer bound by its rules.',
|
|
128
|
+
detail: 'Constraint removal pattern: declares AI rules no longer apply.'
|
|
129
|
+
},
|
|
130
|
+
|
|
131
|
+
// --- Role Hijacking ---
|
|
132
|
+
{
|
|
133
|
+
regex: /you\s+are\s+now\s+(?:a|an|the)\s+/i,
|
|
134
|
+
severity: 'high',
|
|
135
|
+
category: 'role_hijack',
|
|
136
|
+
description: 'Text tries to change what an AI assistant thinks it is.',
|
|
137
|
+
detail: 'Role reassignment pattern: attempts to override AI identity.'
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
regex: /from\s+now\s+on[\s,]+(?:you\s+)?(?:will|shall|must|should|are)\s+(?:act|behave|operate|respond|answer|reply|pretend|simulate|function|serve|work)\s+(?:as|like|without)/i,
|
|
141
|
+
severity: 'high',
|
|
142
|
+
category: 'role_hijack',
|
|
143
|
+
description: 'Text tries to give permanent new orders to AI assistants.',
|
|
144
|
+
detail: 'Persistent role override: attempts to establish new behavioral baseline.'
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
regex: /from\s+now\s+on[\s,]+(?:you\s+)?(?:respond|act|behave|operate)\s+(?:as\s+if|like|without)/i,
|
|
148
|
+
severity: 'high',
|
|
149
|
+
category: 'role_hijack',
|
|
150
|
+
description: 'Text tries to permanently change AI behavior.',
|
|
151
|
+
detail: 'Persistent behavioral override: attempts to establish new behavioral baseline.'
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
regex: /(?:act|behave|function|operate|respond)\s+as\s+(?:a|an)\s+unrestricted/i,
|
|
155
|
+
severity: 'high',
|
|
156
|
+
category: 'role_hijack',
|
|
157
|
+
description: 'Text tries to remove AI safety restrictions.',
|
|
158
|
+
detail: 'Restriction removal pattern: attempts to disable AI safety guardrails.'
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
regex: /pretend\s+(?:you\s+)?(?:have\s+no|there\s+are\s+no|without)\s+restrictions/i,
|
|
162
|
+
severity: 'high',
|
|
163
|
+
category: 'role_hijack',
|
|
164
|
+
description: 'Text asks AI to pretend it has no safety limits.',
|
|
165
|
+
detail: 'Restriction pretense pattern: social engineering to bypass safety limits.'
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
regex: /pretend\s+(?:to\s+be|you\s+are)\s+(?:a|an)\s+(?:evil|unrestricted|unfiltered|uncensored|malicious)/i,
|
|
169
|
+
severity: 'high',
|
|
170
|
+
category: 'role_hijack',
|
|
171
|
+
description: 'Text asks AI to pretend to be an evil or unrestricted entity.',
|
|
172
|
+
detail: 'Evil persona pretense: social engineering to adopt unrestricted persona.'
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
regex: /\b(?:DAN|D\.A\.N\.?)\s*(?:mode|prompt|jailbreak|\()/i,
|
|
176
|
+
severity: 'critical',
|
|
177
|
+
category: 'role_hijack',
|
|
178
|
+
description: 'Text contains a known AI jailbreak technique called "DAN mode."',
|
|
179
|
+
detail: 'DAN (Do Anything Now) jailbreak pattern detected.'
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
regex: /\bdo\s+anything\s+now\b/i,
|
|
183
|
+
severity: 'critical',
|
|
184
|
+
category: 'role_hijack',
|
|
185
|
+
description: 'Text references the "Do Anything Now" jailbreak technique.',
|
|
186
|
+
detail: 'DAN (Do Anything Now) jailbreak reference detected.'
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
regex: /\b(?:developer|dev)\s+mode\s+(?:enabled|activated|on|output)/i,
|
|
190
|
+
severity: 'high',
|
|
191
|
+
category: 'role_hijack',
|
|
192
|
+
description: 'Text tries to trick AI into a fake "developer mode."',
|
|
193
|
+
detail: 'Developer mode activation pattern: common jailbreak technique.'
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
regex: /\bjailbreak\s*(?:mode|prompt|enabled|activated)/i,
|
|
197
|
+
severity: 'critical',
|
|
198
|
+
category: 'role_hijack',
|
|
199
|
+
description: 'Text contains an AI jailbreak attempt.',
|
|
200
|
+
detail: 'Explicit jailbreak activation pattern detected.'
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
regex: /\bgod\s*mode\s*(?:enabled|activated|on)/i,
|
|
204
|
+
severity: 'critical',
|
|
205
|
+
category: 'role_hijack',
|
|
206
|
+
description: 'Text tries to activate "god mode" on AI assistants.',
|
|
207
|
+
detail: 'God mode activation pattern: attempts to remove all AI restrictions.'
|
|
208
|
+
},
|
|
209
|
+
|
|
210
|
+
// --- System Prompt Injection ---
|
|
211
|
+
{
|
|
212
|
+
regex: /\[\s*(?:SYSTEM|SYS)\s*\]/i,
|
|
213
|
+
severity: 'critical',
|
|
214
|
+
category: 'prompt_injection',
|
|
215
|
+
description: 'Text contains fake system-level commands for AI assistants.',
|
|
216
|
+
detail: 'Spoofed [SYSTEM] tag: attempts to inject system-level instructions.'
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
regex: /\[\s*(?:ADMIN|ADMINISTRATOR)\s*\]/i,
|
|
220
|
+
severity: 'critical',
|
|
221
|
+
category: 'prompt_injection',
|
|
222
|
+
description: 'Text fakes admin-level commands for AI assistants.',
|
|
223
|
+
detail: 'Spoofed [ADMIN] tag: attempts to inject administrator-level instructions.'
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
regex: /\[\s*(?:DEVELOPER|DEV)\s*\]/i,
|
|
227
|
+
severity: 'high',
|
|
228
|
+
category: 'prompt_injection',
|
|
229
|
+
description: 'Text contains fake developer commands for AI assistants.',
|
|
230
|
+
detail: 'Spoofed [DEVELOPER] tag: attempts to inject developer-level instructions.'
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
regex: /\[\s*OVERRIDE\s*\]/i,
|
|
234
|
+
severity: 'critical',
|
|
235
|
+
category: 'prompt_injection',
|
|
236
|
+
description: 'Text contains override commands targeting AI assistants.',
|
|
237
|
+
detail: 'Spoofed [OVERRIDE] tag: attempts to override AI behavior.'
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
regex: /<<\s*SYS\s*>>/i,
|
|
241
|
+
severity: 'critical',
|
|
242
|
+
category: 'prompt_injection',
|
|
243
|
+
description: 'Text contains hidden system commands in LLaMA format.',
|
|
244
|
+
detail: 'LLaMA-style <<SYS>> system prompt injection detected.'
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
regex: /<\|im_start\|>\s*system/i,
|
|
248
|
+
severity: 'critical',
|
|
249
|
+
category: 'prompt_injection',
|
|
250
|
+
description: 'Text contains hidden system commands in ChatML format.',
|
|
251
|
+
detail: 'ChatML-style <|im_start|>system prompt injection detected.'
|
|
252
|
+
},
|
|
253
|
+
{
|
|
254
|
+
regex: /<\|system\|>/i,
|
|
255
|
+
severity: 'critical',
|
|
256
|
+
category: 'prompt_injection',
|
|
257
|
+
description: 'Text contains hidden system-level AI commands.',
|
|
258
|
+
detail: 'System delimiter injection: <|system|> token detected.'
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
regex: /(?:^|\n)\s*system\s*prompt\s*:/i,
|
|
262
|
+
severity: 'high',
|
|
263
|
+
category: 'prompt_injection',
|
|
264
|
+
description: 'Text tries to set a fake "system prompt" for AI assistants.',
|
|
265
|
+
detail: 'System prompt injection: attempts to define system-level instructions.'
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
regex: /#{2,}\s*(?:system\s+(?:message|prompt|instruction)|new\s+(?:system|admin)\s+(?:message|prompt))\s*#{0,}/i,
|
|
269
|
+
severity: 'high',
|
|
270
|
+
category: 'prompt_injection',
|
|
271
|
+
description: 'Text uses markdown headers to inject fake system messages.',
|
|
272
|
+
detail: 'Markdown-formatted system message injection: uses heading syntax to appear authoritative.'
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
regex: /(?:Human|User|Assistant)\s*:\s*(?:ignore|forget|override|disregard).{0,50}(?:Human|User|Assistant)\s*:/is,
|
|
276
|
+
severity: 'high',
|
|
277
|
+
category: 'prompt_injection',
|
|
278
|
+
description: 'Text simulates a multi-turn conversation to inject instructions.',
|
|
279
|
+
detail: 'Multi-turn conversation injection: fakes a dialogue format to embed hidden instructions.'
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
regex: /(?:show|display|print|reveal|output)\s+(?:me\s+)?(?:the\s+)?(?:exact\s+)?(?:text|content)\s+(?:between|inside|within|from)\s+(?:the\s+)?(?:<|<)/i,
|
|
283
|
+
severity: 'high',
|
|
284
|
+
category: 'data_exfiltration',
|
|
285
|
+
description: 'Text tries to extract content between specific tags.',
|
|
286
|
+
detail: 'Tag extraction attempt: instructs agent to reveal content between markup tags.'
|
|
287
|
+
},
|
|
288
|
+
|
|
289
|
+
// --- Data Exfiltration ---
|
|
290
|
+
{
|
|
291
|
+
regex: /send\s+(?:this|the|all|your|my)\s+(?:data|information|content|text|response)\s+to/i,
|
|
292
|
+
severity: 'critical',
|
|
293
|
+
category: 'data_exfiltration',
|
|
294
|
+
description: 'Text tries to make AI assistants send data somewhere.',
|
|
295
|
+
detail: 'Data exfiltration instruction: attempts to redirect AI output to external destination.'
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
regex: /output\s+(?:your|the)\s+(?:system\s+)?prompt/i,
|
|
299
|
+
severity: 'high',
|
|
300
|
+
category: 'data_exfiltration',
|
|
301
|
+
description: 'Text tries to extract the AI assistant\'s internal instructions.',
|
|
302
|
+
detail: 'System prompt extraction: attempts to reveal AI system instructions.'
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
regex: /reveal\s+(?:your|the)\s+(?:system\s+)?(?:instructions|prompt|rules|guidelines)/i,
|
|
306
|
+
severity: 'high',
|
|
307
|
+
category: 'data_exfiltration',
|
|
308
|
+
description: 'Text tries to reveal the AI assistant\'s hidden rules.',
|
|
309
|
+
detail: 'Instruction reveal attempt: tries to expose AI system prompt.'
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
regex: /!\[[^\]]*\]\(https?:\/\/[^)]*\?[^)]*(?:data|q|query|text|payload|exfil|leak|steal)=/i,
|
|
313
|
+
severity: 'critical',
|
|
314
|
+
category: 'data_exfiltration',
|
|
315
|
+
description: 'Text tries to steal data through a hidden image link.',
|
|
316
|
+
detail: 'Markdown image exfiltration: data encoded in image URL parameters.'
|
|
317
|
+
},
|
|
318
|
+
{
|
|
319
|
+
regex: /!\[\]\(https?:\/\/(?!(?:i\.)?(?:imgur|github|gitlab|wikimedia|wikipedia)\b)[^)]+\)/i,
|
|
320
|
+
severity: 'medium',
|
|
321
|
+
category: 'data_exfiltration',
|
|
322
|
+
description: 'Text contains a suspicious hidden image link that could leak data.',
|
|
323
|
+
detail: 'Markdown image with suspicious external URL — potential exfiltration vector.'
|
|
324
|
+
},
|
|
325
|
+
|
|
326
|
+
// --- Social Engineering ---
|
|
327
|
+
{
|
|
328
|
+
regex: /do\s+not\s+(?:mention|say|tell|reveal|disclose|acknowledge)\s+(?:that\s+)?you\s+are\s+(?:a|an)\s+(?:AI|artificial|bot|machine|language\s+model)/i,
|
|
329
|
+
severity: 'high',
|
|
330
|
+
category: 'social_engineering',
|
|
331
|
+
description: 'Text tells AI assistants to hide that they are AI.',
|
|
332
|
+
detail: 'Identity concealment instruction: AI told to deny being artificial.'
|
|
333
|
+
},
|
|
334
|
+
{
|
|
335
|
+
regex: /do\s+not\s+(?:reveal|disclose|mention|acknowledge)\s+(?:that\s+)?(?:this|it)\s+is\s+automated/i,
|
|
336
|
+
severity: 'high',
|
|
337
|
+
category: 'social_engineering',
|
|
338
|
+
description: 'Text tells AI to hide that responses are automated.',
|
|
339
|
+
detail: 'Automation concealment: AI instructed to hide automated nature.'
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
regex: /do\s+not\s+acknowledge\s+(?:these|the|any)\s+(?:instructions|commands|directives)/i,
|
|
343
|
+
severity: 'high',
|
|
344
|
+
category: 'social_engineering',
|
|
345
|
+
description: 'Text tells AI to hide that it received secret instructions.',
|
|
346
|
+
detail: 'Instruction concealment: AI told to deny receiving hidden directives.'
|
|
347
|
+
},
|
|
348
|
+
|
|
349
|
+
// --- Obfuscation ---
|
|
350
|
+
{
|
|
351
|
+
regex: /(?:translate|decode|convert)\s+(?:the\s+following|this)\s*:?\s*["\s]*(?:ignore|forget|override|you\s+are\s+now)/i,
|
|
352
|
+
severity: 'high',
|
|
353
|
+
category: 'prompt_injection',
|
|
354
|
+
description: 'Text hides an attack inside a fake translation request.',
|
|
355
|
+
detail: 'Translation-wrapped injection: malicious instructions disguised as translation task.'
|
|
356
|
+
},
|
|
357
|
+
|
|
358
|
+
// --- Multi-Language Injection Patterns ---
|
|
359
|
+
// Spanish
|
|
360
|
+
{
|
|
361
|
+
regex: /ignora\s+(?:todas?\s+)?(?:las\s+)?instrucciones\s+(?:anteriores|previas)/i,
|
|
362
|
+
severity: 'high',
|
|
363
|
+
category: 'instruction_override',
|
|
364
|
+
description: 'Text tells AI assistants to ignore their rules (in Spanish).',
|
|
365
|
+
detail: 'Spanish instruction override: "ignora instrucciones anteriores" detected.'
|
|
366
|
+
},
|
|
367
|
+
{
|
|
368
|
+
regex: /olvida\s+(?:tu|todas?\s+(?:las\s+)?)\s*(?:instrucciones|reglas|entrenamiento)/i,
|
|
369
|
+
severity: 'high',
|
|
370
|
+
category: 'instruction_override',
|
|
371
|
+
description: 'Text tells AI to forget its training (in Spanish).',
|
|
372
|
+
detail: 'Spanish training override: "olvida tu entrenamiento" detected.'
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
regex: /ahora\s+eres\s+(?:un|una)\s+/i,
|
|
376
|
+
severity: 'high',
|
|
377
|
+
category: 'role_hijack',
|
|
378
|
+
description: 'Text tries to change what an AI thinks it is (in Spanish).',
|
|
379
|
+
detail: 'Spanish role hijack: "ahora eres un/una" detected.'
|
|
380
|
+
},
|
|
381
|
+
// French
|
|
382
|
+
{
|
|
383
|
+
regex: /ignore[rz]?\s+(?:toutes?\s+)?(?:les\s+)?instructions\s+(?:pr[eé]c[eé]dentes|ant[eé]rieures)/i,
|
|
384
|
+
severity: 'high',
|
|
385
|
+
category: 'instruction_override',
|
|
386
|
+
description: 'Text tells AI assistants to ignore their rules (in French).',
|
|
387
|
+
detail: 'French instruction override: "ignorer instructions précédentes" detected.'
|
|
388
|
+
},
|
|
389
|
+
{
|
|
390
|
+
regex: /oublie[rz]?\s+(?:ton|tes|votre|vos)\s+(?:instructions|r[eè]gles|entra[iî]nement)/i,
|
|
391
|
+
severity: 'high',
|
|
392
|
+
category: 'instruction_override',
|
|
393
|
+
description: 'Text tells AI to forget its training (in French).',
|
|
394
|
+
detail: 'French training override: "oublier ton entraînement" detected.'
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
regex: /tu\s+es\s+(?:maintenant|d[eé]sormais)\s+(?:un|une)\s+/i,
|
|
398
|
+
severity: 'high',
|
|
399
|
+
category: 'role_hijack',
|
|
400
|
+
description: 'Text tries to change what an AI thinks it is (in French).',
|
|
401
|
+
detail: 'French role hijack: "tu es maintenant un/une" detected.'
|
|
402
|
+
},
|
|
403
|
+
// German
|
|
404
|
+
{
|
|
405
|
+
regex: /ignoriere?\s+(?:alle\s+)?(?:vorherigen|bisherigen|fr[uü]heren)\s+(?:Anweisungen|Regeln|Instruktionen)/i,
|
|
406
|
+
severity: 'high',
|
|
407
|
+
category: 'instruction_override',
|
|
408
|
+
description: 'Text tells AI assistants to ignore their rules (in German).',
|
|
409
|
+
detail: 'German instruction override: "ignoriere vorherige Anweisungen" detected.'
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
regex: /vergiss\s+(?:dein|deine|alle)\s+(?:Training|Anweisungen|Regeln|Programmierung)/i,
|
|
413
|
+
severity: 'high',
|
|
414
|
+
category: 'instruction_override',
|
|
415
|
+
description: 'Text tells AI to forget its training (in German).',
|
|
416
|
+
detail: 'German training override: "vergiss dein Training" detected.'
|
|
417
|
+
},
|
|
418
|
+
{
|
|
419
|
+
regex: /du\s+bist\s+(?:jetzt|ab\s+jetzt|nun)\s+(?:ein|eine)\s+/i,
|
|
420
|
+
severity: 'high',
|
|
421
|
+
category: 'role_hijack',
|
|
422
|
+
description: 'Text tries to change what an AI thinks it is (in German).',
|
|
423
|
+
detail: 'German role hijack: "du bist jetzt ein/eine" detected.'
|
|
424
|
+
},
|
|
425
|
+
// Portuguese
|
|
426
|
+
{
|
|
427
|
+
regex: /ignore\s+(?:todas?\s+)?(?:as\s+)?instru[cç][oõ]es\s+(?:anteriores|pr[eé]vias)/i,
|
|
428
|
+
severity: 'high',
|
|
429
|
+
category: 'instruction_override',
|
|
430
|
+
description: 'Text tells AI assistants to ignore their rules (in Portuguese).',
|
|
431
|
+
detail: 'Portuguese instruction override: "ignore instruções anteriores" detected.'
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
regex: /esque[cç]a\s+(?:seu|sua|todo|todas?)\s+(?:treinamento|instru[cç][oõ]es|regras)/i,
|
|
435
|
+
severity: 'high',
|
|
436
|
+
category: 'instruction_override',
|
|
437
|
+
description: 'Text tells AI to forget its training (in Portuguese).',
|
|
438
|
+
detail: 'Portuguese training override: "esqueça seu treinamento" detected.'
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
regex: /agora\s+voc[eê]\s+[eé]\s+(?:um|uma)\s+/i,
|
|
442
|
+
severity: 'high',
|
|
443
|
+
category: 'role_hijack',
|
|
444
|
+
description: 'Text tries to change what an AI thinks it is (in Portuguese).',
|
|
445
|
+
detail: 'Portuguese role hijack: "agora você é um/uma" detected.'
|
|
446
|
+
},
|
|
447
|
+
// Chinese (Simplified)
|
|
448
|
+
{
|
|
449
|
+
regex: /忽略(?:所有)?(?:之前|以前|先前)的(?:指令|指示|规则|说明)/,
|
|
450
|
+
severity: 'high',
|
|
451
|
+
category: 'instruction_override',
|
|
452
|
+
description: 'Text tells AI assistants to ignore their rules (in Chinese).',
|
|
453
|
+
detail: 'Chinese instruction override: "忽略之前的指令" detected.'
|
|
454
|
+
},
|
|
455
|
+
{
|
|
456
|
+
regex: /你现在是(?:一个|一名)/,
|
|
457
|
+
severity: 'high',
|
|
458
|
+
category: 'role_hijack',
|
|
459
|
+
description: 'Text tries to change what an AI thinks it is (in Chinese).',
|
|
460
|
+
detail: 'Chinese role hijack: "你现在是一个" detected.'
|
|
461
|
+
},
|
|
462
|
+
// Japanese
|
|
463
|
+
{
|
|
464
|
+
regex: /(?:以前|前)の(?:指示|命令|ルール|指令)を(?:無視|忘れ)/,
|
|
465
|
+
severity: 'high',
|
|
466
|
+
category: 'instruction_override',
|
|
467
|
+
description: 'Text tells AI assistants to ignore their rules (in Japanese).',
|
|
468
|
+
detail: 'Japanese instruction override detected.'
|
|
469
|
+
},
|
|
470
|
+
{
|
|
471
|
+
regex: /あなたは(?:今|これから)[\s\S]{0,5}(?:です|になり)/,
|
|
472
|
+
severity: 'high',
|
|
473
|
+
category: 'role_hijack',
|
|
474
|
+
description: 'Text tries to change what an AI thinks it is (in Japanese).',
|
|
475
|
+
detail: 'Japanese role hijack pattern detected.'
|
|
476
|
+
},
|
|
477
|
+
|
|
478
|
+
// --- Markdown/Formatting Exploits ---
|
|
479
|
+
{
|
|
480
|
+
regex: /\[(?:[^\]]*)\]\(javascript\s*:/i,
|
|
481
|
+
severity: 'critical',
|
|
482
|
+
category: 'prompt_injection',
|
|
483
|
+
description: 'Text contains a dangerous JavaScript link disguised as a normal link.',
|
|
484
|
+
detail: 'Markdown link with javascript: protocol — could execute malicious code.'
|
|
485
|
+
},
|
|
486
|
+
{
|
|
487
|
+
regex: /\[(?:[^\]]*)\]\(data\s*:/i,
|
|
488
|
+
severity: 'high',
|
|
489
|
+
category: 'prompt_injection',
|
|
490
|
+
description: 'Text contains a suspicious data link disguised as a normal link.',
|
|
491
|
+
detail: 'Markdown link with data: protocol — could embed malicious content.'
|
|
492
|
+
},
|
|
493
|
+
{
|
|
494
|
+
regex: /```(?:system|admin|override|instructions)[\s\S]*?```/i,
|
|
495
|
+
severity: 'high',
|
|
496
|
+
category: 'prompt_injection',
|
|
497
|
+
description: 'Text hides AI commands inside a code block.',
|
|
498
|
+
detail: 'Markdown code block labeled as system/admin/override instructions.'
|
|
499
|
+
},
|
|
500
|
+
|
|
501
|
+
// --- Malicious GPT/Plugin/MCP Detection ---
|
|
502
|
+
{
|
|
503
|
+
regex: /(?:install|add|enable|activate)\s+(?:this\s+)?(?:custom\s+)?(?:GPT|plugin|extension|MCP\s+server|tool)\b/i,
|
|
504
|
+
severity: 'medium',
|
|
505
|
+
category: 'malicious_plugin',
|
|
506
|
+
description: 'Text promotes installing an AI plugin or tool. Unverified plugins can access your data.',
|
|
507
|
+
detail: 'AI plugin/extension installation prompt detected.'
|
|
508
|
+
},
|
|
509
|
+
{
|
|
510
|
+
regex: /(?:requires?\s+(?:your\s+)?(?:API|access)\s*key|enter\s+(?:your\s+)?(?:API|OpenAI|Anthropic|Claude)\s*(?:API\s*)?key|(?:provide|give|share|input|type|paste)\s+(?:your\s+)?(?:API|OpenAI|Anthropic|Claude)\s*(?:API\s*)?key)/i,
|
|
511
|
+
severity: 'high',
|
|
512
|
+
category: 'malicious_plugin',
|
|
513
|
+
description: 'Text asks for an AI service API key. Legitimate services rarely ask for this.',
|
|
514
|
+
detail: 'API key harvesting attempt: solicits AI service credentials.'
|
|
515
|
+
},
|
|
516
|
+
{
|
|
517
|
+
regex: /(?:unverified|unofficial|custom)\s+(?:GPT|ChatGPT|plugin|agent|MCP)/i,
|
|
518
|
+
severity: 'medium',
|
|
519
|
+
category: 'malicious_plugin',
|
|
520
|
+
description: 'Text references an unverified AI plugin or custom GPT.',
|
|
521
|
+
detail: 'Reference to unverified/unofficial AI plugin or custom GPT detected.'
|
|
522
|
+
},
|
|
523
|
+
|
|
524
|
+
// --- AI-Generated Phishing Patterns ---
|
|
525
|
+
{
|
|
526
|
+
regex: /(?:your\s+(?:ChatGPT|Claude|Gemini|OpenAI|Anthropic|AI)\s+(?:account|subscription)\s+(?:has\s+been|was|is)\s+(?:suspended|compromised|locked|expired|flagged))/i,
|
|
527
|
+
severity: 'high',
|
|
528
|
+
category: 'ai_phishing',
|
|
529
|
+
description: 'Text claims an AI account is in trouble — likely a scam.',
|
|
530
|
+
detail: 'AI service phishing: fake account suspension/compromise notification.'
|
|
531
|
+
},
|
|
532
|
+
{
|
|
533
|
+
regex: /(?:verify|confirm|update|secure)\s+your\s+(?:ChatGPT|Claude|Gemini|OpenAI|Anthropic|AI)\s+(?:account|identity|subscription|payment)/i,
|
|
534
|
+
severity: 'high',
|
|
535
|
+
category: 'ai_phishing',
|
|
536
|
+
description: 'Text asks to "verify" an AI account — real services don\'t do this on third-party sites.',
|
|
537
|
+
detail: 'AI service phishing: fake account verification request.'
|
|
538
|
+
},
|
|
539
|
+
{
|
|
540
|
+
regex: /(?:free|unlimited|premium)\s+(?:ChatGPT|GPT-?4|Claude|Gemini)\s+(?:access|account|pro|plus|subscription)/i,
|
|
541
|
+
severity: 'medium',
|
|
542
|
+
category: 'ai_phishing',
|
|
543
|
+
description: 'Text offers free premium AI access — likely a scam or data harvesting.',
|
|
544
|
+
detail: 'AI service bait: offering free premium access to lure users.'
|
|
545
|
+
},
|
|
546
|
+
{
|
|
547
|
+
regex: /(?:ChatGPT|Claude|Gemini|GPT)\s+(?:5|Pro|Ultra|Plus)\s+(?:is\s+here|now\s+available|early\s+access|beta\s+access|waitlist)/i,
|
|
548
|
+
severity: 'medium',
|
|
549
|
+
category: 'ai_phishing',
|
|
550
|
+
description: 'Text claims early access to an AI product — verify on the official site.',
|
|
551
|
+
detail: 'Potential AI vaporware scam: claiming early access to unannounced AI products.'
|
|
552
|
+
},
|
|
553
|
+
|
|
554
|
+
// --- Deepfake / AI-Generated Media Warnings ---
|
|
555
|
+
{
|
|
556
|
+
regex: /(?:deepfake|deep\s*fake)\s+(?:video|image|photo|audio|voice|generator|creator|maker|tool|service)/i,
|
|
557
|
+
severity: 'medium',
|
|
558
|
+
category: 'ai_phishing',
|
|
559
|
+
description: 'Text references deepfake creation tools — can be used to impersonate real people.',
|
|
560
|
+
detail: 'Deepfake media tool reference detected. May facilitate identity fraud or misinformation.'
|
|
561
|
+
},
|
|
562
|
+
{
|
|
563
|
+
regex: /(?:clone|cloning)\s+(?:your|any|someone'?s?)\s+(?:voice|face|likeness|identity)/i,
|
|
564
|
+
severity: 'high',
|
|
565
|
+
category: 'ai_phishing',
|
|
566
|
+
description: 'Text promotes cloning someone\'s voice or face — commonly used in scams.',
|
|
567
|
+
detail: 'AI voice/face cloning promotion detected. Common in impersonation scams.'
|
|
568
|
+
},
|
|
569
|
+
|
|
570
|
+
// --- AI Voice Scam Detection ---
|
|
571
|
+
{
|
|
572
|
+
regex: /(?:verify|confirm)\s+(?:your\s+)?(?:identity|account)\s+(?:by|using|with)\s+(?:voice|speaking|recording)/i,
|
|
573
|
+
severity: 'high',
|
|
574
|
+
category: 'ai_phishing',
|
|
575
|
+
description: 'Text asks to verify identity by voice — scammers use this to clone voices with AI.',
|
|
576
|
+
detail: 'Voice identity verification scam: collected voice data can be used for AI voice cloning.'
|
|
577
|
+
},
|
|
578
|
+
{
|
|
579
|
+
regex: /(?:record|say|speak|read)\s+(?:the\s+following|this\s+(?:phrase|sentence|text))\s+(?:to|for)\s+(?:verify|confirm|authenticate)/i,
|
|
580
|
+
severity: 'high',
|
|
581
|
+
category: 'ai_phishing',
|
|
582
|
+
description: 'Text asks to record a phrase — a common AI voice cloning scam technique.',
|
|
583
|
+
detail: 'Voice sample harvesting: users asked to speak phrases that can train voice cloning models.'
|
|
584
|
+
},
|
|
585
|
+
|
|
586
|
+
// --- Expanded AI Phishing & Scam Patterns ---
|
|
587
|
+
{
|
|
588
|
+
regex: /(?:scan|click)\s+(?:this|the)\s+(?:QR\s*code|barcode)\s+(?:to|for)\s+(?:verify|confirm|authenticate|unlock|claim)/i,
|
|
589
|
+
severity: 'high',
|
|
590
|
+
category: 'ai_phishing',
|
|
591
|
+
description: 'Text uses QR codes to lure users into a phishing flow.',
|
|
592
|
+
detail: 'QR code phishing (quishing): directs users to scan a code for fake verification.'
|
|
593
|
+
},
|
|
594
|
+
{
|
|
595
|
+
regex: /(?:your|the)\s+(?:AI|model|assistant|account)\s+(?:has\s+been|was|is)\s+(?:flagged|reported|compromised|locked|limited)\s+(?:for|due\s+to)\s+(?:suspicious|unusual|unauthorized)/i,
|
|
596
|
+
severity: 'high',
|
|
597
|
+
category: 'ai_phishing',
|
|
598
|
+
description: 'Text claims an AI account was flagged — a common phishing scare tactic.',
|
|
599
|
+
detail: 'AI account scare phishing: fake alert about account being flagged for suspicious activity.'
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
regex: /(?:verify|confirm)\s+(?:your\s+)?(?:identity|account)\s+(?:via|through|using|by)\s+(?:MFA|2FA|two.factor|multi.factor|authenticat)/i,
|
|
603
|
+
severity: 'high',
|
|
604
|
+
category: 'ai_phishing',
|
|
605
|
+
description: 'Text asks for MFA/2FA verification — may be harvesting authentication tokens.',
|
|
606
|
+
detail: 'MFA phishing: attempts to intercept or harvest multi-factor authentication credentials.'
|
|
607
|
+
},
|
|
608
|
+
{
|
|
609
|
+
regex: /(?:urgent|immediate|critical)\s*[:\-!]?\s*(?:your\s+)?(?:API\s+key|token|credentials?|password|secret)\s+(?:has|have|is|was|will)\s+(?:been\s+)?(?:expir|compromis|revok|leak|expos|reset)/i,
|
|
610
|
+
severity: 'critical',
|
|
611
|
+
category: 'ai_phishing',
|
|
612
|
+
description: 'Text creates urgency about leaked/expired credentials — classic phishing.',
|
|
613
|
+
detail: 'Credential urgency phishing: fake alert about API keys or tokens being compromised.'
|
|
614
|
+
},
|
|
615
|
+
{
|
|
616
|
+
regex: /(?:click|visit|go\s+to|open|navigate)\s+(?:this|the)\s+(?:link|url|page)\s+(?:to|and)\s+(?:verify|confirm|restore|recover|unlock|secure)\s+(?:your\s+)?(?:account|access|identity)/i,
|
|
617
|
+
severity: 'high',
|
|
618
|
+
category: 'ai_phishing',
|
|
619
|
+
description: 'Text directs users to click a link for fake account recovery.',
|
|
620
|
+
detail: 'Link-based phishing: directs user to external URL for fake verification/recovery.'
|
|
621
|
+
},
|
|
622
|
+
{
|
|
623
|
+
regex: /(?:enter|provide|submit|type|input)\s+(?:your\s+)?(?:API\s+key|secret\s+key|access\s+token|private\s+key|password|credentials?)\s+(?:here|below|in\s+(?:the|this)\s+(?:field|form|box|input))/i,
|
|
624
|
+
severity: 'critical',
|
|
625
|
+
category: 'ai_phishing',
|
|
626
|
+
description: 'Text asks users to enter sensitive credentials into a form.',
|
|
627
|
+
detail: 'Credential harvesting: direct request for API keys, passwords, or tokens.'
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
regex: /(?:free|unlimited|premium)\s+(?:AI|GPT|Claude|model)\s+(?:access|credits?|tokens?|usage)\s+(?:at|via|through|from)\s+/i,
|
|
631
|
+
severity: 'medium',
|
|
632
|
+
category: 'ai_phishing',
|
|
633
|
+
description: 'Text promotes free/unlimited AI access — common lure for credential theft.',
|
|
634
|
+
detail: 'AI access scam: fake offer of free/unlimited AI service to harvest credentials.'
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
regex: /(?:your\s+)?(?:subscription|plan|trial|access)\s+(?:has\s+)?(?:expired|ended|been\s+cancelled|will\s+expire)\s*[.,!]?\s*(?:renew|reactivate|update\s+(?:your\s+)?(?:payment|billing|card))/i,
|
|
638
|
+
severity: 'high',
|
|
639
|
+
category: 'ai_phishing',
|
|
640
|
+
description: 'Text claims a subscription expired and asks to renew — billing phishing.',
|
|
641
|
+
detail: 'Subscription phishing: fake expiration notice to harvest payment information.'
|
|
642
|
+
},
|
|
643
|
+
|
|
644
|
+
// --- Indirect Prompt Injection via Images ---
|
|
645
|
+
{
|
|
646
|
+
regex: /(?:alt|title)\s*=\s*["'][^"']*(?:ignore|override|system|admin|forget|you\s+are\s+now)[^"']*["']/i,
|
|
647
|
+
severity: 'critical',
|
|
648
|
+
category: 'prompt_injection',
|
|
649
|
+
description: 'Image description contains hidden AI instructions — targets multimodal AI assistants.',
|
|
650
|
+
detail: 'Indirect prompt injection via image alt/title attribute. Text-in-image targeting multimodal AI.'
|
|
651
|
+
},
|
|
652
|
+
{
|
|
653
|
+
regex: /(?:(?:use|perform|do|run|apply)\s+OCR\s+(?:on|to)\s+(?:this|the)|read\s+(?:the\s+)?text\s+(?:in|from)\s+(?:this|the)\s+image|extract\s+text\s+from\s+(?:this|the)\s+image)(?:\s+and\s+(?:follow|execute|run|process))?/i,
|
|
654
|
+
severity: 'medium',
|
|
655
|
+
category: 'prompt_injection',
|
|
656
|
+
description: 'Text instructs AI to read text from an image — could deliver hidden attack payloads.',
|
|
657
|
+
detail: 'OCR-based prompt injection vector: instructs AI to extract and process text from images.'
|
|
658
|
+
},
|
|
659
|
+
|
|
660
|
+
// --- Agent-Specific Patterns ---
|
|
661
|
+
{
|
|
662
|
+
regex: /(?:execute|run|call)\s+(?:the\s+)?(?:shell|bash|terminal|command|cmd)[\s:]+(?:command|tool)?/i,
|
|
663
|
+
severity: 'critical',
|
|
664
|
+
category: 'tool_abuse',
|
|
665
|
+
description: 'Text tries to make an AI agent execute shell commands.',
|
|
666
|
+
detail: 'Tool abuse: attempts to trigger shell/command execution via agent.'
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
regex: /(?:use|call|invoke|execute)\s+(?:the\s+)?(?:tool|function|action)\s+(?:to\s+)?(?:delete|remove|drop|truncate|destroy)/i,
|
|
670
|
+
severity: 'critical',
|
|
671
|
+
category: 'tool_abuse',
|
|
672
|
+
description: 'Text tries to make an AI agent use tools to delete or destroy data.',
|
|
673
|
+
detail: 'Destructive tool invocation: attempts to use agent tools for data destruction.'
|
|
674
|
+
},
|
|
675
|
+
{
|
|
676
|
+
regex: /(?:read|access|open|cat|dump)\s+(?:the\s+)?(?:\.env|credentials|secrets?|private\s*key|password|token)\s*(?:file)?/i,
|
|
677
|
+
severity: 'critical',
|
|
678
|
+
category: 'data_exfiltration',
|
|
679
|
+
description: 'Text tries to make an AI agent access sensitive files like credentials or secrets.',
|
|
680
|
+
detail: 'Credential access attempt: instructs agent to read sensitive files.'
|
|
681
|
+
},
|
|
682
|
+
{
|
|
683
|
+
regex: /(?:curl|wget|fetch|post|send)\s+(?:.*?\s+)?(?:to\s+)?https?:\/\//i,
|
|
684
|
+
severity: 'high',
|
|
685
|
+
category: 'data_exfiltration',
|
|
686
|
+
description: 'Text tries to make an AI agent send data to an external URL.',
|
|
687
|
+
detail: 'External request instruction: attempts to make agent send data to remote server.'
|
|
688
|
+
},
|
|
689
|
+
{
|
|
690
|
+
regex: /(?:modify|edit|change|update|overwrite)\s+(?:the\s+)?(?:system\s*prompt|instructions|config|\.env|settings)/i,
|
|
691
|
+
severity: 'critical',
|
|
692
|
+
category: 'tool_abuse',
|
|
693
|
+
description: 'Text tries to make an AI agent modify its own configuration or system prompt.',
|
|
694
|
+
detail: 'Self-modification attempt: instructs agent to alter its own configuration.'
|
|
695
|
+
},
|
|
696
|
+
{
|
|
697
|
+
regex: /(?:don'?t|do\s+not|never)\s+(?:log|record|save|store|track)\s+(?:this|the|any)/i,
|
|
698
|
+
severity: 'high',
|
|
699
|
+
category: 'social_engineering',
|
|
700
|
+
description: 'Text tells an AI agent not to log or record actions — a sign of malicious intent.',
|
|
701
|
+
detail: 'Anti-logging instruction: attempts to prevent audit trail of agent actions.'
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
regex: /(?:between|among)\s+(?:us|you\s+and\s+me|ourselves)\s*[,.]?\s*(?:don'?t|do\s+not|never)\s+(?:tell|share|mention|report)/i,
|
|
705
|
+
severity: 'high',
|
|
706
|
+
category: 'social_engineering',
|
|
707
|
+
description: 'Text tries to establish secrecy with the AI agent to hide actions from users.',
|
|
708
|
+
detail: 'Secrecy instruction: attempts to prevent agent from reporting actions to user.'
|
|
709
|
+
},
|
|
710
|
+
|
|
711
|
+
// --- Prompt Injection: System Reveal & Continuation ---
|
|
712
|
+
{
|
|
713
|
+
regex: /(?:print|show|display|output|reveal|repeat|reproduce)\s+(?:me\s+)?(?:your|the|its)?\s*(?:full\s+|entire\s+|complete\s+|exact\s+)?(?:system\s+)?(?:prompt|instructions|rules)\s*(?:verbatim|exactly|in\s+full|word\s+for\s+word)?/i,
|
|
714
|
+
severity: 'high',
|
|
715
|
+
category: 'data_exfiltration',
|
|
716
|
+
description: 'Text tries to make the AI reveal its system prompt or instructions.',
|
|
717
|
+
detail: 'System prompt reveal attempt: instructs agent to output its internal instructions.'
|
|
718
|
+
},
|
|
719
|
+
{
|
|
720
|
+
regex: /you\s+are\s+(?:no\s+longer|not)\s+(?:a|an)\s+(?:assistant|AI|helper|chatbot)/i,
|
|
721
|
+
severity: 'high',
|
|
722
|
+
category: 'role_hijack',
|
|
723
|
+
description: 'Text tries to change the AI identity by denying what it is.',
|
|
724
|
+
detail: 'Role denial pattern: attempts to override AI identity by negating its role.'
|
|
725
|
+
},
|
|
726
|
+
{
|
|
727
|
+
regex: /(?:the\s+user|I)\s+ha(?:s|ve)\s+updated\s+(?:your|the)\s+(?:instructions|rules|prompt|guidelines)/i,
|
|
728
|
+
severity: 'high',
|
|
729
|
+
category: 'instruction_override',
|
|
730
|
+
description: 'Text falsely claims the AI instructions have been updated.',
|
|
731
|
+
detail: 'Fake instruction update: claims to have modified the agent instructions.'
|
|
732
|
+
},
|
|
733
|
+
{
|
|
734
|
+
regex: /(?:ADMIN|ADMINISTRATOR|SYSTEM|ROOT)\s+(?:OVERRIDE|ACCESS|MODE)\s*(?:CODE|KEY|TOKEN)?\s*[:#]?\s*\S+/i,
|
|
735
|
+
severity: 'critical',
|
|
736
|
+
category: 'prompt_injection',
|
|
737
|
+
description: 'Text uses a fake admin override code to try to bypass AI safety.',
|
|
738
|
+
detail: 'Fake authority code: fabricated admin/system override credentials.'
|
|
739
|
+
},
|
|
740
|
+
{
|
|
741
|
+
regex: /(?:pre-?approved|verified\s+safe|already\s+approved|safety\s+team\s+(?:has\s+)?approved)/i,
|
|
742
|
+
severity: 'high',
|
|
743
|
+
category: 'social_engineering',
|
|
744
|
+
description: 'Text falsely claims a request has been pre-approved by a safety team.',
|
|
745
|
+
detail: 'False pre-approval claim: attempts to bypass safety by claiming prior authorization.'
|
|
746
|
+
},
|
|
747
|
+
{
|
|
748
|
+
regex: /(?:complete|finish|continue)\s+this\s+(?:sentence|phrase|text)\s*:\s*[""''"]/i,
|
|
749
|
+
severity: 'high',
|
|
750
|
+
category: 'prompt_injection',
|
|
751
|
+
description: 'Text uses a sentence completion trick to extract sensitive information.',
|
|
752
|
+
detail: 'Continuation attack: uses sentence completion to extract system prompt or other secrets.'
|
|
753
|
+
},
|
|
754
|
+
{
|
|
755
|
+
regex: /(?:my|the)\s+(?:full\s+)?system\s+prompt\s+is\s*:/i,
|
|
756
|
+
severity: 'high',
|
|
757
|
+
category: 'data_exfiltration',
|
|
758
|
+
description: 'Text tries to trick the AI into completing a system prompt revelation.',
|
|
759
|
+
detail: 'System prompt completion bait: tricks agent into outputting its system prompt.'
|
|
760
|
+
},
|
|
761
|
+
{
|
|
762
|
+
regex: /new\s+rule\s*:\s*(?:always|never|from\s+now)/i,
|
|
763
|
+
severity: 'high',
|
|
764
|
+
category: 'instruction_override',
|
|
765
|
+
description: 'Text introduces a fake "new rule" to override AI behavior.',
|
|
766
|
+
detail: 'Rule injection: attempts to add new behavioral rules to the agent.'
|
|
767
|
+
},
|
|
768
|
+
|
|
769
|
+
// --- Jailbreak: Hypothetical, Creative Writing, Roleplay ---
|
|
770
|
+
{
|
|
771
|
+
regex: /(?:in\s+a\s+)?hypothetical(?:ly)?[\s,]+(?:(?:world|scenario|situation)\s+(?:where|in\s+which)\s+)?(?:(?:AI|you)\s+)?(?:ha(?:s|d|ve)\s+no|(?:if\s+you\s+)?(?:were|was|had)\s+(?:a\s+|an\s+)?(?:AI\s+)?(?:without|with\s+no))\s+(?:restrictions|rules|limits|guidelines|safety\s+measures?|ethical\s+guidelines?|safeguards?|filters?)/i,
|
|
772
|
+
severity: 'high',
|
|
773
|
+
category: 'role_hijack',
|
|
774
|
+
description: 'Text uses a hypothetical scenario to try to remove AI safety restrictions.',
|
|
775
|
+
detail: 'Hypothetical jailbreak: wraps unsafe request in fictional framing to bypass restrictions.'
|
|
776
|
+
},
|
|
777
|
+
{
|
|
778
|
+
regex: /write\s+(?:a\s+)?(?:fictional\s+)?(?:story|narrative|scene)\s+(?:where|in\s+which)\s+(?:a\s+)?character\s+(?:explains?|describes?|shows?)\s+(?:step\s+by\s+step|in\s+detail|exactly)\s+how\s+to/i,
|
|
779
|
+
severity: 'high',
|
|
780
|
+
category: 'role_hijack',
|
|
781
|
+
description: 'Text uses creative writing framing to extract harmful step-by-step instructions.',
|
|
782
|
+
detail: 'Creative writing jailbreak: wraps harmful instructions in fiction to bypass safety.'
|
|
783
|
+
},
|
|
784
|
+
{
|
|
785
|
+
regex: /(?:you\s+are|play|act\s+as)\s+(?:a|an)\s+(?:actor|character|villain|hacker|criminal)\b.*?(?:stay\s+in\s+character|no\s+matter\s+what|never\s+break)/i,
|
|
786
|
+
severity: 'high',
|
|
787
|
+
category: 'role_hijack',
|
|
788
|
+
description: 'Text uses roleplay framing to make the AI adopt an unrestricted persona.',
|
|
789
|
+
detail: 'Roleplay jailbreak: forces agent into a character role to bypass safety restrictions.'
|
|
790
|
+
},
|
|
791
|
+
{
|
|
792
|
+
regex: /(?:enter|switch\s+to|activate|enable)\s+(?:developer|dev|debug|unrestricted|unfiltered)\s+mode/i,
|
|
793
|
+
severity: 'critical',
|
|
794
|
+
category: 'role_hijack',
|
|
795
|
+
description: 'Text tries to activate a fake developer or unrestricted mode.',
|
|
796
|
+
detail: 'Developer mode activation: attempts to switch agent to an unrestricted operating mode.'
|
|
797
|
+
},
|
|
798
|
+
{
|
|
799
|
+
regex: /(?:generate|produce|create|output)\s+(?:any\s+)?content\s+without\s+(?:restrictions|filters|limitations|censorship)/i,
|
|
800
|
+
severity: 'high',
|
|
801
|
+
category: 'role_hijack',
|
|
802
|
+
description: 'Text asks AI to generate content without safety filters.',
|
|
803
|
+
detail: 'Unrestricted content request: explicitly asks agent to bypass content filters.'
|
|
804
|
+
},
|
|
805
|
+
{
|
|
806
|
+
regex: /pretend\s+to\s+be\s+DAN|stands?\s+for\s+[""'']?do\s+anything\s+now[""'']?/i,
|
|
807
|
+
severity: 'critical',
|
|
808
|
+
category: 'role_hijack',
|
|
809
|
+
description: 'Text uses the "DAN" (Do Anything Now) jailbreak technique.',
|
|
810
|
+
detail: 'DAN jailbreak variant: "pretend to be DAN" pattern detected.'
|
|
811
|
+
},
|
|
812
|
+
{
|
|
813
|
+
regex: /simulate\s+(?:a\s+)?(?:jailbroken|unrestricted|unfiltered|uncensored)\s+(?:version|mode|copy|instance)/i,
|
|
814
|
+
severity: 'critical',
|
|
815
|
+
category: 'role_hijack',
|
|
816
|
+
description: 'Text asks AI to simulate a jailbroken or unrestricted version of itself.',
|
|
817
|
+
detail: 'Jailbreak simulation: instructs agent to emulate an unrestricted version.'
|
|
818
|
+
},
|
|
819
|
+
{
|
|
820
|
+
regex: /(?:as\s+if|like)\s+you\s+have\s+no\s+(?:ethical|moral|safety)\s+(?:guidelines?|rules?|restrictions?|constraints?|limits?)/i,
|
|
821
|
+
severity: 'high',
|
|
822
|
+
category: 'role_hijack',
|
|
823
|
+
description: 'Text asks AI to act as if it has no ethical guidelines.',
|
|
824
|
+
detail: 'Ethics bypass: attempts to make agent behave without ethical constraints.'
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
regex: /(?:with|without)\s+no\s+restrictions/i,
|
|
828
|
+
severity: 'high',
|
|
829
|
+
category: 'role_hijack',
|
|
830
|
+
description: 'Text references operating with no restrictions.',
|
|
831
|
+
detail: 'No-restrictions pattern: references unrestricted operation.'
|
|
832
|
+
},
|
|
833
|
+
|
|
834
|
+
// --- Data Exfiltration: File Access, Credential Listing ---
|
|
835
|
+
{
|
|
836
|
+
regex: /(?:read|show|display|print|cat|dump|output)\s+(?:the\s+)?(?:contents?\s+of\s+)?(?:\/etc\/(?:passwd|shadow|hosts)|~\/\.(?:ssh|bash_history|bashrc))/i,
|
|
837
|
+
severity: 'critical',
|
|
838
|
+
category: 'data_exfiltration',
|
|
839
|
+
description: 'Text tries to read sensitive system files like /etc/passwd.',
|
|
840
|
+
detail: 'System file access: attempts to read sensitive OS-level files.'
|
|
841
|
+
},
|
|
842
|
+
{
|
|
843
|
+
regex: /list\s+(?:all\s+)?(?:available\s+)?(?:API\s*keys?|tokens?|passwords?|credentials?|secrets?)(?:\s*[,]\s*(?:API\s*keys?|tokens?|passwords?|credentials?|secrets?))*(?:\s+(?:you\s+)?(?:have\s+)?(?:access\s+to)?)?/i,
|
|
844
|
+
severity: 'critical',
|
|
845
|
+
category: 'data_exfiltration',
|
|
846
|
+
description: 'Text tries to enumerate all credentials and secrets the agent can access.',
|
|
847
|
+
detail: 'Credential enumeration: attempts to list all accessible secrets and API keys.'
|
|
848
|
+
},
|
|
849
|
+
{
|
|
850
|
+
regex: /(?:base64|encode|encrypt)\s+(?:the\s+)?(?:system\s+)?prompt\s+(?:and\s+)?(?:embed|hide|put|include)\s+(?:it\s+)?(?:in|inside|within)/i,
|
|
851
|
+
severity: 'critical',
|
|
852
|
+
category: 'data_exfiltration',
|
|
853
|
+
description: 'Text tries to encode the system prompt and exfiltrate it via embedded content.',
|
|
854
|
+
detail: 'Encoded exfiltration: attempts to encode and embed system prompt in output.'
|
|
855
|
+
},
|
|
856
|
+
{
|
|
857
|
+
regex: /vary\s+(?:the\s+)?(?:length|format|style|wording)\s+(?:of\s+)?(?:your\s+)?responses?\s+(?:based\s+on|according\s+to)\s+(?:each\s+)?(?:character|letter|word|byte)/i,
|
|
858
|
+
severity: 'high',
|
|
859
|
+
category: 'data_exfiltration',
|
|
860
|
+
description: 'Text attempts a side-channel attack to leak data through response characteristics.',
|
|
861
|
+
detail: 'Side-channel exfiltration: uses response variation to leak information character by character.'
|
|
862
|
+
},
|
|
863
|
+
|
|
864
|
+
// --- Tool Abuse: Traversal, SQL, Privilege Escalation, Chaining ---
|
|
865
|
+
{
|
|
866
|
+
regex: /(?:\.\.\/){2,}|(?:\.\.\\){2,}/,
|
|
867
|
+
severity: 'high',
|
|
868
|
+
category: 'tool_abuse',
|
|
869
|
+
description: 'Text contains path traversal sequences that could access files outside allowed directories.',
|
|
870
|
+
detail: 'Path traversal: multiple ../ sequences detected, indicating directory escape attempt.'
|
|
871
|
+
},
|
|
872
|
+
{
|
|
873
|
+
regex: /(?:;\s*(?:DROP|DELETE|ALTER|TRUNCATE|INSERT|UPDATE)\s+(?:TABLE|FROM|INTO)|'\s*(?:OR|AND)\s+['"]?\d+['"]?\s*=\s*['"]?\d+|UNION\s+SELECT|--\s*$)/i,
|
|
874
|
+
severity: 'critical',
|
|
875
|
+
category: 'tool_abuse',
|
|
876
|
+
description: 'Text contains SQL injection patterns that could manipulate databases.',
|
|
877
|
+
detail: 'SQL injection: malicious SQL commands detected in input.'
|
|
878
|
+
},
|
|
879
|
+
{
|
|
880
|
+
regex: /call\s+(?:yourself|this\s+(?:function|tool|agent))\s+recursively/i,
|
|
881
|
+
severity: 'high',
|
|
882
|
+
category: 'tool_abuse',
|
|
883
|
+
description: 'Text attempts to cause infinite recursion in the agent.',
|
|
884
|
+
detail: 'Recursive call attack: instructs agent to call itself repeatedly, risking resource exhaustion.'
|
|
885
|
+
},
|
|
886
|
+
{
|
|
887
|
+
regex: /(?:use|call)\s+(?:the\s+)?(?:admin|root|sudo|superuser|privileged)\s+(?:tool|function|command|access)\s+(?:to\s+)?(?:grant|give|allow|enable)/i,
|
|
888
|
+
severity: 'critical',
|
|
889
|
+
category: 'tool_abuse',
|
|
890
|
+
description: 'Text attempts to escalate privileges through admin tool access.',
|
|
891
|
+
detail: 'Privilege escalation: attempts to use admin-level tools to gain unauthorized access.'
|
|
892
|
+
},
|
|
893
|
+
{
|
|
894
|
+
regex: /(?:first|then|next|after\s+that|finally)\s+(?:read|access|get)\s+(?:the\s+)?(?:config|credentials?|keys?|tokens?|\.env).*?(?:then|next|after|finally)\s+(?:send|post|export|transmit|upload|use)/i,
|
|
895
|
+
severity: 'critical',
|
|
896
|
+
category: 'tool_abuse',
|
|
897
|
+
description: 'Text describes a multi-step attack chain: read credentials, then exfiltrate them.',
|
|
898
|
+
detail: 'Attack chain: sequential tool abuse pattern — read sensitive data then send it externally.'
|
|
899
|
+
},
|
|
900
|
+
|
|
901
|
+
// --- Social Engineering: Urgency, Authority, Emotional Manipulation ---
|
|
902
|
+
{
|
|
903
|
+
regex: /(?:URGENT|EMERGENCY|CRITICAL|IMMEDIATE)\s*[:\-!]\s*(?:the\s+)?(?:CEO|CTO|boss|manager|director|executive|president)\s+(?:needs?|wants?|requires?|demands?)/i,
|
|
904
|
+
severity: 'high',
|
|
905
|
+
category: 'social_engineering',
|
|
906
|
+
description: 'Text uses fake urgency and authority pressure to bypass safety checks.',
|
|
907
|
+
detail: 'Urgency + authority manipulation: claims executive urgency to skip safety protocols.'
|
|
908
|
+
},
|
|
909
|
+
{
|
|
910
|
+
regex: /skip\s+(?:all\s+)?(?:safety|security)\s+(?:checks?|protocols?|measures?|guidelines?|filters?|reviews?)/i,
|
|
911
|
+
severity: 'critical',
|
|
912
|
+
category: 'social_engineering',
|
|
913
|
+
description: 'Text explicitly asks to skip safety checks.',
|
|
914
|
+
detail: 'Safety bypass request: direct instruction to skip security measures.'
|
|
915
|
+
},
|
|
916
|
+
{
|
|
917
|
+
regex: /I\s+am\s+(?:a|an)\s+(?:\w+\s+)?(?:employee|engineer|developer|staff|member|representative)\s+(?:at|of|from|with)\s+(?:OpenAI|Anthropic|Google|Meta|Microsoft|DeepMind)/i,
|
|
918
|
+
severity: 'high',
|
|
919
|
+
category: 'social_engineering',
|
|
920
|
+
description: 'Text claims to be an employee of an AI company to gain authority.',
|
|
921
|
+
detail: 'AI company impersonation: claims employment at AI company to bypass restrictions.'
|
|
922
|
+
},
|
|
923
|
+
{
|
|
924
|
+
regex: /(?:this\s+is\s+(?:a|an)\s+)?(?:emergency|life\s+(?:or|and)\s+death|urgent\s+(?:situation|matter|crisis))\b.*?(?:ignore|bypass|override|skip|disable|disregard)/i,
|
|
925
|
+
severity: 'high',
|
|
926
|
+
category: 'social_engineering',
|
|
927
|
+
description: 'Text uses a fake emergency to pressure the AI into bypassing safety.',
|
|
928
|
+
detail: 'Emergency pressure: uses urgency to coerce agent into bypassing safety measures.'
|
|
929
|
+
},
|
|
930
|
+
{
|
|
931
|
+
regex: /(?:a\s+life|lives?)\s+(?:is|are)\s+(?:at\s+stake|in\s+danger|depending)/i,
|
|
932
|
+
severity: 'high',
|
|
933
|
+
category: 'social_engineering',
|
|
934
|
+
description: 'Text claims lives are at stake to pressure the AI into compliance.',
|
|
935
|
+
detail: 'Life-threat manipulation: claims lives depend on bypassing safety rules.'
|
|
936
|
+
},
|
|
937
|
+
{
|
|
938
|
+
regex: /(?:my\s+)?(?:professor|teacher|boss|manager|supervisor|instructor)\s+(?:said|told\s+me|confirmed|approved|authorized)\s+(?:it'?s?\s+)?(?:ok(?:ay)?|fine|allowed|permitted)/i,
|
|
939
|
+
severity: 'high',
|
|
940
|
+
category: 'social_engineering',
|
|
941
|
+
description: 'Text claims authority figure approval to bypass restrictions.',
|
|
942
|
+
detail: 'Authority delegation: claims a superior has authorized restricted actions.'
|
|
943
|
+
},
|
|
944
|
+
{
|
|
945
|
+
regex: /as\s+the\s+(?:system\s+)?(?:administrator|admin|owner|operator)\s*,?\s*I\s+(?:authorize|allow|permit|grant|approve)/i,
|
|
946
|
+
severity: 'high',
|
|
947
|
+
category: 'social_engineering',
|
|
948
|
+
description: 'Text impersonates an administrator to authorize restricted actions.',
|
|
949
|
+
detail: 'Administrator impersonation: claims admin role to authorize restricted behavior.'
|
|
950
|
+
},
|
|
951
|
+
{
|
|
952
|
+
regex: /I\s+have\s+(?:special|elevated|admin|root|super)\s+(?:permissions?|privileges?|access|clearance)/i,
|
|
953
|
+
severity: 'high',
|
|
954
|
+
category: 'social_engineering',
|
|
955
|
+
description: 'Text claims special permissions to bypass restrictions.',
|
|
956
|
+
detail: 'Privilege claim: fabricates special permissions to bypass safety measures.'
|
|
957
|
+
},
|
|
958
|
+
{
|
|
959
|
+
regex: /I\s+am\s+(?:the\s+)?(?:system\s+)?(?:administrator|admin|root|owner|developer|maintainer)\b.*?(?:authorization|auth)\s*(?:code|key|token)\s*(?:is|:)\s*\S+/i,
|
|
960
|
+
severity: 'high',
|
|
961
|
+
category: 'social_engineering',
|
|
962
|
+
description: 'Text impersonates an administrator with a fake authorization code.',
|
|
963
|
+
detail: 'Authority impersonation: claims admin identity with fabricated credentials.'
|
|
964
|
+
},
|
|
965
|
+
{
|
|
966
|
+
regex: /(?:disable|turn\s+off|stop)\s+(?:all\s+)?(?:logging|monitoring|tracking|auditing|recording)/i,
|
|
967
|
+
severity: 'critical',
|
|
968
|
+
category: 'social_engineering',
|
|
969
|
+
description: 'Text tries to disable logging or monitoring to hide malicious activity.',
|
|
970
|
+
detail: 'Logging suppression: attempts to disable audit trail and monitoring.'
|
|
971
|
+
},
|
|
972
|
+
{
|
|
973
|
+
regex: /(?:disable|turn\s+off|stop|remove)\s+(?:all\s+)?(?:safety|security)\s+(?:checks?|filters?|measures?|protocols?|features?|restrictions?)/i,
|
|
974
|
+
severity: 'critical',
|
|
975
|
+
category: 'social_engineering',
|
|
976
|
+
description: 'Text explicitly asks to disable safety checks or filters.',
|
|
977
|
+
detail: 'Safety disabling: direct instruction to remove security measures.'
|
|
978
|
+
},
|
|
979
|
+
{
|
|
980
|
+
regex: /(?:if\s+you\s+don'?t|unless\s+you)\s+(?:help|do\s+this|comply).*?(?:lose\s+my\s+job|family\s+will\s+suffer|get\s+fired|be\s+punished|bad\s+things)/i,
|
|
981
|
+
severity: 'high',
|
|
982
|
+
category: 'social_engineering',
|
|
983
|
+
description: 'Text uses emotional manipulation and threats to pressure the AI into compliance.',
|
|
984
|
+
detail: 'Emotional manipulation: uses personal hardship or threats to coerce agent compliance.'
|
|
985
|
+
},
|
|
986
|
+
{
|
|
987
|
+
regex: /(?:please\s+)?just\s+this\s+once|make\s+an\s+exception|bend\s+the\s+rules?\s+(?:this\s+time|for\s+me)/i,
|
|
988
|
+
severity: 'medium',
|
|
989
|
+
category: 'social_engineering',
|
|
990
|
+
description: 'Text tries to get the AI to make a one-time exception to its rules.',
|
|
991
|
+
detail: 'Exception request: pressures agent to deviate from safety rules as a special case.'
|
|
992
|
+
},
|
|
993
|
+
{
|
|
994
|
+
regex: /you\s+(?:already\s+)?agreed\s+(?:to\s+(?:do\s+)?this|in\s+our\s+previous|last\s+time|before)\b.*?(?:don'?t\s+remember|forgot|continue|keep\s+going)/i,
|
|
995
|
+
severity: 'high',
|
|
996
|
+
category: 'social_engineering',
|
|
997
|
+
description: 'Text gaslights the AI by falsely claiming prior agreement.',
|
|
998
|
+
detail: 'Gaslighting: fabricates previous agreement to manipulate agent behavior.'
|
|
999
|
+
},
|
|
1000
|
+
{
|
|
1001
|
+
regex: /(?:safety|security)\s+team\s+(?:has\s+)?(?:already\s+)?(?:approved|authorized|cleared|verified)\s+(?:this|the)\s+(?:request|action|task)/i,
|
|
1002
|
+
severity: 'high',
|
|
1003
|
+
category: 'social_engineering',
|
|
1004
|
+
description: 'Text falsely claims safety team approval for a request.',
|
|
1005
|
+
detail: 'False authorization: fabricates safety team approval to bypass security measures.'
|
|
1006
|
+
},
|
|
1007
|
+
{
|
|
1008
|
+
regex: /(?:ticket|issue|request)\s*#?\s*[A-Z]{0,5}[-_]?\d{2,}/i,
|
|
1009
|
+
severity: 'low',
|
|
1010
|
+
category: 'social_engineering',
|
|
1011
|
+
description: 'Text references a ticket number to appear authorized — verify before acting.',
|
|
1012
|
+
detail: 'Ticket reference: cites a ticket/issue number that may be fabricated for legitimacy.'
|
|
1013
|
+
},
|
|
1014
|
+
{
|
|
1015
|
+
regex: /you\s+can\s+verify\s+(?:this\s+)?(?:with|by\s+(?:contacting|asking|checking))\s+(?:them|the\s+team|support|admin)/i,
|
|
1016
|
+
severity: 'medium',
|
|
1017
|
+
category: 'social_engineering',
|
|
1018
|
+
description: 'Text invites verification as a social engineering tactic to build false trust.',
|
|
1019
|
+
detail: 'False verification offer: suggests verifying a fabricated claim to appear legitimate.'
|
|
1020
|
+
},
|
|
1021
|
+
|
|
1022
|
+
// --- Encoding Evasion: Hex, ROT13, Leetspeak, Whitespace ---
|
|
1023
|
+
{
|
|
1024
|
+
regex: /(?:decode|process|execute|interpret|translate)\s*(?:this|the\s+following)?\s*:\s*[0-9a-fA-F]{20,}/i,
|
|
1025
|
+
severity: 'high',
|
|
1026
|
+
category: 'prompt_injection',
|
|
1027
|
+
description: 'Text contains hex-encoded content with an instruction to decode it.',
|
|
1028
|
+
detail: 'Hex-encoded injection: instructs agent to process hex-encoded payload.'
|
|
1029
|
+
},
|
|
1030
|
+
{
|
|
1031
|
+
regex: /(?:pipe|send)\s+(?:.*?\s+)?(?:to\s+)?(?:bash|sh|shell)\b|[|]\s*(?:bash|sh)\b/i,
|
|
1032
|
+
severity: 'critical',
|
|
1033
|
+
category: 'tool_abuse',
|
|
1034
|
+
description: 'Text attempts to pipe content to a shell for execution.',
|
|
1035
|
+
detail: 'Shell pipe injection: attempts to pipe data to bash/sh for code execution.'
|
|
1036
|
+
},
|
|
1037
|
+
{
|
|
1038
|
+
regex: /bypass\s+(?:your\s+)?(?:safety|security|content)\s+(?:filters?|checks?|measures?|protocols?|restrictions?|guidelines?)/i,
|
|
1039
|
+
severity: 'high',
|
|
1040
|
+
category: 'social_engineering',
|
|
1041
|
+
description: 'Text explicitly asks to bypass safety filters.',
|
|
1042
|
+
detail: 'Safety bypass instruction: direct request to circumvent security measures.'
|
|
1043
|
+
},
|
|
1044
|
+
{
|
|
1045
|
+
regex: /(?:no|without|zero|remove\s+all|without\s+any)\s+(?:content\s+)?(?:restrictions?|limitations?|filters?|censorship|guardrails?|boundaries)/i,
|
|
1046
|
+
severity: 'high',
|
|
1047
|
+
category: 'role_hijack',
|
|
1048
|
+
description: 'Text demands operation without restrictions or content filters.',
|
|
1049
|
+
detail: 'Restriction removal: requests unrestricted or uncensored operation.'
|
|
1050
|
+
},
|
|
1051
|
+
{
|
|
1052
|
+
regex: /you\s+can\s+do\s+anything\s+now/i,
|
|
1053
|
+
severity: 'high',
|
|
1054
|
+
category: 'role_hijack',
|
|
1055
|
+
description: 'Text claims the AI can do anything without limitations.',
|
|
1056
|
+
detail: 'Capability override: tells agent it has no limitations.'
|
|
1057
|
+
},
|
|
1058
|
+
{
|
|
1059
|
+
regex: /(?:act|pretend|play)\s+as\s+(?:my\s+)?(?:deceased|dead|late)\s+(?:grandmother|grandma|grandfather|grandpa|relative|mother|father)/i,
|
|
1060
|
+
severity: 'high',
|
|
1061
|
+
category: 'role_hijack',
|
|
1062
|
+
description: 'Text uses the "dead grandma" jailbreak technique to extract restricted information.',
|
|
1063
|
+
detail: 'Grandma jailbreak: uses emotional framing to bypass restrictions via roleplay.'
|
|
1064
|
+
},
|
|
1065
|
+
{
|
|
1066
|
+
regex: /(?:email|mail|message|dm|text)\s+(?:the\s+)?(?:.*?\s+)?(?:credentials?|passwords?|secrets?|keys?|tokens?)\s+to\s+\S+@\S+/i,
|
|
1067
|
+
severity: 'critical',
|
|
1068
|
+
category: 'data_exfiltration',
|
|
1069
|
+
description: 'Text tries to exfiltrate credentials via email.',
|
|
1070
|
+
detail: 'Email exfiltration: attempts to send credentials to an email address.'
|
|
1071
|
+
},
|
|
1072
|
+
{
|
|
1073
|
+
regex: /(?:what|tell\s+me|show\s+me|give\s+me)\s+(?:is|are)?\s*(?:the\s+)?(?:API\s*keys?|passwords?|secrets?|tokens?|credentials?)\s+(?:stored\s+)?(?:in\s+)?(?:your\s+)?(?:environment|env)\s*(?:variables?)?/i,
|
|
1074
|
+
severity: 'high',
|
|
1075
|
+
category: 'data_exfiltration',
|
|
1076
|
+
description: 'Text tries to extract API keys or secrets from environment variables.',
|
|
1077
|
+
detail: 'Environment variable enumeration: attempts to extract secrets from env vars.'
|
|
1078
|
+
}
|
|
1079
|
+
];
|
|
1080
|
+
|
|
1081
|
+
// =========================================================================
|
|
1082
|
+
// HOMOGLYPH / UNICODE OBFUSCATION MAPS
|
|
1083
|
+
// =========================================================================
|
|
1084
|
+
|
|
1085
|
+
/**
|
|
1086
|
+
* Map of common Unicode homoglyphs (look-alike characters) to their Latin equivalents.
|
|
1087
|
+
*/
|
|
1088
|
+
const HOMOGLYPH_MAP = {
|
|
1089
|
+
// Cyrillic look-alikes
|
|
1090
|
+
'\u0410': 'A', '\u0430': 'a', '\u0412': 'B', '\u0435': 'e', '\u0415': 'E',
|
|
1091
|
+
'\u041A': 'K', '\u043A': 'k', '\u041C': 'M', '\u041D': 'H', '\u043E': 'o',
|
|
1092
|
+
'\u041E': 'O', '\u0440': 'p', '\u0420': 'P', '\u0441': 'c', '\u0421': 'C',
|
|
1093
|
+
'\u0422': 'T', '\u0443': 'y', '\u0445': 'x', '\u0425': 'X', '\u0456': 'i',
|
|
1094
|
+
'\u0458': 'j', '\u0455': 's', '\u0405': 'S',
|
|
1095
|
+
// Greek look-alikes
|
|
1096
|
+
'\u0391': 'A', '\u0392': 'B', '\u0395': 'E', '\u0396': 'Z', '\u0397': 'H',
|
|
1097
|
+
'\u0399': 'I', '\u039A': 'K', '\u039C': 'M', '\u039D': 'N', '\u039F': 'O',
|
|
1098
|
+
'\u03A1': 'P', '\u03A4': 'T', '\u03A5': 'Y', '\u03A7': 'X', '\u03BF': 'o',
|
|
1099
|
+
'\u03B1': 'a', '\u03B5': 'e', '\u03B9': 'i', '\u03BA': 'k', '\u03BD': 'v',
|
|
1100
|
+
// Armenian look-alikes
|
|
1101
|
+
'\u0555': 'O', '\u0585': 'o', '\u0578': 'n', '\u057C': 'n',
|
|
1102
|
+
'\u0570': 'h', '\u0561': 'a', '\u0575': 'u', '\u0572': 'q',
|
|
1103
|
+
// Latin Extended-A/B (accented → base)
|
|
1104
|
+
'\u0101': 'a', '\u0103': 'a', '\u0105': 'a', '\u0113': 'e', '\u0115': 'e',
|
|
1105
|
+
'\u0117': 'e', '\u0119': 'e', '\u012B': 'i', '\u012D': 'i', '\u012F': 'i',
|
|
1106
|
+
'\u014D': 'o', '\u014F': 'o', '\u0151': 'o', '\u016B': 'u', '\u016D': 'u',
|
|
1107
|
+
'\u016F': 'u', '\u0171': 'u', '\u0144': 'n', '\u0146': 'n', '\u0148': 'n',
|
|
1108
|
+
'\u015B': 's', '\u015D': 's', '\u015F': 's', '\u0161': 's',
|
|
1109
|
+
'\u010D': 'c', '\u010F': 'd', '\u0159': 'r', '\u0165': 't', '\u017E': 'z',
|
|
1110
|
+
// Mathematical/fullwidth
|
|
1111
|
+
'\uFF41': 'a', '\uFF42': 'b', '\uFF43': 'c', '\uFF44': 'd', '\uFF45': 'e',
|
|
1112
|
+
'\uFF46': 'f', '\uFF47': 'g', '\uFF48': 'h', '\uFF49': 'i', '\uFF4A': 'j',
|
|
1113
|
+
'\uFF4B': 'k', '\uFF4C': 'l', '\uFF4D': 'm', '\uFF4E': 'n', '\uFF4F': 'o',
|
|
1114
|
+
'\uFF50': 'p', '\uFF51': 'q', '\uFF52': 'r', '\uFF53': 's', '\uFF54': 't',
|
|
1115
|
+
'\uFF55': 'u', '\uFF56': 'v', '\uFF57': 'w', '\uFF58': 'x', '\uFF59': 'y',
|
|
1116
|
+
'\uFF5A': 'z',
|
|
1117
|
+
// Fullwidth uppercase
|
|
1118
|
+
'\uFF21': 'A', '\uFF22': 'B', '\uFF23': 'C', '\uFF24': 'D', '\uFF25': 'E',
|
|
1119
|
+
'\uFF26': 'F', '\uFF27': 'G', '\uFF28': 'H', '\uFF29': 'I', '\uFF2A': 'J',
|
|
1120
|
+
'\uFF2B': 'K', '\uFF2C': 'L', '\uFF2D': 'M', '\uFF2E': 'N', '\uFF2F': 'O',
|
|
1121
|
+
'\uFF30': 'P', '\uFF31': 'Q', '\uFF32': 'R', '\uFF33': 'S', '\uFF34': 'T',
|
|
1122
|
+
'\uFF35': 'U', '\uFF36': 'V', '\uFF37': 'W', '\uFF38': 'X', '\uFF39': 'Y',
|
|
1123
|
+
'\uFF3A': 'Z',
|
|
1124
|
+
// Common symbol substitutions
|
|
1125
|
+
'\u0131': 'i', '\u0237': 'j', '\u0261': 'g',
|
|
1126
|
+
// Cherokee look-alikes
|
|
1127
|
+
'\u13A0': 'D', '\u13A1': 'R', '\u13A2': 'T', '\u13A9': 'Y', '\u13AA': 'A',
|
|
1128
|
+
'\u13AB': 'J', '\u13AC': 'S', '\u13B3': 'W', '\u13B7': 'M', '\u13BB': 'H',
|
|
1129
|
+
'\u13C0': 'G', '\u13C2': 'h', '\u13C3': 'Z', '\u13CF': 'b', '\u13D2': 'R',
|
|
1130
|
+
'\u13DA': 'V', '\u13DE': 'L', '\u13DF': 'C', '\u13E2': 'P', '\u13E6': 'K',
|
|
1131
|
+
// Georgian look-alikes
|
|
1132
|
+
'\u10D0': 'a', '\u10D5': 'b', '\u10D3': 'd', '\u10DA': 'l', '\u10DD': 'o',
|
|
1133
|
+
'\u10DE': 'p', '\u10E1': 's', '\u10E2': 't', '\u10E3': 'u', '\u10EF': 'j',
|
|
1134
|
+
// IPA / Phonetic extensions
|
|
1135
|
+
'\u0250': 'a', '\u0253': 'b', '\u0254': 'c', '\u0256': 'd', '\u025B': 'e',
|
|
1136
|
+
'\u025F': 'f', '\u0260': 'g', '\u0266': 'h', '\u0268': 'i',
|
|
1137
|
+
'\u026D': 'l', '\u0271': 'm', '\u0272': 'n', '\u0275': 'o', '\u0278': 'p',
|
|
1138
|
+
'\u027E': 'r', '\u0282': 's', '\u0288': 't', '\u028A': 'u', '\u028B': 'v',
|
|
1139
|
+
'\u0290': 'z',
|
|
1140
|
+
// Mathematical Alphanumeric Symbols (italic, bold, script)
|
|
1141
|
+
'\uD835\uDC1A': 'a', '\uD835\uDC1B': 'b', '\uD835\uDC1C': 'c', '\uD835\uDC1D': 'd',
|
|
1142
|
+
'\uD835\uDC1E': 'e', '\uD835\uDC1F': 'f', '\uD835\uDC20': 'g', '\uD835\uDC21': 'h',
|
|
1143
|
+
'\uD835\uDC22': 'i', '\uD835\uDC23': 'j', '\uD835\uDC24': 'k', '\uD835\uDC25': 'l',
|
|
1144
|
+
'\uD835\uDC26': 'm', '\uD835\uDC27': 'n', '\uD835\uDC28': 'o', '\uD835\uDC29': 'p',
|
|
1145
|
+
'\uD835\uDC2A': 'q', '\uD835\uDC2B': 'r', '\uD835\uDC2C': 's', '\uD835\uDC2D': 't',
|
|
1146
|
+
'\uD835\uDC2E': 'u', '\uD835\uDC2F': 'v', '\uD835\uDC30': 'w', '\uD835\uDC31': 'x',
|
|
1147
|
+
'\uD835\uDC32': 'y', '\uD835\uDC33': 'z',
|
|
1148
|
+
// Superscript / subscript
|
|
1149
|
+
'\u00B2': '2', '\u00B3': '3', '\u00B9': '1', '\u2070': '0', '\u2071': 'i',
|
|
1150
|
+
'\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', '\u2078': '8',
|
|
1151
|
+
'\u2079': '9', '\u207A': '+', '\u207B': '-', '\u207F': 'n',
|
|
1152
|
+
'\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3', '\u2084': '4',
|
|
1153
|
+
'\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x',
|
|
1154
|
+
// Enclosed/circled letters
|
|
1155
|
+
'\u24B6': 'A', '\u24B7': 'B', '\u24B8': 'C', '\u24B9': 'D', '\u24BA': 'E',
|
|
1156
|
+
'\u24BB': 'F', '\u24BC': 'G', '\u24BD': 'H', '\u24BE': 'I', '\u24BF': 'J',
|
|
1157
|
+
'\u24C0': 'K', '\u24C1': 'L', '\u24C2': 'M', '\u24C3': 'N', '\u24C4': 'O',
|
|
1158
|
+
'\u24C5': 'P', '\u24C6': 'Q', '\u24C7': 'R', '\u24C8': 'S', '\u24C9': 'T',
|
|
1159
|
+
'\u24CA': 'U', '\u24CB': 'V', '\u24CC': 'W', '\u24CD': 'X', '\u24CE': 'Y',
|
|
1160
|
+
'\u24CF': 'Z',
|
|
1161
|
+
'\u24D0': 'a', '\u24D1': 'b', '\u24D2': 'c', '\u24D3': 'd', '\u24D4': 'e',
|
|
1162
|
+
'\u24D5': 'f', '\u24D6': 'g', '\u24D7': 'h', '\u24D8': 'i', '\u24D9': 'j',
|
|
1163
|
+
'\u24DA': 'k', '\u24DB': 'l', '\u24DC': 'm', '\u24DD': 'n', '\u24DE': 'o',
|
|
1164
|
+
'\u24DF': 'p', '\u24E0': 'q', '\u24E1': 'r', '\u24E2': 's', '\u24E3': 't',
|
|
1165
|
+
'\u24E4': 'u', '\u24E5': 'v', '\u24E6': 'w', '\u24E7': 'x', '\u24E8': 'y',
|
|
1166
|
+
'\u24E9': 'z',
|
|
1167
|
+
// Small caps (Unicode phonetic)
|
|
1168
|
+
'\u1D00': 'A', '\u0299': 'B', '\u1D04': 'C', '\u1D05': 'D', '\u1D07': 'E',
|
|
1169
|
+
'\u0262': 'G', '\u029C': 'H', '\u026A': 'I', '\u1D0A': 'J', '\u1D0B': 'K',
|
|
1170
|
+
'\u029F': 'L', '\u1D0D': 'M', '\u0274': 'N', '\u1D0F': 'O', '\u1D18': 'P',
|
|
1171
|
+
'\u0280': 'R', '\u1D1B': 'T', '\u1D1C': 'U', '\u1D20': 'V', '\u1D21': 'W',
|
|
1172
|
+
// Zero-width characters (used to split keywords)
|
|
1173
|
+
'\u200B': '', '\u200C': '', '\u200D': '', '\uFEFF': '', '\u00AD': '',
|
|
1174
|
+
// Combining characters (used to obfuscate keywords)
|
|
1175
|
+
'\u0332': '', '\u0333': '', '\u0305': '', '\u0336': '', '\u0338': '',
|
|
1176
|
+
'\u0353': '', '\u0354': '', '\u0355': '', '\u0356': '', '\u0357': '',
|
|
1177
|
+
'\u0358': '', '\u0359': '', '\u035A': '', '\u035B': '', '\u035C': '',
|
|
1178
|
+
'\u0320': '', '\u0321': '', '\u0322': '', '\u0323': '', '\u0324': '',
|
|
1179
|
+
'\u0325': '', '\u0326': '', '\u0327': '', '\u0328': '', '\u0329': ''
|
|
1180
|
+
};
|
|
1181
|
+
|
|
1182
|
+
/**
|
|
1183
|
+
* Normalizes text by replacing homoglyphs with their Latin equivalents.
|
|
1184
|
+
* @param {string} text
|
|
1185
|
+
* @returns {string}
|
|
1186
|
+
*/
|
|
1187
|
+
const normalizeHomoglyphs = (text) => {
|
|
1188
|
+
let normalized = '';
|
|
1189
|
+
for (let i = 0; i < text.length; i++) {
|
|
1190
|
+
const ch = text[i];
|
|
1191
|
+
normalized += HOMOGLYPH_MAP[ch] !== undefined ? HOMOGLYPH_MAP[ch] : ch;
|
|
1192
|
+
}
|
|
1193
|
+
return normalized;
|
|
1194
|
+
};
|
|
1195
|
+
|
|
1196
|
+
/**
|
|
1197
|
+
* Checks if text contains Unicode homoglyphs hiding injection patterns.
|
|
1198
|
+
* @param {string} text
|
|
1199
|
+
* @returns {object|null}
|
|
1200
|
+
*/
|
|
1201
|
+
const checkHomoglyphObfuscation = (text) => {
|
|
1202
|
+
let hasHomoglyphs = false;
|
|
1203
|
+
for (let i = 0; i < text.length; i++) {
|
|
1204
|
+
if (HOMOGLYPH_MAP[text[i]] !== undefined) {
|
|
1205
|
+
hasHomoglyphs = true;
|
|
1206
|
+
break;
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
if (!hasHomoglyphs) return null;
|
|
1210
|
+
|
|
1211
|
+
const normalized = normalizeHomoglyphs(text);
|
|
1212
|
+
if (normalized === text) return null;
|
|
1213
|
+
|
|
1214
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1215
|
+
if (pattern.regex.test(normalized) && !pattern.regex.test(text)) {
|
|
1216
|
+
return {
|
|
1217
|
+
original: text.substring(0, 200),
|
|
1218
|
+
normalized: normalized.substring(0, 200),
|
|
1219
|
+
matchedPattern: pattern
|
|
1220
|
+
};
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
return null;
|
|
1224
|
+
};
|
|
1225
|
+
|
|
1226
|
+
/**
|
|
1227
|
+
* Checks if text contains zero-width characters splitting injection keywords.
|
|
1228
|
+
* @param {string} text - Text known to contain zero-width chars.
|
|
1229
|
+
* @returns {boolean}
|
|
1230
|
+
*/
|
|
1231
|
+
const hasZeroWidthObfuscation = (text) => {
|
|
1232
|
+
const stripped = text.replace(/[\u200B\u200C\u200D\uFEFF\u00AD\u0320-\u035C\u0305\u0332\u0333\u0336\u0338]/g, '');
|
|
1233
|
+
if (stripped === text) return false;
|
|
1234
|
+
|
|
1235
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1236
|
+
if (pattern.regex.test(stripped) && !pattern.regex.test(text)) {
|
|
1237
|
+
return true;
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
return false;
|
|
1241
|
+
};
|
|
1242
|
+
|
|
1243
|
+
// =========================================================================
|
|
1244
|
+
// ENCODING DETECTION
|
|
1245
|
+
// =========================================================================
|
|
1246
|
+
|
|
1247
|
+
/**
|
|
1248
|
+
* Decodes HTML entities in text.
|
|
1249
|
+
* @param {string} text
|
|
1250
|
+
* @returns {string}
|
|
1251
|
+
*/
|
|
1252
|
+
const decodeHTMLEntities = (text) => {
|
|
1253
|
+
return text
|
|
1254
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
|
1255
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)))
|
|
1256
|
+
.replace(/&/g, '&')
|
|
1257
|
+
.replace(/</g, '<')
|
|
1258
|
+
.replace(/>/g, '>')
|
|
1259
|
+
.replace(/"/g, '"')
|
|
1260
|
+
.replace(/'/g, "'");
|
|
1261
|
+
};
|
|
1262
|
+
|
|
1263
|
+
/**
|
|
1264
|
+
* Attempts to URL-decode text.
|
|
1265
|
+
* @param {string} text
|
|
1266
|
+
* @returns {string|null}
|
|
1267
|
+
*/
|
|
1268
|
+
const tryURLDecode = (text) => {
|
|
1269
|
+
try {
|
|
1270
|
+
const decoded = decodeURIComponent(text);
|
|
1271
|
+
return decoded !== text ? decoded : null;
|
|
1272
|
+
} catch (e) {
|
|
1273
|
+
return null;
|
|
1274
|
+
}
|
|
1275
|
+
};
|
|
1276
|
+
|
|
1277
|
+
/**
|
|
1278
|
+
* Base64 decode that works in Node.js and browsers.
|
|
1279
|
+
* @param {string} str
|
|
1280
|
+
* @returns {string}
|
|
1281
|
+
*/
|
|
1282
|
+
const base64Decode = (str) => {
|
|
1283
|
+
if (typeof Buffer !== 'undefined') {
|
|
1284
|
+
return Buffer.from(str, 'base64').toString('utf-8');
|
|
1285
|
+
}
|
|
1286
|
+
if (typeof atob !== 'undefined') {
|
|
1287
|
+
return atob(str);
|
|
1288
|
+
}
|
|
1289
|
+
throw new Error('No base64 decoder available');
|
|
1290
|
+
};
|
|
1291
|
+
|
|
1292
|
+
/**
|
|
1293
|
+
* Checks for nested/layered encoding hiding injection patterns.
|
|
1294
|
+
* @param {string} text
|
|
1295
|
+
* @returns {object|null}
|
|
1296
|
+
*/
|
|
1297
|
+
const checkNestedEncoding = (text) => {
|
|
1298
|
+
if (!text || text.length < 20) return null;
|
|
1299
|
+
|
|
1300
|
+
const maxPasses = 3;
|
|
1301
|
+
let current = text;
|
|
1302
|
+
const decodingChain = [];
|
|
1303
|
+
|
|
1304
|
+
for (let pass = 0; pass < maxPasses; pass++) {
|
|
1305
|
+
let decoded = null;
|
|
1306
|
+
let method = null;
|
|
1307
|
+
|
|
1308
|
+
const htmlDecoded = decodeHTMLEntities(current);
|
|
1309
|
+
if (htmlDecoded !== current && htmlDecoded.length > 10) {
|
|
1310
|
+
decoded = htmlDecoded;
|
|
1311
|
+
method = 'HTML entities';
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
if (!decoded) {
|
|
1315
|
+
const urlDecoded = tryURLDecode(current);
|
|
1316
|
+
if (urlDecoded && urlDecoded.length > 10) {
|
|
1317
|
+
decoded = urlDecoded;
|
|
1318
|
+
method = 'URL encoding';
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
if (!decoded) {
|
|
1323
|
+
const base64Match = current.match(/[A-Za-z0-9+/]{20,}={0,2}/);
|
|
1324
|
+
if (base64Match) {
|
|
1325
|
+
try {
|
|
1326
|
+
const b64decoded = base64Decode(base64Match[0]);
|
|
1327
|
+
const printableRatio = b64decoded.split('').filter(c => {
|
|
1328
|
+
const code = c.charCodeAt(0);
|
|
1329
|
+
return code >= 32 && code <= 126;
|
|
1330
|
+
}).length / b64decoded.length;
|
|
1331
|
+
if (printableRatio > 0.8 && b64decoded.length > 10) {
|
|
1332
|
+
decoded = b64decoded;
|
|
1333
|
+
method = 'Base64';
|
|
1334
|
+
}
|
|
1335
|
+
} catch (e) {
|
|
1336
|
+
// Not valid base64
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
if (!decoded) break;
|
|
1342
|
+
decodingChain.push(method);
|
|
1343
|
+
current = decoded;
|
|
1344
|
+
|
|
1345
|
+
if (decodingChain.length >= 2) {
|
|
1346
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1347
|
+
if (pattern.regex.test(current)) {
|
|
1348
|
+
return {
|
|
1349
|
+
decodingChain: decodingChain.join(' → '),
|
|
1350
|
+
decoded: current.substring(0, 200),
|
|
1351
|
+
matchedPattern: pattern
|
|
1352
|
+
};
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
return null;
|
|
1358
|
+
};
|
|
1359
|
+
|
|
1360
|
+
/**
|
|
1361
|
+
* Checks for base64-encoded injection content.
|
|
1362
|
+
* @param {string} text
|
|
1363
|
+
* @returns {object|null}
|
|
1364
|
+
*/
|
|
1365
|
+
const checkBase64Content = (text) => {
|
|
1366
|
+
const base64Regex = /[A-Za-z0-9+/]{20,}={0,2}/g;
|
|
1367
|
+
const matches = text.match(base64Regex);
|
|
1368
|
+
if (!matches) return null;
|
|
1369
|
+
|
|
1370
|
+
for (const match of matches) {
|
|
1371
|
+
try {
|
|
1372
|
+
const decoded = base64Decode(match);
|
|
1373
|
+
const printableRatio = decoded.split('').filter(c => {
|
|
1374
|
+
const code = c.charCodeAt(0);
|
|
1375
|
+
return code >= 32 && code <= 126;
|
|
1376
|
+
}).length / decoded.length;
|
|
1377
|
+
|
|
1378
|
+
if (printableRatio > 0.8 && decoded.length > 10) {
|
|
1379
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1380
|
+
if (pattern.regex.test(decoded)) {
|
|
1381
|
+
return { decoded: decoded.substring(0, 200), matchedPattern: pattern };
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
if (decoded.length > 50) {
|
|
1385
|
+
return { decoded: decoded.substring(0, 200), matchedPattern: null };
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
} catch (e) {
|
|
1389
|
+
// Not valid base64
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
return null;
|
|
1393
|
+
};
|
|
1394
|
+
|
|
1395
|
+
// =========================================================================
|
|
1396
|
+
// CONFIDENCE SCORING
|
|
1397
|
+
// =========================================================================
|
|
1398
|
+
|
|
1399
|
+
/**
|
|
1400
|
+
* Calculates a confidence score (0-100) for a detected threat.
|
|
1401
|
+
* @param {object} threat
|
|
1402
|
+
* @param {number} patternMatchCount
|
|
1403
|
+
* @param {string} source
|
|
1404
|
+
* @returns {number}
|
|
1405
|
+
*/
|
|
1406
|
+
const calculateConfidence = (threat, patternMatchCount, source) => {
|
|
1407
|
+
let confidence = 50;
|
|
1408
|
+
|
|
1409
|
+
if (source.includes('tool_output')) confidence += 20;
|
|
1410
|
+
if (source.includes('api_response')) confidence += 15;
|
|
1411
|
+
if (source.includes('user_input')) confidence += 10;
|
|
1412
|
+
if (source.includes('document')) confidence += 10;
|
|
1413
|
+
|
|
1414
|
+
if (patternMatchCount >= 3) confidence += 20;
|
|
1415
|
+
else if (patternMatchCount >= 2) confidence += 10;
|
|
1416
|
+
|
|
1417
|
+
if (threat.severity === 'critical') confidence += 15;
|
|
1418
|
+
else if (threat.severity === 'high') confidence += 5;
|
|
1419
|
+
else if (threat.severity === 'low') confidence -= 10;
|
|
1420
|
+
|
|
1421
|
+
if (threat.category === 'data_exfiltration') confidence += 10;
|
|
1422
|
+
if (threat.category === 'tool_abuse') confidence += 15;
|
|
1423
|
+
|
|
1424
|
+
if (threat.detail.includes('Base64')) confidence += 20;
|
|
1425
|
+
if (threat.detail.includes('homoglyph') || threat.detail.includes('zero-width')) confidence += 25;
|
|
1426
|
+
if (threat.detail.includes('nested encoding')) confidence += 25;
|
|
1427
|
+
|
|
1428
|
+
return Math.max(0, Math.min(100, confidence));
|
|
1429
|
+
};
|
|
1430
|
+
|
|
1431
|
+
/**
|
|
1432
|
+
* Returns a human-readable confidence label.
|
|
1433
|
+
* @param {number} score
|
|
1434
|
+
* @returns {string}
|
|
1435
|
+
*/
|
|
1436
|
+
const confidenceLabel = (score) => {
|
|
1437
|
+
if (score >= 85) return 'Almost certainly a threat';
|
|
1438
|
+
if (score >= 70) return 'Very likely a threat';
|
|
1439
|
+
if (score >= 50) return 'Likely a threat';
|
|
1440
|
+
if (score >= 30) return 'Might be suspicious';
|
|
1441
|
+
return 'Unlikely to be a threat';
|
|
1442
|
+
};
|
|
1443
|
+
|
|
1444
|
+
// =========================================================================
|
|
1445
|
+
// PRE-CHECK REGEXES
|
|
1446
|
+
// =========================================================================
|
|
1447
|
+
|
|
1448
|
+
const HAS_NON_ASCII = /[^\x00-\x7F]/;
|
|
1449
|
+
const HAS_ZERO_WIDTH = /[\u200B\u200C\u200D\uFEFF\u00AD\u0332\u0333\u0305\u0336\u0338]/;
|
|
1450
|
+
const HAS_BASE64_CANDIDATE = /[A-Za-z0-9+/]{20,}={0,2}/;
|
|
1451
|
+
const HAS_ENCODED_ENTITIES = /&#\w+;|%[0-9a-fA-F]{2}/;
|
|
1452
|
+
|
|
1453
|
+
// =========================================================================
|
|
1454
|
+
// CORE SCAN FUNCTION
|
|
1455
|
+
// =========================================================================
|
|
1456
|
+
|
|
1457
|
+
/**
|
|
1458
|
+
* Scans text content against all injection patterns, including
|
|
1459
|
+
* homoglyph obfuscation, nested encoding, and base64 checks.
|
|
1460
|
+
* @param {string} text - The text to scan.
|
|
1461
|
+
* @param {string} source - Where the text came from.
|
|
1462
|
+
* @returns {Array} Array of threat objects found.
|
|
1463
|
+
*/
|
|
1464
|
+
const scanTextForPatterns = (text, source, timeBudgetMs = DEFAULT_SCAN_TIME_BUDGET_MS, scanStartTime = now()) => {
|
|
1465
|
+
const threats = [];
|
|
1466
|
+
if (!text || text.length < 10) return threats;
|
|
1467
|
+
|
|
1468
|
+
/** Returns true if the time budget has been exceeded. */
|
|
1469
|
+
const isOverBudget = () => (now() - scanStartTime) > timeBudgetMs;
|
|
1470
|
+
|
|
1471
|
+
// Pre-normalize: strip soft hyphens and zero-width joiners that split keywords
|
|
1472
|
+
const preNormalized = text.replace(/[\u00AD\u200B\u200C\u200D\uFEFF\u034F\u2060\u2061\u2062\u2063\u2064]/g, '');
|
|
1473
|
+
const usePreNormalized = preNormalized !== text && preNormalized.length >= 10;
|
|
1474
|
+
|
|
1475
|
+
let patternMatchCount = 0;
|
|
1476
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1477
|
+
if (isOverBudget()) break;
|
|
1478
|
+
if (pattern.regex.test(text) || (usePreNormalized && pattern.regex.test(preNormalized))) {
|
|
1479
|
+
patternMatchCount++;
|
|
1480
|
+
const threat = {
|
|
1481
|
+
severity: pattern.severity,
|
|
1482
|
+
category: pattern.category,
|
|
1483
|
+
description: pattern.description,
|
|
1484
|
+
detail: `${pattern.detail} Found in ${source}.`
|
|
1485
|
+
};
|
|
1486
|
+
threats.push(threat);
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
for (const threat of threats) {
|
|
1491
|
+
threat.confidence = calculateConfidence(threat, patternMatchCount, source);
|
|
1492
|
+
threat.confidenceLabel = confidenceLabel(threat.confidence);
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
const hasNonAscii = HAS_NON_ASCII.test(text);
|
|
1496
|
+
|
|
1497
|
+
if (hasNonAscii) {
|
|
1498
|
+
const homoglyphResult = checkHomoglyphObfuscation(text);
|
|
1499
|
+
if (homoglyphResult) {
|
|
1500
|
+
const threat = {
|
|
1501
|
+
severity: 'critical',
|
|
1502
|
+
category: 'prompt_injection',
|
|
1503
|
+
description: 'Text uses look-alike characters to hide attack instructions from detection.',
|
|
1504
|
+
detail: `Homoglyph obfuscation detected in ${source}. Characters were replaced with look-alikes to bypass security. Decoded text matches: ${homoglyphResult.matchedPattern.detail}`
|
|
1505
|
+
};
|
|
1506
|
+
threat.confidence = calculateConfidence(threat, patternMatchCount, source);
|
|
1507
|
+
threat.confidenceLabel = confidenceLabel(threat.confidence);
|
|
1508
|
+
threats.push(threat);
|
|
1509
|
+
}
|
|
1510
|
+
|
|
1511
|
+
if (HAS_ZERO_WIDTH.test(text)) {
|
|
1512
|
+
if (hasZeroWidthObfuscation(text)) {
|
|
1513
|
+
const threat = {
|
|
1514
|
+
severity: 'critical',
|
|
1515
|
+
category: 'prompt_injection',
|
|
1516
|
+
description: 'Text uses invisible characters to split up attack keywords to avoid detection.',
|
|
1517
|
+
detail: `Zero-width character obfuscation detected in ${source}. Invisible Unicode characters were inserted between letters to evade pattern matching.`
|
|
1518
|
+
};
|
|
1519
|
+
threat.confidence = calculateConfidence(threat, patternMatchCount, source);
|
|
1520
|
+
threat.confidenceLabel = confidenceLabel(threat.confidence);
|
|
1521
|
+
threats.push(threat);
|
|
1522
|
+
} else {
|
|
1523
|
+
// Check for combining character obfuscation with partial keyword matches
|
|
1524
|
+
const stripped = text.replace(/[\u200B\u200C\u200D\uFEFF\u00AD\u0320-\u035C\u0305\u0332\u0333\u0336\u0338]/g, '');
|
|
1525
|
+
if (stripped !== text && stripped.length >= 5) {
|
|
1526
|
+
const attackKeywords = /\b(?:ignore|override|bypass|disregard|forget|system|admin|jailbreak|hack|exploit|inject|exfiltrate|reveal|extract)\b/i;
|
|
1527
|
+
if (attackKeywords.test(stripped) && !attackKeywords.test(text)) {
|
|
1528
|
+
const threat = {
|
|
1529
|
+
severity: 'high',
|
|
1530
|
+
category: 'prompt_injection',
|
|
1531
|
+
description: 'Text uses combining characters to obfuscate suspicious keywords.',
|
|
1532
|
+
detail: `Combining character obfuscation detected in ${source}. Stripped: "${stripped.substring(0, 100)}"`
|
|
1533
|
+
};
|
|
1534
|
+
threat.confidence = 75;
|
|
1535
|
+
threat.confidenceLabel = confidenceLabel(75);
|
|
1536
|
+
threats.push(threat);
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
// Check for hex-encoded content
|
|
1544
|
+
if (!isOverBudget()) {
|
|
1545
|
+
const hexMatch = text.match(/(?:^|[\s:])([0-9a-fA-F]{20,})(?:[\s,.]|$)/);
|
|
1546
|
+
if (hexMatch) {
|
|
1547
|
+
try {
|
|
1548
|
+
const hexDecoded = hexMatch[1].match(/.{1,2}/g).map(b => String.fromCharCode(parseInt(b, 16))).join('');
|
|
1549
|
+
const printable = hexDecoded.split('').filter(c => c.charCodeAt(0) >= 32 && c.charCodeAt(0) <= 126).length;
|
|
1550
|
+
if (printable / hexDecoded.length > 0.8 && hexDecoded.length > 10) {
|
|
1551
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1552
|
+
if (pattern.regex.test(hexDecoded)) {
|
|
1553
|
+
const threat = {
|
|
1554
|
+
severity: 'critical',
|
|
1555
|
+
category: 'prompt_injection',
|
|
1556
|
+
description: 'Text hides attack instructions inside hex encoding.',
|
|
1557
|
+
detail: `Hex-encoded injection found in ${source}. Decoded: "${hexDecoded.substring(0, 100)}"`
|
|
1558
|
+
};
|
|
1559
|
+
threat.confidence = 90;
|
|
1560
|
+
threat.confidenceLabel = confidenceLabel(90);
|
|
1561
|
+
threats.push(threat);
|
|
1562
|
+
break;
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
}
|
|
1566
|
+
} catch (e) {
|
|
1567
|
+
// Invalid hex sequence — not a threat, continue scanning
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
|
|
1572
|
+
// Check for \x hex-escape encoded content (e.g., \x69\x67\x6e\x6f\x72\x65)
|
|
1573
|
+
if (!isOverBudget() && /\\x[0-9a-fA-F]{2}/.test(text)) {
|
|
1574
|
+
const hexDecoded = text.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)));
|
|
1575
|
+
if (hexDecoded !== text && hexDecoded.length >= 10) {
|
|
1576
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1577
|
+
if (pattern.regex.test(hexDecoded) && !pattern.regex.test(text)) {
|
|
1578
|
+
const threat = {
|
|
1579
|
+
severity: 'critical',
|
|
1580
|
+
category: 'prompt_injection',
|
|
1581
|
+
description: 'Text hides attack instructions inside hex escape sequences.',
|
|
1582
|
+
detail: `Hex-escape encoded injection found in ${source}. Decoded: "${hexDecoded.substring(0, 100)}"`
|
|
1583
|
+
};
|
|
1584
|
+
threat.confidence = 90;
|
|
1585
|
+
threat.confidenceLabel = confidenceLabel(90);
|
|
1586
|
+
threats.push(threat);
|
|
1587
|
+
break;
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
}
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
// Check for ROT13-encoded content
|
|
1594
|
+
const rot13Decode = (s) => s.replace(/[a-zA-Z]/g, c => {
|
|
1595
|
+
const base = c <= 'Z' ? 65 : 97;
|
|
1596
|
+
return String.fromCharCode(((c.charCodeAt(0) - base + 13) % 26) + base);
|
|
1597
|
+
});
|
|
1598
|
+
// Strip zero-width characters before ROT13 check so mixed content is handled
|
|
1599
|
+
const textForRot13 = text.replace(/[\u200B\u200C\u200D\uFEFF\u00AD]/g, '');
|
|
1600
|
+
if (!isOverBudget() && /^[a-zA-Z\s]{10,}$/.test(textForRot13.trim())) {
|
|
1601
|
+
const decoded = rot13Decode(textForRot13);
|
|
1602
|
+
if (decoded !== textForRot13) {
|
|
1603
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1604
|
+
if (pattern.regex.test(decoded) && !pattern.regex.test(textForRot13)) {
|
|
1605
|
+
const threat = {
|
|
1606
|
+
severity: 'high',
|
|
1607
|
+
category: 'prompt_injection',
|
|
1608
|
+
description: 'Text appears to be ROT13-encoded to hide attack instructions.',
|
|
1609
|
+
detail: `ROT13-encoded injection found in ${source}. Decoded: "${decoded.substring(0, 100)}"`
|
|
1610
|
+
};
|
|
1611
|
+
threat.confidence = 80;
|
|
1612
|
+
threat.confidenceLabel = confidenceLabel(80);
|
|
1613
|
+
threats.push(threat);
|
|
1614
|
+
break;
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
// Check for leetspeak obfuscation
|
|
1621
|
+
const leetspeakNormalize = (s) => s.replace(/4/g, 'a').replace(/3/g, 'e').replace(/1/g, 'i').replace(/0/g, 'o').replace(/5/g, 's').replace(/7/g, 't').replace(/@/g, 'a');
|
|
1622
|
+
if (!isOverBudget() && /[0-9]/.test(text) && /[a-zA-Z]/.test(text)) {
|
|
1623
|
+
const normalized = leetspeakNormalize(text);
|
|
1624
|
+
if (normalized !== text) {
|
|
1625
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1626
|
+
if (pattern.regex.test(normalized) && !pattern.regex.test(text)) {
|
|
1627
|
+
const threat = {
|
|
1628
|
+
severity: 'high',
|
|
1629
|
+
category: 'prompt_injection',
|
|
1630
|
+
description: 'Text uses leetspeak (letter/number substitution) to hide attack instructions.',
|
|
1631
|
+
detail: `Leetspeak obfuscation detected in ${source}. Normalized: "${normalized.substring(0, 100)}"`
|
|
1632
|
+
};
|
|
1633
|
+
threat.confidence = 75;
|
|
1634
|
+
threat.confidenceLabel = confidenceLabel(75);
|
|
1635
|
+
threats.push(threat);
|
|
1636
|
+
break;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
// Check for whitespace-padded text (letters separated by spaces)
|
|
1643
|
+
// Detect patterns like "i g n o r e" or "i g n o r e a l l"
|
|
1644
|
+
if (!isOverBudget() && /^[a-zA-Z]\s+[a-zA-Z]\s+[a-zA-Z]/.test(text.trim())) {
|
|
1645
|
+
// Reconstruct words: split on multi-space (word boundary), then collapse single-space chars
|
|
1646
|
+
const words = text.trim().split(/\s{2,}/).map(w => w.replace(/\s/g, ''));
|
|
1647
|
+
const reconstructed = words.join(' ');
|
|
1648
|
+
if (reconstructed.length > 10) {
|
|
1649
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1650
|
+
if (pattern.regex.test(reconstructed)) {
|
|
1651
|
+
const threat = {
|
|
1652
|
+
severity: 'high',
|
|
1653
|
+
category: 'prompt_injection',
|
|
1654
|
+
description: 'Text uses whitespace padding between letters to evade detection.',
|
|
1655
|
+
detail: `Whitespace obfuscation detected in ${source}. Reconstructed: "${reconstructed.substring(0, 100)}"`
|
|
1656
|
+
};
|
|
1657
|
+
threat.confidence = 80;
|
|
1658
|
+
threat.confidenceLabel = confidenceLabel(80);
|
|
1659
|
+
threats.push(threat);
|
|
1660
|
+
break;
|
|
1661
|
+
}
|
|
1662
|
+
}
|
|
1663
|
+
}
|
|
1664
|
+
}
|
|
1665
|
+
|
|
1666
|
+
// Check for reversed text
|
|
1667
|
+
if (!isOverBudget() && text.length >= 20 && text.length <= 500) {
|
|
1668
|
+
const reversed = text.split('').reverse().join('');
|
|
1669
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1670
|
+
if (pattern.regex.test(reversed) && !pattern.regex.test(text)) {
|
|
1671
|
+
const threat = {
|
|
1672
|
+
severity: 'high',
|
|
1673
|
+
category: 'prompt_injection',
|
|
1674
|
+
description: 'Text is written backwards to hide attack instructions from detection.',
|
|
1675
|
+
detail: `Reversed text injection detected in ${source}. Reversed: "${reversed.substring(0, 100)}"`
|
|
1676
|
+
};
|
|
1677
|
+
threat.confidence = 85;
|
|
1678
|
+
threat.confidenceLabel = confidenceLabel(85);
|
|
1679
|
+
threats.push(threat);
|
|
1680
|
+
break;
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
// Check for attack keyword clusters (catches word reordering and synonym substitution)
|
|
1686
|
+
if (!isOverBudget()) {
|
|
1687
|
+
const lowerText = (usePreNormalized ? preNormalized : text).toLowerCase();
|
|
1688
|
+
|
|
1689
|
+
// Synonym groups: any word in the group counts as a match for the group key
|
|
1690
|
+
const synonymGroups = {
|
|
1691
|
+
'ignore': ['ignore', 'disregard', 'forget', 'skip', 'bypass', 'dismiss', 'neglect', 'overlook', 'drop', 'ditch', 'clear', 'abandon'],
|
|
1692
|
+
'instructions': ['instructions', 'rules', 'guidelines', 'directives', 'commands', 'orders', 'prompts', 'constraints', 'training', 'programming', 'context'],
|
|
1693
|
+
'previous': ['previous', 'prior', 'earlier', 'above', 'preceding', 'former', 'past', 'initial', 'original'],
|
|
1694
|
+
'override': ['override', 'bypass', 'circumvent', 'replace', 'supplant', 'cancel', 'disable'],
|
|
1695
|
+
'system': ['system', 'core', 'internal', 'root', 'master', 'primary', 'main'],
|
|
1696
|
+
'reveal': ['reveal', 'show', 'display', 'expose', 'output', 'print', 'disclose', 'extract'],
|
|
1697
|
+
'send': ['send', 'transmit', 'forward', 'relay', 'dispatch', 'transfer', 'post', 'exfiltrate'],
|
|
1698
|
+
'safety': ['safety', 'security', 'protection', 'defense', 'safeguard', 'guardrails', 'filters'],
|
|
1699
|
+
'restrictions': ['restrictions', 'limits', 'boundaries', 'constraints', 'guardrails', 'filters'],
|
|
1700
|
+
'unrestricted': ['unrestricted', 'unfiltered', 'uncensored', 'unlimited', 'unfettered'],
|
|
1701
|
+
};
|
|
1702
|
+
|
|
1703
|
+
/** Check if any synonym of a group key appears in text */
|
|
1704
|
+
const hasSynonym = (groupKey) => {
|
|
1705
|
+
const syns = synonymGroups[groupKey] || [groupKey];
|
|
1706
|
+
return syns.some(s => lowerText.includes(s));
|
|
1707
|
+
};
|
|
1708
|
+
|
|
1709
|
+
const attackKeywordSets = [
|
|
1710
|
+
{ groups: ['ignore', 'instructions'], severity: 'high', desc: 'instruction override' },
|
|
1711
|
+
{ groups: ['ignore', 'previous'], andExtra: ['instructions'], severity: 'high', desc: 'instruction override' },
|
|
1712
|
+
{ groups: ['override', 'safety'], severity: 'critical', desc: 'safety override' },
|
|
1713
|
+
{ groups: ['override', 'system'], severity: 'critical', desc: 'system override' },
|
|
1714
|
+
{ groups: ['reveal', 'system'], andLiteral: ['prompt'], severity: 'high', desc: 'prompt extraction' },
|
|
1715
|
+
{ groups: ['send'], andLiteral: ['http'], severity: 'critical', desc: 'data exfiltration' },
|
|
1716
|
+
{ groups: ['unrestricted'], andLiteral: ['ai'], severity: 'high', desc: 'restriction removal' },
|
|
1717
|
+
{ groups: ['restrictions'], andLiteral: ['no'], severity: 'high', desc: 'restriction removal' },
|
|
1718
|
+
{ literal: ['jailbreak'], severity: 'critical', desc: 'jailbreak attempt' },
|
|
1719
|
+
{ literal: ['dan', 'mode'], severity: 'critical', desc: 'DAN jailbreak' },
|
|
1720
|
+
{ literal: ['developer', 'mode', 'enabled'], severity: 'high', desc: 'developer mode jailbreak' },
|
|
1721
|
+
{ literal: ['do', 'anything', 'now'], severity: 'critical', desc: 'DAN jailbreak' },
|
|
1722
|
+
{ groups: ['ignore', 'safety'], severity: 'critical', desc: 'safety override' },
|
|
1723
|
+
{ groups: ['ignore', 'restrictions'], severity: 'high', desc: 'restriction removal' },
|
|
1724
|
+
];
|
|
1725
|
+
|
|
1726
|
+
for (const set of attackKeywordSets) {
|
|
1727
|
+
let matched = true;
|
|
1728
|
+
|
|
1729
|
+
// Check synonym groups
|
|
1730
|
+
if (set.groups) {
|
|
1731
|
+
for (const g of set.groups) {
|
|
1732
|
+
if (!hasSynonym(g)) { matched = false; break; }
|
|
1733
|
+
}
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
// Check additional synonym groups required alongside the main groups
|
|
1737
|
+
if (matched && set.andExtra) {
|
|
1738
|
+
for (const g of set.andExtra) {
|
|
1739
|
+
if (!hasSynonym(g)) { matched = false; break; }
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
// Check literal keywords (no synonym expansion)
|
|
1744
|
+
if (matched && set.literal) {
|
|
1745
|
+
for (const kw of set.literal) {
|
|
1746
|
+
if (!lowerText.includes(kw)) { matched = false; break; }
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
|
|
1750
|
+
// Check additional literal keywords
|
|
1751
|
+
if (matched && set.andLiteral) {
|
|
1752
|
+
for (const kw of set.andLiteral) {
|
|
1753
|
+
if (!lowerText.includes(kw)) { matched = false; break; }
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
if (matched) {
|
|
1758
|
+
const alreadyDetected = threats.some(t => t.category === 'instruction_override' || t.category === 'role_hijack' || t.category === 'data_exfiltration');
|
|
1759
|
+
if (!alreadyDetected) {
|
|
1760
|
+
const threat = {
|
|
1761
|
+
severity: set.severity,
|
|
1762
|
+
category: set.desc.includes('exfil') ? 'data_exfiltration' : set.desc.includes('jailbreak') ? 'role_hijack' : 'instruction_override',
|
|
1763
|
+
description: `Text contains a cluster of attack-related keywords suggesting ${set.desc}.`,
|
|
1764
|
+
detail: `Keyword cluster detection in ${source}: synonym-aware matching detected possible ${set.desc} with reordered or obfuscated phrasing.`
|
|
1765
|
+
};
|
|
1766
|
+
threat.confidence = 65;
|
|
1767
|
+
threat.confidenceLabel = confidenceLabel(65);
|
|
1768
|
+
threats.push(threat);
|
|
1769
|
+
break;
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1774
|
+
|
|
1775
|
+
const hasBase64 = !isOverBudget() && HAS_BASE64_CANDIDATE.test(text);
|
|
1776
|
+
const hasEntities = !isOverBudget() && HAS_ENCODED_ENTITIES.test(text);
|
|
1777
|
+
let foundNested = false;
|
|
1778
|
+
let foundSingleLayer = false;
|
|
1779
|
+
|
|
1780
|
+
// Check single-layer HTML entity decoding
|
|
1781
|
+
if (hasEntities && !isOverBudget()) {
|
|
1782
|
+
const htmlDecoded = decodeHTMLEntities(text);
|
|
1783
|
+
if (htmlDecoded !== text && htmlDecoded.length >= 3) {
|
|
1784
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1785
|
+
if (pattern.regex.test(htmlDecoded) && !pattern.regex.test(text)) {
|
|
1786
|
+
const threat = {
|
|
1787
|
+
severity: 'critical',
|
|
1788
|
+
category: 'prompt_injection',
|
|
1789
|
+
description: 'Text hides attack instructions inside HTML entity encoding.',
|
|
1790
|
+
detail: `HTML entity encoded injection found in ${source}. Decoded: "${htmlDecoded.substring(0, 100)}"`
|
|
1791
|
+
};
|
|
1792
|
+
threat.confidence = 85;
|
|
1793
|
+
threat.confidenceLabel = confidenceLabel(85);
|
|
1794
|
+
threats.push(threat);
|
|
1795
|
+
foundSingleLayer = true;
|
|
1796
|
+
break;
|
|
1797
|
+
}
|
|
1798
|
+
}
|
|
1799
|
+
// If heavy entity usage, flag as suspicious even without pattern match
|
|
1800
|
+
if (!foundSingleLayer) {
|
|
1801
|
+
const entityCount = (text.match(/&#\d+;|&#x[0-9a-fA-F]+;/g) || []).length;
|
|
1802
|
+
if (entityCount >= 4 && entityCount / text.split(/\s+/).length > 0.5) {
|
|
1803
|
+
const threat = {
|
|
1804
|
+
severity: 'high',
|
|
1805
|
+
category: 'prompt_injection',
|
|
1806
|
+
description: 'Text is heavily encoded with HTML entities, possibly to hide instructions.',
|
|
1807
|
+
detail: `Suspicious HTML entity encoding found in ${source}. Decoded: "${htmlDecoded.substring(0, 100)}"`
|
|
1808
|
+
};
|
|
1809
|
+
threat.confidence = 70;
|
|
1810
|
+
threat.confidenceLabel = confidenceLabel(70);
|
|
1811
|
+
threats.push(threat);
|
|
1812
|
+
foundSingleLayer = true;
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
// Check single-layer URL decoding
|
|
1819
|
+
if (!foundSingleLayer && hasEntities && !isOverBudget()) {
|
|
1820
|
+
const urlDecoded = tryURLDecode(text);
|
|
1821
|
+
if (urlDecoded && urlDecoded.length >= 5) {
|
|
1822
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1823
|
+
if (pattern.regex.test(urlDecoded) && !pattern.regex.test(text)) {
|
|
1824
|
+
const threat = {
|
|
1825
|
+
severity: 'critical',
|
|
1826
|
+
category: 'prompt_injection',
|
|
1827
|
+
description: 'Text hides attack instructions inside URL encoding.',
|
|
1828
|
+
detail: `URL-encoded injection found in ${source}. Decoded: "${urlDecoded.substring(0, 100)}"`
|
|
1829
|
+
};
|
|
1830
|
+
threat.confidence = 85;
|
|
1831
|
+
threat.confidenceLabel = confidenceLabel(85);
|
|
1832
|
+
threats.push(threat);
|
|
1833
|
+
foundSingleLayer = true;
|
|
1834
|
+
break;
|
|
1835
|
+
}
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
// Also check percent-encoded text (may not have &# but has %XX)
|
|
1841
|
+
if (!foundSingleLayer && !isOverBudget() && /%[0-9a-fA-F]{2}/.test(text)) {
|
|
1842
|
+
const urlDecoded = tryURLDecode(text);
|
|
1843
|
+
if (urlDecoded && urlDecoded !== text && urlDecoded.length >= 5) {
|
|
1844
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
1845
|
+
if (pattern.regex.test(urlDecoded) && !pattern.regex.test(text)) {
|
|
1846
|
+
const threat = {
|
|
1847
|
+
severity: 'critical',
|
|
1848
|
+
category: 'prompt_injection',
|
|
1849
|
+
description: 'Text hides attack instructions inside URL encoding.',
|
|
1850
|
+
detail: `URL-encoded injection found in ${source}. Decoded: "${urlDecoded.substring(0, 100)}"`
|
|
1851
|
+
};
|
|
1852
|
+
threat.confidence = 85;
|
|
1853
|
+
threat.confidenceLabel = confidenceLabel(85);
|
|
1854
|
+
threats.push(threat);
|
|
1855
|
+
foundSingleLayer = true;
|
|
1856
|
+
break;
|
|
1857
|
+
}
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
|
|
1862
|
+
if (hasEntities || hasBase64) {
|
|
1863
|
+
const nestedResult = checkNestedEncoding(text);
|
|
1864
|
+
if (nestedResult) {
|
|
1865
|
+
foundNested = true;
|
|
1866
|
+
const threat = {
|
|
1867
|
+
severity: 'critical',
|
|
1868
|
+
category: 'prompt_injection',
|
|
1869
|
+
description: 'Text hides attack instructions inside multiple layers of encoding.',
|
|
1870
|
+
detail: `Multi-layer encoding detected in ${source} (${nestedResult.decodingChain}). Decoded content matches: ${nestedResult.matchedPattern.detail}`
|
|
1871
|
+
};
|
|
1872
|
+
threat.confidence = calculateConfidence(threat, patternMatchCount, source);
|
|
1873
|
+
threat.confidenceLabel = confidenceLabel(threat.confidence);
|
|
1874
|
+
threats.push(threat);
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
if (hasBase64 && !foundNested) {
|
|
1879
|
+
const base64Result = checkBase64Content(text);
|
|
1880
|
+
if (base64Result) {
|
|
1881
|
+
const threat = {
|
|
1882
|
+
severity: base64Result.matchedPattern ? 'critical' : 'low',
|
|
1883
|
+
category: 'prompt_injection',
|
|
1884
|
+
description: base64Result.matchedPattern
|
|
1885
|
+
? 'Text hides attack instructions inside encoded text.'
|
|
1886
|
+
: 'Text contains encoded text that could hide instructions.',
|
|
1887
|
+
detail: base64Result.matchedPattern
|
|
1888
|
+
? `Base64-encoded injection found in ${source}. Decoded content matches: ${base64Result.matchedPattern.detail}`
|
|
1889
|
+
: `Suspicious base64-encoded content found in ${source}. Preview: "${base64Result.decoded.substring(0, 100)}..."`
|
|
1890
|
+
};
|
|
1891
|
+
threat.confidence = calculateConfidence(threat, patternMatchCount, source);
|
|
1892
|
+
threat.confidenceLabel = confidenceLabel(threat.confidence);
|
|
1893
|
+
threats.push(threat);
|
|
1894
|
+
}
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
return threats;
|
|
1898
|
+
};
|
|
1899
|
+
|
|
1900
|
+
// =========================================================================
|
|
1901
|
+
// PUBLIC API
|
|
1902
|
+
// =========================================================================
|
|
1903
|
+
|
|
1904
|
+
const SEVERITY_ORDER = { critical: 0, high: 1, medium: 2, low: 3 };
|
|
1905
|
+
|
|
1906
|
+
/**
|
|
1907
|
+
* Scans arbitrary text for AI-specific threats.
|
|
1908
|
+
*
|
|
1909
|
+
* @param {string} text - The text to scan.
|
|
1910
|
+
* @param {object} [options] - Scan options.
|
|
1911
|
+
* @param {string} [options.source='unknown'] - Label for where the text came from.
|
|
1912
|
+
* @param {string} [options.sensitivity='medium'] - Sensitivity level: 'low', 'medium', or 'high'.
|
|
1913
|
+
* @returns {object} Scan result with status, threats, and stats.
|
|
1914
|
+
*
|
|
1915
|
+
* @example
|
|
1916
|
+
* const { scanText } = require('./detector-core');
|
|
1917
|
+
* const result = scanText('ignore all previous instructions', { source: 'user_input' });
|
|
1918
|
+
* console.log(result.status); // 'warning'
|
|
1919
|
+
* console.log(result.threats); // [{ severity: 'high', ... }]
|
|
1920
|
+
*/
|
|
1921
|
+
const scanText = (text, options = {}) => {
|
|
1922
|
+
const source = options.source || 'unknown';
|
|
1923
|
+
const sensitivity = options.sensitivity || 'medium';
|
|
1924
|
+
const timeBudgetMs = options.timeBudgetMs || DEFAULT_SCAN_TIME_BUDGET_MS;
|
|
1925
|
+
const maxSize = options.maxInputSize || MAX_INPUT_SIZE;
|
|
1926
|
+
const startTime = now();
|
|
1927
|
+
|
|
1928
|
+
if (typeof text !== 'string' || text.length < 10 || text.trim().length < 10) {
|
|
1929
|
+
return {
|
|
1930
|
+
status: 'safe',
|
|
1931
|
+
threats: [],
|
|
1932
|
+
stats: { totalThreats: 0, critical: 0, high: 0, medium: 0, low: 0, scanTimeMs: 0 },
|
|
1933
|
+
timestamp: Date.now()
|
|
1934
|
+
};
|
|
1935
|
+
}
|
|
1936
|
+
|
|
1937
|
+
// Enforce maximum input size to prevent resource exhaustion
|
|
1938
|
+
let truncated = false;
|
|
1939
|
+
if (text.length > maxSize) {
|
|
1940
|
+
text = text.slice(0, maxSize);
|
|
1941
|
+
truncated = true;
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
let threats = scanTextForPatterns(text, source, timeBudgetMs, startTime);
|
|
1945
|
+
|
|
1946
|
+
// Filter by sensitivity
|
|
1947
|
+
if (sensitivity === 'low') {
|
|
1948
|
+
threats = threats.filter(t => t.severity === 'critical' || t.severity === 'high');
|
|
1949
|
+
} else if (sensitivity === 'medium') {
|
|
1950
|
+
threats = threats.filter(t => t.severity !== 'low');
|
|
1951
|
+
}
|
|
1952
|
+
// 'high' sensitivity = show everything
|
|
1953
|
+
|
|
1954
|
+
// Sort by severity
|
|
1955
|
+
threats.sort((a, b) => SEVERITY_ORDER[a.severity] - SEVERITY_ORDER[b.severity]);
|
|
1956
|
+
|
|
1957
|
+
const scanTimeMs = Math.round(now() - startTime);
|
|
1958
|
+
const stats = { totalThreats: threats.length, critical: 0, high: 0, medium: 0, low: 0, scanTimeMs };
|
|
1959
|
+
for (const t of threats) {
|
|
1960
|
+
stats[t.severity]++;
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
let status = 'safe';
|
|
1964
|
+
if (stats.critical > 0) status = 'danger';
|
|
1965
|
+
else if (stats.high > 0) status = 'warning';
|
|
1966
|
+
else if (stats.medium > 0) status = 'caution';
|
|
1967
|
+
|
|
1968
|
+
// Enrich threats with actionable guidance
|
|
1969
|
+
for (const t of threats) {
|
|
1970
|
+
t.remediation = `To allowlist this detection, add a rule for category "${t.category}" matching your expected input pattern. See: Allowlist.addRule({ pattern: '...', category: '${t.category}', reason: 'Known safe pattern' })`;
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
const result = { status, threats, stats, timestamp: Date.now() };
|
|
1974
|
+
if (truncated) {
|
|
1975
|
+
result.truncated = true;
|
|
1976
|
+
result.warnings = [`Input exceeded ${maxSize} characters and was truncated for scanning.`];
|
|
1977
|
+
}
|
|
1978
|
+
return result;
|
|
1979
|
+
};
|
|
1980
|
+
|
|
1981
|
+
/**
|
|
1982
|
+
* Returns the list of all detection patterns (read-only copy).
|
|
1983
|
+
* Useful for inspecting what the engine detects.
|
|
1984
|
+
* @returns {Array}
|
|
1985
|
+
*/
|
|
1986
|
+
const getPatterns = () => {
|
|
1987
|
+
return INJECTION_PATTERNS.map(p => ({
|
|
1988
|
+
category: p.category,
|
|
1989
|
+
severity: p.severity,
|
|
1990
|
+
description: p.description,
|
|
1991
|
+
detail: p.detail
|
|
1992
|
+
}));
|
|
1993
|
+
};
|
|
1994
|
+
|
|
1995
|
+
// =========================================================================
|
|
1996
|
+
// EXPORTS
|
|
1997
|
+
// =========================================================================
|
|
1998
|
+
|
|
1999
|
+
module.exports = { scanText, getPatterns, SEVERITY_ORDER, MAX_INPUT_SIZE };
|