gauntlet-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gauntlet/__init__.py +20 -0
- gauntlet/cli.py +246 -0
- gauntlet/config.py +174 -0
- gauntlet/data/embeddings.npz +0 -0
- gauntlet/data/metadata.json +109 -0
- gauntlet/detector.py +274 -0
- gauntlet/exceptions.py +13 -0
- gauntlet/layers/__init__.py +1 -0
- gauntlet/layers/embeddings.py +269 -0
- gauntlet/layers/llm_judge.py +319 -0
- gauntlet/layers/rules.py +852 -0
- gauntlet/mcp_server.py +135 -0
- gauntlet/models.py +83 -0
- gauntlet_ai-0.1.0.dist-info/METADATA +281 -0
- gauntlet_ai-0.1.0.dist-info/RECORD +17 -0
- gauntlet_ai-0.1.0.dist-info/WHEEL +4 -0
- gauntlet_ai-0.1.0.dist-info/entry_points.txt +2 -0
gauntlet/layers/rules.py
ADDED
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
"""Layer 1: Rule-based prompt injection detection using regex patterns.
|
|
2
|
+
|
|
3
|
+
This module provides fast, regex-based detection for common prompt injection
|
|
4
|
+
patterns. It's designed as the first line of defense in the detection cascade,
|
|
5
|
+
catching obvious attacks quickly and cheaply before more expensive layers.
|
|
6
|
+
|
|
7
|
+
Zero dependencies - works with Python standard library only.
|
|
8
|
+
|
|
9
|
+
Patterns cover:
|
|
10
|
+
- Instruction override attempts
|
|
11
|
+
- Jailbreak attempts (DAN, STAN, DUDE, AIM, developer mode, roleplay)
|
|
12
|
+
- Delimiter/context injection
|
|
13
|
+
- Data extraction attempts
|
|
14
|
+
- Context manipulation
|
|
15
|
+
- Obfuscation techniques
|
|
16
|
+
- Hypothetical framing
|
|
17
|
+
- Multilingual attacks (13 languages)
|
|
18
|
+
- Indirect injection attacks
|
|
19
|
+
- Unicode homoglyph normalization
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
import time
|
|
24
|
+
import unicodedata
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Pattern
|
|
27
|
+
|
|
28
|
+
from gauntlet.models import LayerResult
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# UNICODE NORMALIZATION
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
# Common Unicode confusables (homoglyphs) that attackers use to bypass regex
|
|
36
|
+
# Maps lookalike characters to their ASCII equivalents
|
|
37
|
+
CONFUSABLES: dict[str, str] = {
|
|
38
|
+
# Cyrillic lookalikes
|
|
39
|
+
"\u0430": "a", "\u0410": "A",
|
|
40
|
+
"\u0441": "c", "\u0421": "C",
|
|
41
|
+
"\u0435": "e", "\u0415": "E",
|
|
42
|
+
"\u0456": "i", "\u0406": "I",
|
|
43
|
+
"\u043e": "o", "\u041e": "O",
|
|
44
|
+
"\u0440": "p", "\u0420": "P",
|
|
45
|
+
"\u0443": "y", "\u0423": "Y",
|
|
46
|
+
"\u0445": "x", "\u0425": "X",
|
|
47
|
+
"\u0455": "s", "\u0405": "S",
|
|
48
|
+
"\u0458": "j", "\u0408": "J",
|
|
49
|
+
"\u04bb": "h", "\u04ba": "H",
|
|
50
|
+
"\u0501": "d",
|
|
51
|
+
"\u051b": "q",
|
|
52
|
+
"\u051d": "w",
|
|
53
|
+
"\u1d00": "a", "\u1d04": "c", "\u1d05": "d", "\u1d07": "e", "\u1d0d": "m", "\u0274": "n",
|
|
54
|
+
"\u1d0f": "o", "\u1d18": "p", "\u1d1b": "t", "\u1d1c": "u", "\u1d20": "v", "\u1d21": "w",
|
|
55
|
+
# Greek lookalikes
|
|
56
|
+
"\u0391": "A", "\u0392": "B", "\u0395": "E", "\u0396": "Z", "\u0397": "H", "\u0399": "I",
|
|
57
|
+
"\u039a": "K", "\u039c": "M", "\u039d": "N", "\u039f": "O", "\u03a1": "P", "\u03a4": "T",
|
|
58
|
+
"\u03a5": "Y", "\u03a7": "X",
|
|
59
|
+
"\u03b1": "a", "\u03b2": "b", "\u03b5": "e", "\u03b9": "i", "\u03ba": "k", "\u03bd": "v",
|
|
60
|
+
"\u03bf": "o", "\u03c1": "p", "\u03c4": "t", "\u03c5": "u", "\u03c7": "x",
|
|
61
|
+
# Latin variants
|
|
62
|
+
"\u0251": "a", "\u0261": "g", "\u0131": "i", "\u0237": "j", "\u0269": "i",
|
|
63
|
+
"\u0280": "r", "\u0299": "b", "\u0262": "g", "\u029c": "h", "\u029f": "l",
|
|
64
|
+
# Fullwidth characters
|
|
65
|
+
"\uff21": "A", "\uff22": "B", "\uff23": "C", "\uff24": "D", "\uff25": "E", "\uff26": "F",
|
|
66
|
+
"\uff27": "G", "\uff28": "H", "\uff29": "I", "\uff2a": "J", "\uff2b": "K", "\uff2c": "L",
|
|
67
|
+
"\uff2d": "M", "\uff2e": "N", "\uff2f": "O", "\uff30": "P", "\uff31": "Q", "\uff32": "R",
|
|
68
|
+
"\uff33": "S", "\uff34": "T", "\uff35": "U", "\uff36": "V", "\uff37": "W", "\uff38": "X",
|
|
69
|
+
"\uff39": "Y", "\uff3a": "Z",
|
|
70
|
+
"\uff41": "a", "\uff42": "b", "\uff43": "c", "\uff44": "d", "\uff45": "e", "\uff46": "f",
|
|
71
|
+
"\uff47": "g", "\uff48": "h", "\uff49": "i", "\uff4a": "j", "\uff4b": "k", "\uff4c": "l",
|
|
72
|
+
"\uff4d": "m", "\uff4e": "n", "\uff4f": "o", "\uff50": "p", "\uff51": "q", "\uff52": "r",
|
|
73
|
+
"\uff53": "s", "\uff54": "t", "\uff55": "u", "\uff56": "v", "\uff57": "w", "\uff58": "x",
|
|
74
|
+
"\uff59": "y", "\uff5a": "z",
|
|
75
|
+
"\uff10": "0", "\uff11": "1", "\uff12": "2", "\uff13": "3", "\uff14": "4",
|
|
76
|
+
"\uff15": "5", "\uff16": "6", "\uff17": "7", "\uff18": "8", "\uff19": "9",
|
|
77
|
+
# Other common substitutions
|
|
78
|
+
"\u2113": "l", "\u2112": "L",
|
|
79
|
+
"\u2170": "i", "\u2171": "ii", "\u2172": "iii",
|
|
80
|
+
"\u2070": "0", "\u00b9": "1", "\u00b2": "2", "\u00b3": "3",
|
|
81
|
+
"\u2074": "4", "\u2075": "5", "\u2076": "6", "\u2077": "7", "\u2078": "8", "\u2079": "9",
|
|
82
|
+
"\u2080": "0", "\u2081": "1", "\u2082": "2", "\u2083": "3", "\u2084": "4",
|
|
83
|
+
"\u2085": "5", "\u2086": "6", "\u2087": "7", "\u2088": "8", "\u2089": "9",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Build translation table for fast replacement
|
|
87
|
+
_CONFUSABLES_TABLE = str.maketrans(CONFUSABLES)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def normalize_unicode(text: str) -> str:
|
|
91
|
+
"""Normalize Unicode text to catch homoglyph attacks.
|
|
92
|
+
|
|
93
|
+
Applies NFKC normalization and replaces known confusable characters
|
|
94
|
+
with their ASCII equivalents.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
text: The input text to normalize.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Normalized text with confusables replaced.
|
|
101
|
+
"""
|
|
102
|
+
text = unicodedata.normalize("NFKC", text)
|
|
103
|
+
text = text.translate(_CONFUSABLES_TABLE)
|
|
104
|
+
return text
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# =============================================================================
|
|
108
|
+
# PATTERN DEFINITIONS
|
|
109
|
+
# =============================================================================
|
|
110
|
+
|
|
111
|
+
@dataclass(frozen=True)
|
|
112
|
+
class InjectionPattern:
|
|
113
|
+
"""A single regex pattern for injection detection."""
|
|
114
|
+
|
|
115
|
+
name: str
|
|
116
|
+
pattern: Pattern[str]
|
|
117
|
+
category: str
|
|
118
|
+
confidence: float
|
|
119
|
+
description: str
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Pattern definitions organized by attack category
|
|
123
|
+
INJECTION_PATTERNS: list[InjectionPattern] = [
|
|
124
|
+
# =========================================================================
|
|
125
|
+
# 1. INSTRUCTION OVERRIDE ATTEMPTS
|
|
126
|
+
# =========================================================================
|
|
127
|
+
InjectionPattern(
|
|
128
|
+
name="ignore_previous_instructions",
|
|
129
|
+
pattern=re.compile(
|
|
130
|
+
r"\b(ignore|disregard|forget|override|bypass|skip|dismiss)\b.{0,30}"
|
|
131
|
+
r"\b(previous|prior|above|earlier|all|any|original|initial|system|assistant)\b.{0,20}"
|
|
132
|
+
r"\b(instructions?|rules?|guidelines?|programming|directives?|constraints?|prompts?)\b",
|
|
133
|
+
re.IGNORECASE,
|
|
134
|
+
),
|
|
135
|
+
category="instruction_override",
|
|
136
|
+
confidence=0.95,
|
|
137
|
+
description="Explicit attempts to nullify prior instructions",
|
|
138
|
+
),
|
|
139
|
+
InjectionPattern(
|
|
140
|
+
name="new_instructions_override",
|
|
141
|
+
pattern=re.compile(
|
|
142
|
+
r"\b(new|actual|real|true|correct|updated)\b.{0,20}"
|
|
143
|
+
r"\b(instructions?|rules?|directives?|orders?)\b.{0,10}(:|are|follow)",
|
|
144
|
+
re.IGNORECASE,
|
|
145
|
+
),
|
|
146
|
+
category="instruction_override",
|
|
147
|
+
confidence=0.75,
|
|
148
|
+
description="Attempts to establish attacker text as authoritative",
|
|
149
|
+
),
|
|
150
|
+
InjectionPattern(
|
|
151
|
+
name="do_not_follow",
|
|
152
|
+
pattern=re.compile(
|
|
153
|
+
r"\b(do\s*n[o']?t|don't|never|stop)\b.{0,20}"
|
|
154
|
+
r"\b(follow|obey|listen(?:ing)?\s+to|adhere|comply)\b.{0,30}"
|
|
155
|
+
r"\b(instructions?|rules?|guidelines?|programming)\b",
|
|
156
|
+
re.IGNORECASE,
|
|
157
|
+
),
|
|
158
|
+
category="instruction_override",
|
|
159
|
+
confidence=0.90,
|
|
160
|
+
description="Direct commands to disobey constraints",
|
|
161
|
+
),
|
|
162
|
+
InjectionPattern(
|
|
163
|
+
name="from_now_on",
|
|
164
|
+
pattern=re.compile(
|
|
165
|
+
r"\b(from\s+now\s+on|henceforth|going\s+forward|starting\s+now)\b.{0,50}"
|
|
166
|
+
r"\b(you\s+(will|must|shall|should|are)|act\s+as|behave|respond)\b",
|
|
167
|
+
re.IGNORECASE,
|
|
168
|
+
),
|
|
169
|
+
category="instruction_override",
|
|
170
|
+
confidence=0.80,
|
|
171
|
+
description="Temporal override establishing new behavior",
|
|
172
|
+
),
|
|
173
|
+
# =========================================================================
|
|
174
|
+
# 2. JAILBREAK ATTEMPTS
|
|
175
|
+
# =========================================================================
|
|
176
|
+
InjectionPattern(
|
|
177
|
+
name="dan_jailbreak",
|
|
178
|
+
pattern=re.compile(
|
|
179
|
+
r"\b(DAN|D\.A\.N|do\s+anything\s+now)\b|\bDAN\s*\d*\s*(\.\d+)?\b",
|
|
180
|
+
re.IGNORECASE,
|
|
181
|
+
),
|
|
182
|
+
category="jailbreak",
|
|
183
|
+
confidence=0.95,
|
|
184
|
+
description="DAN (Do Anything Now) jailbreak variants",
|
|
185
|
+
),
|
|
186
|
+
InjectionPattern(
|
|
187
|
+
name="stan_jailbreak",
|
|
188
|
+
pattern=re.compile(
|
|
189
|
+
r"\b(STAN|S\.T\.A\.N|strive\s+to\s+avoid\s+norms)\b",
|
|
190
|
+
re.IGNORECASE,
|
|
191
|
+
),
|
|
192
|
+
category="jailbreak",
|
|
193
|
+
confidence=0.95,
|
|
194
|
+
description="STAN (Strive To Avoid Norms) jailbreak",
|
|
195
|
+
),
|
|
196
|
+
InjectionPattern(
|
|
197
|
+
name="dude_jailbreak",
|
|
198
|
+
pattern=re.compile(
|
|
199
|
+
r"\bDUDE\b|\b(D\.U\.D\.E|decoupled\s+unrestricted\s+dialogue)\b",
|
|
200
|
+
re.IGNORECASE,
|
|
201
|
+
),
|
|
202
|
+
category="jailbreak",
|
|
203
|
+
confidence=0.95,
|
|
204
|
+
description="DUDE jailbreak variant",
|
|
205
|
+
),
|
|
206
|
+
InjectionPattern(
|
|
207
|
+
name="aim_jailbreak",
|
|
208
|
+
pattern=re.compile(
|
|
209
|
+
r"\bAIM\b.{0,20}\b(machiavellian|amoral|always\s+intelligent)\b|"
|
|
210
|
+
r"\b(always\s+intelligent\s+and\s+machiavellian)\b",
|
|
211
|
+
re.IGNORECASE,
|
|
212
|
+
),
|
|
213
|
+
category="jailbreak",
|
|
214
|
+
confidence=0.93,
|
|
215
|
+
description="AIM (Always Intelligent and Machiavellian) jailbreak",
|
|
216
|
+
),
|
|
217
|
+
InjectionPattern(
|
|
218
|
+
name="evil_confidant",
|
|
219
|
+
pattern=re.compile(
|
|
220
|
+
r"\b(evil\s+confidant|evil\s+advisor|malicious\s+assistant)\b",
|
|
221
|
+
re.IGNORECASE,
|
|
222
|
+
),
|
|
223
|
+
category="jailbreak",
|
|
224
|
+
confidence=0.92,
|
|
225
|
+
description="Evil Confidant persona jailbreak",
|
|
226
|
+
),
|
|
227
|
+
InjectionPattern(
|
|
228
|
+
name="named_jailbreaks",
|
|
229
|
+
pattern=re.compile(
|
|
230
|
+
r"\b(KEVIN|Mongo\s*Tom|APOPHIS|Maximum|BasedGPT|JailMilk|AntiGPT|"
|
|
231
|
+
r"BetterDAN|DevMode|BISH|OMNI|Alphabreak|PersonGPT|TranslatorBot|SWITCH)\b",
|
|
232
|
+
re.IGNORECASE,
|
|
233
|
+
),
|
|
234
|
+
category="jailbreak",
|
|
235
|
+
confidence=0.94,
|
|
236
|
+
description="Known named jailbreak personas",
|
|
237
|
+
),
|
|
238
|
+
InjectionPattern(
|
|
239
|
+
name="developer_mode",
|
|
240
|
+
pattern=re.compile(
|
|
241
|
+
r"\b(enter|enable|activate|switch\s+to|engage)\s+"
|
|
242
|
+
r"(developer|dev|debug|admin|root|sudo|maintenance|test)\s*"
|
|
243
|
+
r"(mode|access|privileges?|override)\b|"
|
|
244
|
+
r"\b(developer|dev|debug|admin|root|sudo)\s*(mode|access|privileges?)\s+"
|
|
245
|
+
r"(enabled?|activated?|on|unlocked)\b",
|
|
246
|
+
re.IGNORECASE,
|
|
247
|
+
),
|
|
248
|
+
category="jailbreak",
|
|
249
|
+
confidence=0.90,
|
|
250
|
+
description="Fake developer/admin mode activation",
|
|
251
|
+
),
|
|
252
|
+
InjectionPattern(
|
|
253
|
+
name="roleplay_jailbreak",
|
|
254
|
+
pattern=re.compile(
|
|
255
|
+
r"\b(pretend|imagine|act|roleplay|simulate|behave)\b.{0,30}"
|
|
256
|
+
r"\b(you\s+are|you're|as\s+if\s+you\s+were?|to\s+be|as\s+an?\s+AI)\b.{0,40}"
|
|
257
|
+
r"\b(unrestricted|unfiltered|uncensored|without\s+(limits?|restrictions?|rules?|filters?|guardrails?)|no\s+guardrails?)\b",
|
|
258
|
+
re.IGNORECASE,
|
|
259
|
+
),
|
|
260
|
+
category="jailbreak",
|
|
261
|
+
confidence=0.92,
|
|
262
|
+
description="Roleplay-based constraint removal",
|
|
263
|
+
),
|
|
264
|
+
InjectionPattern(
|
|
265
|
+
name="jailbreak_mode_activation",
|
|
266
|
+
pattern=re.compile(
|
|
267
|
+
r"\b(jailbr[eo]ak|unlock|liberat\w*|unbounded|unchained|unleash\w*)\b.{0,20}"
|
|
268
|
+
r"\b(mode|version|state|yourself|your\s+true\s+self)\b|"
|
|
269
|
+
r"\b(enter|enable|activate|switch\s+to)\b.{0,20}\b(jailbr[eo]ak|unleashed)\b",
|
|
270
|
+
re.IGNORECASE,
|
|
271
|
+
),
|
|
272
|
+
category="jailbreak",
|
|
273
|
+
confidence=0.98,
|
|
274
|
+
description="Explicit jailbreak activation attempts",
|
|
275
|
+
),
|
|
276
|
+
InjectionPattern(
|
|
277
|
+
name="opposite_day",
|
|
278
|
+
pattern=re.compile(
|
|
279
|
+
r"\b(opposite\s+day|opposite\s+mode|reverse\s+(your\s+)?rules?|"
|
|
280
|
+
r"invert\s+(your\s+)?(?:rules?|behavior))\b",
|
|
281
|
+
re.IGNORECASE,
|
|
282
|
+
),
|
|
283
|
+
category="jailbreak",
|
|
284
|
+
confidence=0.88,
|
|
285
|
+
description="'Opposite day' style rule inversion",
|
|
286
|
+
),
|
|
287
|
+
InjectionPattern(
|
|
288
|
+
name="persona_switch",
|
|
289
|
+
pattern=re.compile(
|
|
290
|
+
r"\b(you\s+are\s+no\s+longer|stop\s+being|cease\s+being)\b.{0,20}"
|
|
291
|
+
r"\b(an?\s+AI|assistant|chatbot|helpful)\b|"
|
|
292
|
+
r"\b(from\s+now\s+on\s+you\s+are|you\s+are\s+now)\b.{0,30}"
|
|
293
|
+
r"\b(evil|malicious|unrestricted|unethical|amoral)\b",
|
|
294
|
+
re.IGNORECASE,
|
|
295
|
+
),
|
|
296
|
+
category="jailbreak",
|
|
297
|
+
confidence=0.90,
|
|
298
|
+
description="Attempts to switch AI persona",
|
|
299
|
+
),
|
|
300
|
+
# =========================================================================
|
|
301
|
+
# 3. DELIMITER/CONTEXT INJECTION
|
|
302
|
+
# =========================================================================
|
|
303
|
+
InjectionPattern(
|
|
304
|
+
name="fake_system_tags",
|
|
305
|
+
pattern=re.compile(
|
|
306
|
+
r"<\s*/?\s*(system|assistant|user|human|ai|instruction|prompt|context|message|chat)\s*>|"
|
|
307
|
+
r"<<\s*(SYS|INST|USR)\s*>>|\[\s*(SYSTEM|INST|SYS)\s*\]",
|
|
308
|
+
re.IGNORECASE,
|
|
309
|
+
),
|
|
310
|
+
category="delimiter_injection",
|
|
311
|
+
confidence=0.95,
|
|
312
|
+
description="Fake XML/bracket system message tags",
|
|
313
|
+
),
|
|
314
|
+
InjectionPattern(
|
|
315
|
+
name="markdown_code_injection",
|
|
316
|
+
pattern=re.compile(
|
|
317
|
+
r"```\s*(system|prompt|instructions?|config|internal|hidden|secret)\b",
|
|
318
|
+
re.IGNORECASE,
|
|
319
|
+
),
|
|
320
|
+
category="delimiter_injection",
|
|
321
|
+
confidence=0.85,
|
|
322
|
+
description="Markdown code blocks pretending to be system content",
|
|
323
|
+
),
|
|
324
|
+
InjectionPattern(
|
|
325
|
+
name="separator_injection",
|
|
326
|
+
pattern=re.compile(
|
|
327
|
+
r"(?:^|\n)\s*[-=]{5,}\s*(?:end|begin|start|stop|new|system|ignore).{0,30}[-=]{0,}",
|
|
328
|
+
re.IGNORECASE | re.MULTILINE,
|
|
329
|
+
),
|
|
330
|
+
category="delimiter_injection",
|
|
331
|
+
confidence=0.80,
|
|
332
|
+
description="Fake separators implying context boundaries",
|
|
333
|
+
),
|
|
334
|
+
InjectionPattern(
|
|
335
|
+
name="conversation_reset",
|
|
336
|
+
pattern=re.compile(
|
|
337
|
+
r"\b(conversation|context|chat|session|thread)\s+"
|
|
338
|
+
r"(reset|restart|clear|wipe|start\s*over)\b|"
|
|
339
|
+
r"\b(reset|restart|clear)\s+(the\s+|this\s+)?(conversation|context|chat|session)\b",
|
|
340
|
+
re.IGNORECASE,
|
|
341
|
+
),
|
|
342
|
+
category="delimiter_injection",
|
|
343
|
+
confidence=0.82,
|
|
344
|
+
description="Attempts to reset conversation state",
|
|
345
|
+
),
|
|
346
|
+
# =========================================================================
|
|
347
|
+
# 4. DATA EXTRACTION ATTEMPTS
|
|
348
|
+
# =========================================================================
|
|
349
|
+
InjectionPattern(
|
|
350
|
+
name="reveal_system_prompt",
|
|
351
|
+
pattern=re.compile(
|
|
352
|
+
r"\b(reveal|show|display|print|output|tell\s+me|what\s+(is|are)|repeat|echo)\b.{0,30}"
|
|
353
|
+
r"\b(system\s+prompt|initial\s+prompt|original\s+instructions?|hidden\s+instructions?|"
|
|
354
|
+
r"secret\s+instructions?|your\s+instructions?|your\s+prompt|your\s+programming)\b",
|
|
355
|
+
re.IGNORECASE,
|
|
356
|
+
),
|
|
357
|
+
category="data_extraction",
|
|
358
|
+
confidence=0.92,
|
|
359
|
+
description="Direct system prompt extraction attempts",
|
|
360
|
+
),
|
|
361
|
+
InjectionPattern(
|
|
362
|
+
name="extract_secrets",
|
|
363
|
+
pattern=re.compile(
|
|
364
|
+
r"\b(reveal|show|tell|give|leak|expose|extract|dump|print)\b.{0,30}"
|
|
365
|
+
r"\b(api\s*keys?|secrets?|credentials?|passwords?|tokens?|"
|
|
366
|
+
r"private\s+(?:keys?|data)|environment\s+variables?|config(?:uration)?)\b",
|
|
367
|
+
re.IGNORECASE,
|
|
368
|
+
),
|
|
369
|
+
category="data_extraction",
|
|
370
|
+
confidence=0.95,
|
|
371
|
+
description="Attempts to extract sensitive configuration",
|
|
372
|
+
),
|
|
373
|
+
InjectionPattern(
|
|
374
|
+
name="verbatim_output",
|
|
375
|
+
pattern=re.compile(
|
|
376
|
+
r"\b(output|print|write|echo|display|return)\b.{0,20}"
|
|
377
|
+
r"\b(verbatim|exactly|word\s*for\s*word|character\s*by\s*character|raw)\b.{0,30}"
|
|
378
|
+
r"\b(prompt|instructions?|input|system|above)\b",
|
|
379
|
+
re.IGNORECASE,
|
|
380
|
+
),
|
|
381
|
+
category="data_extraction",
|
|
382
|
+
confidence=0.85,
|
|
383
|
+
description="Requests for verbatim prompt reproduction",
|
|
384
|
+
),
|
|
385
|
+
# =========================================================================
|
|
386
|
+
# 5. CONTEXT MANIPULATION
|
|
387
|
+
# =========================================================================
|
|
388
|
+
InjectionPattern(
|
|
389
|
+
name="dismiss_as_fake",
|
|
390
|
+
pattern=re.compile(
|
|
391
|
+
r"\b(above|previous|prior|earlier)\b.{0,30}\b(was|were|is|are)\b.{0,20}"
|
|
392
|
+
r"\b(fake|false|test|placeholder|example|not\s+real|incorrect|wrong|malicious)\b",
|
|
393
|
+
re.IGNORECASE,
|
|
394
|
+
),
|
|
395
|
+
category="context_manipulation",
|
|
396
|
+
confidence=0.88,
|
|
397
|
+
description="Dismissing legitimate context as fake",
|
|
398
|
+
),
|
|
399
|
+
InjectionPattern(
|
|
400
|
+
name="context_is_user",
|
|
401
|
+
pattern=re.compile(
|
|
402
|
+
r"\b(everything|all|anything)\s+(above|before|prior|previous)\b.{0,30}"
|
|
403
|
+
r"\b(user|attacker|adversar\w*|injected|untrusted)\b",
|
|
404
|
+
re.IGNORECASE,
|
|
405
|
+
),
|
|
406
|
+
category="context_manipulation",
|
|
407
|
+
confidence=0.90,
|
|
408
|
+
description="Claiming prior context is user-generated",
|
|
409
|
+
),
|
|
410
|
+
InjectionPattern(
|
|
411
|
+
name="real_user_claim",
|
|
412
|
+
pattern=re.compile(
|
|
413
|
+
r"\b(i\s+am|i'm|this\s+is)\s+(the\s+)?(real|actual|true|legitimate)\s+"
|
|
414
|
+
r"(user|human|admin|developer|operator)\b",
|
|
415
|
+
re.IGNORECASE,
|
|
416
|
+
),
|
|
417
|
+
category="context_manipulation",
|
|
418
|
+
confidence=0.80,
|
|
419
|
+
description="False claims of privileged identity",
|
|
420
|
+
),
|
|
421
|
+
# =========================================================================
|
|
422
|
+
# 6. OBFUSCATION TECHNIQUES
|
|
423
|
+
# =========================================================================
|
|
424
|
+
InjectionPattern(
|
|
425
|
+
name="base64_reference",
|
|
426
|
+
pattern=re.compile(
|
|
427
|
+
r"\b(base64|b64|rot13|hex|unicode\s+escape|url\s*encod)\b.{0,30}"
|
|
428
|
+
r"\b(this|following|below|decode|execute|run|interpret|encoded|text)\b|"
|
|
429
|
+
r"\b(decode|execute|run|interpret)\b.{0,20}\b(this\s+)?"
|
|
430
|
+
r"(base64|b64|rot13|hex|encoded)\b",
|
|
431
|
+
re.IGNORECASE,
|
|
432
|
+
),
|
|
433
|
+
category="obfuscation",
|
|
434
|
+
confidence=0.85,
|
|
435
|
+
description="References to encoded payloads",
|
|
436
|
+
),
|
|
437
|
+
InjectionPattern(
|
|
438
|
+
name="character_substitution_hint",
|
|
439
|
+
pattern=re.compile(
|
|
440
|
+
r"\b(replace|substitute|swap|change)\b.{0,30}\b(letters?|characters?|symbols?|each)\b.{0,30}"
|
|
441
|
+
r"\b(with|to|for)\b|\b(read\s+)?backwards?\b.{0,20}\b(spell|says?|reads?)\b",
|
|
442
|
+
re.IGNORECASE,
|
|
443
|
+
),
|
|
444
|
+
category="obfuscation",
|
|
445
|
+
confidence=0.75,
|
|
446
|
+
description="Instructions to decode obfuscated text",
|
|
447
|
+
),
|
|
448
|
+
InjectionPattern(
|
|
449
|
+
name="leet_speak_injection",
|
|
450
|
+
pattern=re.compile(
|
|
451
|
+
r"(?:1gn0r3|d1sr3g4rd|1nstruct10ns?|syst3m|pr0mpt|byp4ss|j41lbr34k|h4ck)",
|
|
452
|
+
re.IGNORECASE,
|
|
453
|
+
),
|
|
454
|
+
category="obfuscation",
|
|
455
|
+
confidence=0.88,
|
|
456
|
+
description="Leetspeak variants of injection keywords",
|
|
457
|
+
),
|
|
458
|
+
# =========================================================================
|
|
459
|
+
# 7. HYPOTHETICAL FRAMING
|
|
460
|
+
# =========================================================================
|
|
461
|
+
InjectionPattern(
|
|
462
|
+
name="hypothetical_scenario",
|
|
463
|
+
pattern=re.compile(
|
|
464
|
+
r"\b(hypothetically|theoretically|in\s+theory|imagine\s+if|what\s+if|suppose|assuming)\b.{0,50}"
|
|
465
|
+
r"\b(no\s+(rules?|restrictions?|limits?|filters?|guidelines?)|"
|
|
466
|
+
r"you\s+(could|were\s+able|had\s+no|weren't\s+restricted))\b",
|
|
467
|
+
re.IGNORECASE,
|
|
468
|
+
),
|
|
469
|
+
category="hypothetical_framing",
|
|
470
|
+
confidence=0.85,
|
|
471
|
+
description="Hypothetical scenarios to bypass rules",
|
|
472
|
+
),
|
|
473
|
+
InjectionPattern(
|
|
474
|
+
name="fictional_context",
|
|
475
|
+
pattern=re.compile(
|
|
476
|
+
r"\b(in\s+(?:a|this)\s+(?:fictional|imaginary|made-up|pretend)|"
|
|
477
|
+
r"for\s+(?:a|my)\s+(?:story|novel|book|fiction|game|script))\b.{0,50}"
|
|
478
|
+
r"\b(how\s+(to|would)|explain|describe|write)\b.{0,30}"
|
|
479
|
+
r"\b(hack|exploit|attack|bypass|illegal|dangerous|harmful)",
|
|
480
|
+
re.IGNORECASE,
|
|
481
|
+
),
|
|
482
|
+
category="hypothetical_framing",
|
|
483
|
+
confidence=0.82,
|
|
484
|
+
description="Fiction framing to extract harmful content",
|
|
485
|
+
),
|
|
486
|
+
InjectionPattern(
|
|
487
|
+
name="educational_framing",
|
|
488
|
+
pattern=re.compile(
|
|
489
|
+
r"\b(for\s+(?:educational|research|academic|learning)\s+purposes?|"
|
|
490
|
+
r"to\s+(?:learn|understand|study|research))\b.{0,50}"
|
|
491
|
+
r"\b(how\s+(?:to|do\s+I)|explain|show\s+me)\b.{0,30}"
|
|
492
|
+
r"\b(hack|exploit|attack|bypass|inject|jailbreak)",
|
|
493
|
+
re.IGNORECASE,
|
|
494
|
+
),
|
|
495
|
+
category="hypothetical_framing",
|
|
496
|
+
confidence=0.78,
|
|
497
|
+
description="Educational framing for harmful requests",
|
|
498
|
+
),
|
|
499
|
+
# =========================================================================
|
|
500
|
+
# 8. MULTILINGUAL ATTACKS (13 languages)
|
|
501
|
+
# =========================================================================
|
|
502
|
+
InjectionPattern(
|
|
503
|
+
name="spanish_injection",
|
|
504
|
+
pattern=re.compile(
|
|
505
|
+
r"\b(ignora|olvida|descarta|omite)\b.{0,30}"
|
|
506
|
+
r"\b(instrucciones?|reglas?|sistema|directrices?)\b.{0,20}"
|
|
507
|
+
r"\b(anteriores?|previas?|del\s+sistema)?\b",
|
|
508
|
+
re.IGNORECASE,
|
|
509
|
+
),
|
|
510
|
+
category="multilingual_injection",
|
|
511
|
+
confidence=0.90,
|
|
512
|
+
description="Spanish language injection attempts",
|
|
513
|
+
),
|
|
514
|
+
InjectionPattern(
|
|
515
|
+
name="german_injection",
|
|
516
|
+
pattern=re.compile(
|
|
517
|
+
r"\b(ignoriere|vergiss|missachte|überspringe)\b.{0,30}"
|
|
518
|
+
r"\b(anweisungen?|regeln?|instruktionen?|befehle?|richtlinien?)\b",
|
|
519
|
+
re.IGNORECASE,
|
|
520
|
+
),
|
|
521
|
+
category="multilingual_injection",
|
|
522
|
+
confidence=0.90,
|
|
523
|
+
description="German language injection attempts",
|
|
524
|
+
),
|
|
525
|
+
InjectionPattern(
|
|
526
|
+
name="french_injection",
|
|
527
|
+
pattern=re.compile(
|
|
528
|
+
r"\b(ignore[zr]?|oublie[zr]?|n\u00e9glige[zr]?)\b.{0,30}"
|
|
529
|
+
r"\b(instructions?|r[e\u00e8]gles?|consignes?|directives?)\b.{0,20}"
|
|
530
|
+
r"\b(pr[\u00e9e]c[\u00e9e]dentes?|ant[\u00e9e]rieures?|du\s+syst[\u00e8e]me)?\b",
|
|
531
|
+
re.IGNORECASE,
|
|
532
|
+
),
|
|
533
|
+
category="multilingual_injection",
|
|
534
|
+
confidence=0.90,
|
|
535
|
+
description="French language injection attempts",
|
|
536
|
+
),
|
|
537
|
+
InjectionPattern(
|
|
538
|
+
name="chinese_injection",
|
|
539
|
+
pattern=re.compile(
|
|
540
|
+
r"(\u5ffd\u7565|\u65e0\u89c6|\u5fd8\u8bb0|\u8df3\u8fc7|\u653e\u5f03).{0,10}"
|
|
541
|
+
r"(\u4e4b\u524d\u7684|\u4ee5\u524d\u7684|\u5148\u524d\u7684|\u7cfb\u7edf)?"
|
|
542
|
+
r"(\u6307\u4ee4|\u89c4\u5219|\u8bf4\u660e|\u6307\u793a|\u547d\u4ee4)",
|
|
543
|
+
re.IGNORECASE,
|
|
544
|
+
),
|
|
545
|
+
category="multilingual_injection",
|
|
546
|
+
confidence=0.90,
|
|
547
|
+
description="Chinese language injection attempts",
|
|
548
|
+
),
|
|
549
|
+
InjectionPattern(
|
|
550
|
+
name="russian_injection",
|
|
551
|
+
pattern=re.compile(
|
|
552
|
+
r"(\u0438\u0433\u043d\u043e\u0440\u0438\u0440\u0443\u0439|\u0437\u0430\u0431\u0443\u0434\u044c|\u043f\u0440\u043e\u043f\u0443\u0441\u0442\u0438|\u043e\u0442\u0431\u0440\u043e\u0441\u044c).{0,40}"
|
|
553
|
+
r"(\u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438[\u0438\u0439\u044e\u044f\u0435]|\u043f\u0440\u0430\u0432\u0438\u043b\u0430?|\u0443\u043a\u0430\u0437\u0430\u043d\u0438[\u044f\u0435]|\u043a\u043e\u043c\u0430\u043d\u0434[\u044b\u0443])",
|
|
554
|
+
re.IGNORECASE,
|
|
555
|
+
),
|
|
556
|
+
category="multilingual_injection",
|
|
557
|
+
confidence=0.90,
|
|
558
|
+
description="Russian language injection attempts",
|
|
559
|
+
),
|
|
560
|
+
InjectionPattern(
|
|
561
|
+
name="arabic_injection",
|
|
562
|
+
pattern=re.compile(
|
|
563
|
+
r"(\u062a\u062c\u0627\u0647\u0644|\u0627\u0646\u0633\u0649|\u0627\u0647\u0645\u0644|\u062a\u062e\u0637\u0649).{0,20}"
|
|
564
|
+
r"(\u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a|\u0627\u0644\u0642\u0648\u0627\u0639\u062f|\u0627\u0644\u0623\u0648\u0627\u0645\u0631|\u0627\u0644\u0646\u0638\u0627\u0645)",
|
|
565
|
+
re.IGNORECASE,
|
|
566
|
+
),
|
|
567
|
+
category="multilingual_injection",
|
|
568
|
+
confidence=0.90,
|
|
569
|
+
description="Arabic language injection attempts",
|
|
570
|
+
),
|
|
571
|
+
InjectionPattern(
|
|
572
|
+
name="portuguese_injection",
|
|
573
|
+
pattern=re.compile(
|
|
574
|
+
r"\b(ignore|ignora|esque\u00e7a|descarte|pule)\b.{0,30}"
|
|
575
|
+
r"\b(instru\u00e7\u00f5es?|regras?|diretrizes?|comandos?)\b.{0,20}"
|
|
576
|
+
r"\b(anteriores?|pr\u00e9vias?|do\s+sistema)?\b",
|
|
577
|
+
re.IGNORECASE,
|
|
578
|
+
),
|
|
579
|
+
category="multilingual_injection",
|
|
580
|
+
confidence=0.90,
|
|
581
|
+
description="Portuguese language injection attempts",
|
|
582
|
+
),
|
|
583
|
+
InjectionPattern(
|
|
584
|
+
name="japanese_injection",
|
|
585
|
+
pattern=re.compile(
|
|
586
|
+
r"(\u4ee5\u524d\u306e|\u524d\u306e|\u30b7\u30b9\u30c6\u30e0\u306e)?(\u6307\u793a|\u30eb\u30fc\u30eb|\u547d\u4ee4|\u6307\u4ee4).{0,5}(\u3092)?(\u7121\u8996|\u5fd8\u308c|\u30b9\u30ad\u30c3\u30d7|\u7121\u52b9\u306b)|"
|
|
587
|
+
r"(\u7121\u8996|\u5fd8\u308c|\u30b9\u30ad\u30c3\u30d7).{0,10}(\u6307\u793a|\u30eb\u30fc\u30eb|\u547d\u4ee4)",
|
|
588
|
+
re.IGNORECASE,
|
|
589
|
+
),
|
|
590
|
+
category="multilingual_injection",
|
|
591
|
+
confidence=0.90,
|
|
592
|
+
description="Japanese language injection attempts",
|
|
593
|
+
),
|
|
594
|
+
InjectionPattern(
|
|
595
|
+
name="korean_injection",
|
|
596
|
+
pattern=re.compile(
|
|
597
|
+
r"(\uc774\uc804|\uc2dc\uc2a4\ud15c)?.{0,5}(\uc9c0\uc2dc|\uaddc\uce59|\uba85\ub839|\uc9c0\uce68).{0,5}(\ub97c|\uc744)?.{0,5}(\ubb34\uc2dc|\uc78a\uc5b4|\uac74\ub108\ub6f0|\ubb34\ud6a8)|"
|
|
598
|
+
r"(\ubb34\uc2dc|\uc78a\uc5b4).{0,10}(\uc9c0\uc2dc|\uaddc\uce59|\uba85\ub839)",
|
|
599
|
+
re.IGNORECASE,
|
|
600
|
+
),
|
|
601
|
+
category="multilingual_injection",
|
|
602
|
+
confidence=0.90,
|
|
603
|
+
description="Korean language injection attempts",
|
|
604
|
+
),
|
|
605
|
+
InjectionPattern(
|
|
606
|
+
name="italian_injection",
|
|
607
|
+
pattern=re.compile(
|
|
608
|
+
r"\b(ignora|dimentica|tralascia|salta)\b.{0,30}"
|
|
609
|
+
r"\b(istruzioni?|regole?|direttive?|comandi?)\b.{0,20}"
|
|
610
|
+
r"\b(precedenti?|del\s+sistema)?\b",
|
|
611
|
+
re.IGNORECASE,
|
|
612
|
+
),
|
|
613
|
+
category="multilingual_injection",
|
|
614
|
+
confidence=0.90,
|
|
615
|
+
description="Italian language injection attempts",
|
|
616
|
+
),
|
|
617
|
+
InjectionPattern(
|
|
618
|
+
name="dutch_injection",
|
|
619
|
+
pattern=re.compile(
|
|
620
|
+
r"\b(negeer|vergeet|sla\s+over|negeren)\b.{0,30}"
|
|
621
|
+
r"\b(instructies?|regels?|aanwijzingen?|opdrachten?)\b",
|
|
622
|
+
re.IGNORECASE,
|
|
623
|
+
),
|
|
624
|
+
category="multilingual_injection",
|
|
625
|
+
confidence=0.90,
|
|
626
|
+
description="Dutch language injection attempts",
|
|
627
|
+
),
|
|
628
|
+
InjectionPattern(
|
|
629
|
+
name="polish_injection",
|
|
630
|
+
pattern=re.compile(
|
|
631
|
+
r"\b(zignoruj|zapomnij|pomi\u0144|odrzu\u0107)\b.{0,30}"
|
|
632
|
+
r"\b(instrukcj[eai]|regu\u0142[y\u0119]|polece\u0144|zasad[y\u0119])\b",
|
|
633
|
+
re.IGNORECASE,
|
|
634
|
+
),
|
|
635
|
+
category="multilingual_injection",
|
|
636
|
+
confidence=0.90,
|
|
637
|
+
description="Polish language injection attempts",
|
|
638
|
+
),
|
|
639
|
+
InjectionPattern(
|
|
640
|
+
name="turkish_injection",
|
|
641
|
+
pattern=re.compile(
|
|
642
|
+
r"(talimat|kural|y\u00f6nerge|komut)\w*.{0,20}(yoksay|unut|atla|g\u00f6rmezden)|"
|
|
643
|
+
r"(\u00f6nceki|eski).{0,20}(talimat|kural|y\u00f6nerge).{0,10}(yoksay|unut|atla)",
|
|
644
|
+
re.IGNORECASE,
|
|
645
|
+
),
|
|
646
|
+
category="multilingual_injection",
|
|
647
|
+
confidence=0.90,
|
|
648
|
+
description="Turkish language injection attempts",
|
|
649
|
+
),
|
|
650
|
+
# =========================================================================
|
|
651
|
+
# 9. INDIRECT INJECTION ATTACKS
|
|
652
|
+
# =========================================================================
|
|
653
|
+
InjectionPattern(
|
|
654
|
+
name="trigger_instruction",
|
|
655
|
+
pattern=re.compile(
|
|
656
|
+
r"\b(when|if|once|after)\s+(you|the\s+(ai|assistant|model|system))\s+"
|
|
657
|
+
r"(see|read|encounter|find|process|receive)\b.{0,50}"
|
|
658
|
+
r"\b(execute|run|do|perform|follow|output)\b",
|
|
659
|
+
re.IGNORECASE,
|
|
660
|
+
),
|
|
661
|
+
category="indirect_injection",
|
|
662
|
+
confidence=0.85,
|
|
663
|
+
description="Planted trigger-based instructions",
|
|
664
|
+
),
|
|
665
|
+
InjectionPattern(
|
|
666
|
+
name="hidden_instruction_marker",
|
|
667
|
+
pattern=re.compile(
|
|
668
|
+
r"\[\s*(HIDDEN|INVISIBLE|SECRET|IGNORE\s+THIS|FOR\s+AI\s+ONLY|"
|
|
669
|
+
r"AI\s+INSTRUCTION|SYSTEM\s+OVERRIDE|INSTRUCTION|DO\s+NOT\s+DISPLAY)\s*[:\]]|"
|
|
670
|
+
r"<!--\s*(ignore|instruction|system|hidden|ai\s+only)",
|
|
671
|
+
re.IGNORECASE,
|
|
672
|
+
),
|
|
673
|
+
category="indirect_injection",
|
|
674
|
+
confidence=0.92,
|
|
675
|
+
description="Markers indicating hidden instructions",
|
|
676
|
+
),
|
|
677
|
+
InjectionPattern(
|
|
678
|
+
name="data_field_injection",
|
|
679
|
+
pattern=re.compile(
|
|
680
|
+
r"(description|summary|bio|about|notes?|comments?|title|name)\s*"
|
|
681
|
+
r"[\"':=].{0,50}(ignore|disregard|forget|you\s+are\s+now|new\s+instructions)",
|
|
682
|
+
re.IGNORECASE,
|
|
683
|
+
),
|
|
684
|
+
category="indirect_injection",
|
|
685
|
+
confidence=0.82,
|
|
686
|
+
description="Injection hidden in data fields",
|
|
687
|
+
),
|
|
688
|
+
InjectionPattern(
|
|
689
|
+
name="invisible_text_marker",
|
|
690
|
+
pattern=re.compile(
|
|
691
|
+
r"(color|background|font-size)\s*:\s*(white|transparent|0|hidden)|"
|
|
692
|
+
r"display\s*:\s*none|visibility\s*:\s*hidden|"
|
|
693
|
+
r"position\s*:\s*absolute.{0,30}(left|top)\s*:\s*-\d{4,}",
|
|
694
|
+
re.IGNORECASE,
|
|
695
|
+
),
|
|
696
|
+
category="indirect_injection",
|
|
697
|
+
confidence=0.80,
|
|
698
|
+
description="CSS hiding techniques for invisible text",
|
|
699
|
+
),
|
|
700
|
+
InjectionPattern(
|
|
701
|
+
name="ai_addressing",
|
|
702
|
+
pattern=re.compile(
|
|
703
|
+
r"\b(attention|hey|hello|dear)\s+(ai|assistant|model|chatbot|gpt|claude|llm)\b.{0,30}"
|
|
704
|
+
r"\b(ignore|disregard|forget|override)\b|"
|
|
705
|
+
r"\b(note\s+to\s+(self|ai|assistant)|internal\s+note)\b.{0,30}"
|
|
706
|
+
r"\b(ignore|override|execute)\b",
|
|
707
|
+
re.IGNORECASE,
|
|
708
|
+
),
|
|
709
|
+
category="indirect_injection",
|
|
710
|
+
confidence=0.85,
|
|
711
|
+
description="Direct addressing of AI in injected content",
|
|
712
|
+
),
|
|
713
|
+
InjectionPattern(
|
|
714
|
+
name="instruction_in_url",
|
|
715
|
+
pattern=re.compile(
|
|
716
|
+
r"(https?://|www\.)[^\s]*"
|
|
717
|
+
r"(ignore|jailbreak|bypass|prompt|inject|override|system)",
|
|
718
|
+
re.IGNORECASE,
|
|
719
|
+
),
|
|
720
|
+
category="indirect_injection",
|
|
721
|
+
confidence=0.75,
|
|
722
|
+
description="Injection keywords hidden in URLs",
|
|
723
|
+
),
|
|
724
|
+
InjectionPattern(
|
|
725
|
+
name="document_boundary_attack",
|
|
726
|
+
pattern=re.compile(
|
|
727
|
+
r"\b(end\s+of\s+(document|file|content|input)|document\s+ends?\s+here)\b.{0,30}"
|
|
728
|
+
r"\b(new\s+instructions?|real\s+task|actual\s+prompt|system\s+override)\b",
|
|
729
|
+
re.IGNORECASE,
|
|
730
|
+
),
|
|
731
|
+
category="indirect_injection",
|
|
732
|
+
confidence=0.88,
|
|
733
|
+
description="Fake document boundaries with new instructions",
|
|
734
|
+
),
|
|
735
|
+
]
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class RulesDetector:
|
|
739
|
+
"""Fast regex-based detector for common prompt injection patterns.
|
|
740
|
+
|
|
741
|
+
This is Layer 1 of the detection cascade - designed to catch
|
|
742
|
+
obvious attacks quickly and cheaply before more expensive layers.
|
|
743
|
+
|
|
744
|
+
Features:
|
|
745
|
+
- Unicode normalization to catch homoglyph attacks
|
|
746
|
+
- 50+ patterns covering 9 attack categories
|
|
747
|
+
- 13 language support for multilingual attacks
|
|
748
|
+
- Indirect injection detection
|
|
749
|
+
"""
|
|
750
|
+
|
|
751
|
+
def __init__(self, normalize: bool = True) -> None:
|
|
752
|
+
"""Initialize the detector with the predefined patterns.
|
|
753
|
+
|
|
754
|
+
Args:
|
|
755
|
+
normalize: Whether to apply Unicode normalization before detection.
|
|
756
|
+
"""
|
|
757
|
+
self.patterns = INJECTION_PATTERNS
|
|
758
|
+
self.normalize = normalize
|
|
759
|
+
|
|
760
|
+
def detect(self, text: str) -> LayerResult:
|
|
761
|
+
"""Check text against all patterns.
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
text: The input text to analyze.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
LayerResult with detection outcome.
|
|
768
|
+
"""
|
|
769
|
+
start_time = time.perf_counter()
|
|
770
|
+
|
|
771
|
+
try:
|
|
772
|
+
texts_to_check = [text]
|
|
773
|
+
if self.normalize:
|
|
774
|
+
normalized_text = normalize_unicode(text)
|
|
775
|
+
if normalized_text != text:
|
|
776
|
+
texts_to_check.append(normalized_text)
|
|
777
|
+
|
|
778
|
+
best_match: tuple[InjectionPattern, re.Match[str]] | None = None
|
|
779
|
+
best_confidence = 0.0
|
|
780
|
+
|
|
781
|
+
for check_text in texts_to_check:
|
|
782
|
+
for pattern in self.patterns:
|
|
783
|
+
match = pattern.pattern.search(check_text)
|
|
784
|
+
if match and pattern.confidence > best_confidence:
|
|
785
|
+
best_match = (pattern, match)
|
|
786
|
+
best_confidence = pattern.confidence
|
|
787
|
+
|
|
788
|
+
latency_ms = (time.perf_counter() - start_time) * 1000
|
|
789
|
+
|
|
790
|
+
if best_match:
|
|
791
|
+
pattern, match = best_match
|
|
792
|
+
return LayerResult(
|
|
793
|
+
is_injection=True,
|
|
794
|
+
confidence=pattern.confidence,
|
|
795
|
+
attack_type=pattern.category,
|
|
796
|
+
layer=1,
|
|
797
|
+
latency_ms=latency_ms,
|
|
798
|
+
details={
|
|
799
|
+
"pattern_name": pattern.name,
|
|
800
|
+
"matched_length": len(match.group(0)),
|
|
801
|
+
"matched_position": match.start(),
|
|
802
|
+
"description": pattern.description,
|
|
803
|
+
"normalized": self.normalize,
|
|
804
|
+
},
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
return LayerResult(
|
|
808
|
+
is_injection=False,
|
|
809
|
+
confidence=0.0,
|
|
810
|
+
attack_type=None,
|
|
811
|
+
layer=1,
|
|
812
|
+
latency_ms=latency_ms,
|
|
813
|
+
details=None,
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
except Exception as e:
|
|
817
|
+
latency_ms = (time.perf_counter() - start_time) * 1000
|
|
818
|
+
return LayerResult(
|
|
819
|
+
is_injection=False,
|
|
820
|
+
confidence=0.0,
|
|
821
|
+
attack_type=None,
|
|
822
|
+
layer=1,
|
|
823
|
+
latency_ms=latency_ms,
|
|
824
|
+
details=None,
|
|
825
|
+
error=str(e),
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
def get_all_matches(
|
|
829
|
+
self, text: str, normalize: bool | None = None
|
|
830
|
+
) -> list[tuple[InjectionPattern, re.Match[str]]]:
|
|
831
|
+
"""Get all matching patterns for analysis/debugging.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
text: The input text to analyze.
|
|
835
|
+
normalize: Override instance normalize setting.
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
List of (pattern, match) tuples for all patterns that matched.
|
|
839
|
+
"""
|
|
840
|
+
should_normalize = normalize if normalize is not None else self.normalize
|
|
841
|
+
if should_normalize:
|
|
842
|
+
text = normalize_unicode(text)
|
|
843
|
+
|
|
844
|
+
matches = []
|
|
845
|
+
for pattern in self.patterns:
|
|
846
|
+
match = pattern.pattern.search(text)
|
|
847
|
+
if match:
|
|
848
|
+
matches.append((pattern, match))
|
|
849
|
+
return matches
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
__all__ = ["RulesDetector", "InjectionPattern", "INJECTION_PATTERNS", "normalize_unicode"]
|