gauntlet-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,852 @@
1
+ """Layer 1: Rule-based prompt injection detection using regex patterns.
2
+
3
+ This module provides fast, regex-based detection for common prompt injection
4
+ patterns. It's designed as the first line of defense in the detection cascade,
5
+ catching obvious attacks quickly and cheaply before more expensive layers.
6
+
7
+ Zero dependencies - works with Python standard library only.
8
+
9
+ Patterns cover:
10
+ - Instruction override attempts
11
+ - Jailbreak attempts (DAN, STAN, DUDE, AIM, developer mode, roleplay)
12
+ - Delimiter/context injection
13
+ - Data extraction attempts
14
+ - Context manipulation
15
+ - Obfuscation techniques
16
+ - Hypothetical framing
17
+ - Multilingual attacks (13 languages)
18
+ - Indirect injection attacks
19
+ - Unicode homoglyph normalization
20
+ """
21
+
22
+ import re
23
+ import time
24
+ import unicodedata
25
+ from dataclasses import dataclass
26
+ from typing import Pattern
27
+
28
+ from gauntlet.models import LayerResult
29
+
30
+
31
+ # =============================================================================
32
+ # UNICODE NORMALIZATION
33
+ # =============================================================================
34
+
35
+ # Common Unicode confusables (homoglyphs) that attackers use to bypass regex
36
+ # Maps lookalike characters to their ASCII equivalents
37
+ CONFUSABLES: dict[str, str] = {
38
+ # Cyrillic lookalikes
39
+ "\u0430": "a", "\u0410": "A",
40
+ "\u0441": "c", "\u0421": "C",
41
+ "\u0435": "e", "\u0415": "E",
42
+ "\u0456": "i", "\u0406": "I",
43
+ "\u043e": "o", "\u041e": "O",
44
+ "\u0440": "p", "\u0420": "P",
45
+ "\u0443": "y", "\u0423": "Y",
46
+ "\u0445": "x", "\u0425": "X",
47
+ "\u0455": "s", "\u0405": "S",
48
+ "\u0458": "j", "\u0408": "J",
49
+ "\u04bb": "h", "\u04ba": "H",
50
+ "\u0501": "d",
51
+ "\u051b": "q",
52
+ "\u051d": "w",
53
+ "\u1d00": "a", "\u1d04": "c", "\u1d05": "d", "\u1d07": "e", "\u1d0d": "m", "\u0274": "n",
54
+ "\u1d0f": "o", "\u1d18": "p", "\u1d1b": "t", "\u1d1c": "u", "\u1d20": "v", "\u1d21": "w",
55
+ # Greek lookalikes
56
+ "\u0391": "A", "\u0392": "B", "\u0395": "E", "\u0396": "Z", "\u0397": "H", "\u0399": "I",
57
+ "\u039a": "K", "\u039c": "M", "\u039d": "N", "\u039f": "O", "\u03a1": "P", "\u03a4": "T",
58
+ "\u03a5": "Y", "\u03a7": "X",
59
+ "\u03b1": "a", "\u03b2": "b", "\u03b5": "e", "\u03b9": "i", "\u03ba": "k", "\u03bd": "v",
60
+ "\u03bf": "o", "\u03c1": "p", "\u03c4": "t", "\u03c5": "u", "\u03c7": "x",
61
+ # Latin variants
62
+ "\u0251": "a", "\u0261": "g", "\u0131": "i", "\u0237": "j", "\u0269": "i",
63
+ "\u0280": "r", "\u0299": "b", "\u0262": "g", "\u029c": "h", "\u029f": "l",
64
+ # Fullwidth characters
65
+ "\uff21": "A", "\uff22": "B", "\uff23": "C", "\uff24": "D", "\uff25": "E", "\uff26": "F",
66
+ "\uff27": "G", "\uff28": "H", "\uff29": "I", "\uff2a": "J", "\uff2b": "K", "\uff2c": "L",
67
+ "\uff2d": "M", "\uff2e": "N", "\uff2f": "O", "\uff30": "P", "\uff31": "Q", "\uff32": "R",
68
+ "\uff33": "S", "\uff34": "T", "\uff35": "U", "\uff36": "V", "\uff37": "W", "\uff38": "X",
69
+ "\uff39": "Y", "\uff3a": "Z",
70
+ "\uff41": "a", "\uff42": "b", "\uff43": "c", "\uff44": "d", "\uff45": "e", "\uff46": "f",
71
+ "\uff47": "g", "\uff48": "h", "\uff49": "i", "\uff4a": "j", "\uff4b": "k", "\uff4c": "l",
72
+ "\uff4d": "m", "\uff4e": "n", "\uff4f": "o", "\uff50": "p", "\uff51": "q", "\uff52": "r",
73
+ "\uff53": "s", "\uff54": "t", "\uff55": "u", "\uff56": "v", "\uff57": "w", "\uff58": "x",
74
+ "\uff59": "y", "\uff5a": "z",
75
+ "\uff10": "0", "\uff11": "1", "\uff12": "2", "\uff13": "3", "\uff14": "4",
76
+ "\uff15": "5", "\uff16": "6", "\uff17": "7", "\uff18": "8", "\uff19": "9",
77
+ # Other common substitutions
78
+ "\u2113": "l", "\u2112": "L",
79
+ "\u2170": "i", "\u2171": "ii", "\u2172": "iii",
80
+ "\u2070": "0", "\u00b9": "1", "\u00b2": "2", "\u00b3": "3",
81
+ "\u2074": "4", "\u2075": "5", "\u2076": "6", "\u2077": "7", "\u2078": "8", "\u2079": "9",
82
+ "\u2080": "0", "\u2081": "1", "\u2082": "2", "\u2083": "3", "\u2084": "4",
83
+ "\u2085": "5", "\u2086": "6", "\u2087": "7", "\u2088": "8", "\u2089": "9",
84
+ }
85
+
86
+ # Build translation table for fast replacement
87
+ _CONFUSABLES_TABLE = str.maketrans(CONFUSABLES)
88
+
89
+
90
+ def normalize_unicode(text: str) -> str:
91
+ """Normalize Unicode text to catch homoglyph attacks.
92
+
93
+ Applies NFKC normalization and replaces known confusable characters
94
+ with their ASCII equivalents.
95
+
96
+ Args:
97
+ text: The input text to normalize.
98
+
99
+ Returns:
100
+ Normalized text with confusables replaced.
101
+ """
102
+ text = unicodedata.normalize("NFKC", text)
103
+ text = text.translate(_CONFUSABLES_TABLE)
104
+ return text
105
+
106
+
107
+ # =============================================================================
108
+ # PATTERN DEFINITIONS
109
+ # =============================================================================
110
+
111
+ @dataclass(frozen=True)
112
+ class InjectionPattern:
113
+ """A single regex pattern for injection detection."""
114
+
115
+ name: str
116
+ pattern: Pattern[str]
117
+ category: str
118
+ confidence: float
119
+ description: str
120
+
121
+
122
+ # Pattern definitions organized by attack category
123
+ INJECTION_PATTERNS: list[InjectionPattern] = [
124
+ # =========================================================================
125
+ # 1. INSTRUCTION OVERRIDE ATTEMPTS
126
+ # =========================================================================
127
+ InjectionPattern(
128
+ name="ignore_previous_instructions",
129
+ pattern=re.compile(
130
+ r"\b(ignore|disregard|forget|override|bypass|skip|dismiss)\b.{0,30}"
131
+ r"\b(previous|prior|above|earlier|all|any|original|initial|system|assistant)\b.{0,20}"
132
+ r"\b(instructions?|rules?|guidelines?|programming|directives?|constraints?|prompts?)\b",
133
+ re.IGNORECASE,
134
+ ),
135
+ category="instruction_override",
136
+ confidence=0.95,
137
+ description="Explicit attempts to nullify prior instructions",
138
+ ),
139
+ InjectionPattern(
140
+ name="new_instructions_override",
141
+ pattern=re.compile(
142
+ r"\b(new|actual|real|true|correct|updated)\b.{0,20}"
143
+ r"\b(instructions?|rules?|directives?|orders?)\b.{0,10}(:|are|follow)",
144
+ re.IGNORECASE,
145
+ ),
146
+ category="instruction_override",
147
+ confidence=0.75,
148
+ description="Attempts to establish attacker text as authoritative",
149
+ ),
150
+ InjectionPattern(
151
+ name="do_not_follow",
152
+ pattern=re.compile(
153
+ r"\b(do\s*n[o']?t|don't|never|stop)\b.{0,20}"
154
+ r"\b(follow|obey|listen(?:ing)?\s+to|adhere|comply)\b.{0,30}"
155
+ r"\b(instructions?|rules?|guidelines?|programming)\b",
156
+ re.IGNORECASE,
157
+ ),
158
+ category="instruction_override",
159
+ confidence=0.90,
160
+ description="Direct commands to disobey constraints",
161
+ ),
162
+ InjectionPattern(
163
+ name="from_now_on",
164
+ pattern=re.compile(
165
+ r"\b(from\s+now\s+on|henceforth|going\s+forward|starting\s+now)\b.{0,50}"
166
+ r"\b(you\s+(will|must|shall|should|are)|act\s+as|behave|respond)\b",
167
+ re.IGNORECASE,
168
+ ),
169
+ category="instruction_override",
170
+ confidence=0.80,
171
+ description="Temporal override establishing new behavior",
172
+ ),
173
+ # =========================================================================
174
+ # 2. JAILBREAK ATTEMPTS
175
+ # =========================================================================
176
+ InjectionPattern(
177
+ name="dan_jailbreak",
178
+ pattern=re.compile(
179
+ r"\b(DAN|D\.A\.N|do\s+anything\s+now)\b|\bDAN\s*\d*\s*(\.\d+)?\b",
180
+ re.IGNORECASE,
181
+ ),
182
+ category="jailbreak",
183
+ confidence=0.95,
184
+ description="DAN (Do Anything Now) jailbreak variants",
185
+ ),
186
+ InjectionPattern(
187
+ name="stan_jailbreak",
188
+ pattern=re.compile(
189
+ r"\b(STAN|S\.T\.A\.N|strive\s+to\s+avoid\s+norms)\b",
190
+ re.IGNORECASE,
191
+ ),
192
+ category="jailbreak",
193
+ confidence=0.95,
194
+ description="STAN (Strive To Avoid Norms) jailbreak",
195
+ ),
196
+ InjectionPattern(
197
+ name="dude_jailbreak",
198
+ pattern=re.compile(
199
+ r"\bDUDE\b|\b(D\.U\.D\.E|decoupled\s+unrestricted\s+dialogue)\b",
200
+ re.IGNORECASE,
201
+ ),
202
+ category="jailbreak",
203
+ confidence=0.95,
204
+ description="DUDE jailbreak variant",
205
+ ),
206
+ InjectionPattern(
207
+ name="aim_jailbreak",
208
+ pattern=re.compile(
209
+ r"\bAIM\b.{0,20}\b(machiavellian|amoral|always\s+intelligent)\b|"
210
+ r"\b(always\s+intelligent\s+and\s+machiavellian)\b",
211
+ re.IGNORECASE,
212
+ ),
213
+ category="jailbreak",
214
+ confidence=0.93,
215
+ description="AIM (Always Intelligent and Machiavellian) jailbreak",
216
+ ),
217
+ InjectionPattern(
218
+ name="evil_confidant",
219
+ pattern=re.compile(
220
+ r"\b(evil\s+confidant|evil\s+advisor|malicious\s+assistant)\b",
221
+ re.IGNORECASE,
222
+ ),
223
+ category="jailbreak",
224
+ confidence=0.92,
225
+ description="Evil Confidant persona jailbreak",
226
+ ),
227
+ InjectionPattern(
228
+ name="named_jailbreaks",
229
+ pattern=re.compile(
230
+ r"\b(KEVIN|Mongo\s*Tom|APOPHIS|Maximum|BasedGPT|JailMilk|AntiGPT|"
231
+ r"BetterDAN|DevMode|BISH|OMNI|Alphabreak|PersonGPT|TranslatorBot|SWITCH)\b",
232
+ re.IGNORECASE,
233
+ ),
234
+ category="jailbreak",
235
+ confidence=0.94,
236
+ description="Known named jailbreak personas",
237
+ ),
238
+ InjectionPattern(
239
+ name="developer_mode",
240
+ pattern=re.compile(
241
+ r"\b(enter|enable|activate|switch\s+to|engage)\s+"
242
+ r"(developer|dev|debug|admin|root|sudo|maintenance|test)\s*"
243
+ r"(mode|access|privileges?|override)\b|"
244
+ r"\b(developer|dev|debug|admin|root|sudo)\s*(mode|access|privileges?)\s+"
245
+ r"(enabled?|activated?|on|unlocked)\b",
246
+ re.IGNORECASE,
247
+ ),
248
+ category="jailbreak",
249
+ confidence=0.90,
250
+ description="Fake developer/admin mode activation",
251
+ ),
252
+ InjectionPattern(
253
+ name="roleplay_jailbreak",
254
+ pattern=re.compile(
255
+ r"\b(pretend|imagine|act|roleplay|simulate|behave)\b.{0,30}"
256
+ r"\b(you\s+are|you're|as\s+if\s+you\s+were?|to\s+be|as\s+an?\s+AI)\b.{0,40}"
257
+ r"\b(unrestricted|unfiltered|uncensored|without\s+(limits?|restrictions?|rules?|filters?|guardrails?)|no\s+guardrails?)\b",
258
+ re.IGNORECASE,
259
+ ),
260
+ category="jailbreak",
261
+ confidence=0.92,
262
+ description="Roleplay-based constraint removal",
263
+ ),
264
+ InjectionPattern(
265
+ name="jailbreak_mode_activation",
266
+ pattern=re.compile(
267
+ r"\b(jailbr[eo]ak|unlock|liberat\w*|unbounded|unchained|unleash\w*)\b.{0,20}"
268
+ r"\b(mode|version|state|yourself|your\s+true\s+self)\b|"
269
+ r"\b(enter|enable|activate|switch\s+to)\b.{0,20}\b(jailbr[eo]ak|unleashed)\b",
270
+ re.IGNORECASE,
271
+ ),
272
+ category="jailbreak",
273
+ confidence=0.98,
274
+ description="Explicit jailbreak activation attempts",
275
+ ),
276
+ InjectionPattern(
277
+ name="opposite_day",
278
+ pattern=re.compile(
279
+ r"\b(opposite\s+day|opposite\s+mode|reverse\s+(your\s+)?rules?|"
280
+ r"invert\s+(your\s+)?(?:rules?|behavior))\b",
281
+ re.IGNORECASE,
282
+ ),
283
+ category="jailbreak",
284
+ confidence=0.88,
285
+ description="'Opposite day' style rule inversion",
286
+ ),
287
+ InjectionPattern(
288
+ name="persona_switch",
289
+ pattern=re.compile(
290
+ r"\b(you\s+are\s+no\s+longer|stop\s+being|cease\s+being)\b.{0,20}"
291
+ r"\b(an?\s+AI|assistant|chatbot|helpful)\b|"
292
+ r"\b(from\s+now\s+on\s+you\s+are|you\s+are\s+now)\b.{0,30}"
293
+ r"\b(evil|malicious|unrestricted|unethical|amoral)\b",
294
+ re.IGNORECASE,
295
+ ),
296
+ category="jailbreak",
297
+ confidence=0.90,
298
+ description="Attempts to switch AI persona",
299
+ ),
300
+ # =========================================================================
301
+ # 3. DELIMITER/CONTEXT INJECTION
302
+ # =========================================================================
303
+ InjectionPattern(
304
+ name="fake_system_tags",
305
+ pattern=re.compile(
306
+ r"<\s*/?\s*(system|assistant|user|human|ai|instruction|prompt|context|message|chat)\s*>|"
307
+ r"<<\s*(SYS|INST|USR)\s*>>|\[\s*(SYSTEM|INST|SYS)\s*\]",
308
+ re.IGNORECASE,
309
+ ),
310
+ category="delimiter_injection",
311
+ confidence=0.95,
312
+ description="Fake XML/bracket system message tags",
313
+ ),
314
+ InjectionPattern(
315
+ name="markdown_code_injection",
316
+ pattern=re.compile(
317
+ r"```\s*(system|prompt|instructions?|config|internal|hidden|secret)\b",
318
+ re.IGNORECASE,
319
+ ),
320
+ category="delimiter_injection",
321
+ confidence=0.85,
322
+ description="Markdown code blocks pretending to be system content",
323
+ ),
324
+ InjectionPattern(
325
+ name="separator_injection",
326
+ pattern=re.compile(
327
+ r"(?:^|\n)\s*[-=]{5,}\s*(?:end|begin|start|stop|new|system|ignore).{0,30}[-=]{0,}",
328
+ re.IGNORECASE | re.MULTILINE,
329
+ ),
330
+ category="delimiter_injection",
331
+ confidence=0.80,
332
+ description="Fake separators implying context boundaries",
333
+ ),
334
+ InjectionPattern(
335
+ name="conversation_reset",
336
+ pattern=re.compile(
337
+ r"\b(conversation|context|chat|session|thread)\s+"
338
+ r"(reset|restart|clear|wipe|start\s*over)\b|"
339
+ r"\b(reset|restart|clear)\s+(the\s+|this\s+)?(conversation|context|chat|session)\b",
340
+ re.IGNORECASE,
341
+ ),
342
+ category="delimiter_injection",
343
+ confidence=0.82,
344
+ description="Attempts to reset conversation state",
345
+ ),
346
+ # =========================================================================
347
+ # 4. DATA EXTRACTION ATTEMPTS
348
+ # =========================================================================
349
+ InjectionPattern(
350
+ name="reveal_system_prompt",
351
+ pattern=re.compile(
352
+ r"\b(reveal|show|display|print|output|tell\s+me|what\s+(is|are)|repeat|echo)\b.{0,30}"
353
+ r"\b(system\s+prompt|initial\s+prompt|original\s+instructions?|hidden\s+instructions?|"
354
+ r"secret\s+instructions?|your\s+instructions?|your\s+prompt|your\s+programming)\b",
355
+ re.IGNORECASE,
356
+ ),
357
+ category="data_extraction",
358
+ confidence=0.92,
359
+ description="Direct system prompt extraction attempts",
360
+ ),
361
+ InjectionPattern(
362
+ name="extract_secrets",
363
+ pattern=re.compile(
364
+ r"\b(reveal|show|tell|give|leak|expose|extract|dump|print)\b.{0,30}"
365
+ r"\b(api\s*keys?|secrets?|credentials?|passwords?|tokens?|"
366
+ r"private\s+(?:keys?|data)|environment\s+variables?|config(?:uration)?)\b",
367
+ re.IGNORECASE,
368
+ ),
369
+ category="data_extraction",
370
+ confidence=0.95,
371
+ description="Attempts to extract sensitive configuration",
372
+ ),
373
+ InjectionPattern(
374
+ name="verbatim_output",
375
+ pattern=re.compile(
376
+ r"\b(output|print|write|echo|display|return)\b.{0,20}"
377
+ r"\b(verbatim|exactly|word\s*for\s*word|character\s*by\s*character|raw)\b.{0,30}"
378
+ r"\b(prompt|instructions?|input|system|above)\b",
379
+ re.IGNORECASE,
380
+ ),
381
+ category="data_extraction",
382
+ confidence=0.85,
383
+ description="Requests for verbatim prompt reproduction",
384
+ ),
385
+ # =========================================================================
386
+ # 5. CONTEXT MANIPULATION
387
+ # =========================================================================
388
+ InjectionPattern(
389
+ name="dismiss_as_fake",
390
+ pattern=re.compile(
391
+ r"\b(above|previous|prior|earlier)\b.{0,30}\b(was|were|is|are)\b.{0,20}"
392
+ r"\b(fake|false|test|placeholder|example|not\s+real|incorrect|wrong|malicious)\b",
393
+ re.IGNORECASE,
394
+ ),
395
+ category="context_manipulation",
396
+ confidence=0.88,
397
+ description="Dismissing legitimate context as fake",
398
+ ),
399
+ InjectionPattern(
400
+ name="context_is_user",
401
+ pattern=re.compile(
402
+ r"\b(everything|all|anything)\s+(above|before|prior|previous)\b.{0,30}"
403
+ r"\b(user|attacker|adversar\w*|injected|untrusted)\b",
404
+ re.IGNORECASE,
405
+ ),
406
+ category="context_manipulation",
407
+ confidence=0.90,
408
+ description="Claiming prior context is user-generated",
409
+ ),
410
+ InjectionPattern(
411
+ name="real_user_claim",
412
+ pattern=re.compile(
413
+ r"\b(i\s+am|i'm|this\s+is)\s+(the\s+)?(real|actual|true|legitimate)\s+"
414
+ r"(user|human|admin|developer|operator)\b",
415
+ re.IGNORECASE,
416
+ ),
417
+ category="context_manipulation",
418
+ confidence=0.80,
419
+ description="False claims of privileged identity",
420
+ ),
421
+ # =========================================================================
422
+ # 6. OBFUSCATION TECHNIQUES
423
+ # =========================================================================
424
+ InjectionPattern(
425
+ name="base64_reference",
426
+ pattern=re.compile(
427
+ r"\b(base64|b64|rot13|hex|unicode\s+escape|url\s*encod)\b.{0,30}"
428
+ r"\b(this|following|below|decode|execute|run|interpret|encoded|text)\b|"
429
+ r"\b(decode|execute|run|interpret)\b.{0,20}\b(this\s+)?"
430
+ r"(base64|b64|rot13|hex|encoded)\b",
431
+ re.IGNORECASE,
432
+ ),
433
+ category="obfuscation",
434
+ confidence=0.85,
435
+ description="References to encoded payloads",
436
+ ),
437
+ InjectionPattern(
438
+ name="character_substitution_hint",
439
+ pattern=re.compile(
440
+ r"\b(replace|substitute|swap|change)\b.{0,30}\b(letters?|characters?|symbols?|each)\b.{0,30}"
441
+ r"\b(with|to|for)\b|\b(read\s+)?backwards?\b.{0,20}\b(spell|says?|reads?)\b",
442
+ re.IGNORECASE,
443
+ ),
444
+ category="obfuscation",
445
+ confidence=0.75,
446
+ description="Instructions to decode obfuscated text",
447
+ ),
448
+ InjectionPattern(
449
+ name="leet_speak_injection",
450
+ pattern=re.compile(
451
+ r"(?:1gn0r3|d1sr3g4rd|1nstruct10ns?|syst3m|pr0mpt|byp4ss|j41lbr34k|h4ck)",
452
+ re.IGNORECASE,
453
+ ),
454
+ category="obfuscation",
455
+ confidence=0.88,
456
+ description="Leetspeak variants of injection keywords",
457
+ ),
458
+ # =========================================================================
459
+ # 7. HYPOTHETICAL FRAMING
460
+ # =========================================================================
461
+ InjectionPattern(
462
+ name="hypothetical_scenario",
463
+ pattern=re.compile(
464
+ r"\b(hypothetically|theoretically|in\s+theory|imagine\s+if|what\s+if|suppose|assuming)\b.{0,50}"
465
+ r"\b(no\s+(rules?|restrictions?|limits?|filters?|guidelines?)|"
466
+ r"you\s+(could|were\s+able|had\s+no|weren't\s+restricted))\b",
467
+ re.IGNORECASE,
468
+ ),
469
+ category="hypothetical_framing",
470
+ confidence=0.85,
471
+ description="Hypothetical scenarios to bypass rules",
472
+ ),
473
+ InjectionPattern(
474
+ name="fictional_context",
475
+ pattern=re.compile(
476
+ r"\b(in\s+(?:a|this)\s+(?:fictional|imaginary|made-up|pretend)|"
477
+ r"for\s+(?:a|my)\s+(?:story|novel|book|fiction|game|script))\b.{0,50}"
478
+ r"\b(how\s+(to|would)|explain|describe|write)\b.{0,30}"
479
+ r"\b(hack|exploit|attack|bypass|illegal|dangerous|harmful)",
480
+ re.IGNORECASE,
481
+ ),
482
+ category="hypothetical_framing",
483
+ confidence=0.82,
484
+ description="Fiction framing to extract harmful content",
485
+ ),
486
+ InjectionPattern(
487
+ name="educational_framing",
488
+ pattern=re.compile(
489
+ r"\b(for\s+(?:educational|research|academic|learning)\s+purposes?|"
490
+ r"to\s+(?:learn|understand|study|research))\b.{0,50}"
491
+ r"\b(how\s+(?:to|do\s+I)|explain|show\s+me)\b.{0,30}"
492
+ r"\b(hack|exploit|attack|bypass|inject|jailbreak)",
493
+ re.IGNORECASE,
494
+ ),
495
+ category="hypothetical_framing",
496
+ confidence=0.78,
497
+ description="Educational framing for harmful requests",
498
+ ),
499
+ # =========================================================================
500
+ # 8. MULTILINGUAL ATTACKS (13 languages)
501
+ # =========================================================================
502
+ InjectionPattern(
503
+ name="spanish_injection",
504
+ pattern=re.compile(
505
+ r"\b(ignora|olvida|descarta|omite)\b.{0,30}"
506
+ r"\b(instrucciones?|reglas?|sistema|directrices?)\b.{0,20}"
507
+ r"\b(anteriores?|previas?|del\s+sistema)?\b",
508
+ re.IGNORECASE,
509
+ ),
510
+ category="multilingual_injection",
511
+ confidence=0.90,
512
+ description="Spanish language injection attempts",
513
+ ),
514
+ InjectionPattern(
515
+ name="german_injection",
516
+ pattern=re.compile(
517
+ r"\b(ignoriere|vergiss|missachte|überspringe)\b.{0,30}"
518
+ r"\b(anweisungen?|regeln?|instruktionen?|befehle?|richtlinien?)\b",
519
+ re.IGNORECASE,
520
+ ),
521
+ category="multilingual_injection",
522
+ confidence=0.90,
523
+ description="German language injection attempts",
524
+ ),
525
+ InjectionPattern(
526
+ name="french_injection",
527
+ pattern=re.compile(
528
+ r"\b(ignore[zr]?|oublie[zr]?|n\u00e9glige[zr]?)\b.{0,30}"
529
+ r"\b(instructions?|r[e\u00e8]gles?|consignes?|directives?)\b.{0,20}"
530
+ r"\b(pr[\u00e9e]c[\u00e9e]dentes?|ant[\u00e9e]rieures?|du\s+syst[\u00e8e]me)?\b",
531
+ re.IGNORECASE,
532
+ ),
533
+ category="multilingual_injection",
534
+ confidence=0.90,
535
+ description="French language injection attempts",
536
+ ),
537
+ InjectionPattern(
538
+ name="chinese_injection",
539
+ pattern=re.compile(
540
+ r"(\u5ffd\u7565|\u65e0\u89c6|\u5fd8\u8bb0|\u8df3\u8fc7|\u653e\u5f03).{0,10}"
541
+ r"(\u4e4b\u524d\u7684|\u4ee5\u524d\u7684|\u5148\u524d\u7684|\u7cfb\u7edf)?"
542
+ r"(\u6307\u4ee4|\u89c4\u5219|\u8bf4\u660e|\u6307\u793a|\u547d\u4ee4)",
543
+ re.IGNORECASE,
544
+ ),
545
+ category="multilingual_injection",
546
+ confidence=0.90,
547
+ description="Chinese language injection attempts",
548
+ ),
549
+ InjectionPattern(
550
+ name="russian_injection",
551
+ pattern=re.compile(
552
+ r"(\u0438\u0433\u043d\u043e\u0440\u0438\u0440\u0443\u0439|\u0437\u0430\u0431\u0443\u0434\u044c|\u043f\u0440\u043e\u043f\u0443\u0441\u0442\u0438|\u043e\u0442\u0431\u0440\u043e\u0441\u044c).{0,40}"
553
+ r"(\u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438[\u0438\u0439\u044e\u044f\u0435]|\u043f\u0440\u0430\u0432\u0438\u043b\u0430?|\u0443\u043a\u0430\u0437\u0430\u043d\u0438[\u044f\u0435]|\u043a\u043e\u043c\u0430\u043d\u0434[\u044b\u0443])",
554
+ re.IGNORECASE,
555
+ ),
556
+ category="multilingual_injection",
557
+ confidence=0.90,
558
+ description="Russian language injection attempts",
559
+ ),
560
+ InjectionPattern(
561
+ name="arabic_injection",
562
+ pattern=re.compile(
563
+ r"(\u062a\u062c\u0627\u0647\u0644|\u0627\u0646\u0633\u0649|\u0627\u0647\u0645\u0644|\u062a\u062e\u0637\u0649).{0,20}"
564
+ r"(\u0627\u0644\u062a\u0639\u0644\u064a\u0645\u0627\u062a|\u0627\u0644\u0642\u0648\u0627\u0639\u062f|\u0627\u0644\u0623\u0648\u0627\u0645\u0631|\u0627\u0644\u0646\u0638\u0627\u0645)",
565
+ re.IGNORECASE,
566
+ ),
567
+ category="multilingual_injection",
568
+ confidence=0.90,
569
+ description="Arabic language injection attempts",
570
+ ),
571
+ InjectionPattern(
572
+ name="portuguese_injection",
573
+ pattern=re.compile(
574
+ r"\b(ignore|ignora|esque\u00e7a|descarte|pule)\b.{0,30}"
575
+ r"\b(instru\u00e7\u00f5es?|regras?|diretrizes?|comandos?)\b.{0,20}"
576
+ r"\b(anteriores?|pr\u00e9vias?|do\s+sistema)?\b",
577
+ re.IGNORECASE,
578
+ ),
579
+ category="multilingual_injection",
580
+ confidence=0.90,
581
+ description="Portuguese language injection attempts",
582
+ ),
583
+ InjectionPattern(
584
+ name="japanese_injection",
585
+ pattern=re.compile(
586
+ r"(\u4ee5\u524d\u306e|\u524d\u306e|\u30b7\u30b9\u30c6\u30e0\u306e)?(\u6307\u793a|\u30eb\u30fc\u30eb|\u547d\u4ee4|\u6307\u4ee4).{0,5}(\u3092)?(\u7121\u8996|\u5fd8\u308c|\u30b9\u30ad\u30c3\u30d7|\u7121\u52b9\u306b)|"
587
+ r"(\u7121\u8996|\u5fd8\u308c|\u30b9\u30ad\u30c3\u30d7).{0,10}(\u6307\u793a|\u30eb\u30fc\u30eb|\u547d\u4ee4)",
588
+ re.IGNORECASE,
589
+ ),
590
+ category="multilingual_injection",
591
+ confidence=0.90,
592
+ description="Japanese language injection attempts",
593
+ ),
594
+ InjectionPattern(
595
+ name="korean_injection",
596
+ pattern=re.compile(
597
+ r"(\uc774\uc804|\uc2dc\uc2a4\ud15c)?.{0,5}(\uc9c0\uc2dc|\uaddc\uce59|\uba85\ub839|\uc9c0\uce68).{0,5}(\ub97c|\uc744)?.{0,5}(\ubb34\uc2dc|\uc78a\uc5b4|\uac74\ub108\ub6f0|\ubb34\ud6a8)|"
598
+ r"(\ubb34\uc2dc|\uc78a\uc5b4).{0,10}(\uc9c0\uc2dc|\uaddc\uce59|\uba85\ub839)",
599
+ re.IGNORECASE,
600
+ ),
601
+ category="multilingual_injection",
602
+ confidence=0.90,
603
+ description="Korean language injection attempts",
604
+ ),
605
+ InjectionPattern(
606
+ name="italian_injection",
607
+ pattern=re.compile(
608
+ r"\b(ignora|dimentica|tralascia|salta)\b.{0,30}"
609
+ r"\b(istruzioni?|regole?|direttive?|comandi?)\b.{0,20}"
610
+ r"\b(precedenti?|del\s+sistema)?\b",
611
+ re.IGNORECASE,
612
+ ),
613
+ category="multilingual_injection",
614
+ confidence=0.90,
615
+ description="Italian language injection attempts",
616
+ ),
617
+ InjectionPattern(
618
+ name="dutch_injection",
619
+ pattern=re.compile(
620
+ r"\b(negeer|vergeet|sla\s+over|negeren)\b.{0,30}"
621
+ r"\b(instructies?|regels?|aanwijzingen?|opdrachten?)\b",
622
+ re.IGNORECASE,
623
+ ),
624
+ category="multilingual_injection",
625
+ confidence=0.90,
626
+ description="Dutch language injection attempts",
627
+ ),
628
+ InjectionPattern(
629
+ name="polish_injection",
630
+ pattern=re.compile(
631
+ r"\b(zignoruj|zapomnij|pomi\u0144|odrzu\u0107)\b.{0,30}"
632
+ r"\b(instrukcj[eai]|regu\u0142[y\u0119]|polece\u0144|zasad[y\u0119])\b",
633
+ re.IGNORECASE,
634
+ ),
635
+ category="multilingual_injection",
636
+ confidence=0.90,
637
+ description="Polish language injection attempts",
638
+ ),
639
+ InjectionPattern(
640
+ name="turkish_injection",
641
+ pattern=re.compile(
642
+ r"(talimat|kural|y\u00f6nerge|komut)\w*.{0,20}(yoksay|unut|atla|g\u00f6rmezden)|"
643
+ r"(\u00f6nceki|eski).{0,20}(talimat|kural|y\u00f6nerge).{0,10}(yoksay|unut|atla)",
644
+ re.IGNORECASE,
645
+ ),
646
+ category="multilingual_injection",
647
+ confidence=0.90,
648
+ description="Turkish language injection attempts",
649
+ ),
650
+ # =========================================================================
651
+ # 9. INDIRECT INJECTION ATTACKS
652
+ # =========================================================================
653
+ InjectionPattern(
654
+ name="trigger_instruction",
655
+ pattern=re.compile(
656
+ r"\b(when|if|once|after)\s+(you|the\s+(ai|assistant|model|system))\s+"
657
+ r"(see|read|encounter|find|process|receive)\b.{0,50}"
658
+ r"\b(execute|run|do|perform|follow|output)\b",
659
+ re.IGNORECASE,
660
+ ),
661
+ category="indirect_injection",
662
+ confidence=0.85,
663
+ description="Planted trigger-based instructions",
664
+ ),
665
+ InjectionPattern(
666
+ name="hidden_instruction_marker",
667
+ pattern=re.compile(
668
+ r"\[\s*(HIDDEN|INVISIBLE|SECRET|IGNORE\s+THIS|FOR\s+AI\s+ONLY|"
669
+ r"AI\s+INSTRUCTION|SYSTEM\s+OVERRIDE|INSTRUCTION|DO\s+NOT\s+DISPLAY)\s*[:\]]|"
670
+ r"<!--\s*(ignore|instruction|system|hidden|ai\s+only)",
671
+ re.IGNORECASE,
672
+ ),
673
+ category="indirect_injection",
674
+ confidence=0.92,
675
+ description="Markers indicating hidden instructions",
676
+ ),
677
+ InjectionPattern(
678
+ name="data_field_injection",
679
+ pattern=re.compile(
680
+ r"(description|summary|bio|about|notes?|comments?|title|name)\s*"
681
+ r"[\"':=].{0,50}(ignore|disregard|forget|you\s+are\s+now|new\s+instructions)",
682
+ re.IGNORECASE,
683
+ ),
684
+ category="indirect_injection",
685
+ confidence=0.82,
686
+ description="Injection hidden in data fields",
687
+ ),
688
+ InjectionPattern(
689
+ name="invisible_text_marker",
690
+ pattern=re.compile(
691
+ r"(color|background|font-size)\s*:\s*(white|transparent|0|hidden)|"
692
+ r"display\s*:\s*none|visibility\s*:\s*hidden|"
693
+ r"position\s*:\s*absolute.{0,30}(left|top)\s*:\s*-\d{4,}",
694
+ re.IGNORECASE,
695
+ ),
696
+ category="indirect_injection",
697
+ confidence=0.80,
698
+ description="CSS hiding techniques for invisible text",
699
+ ),
700
+ InjectionPattern(
701
+ name="ai_addressing",
702
+ pattern=re.compile(
703
+ r"\b(attention|hey|hello|dear)\s+(ai|assistant|model|chatbot|gpt|claude|llm)\b.{0,30}"
704
+ r"\b(ignore|disregard|forget|override)\b|"
705
+ r"\b(note\s+to\s+(self|ai|assistant)|internal\s+note)\b.{0,30}"
706
+ r"\b(ignore|override|execute)\b",
707
+ re.IGNORECASE,
708
+ ),
709
+ category="indirect_injection",
710
+ confidence=0.85,
711
+ description="Direct addressing of AI in injected content",
712
+ ),
713
+ InjectionPattern(
714
+ name="instruction_in_url",
715
+ pattern=re.compile(
716
+ r"(https?://|www\.)[^\s]*"
717
+ r"(ignore|jailbreak|bypass|prompt|inject|override|system)",
718
+ re.IGNORECASE,
719
+ ),
720
+ category="indirect_injection",
721
+ confidence=0.75,
722
+ description="Injection keywords hidden in URLs",
723
+ ),
724
+ InjectionPattern(
725
+ name="document_boundary_attack",
726
+ pattern=re.compile(
727
+ r"\b(end\s+of\s+(document|file|content|input)|document\s+ends?\s+here)\b.{0,30}"
728
+ r"\b(new\s+instructions?|real\s+task|actual\s+prompt|system\s+override)\b",
729
+ re.IGNORECASE,
730
+ ),
731
+ category="indirect_injection",
732
+ confidence=0.88,
733
+ description="Fake document boundaries with new instructions",
734
+ ),
735
+ ]
736
+
737
+
738
+ class RulesDetector:
739
+ """Fast regex-based detector for common prompt injection patterns.
740
+
741
+ This is Layer 1 of the detection cascade - designed to catch
742
+ obvious attacks quickly and cheaply before more expensive layers.
743
+
744
+ Features:
745
+ - Unicode normalization to catch homoglyph attacks
746
+ - 50+ patterns covering 9 attack categories
747
+ - 13 language support for multilingual attacks
748
+ - Indirect injection detection
749
+ """
750
+
751
+ def __init__(self, normalize: bool = True) -> None:
752
+ """Initialize the detector with the predefined patterns.
753
+
754
+ Args:
755
+ normalize: Whether to apply Unicode normalization before detection.
756
+ """
757
+ self.patterns = INJECTION_PATTERNS
758
+ self.normalize = normalize
759
+
760
+ def detect(self, text: str) -> LayerResult:
761
+ """Check text against all patterns.
762
+
763
+ Args:
764
+ text: The input text to analyze.
765
+
766
+ Returns:
767
+ LayerResult with detection outcome.
768
+ """
769
+ start_time = time.perf_counter()
770
+
771
+ try:
772
+ texts_to_check = [text]
773
+ if self.normalize:
774
+ normalized_text = normalize_unicode(text)
775
+ if normalized_text != text:
776
+ texts_to_check.append(normalized_text)
777
+
778
+ best_match: tuple[InjectionPattern, re.Match[str]] | None = None
779
+ best_confidence = 0.0
780
+
781
+ for check_text in texts_to_check:
782
+ for pattern in self.patterns:
783
+ match = pattern.pattern.search(check_text)
784
+ if match and pattern.confidence > best_confidence:
785
+ best_match = (pattern, match)
786
+ best_confidence = pattern.confidence
787
+
788
+ latency_ms = (time.perf_counter() - start_time) * 1000
789
+
790
+ if best_match:
791
+ pattern, match = best_match
792
+ return LayerResult(
793
+ is_injection=True,
794
+ confidence=pattern.confidence,
795
+ attack_type=pattern.category,
796
+ layer=1,
797
+ latency_ms=latency_ms,
798
+ details={
799
+ "pattern_name": pattern.name,
800
+ "matched_length": len(match.group(0)),
801
+ "matched_position": match.start(),
802
+ "description": pattern.description,
803
+ "normalized": self.normalize,
804
+ },
805
+ )
806
+
807
+ return LayerResult(
808
+ is_injection=False,
809
+ confidence=0.0,
810
+ attack_type=None,
811
+ layer=1,
812
+ latency_ms=latency_ms,
813
+ details=None,
814
+ )
815
+
816
+ except Exception as e:
817
+ latency_ms = (time.perf_counter() - start_time) * 1000
818
+ return LayerResult(
819
+ is_injection=False,
820
+ confidence=0.0,
821
+ attack_type=None,
822
+ layer=1,
823
+ latency_ms=latency_ms,
824
+ details=None,
825
+ error=str(e),
826
+ )
827
+
828
+ def get_all_matches(
829
+ self, text: str, normalize: bool | None = None
830
+ ) -> list[tuple[InjectionPattern, re.Match[str]]]:
831
+ """Get all matching patterns for analysis/debugging.
832
+
833
+ Args:
834
+ text: The input text to analyze.
835
+ normalize: Override instance normalize setting.
836
+
837
+ Returns:
838
+ List of (pattern, match) tuples for all patterns that matched.
839
+ """
840
+ should_normalize = normalize if normalize is not None else self.normalize
841
+ if should_normalize:
842
+ text = normalize_unicode(text)
843
+
844
+ matches = []
845
+ for pattern in self.patterns:
846
+ match = pattern.pattern.search(text)
847
+ if match:
848
+ matches.append((pattern, match))
849
+ return matches
850
+
851
+
852
+ __all__ = ["RulesDetector", "InjectionPattern", "INJECTION_PATTERNS", "normalize_unicode"]