stackone-defender 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ """stackone-defender: Prompt injection defense for AI tool-calling.
2
+
3
+ Usage:
4
+ from stackone_defender import create_prompt_defense
5
+
6
+ defense = create_prompt_defense(enable_tier2=True)
7
+ defense.warmup_tier2()
8
+
9
+ result = defense.defend_tool_result(tool_output, "gmail_get_message")
10
+ if not result.allowed:
11
+ print(f"Blocked: {result.risk_level}")
12
+ """
13
+
14
+ from .core.prompt_defense import PromptDefense, create_prompt_defense
15
+ from .types import DefenseResult, RiskLevel, Tier1Result, ToolSanitizationRule
16
+
17
+ __all__ = [
18
+ "DefenseResult",
19
+ "PromptDefense",
20
+ "RiskLevel",
21
+ "Tier1Result",
22
+ "ToolSanitizationRule",
23
+ "create_prompt_defense",
24
+ ]
@@ -0,0 +1,12 @@
1
+ """Classifiers for prompt injection detection."""
2
+
3
+ from .pattern_detector import PatternDetector, create_pattern_detector
4
+ from .patterns import ALL_PATTERNS, FAST_FILTER_KEYWORDS, contains_filter_keywords
5
+
6
+ __all__ = [
7
+ "ALL_PATTERNS",
8
+ "FAST_FILTER_KEYWORDS",
9
+ "PatternDetector",
10
+ "contains_filter_keywords",
11
+ "create_pattern_detector",
12
+ ]
@@ -0,0 +1,95 @@
1
+ """ONNX classifier for fine-tuned MiniLM prompt injection detection.
2
+
3
+ Pipeline: text -> tokenizer -> ONNX Runtime -> logit -> sigmoid -> score
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import math
9
+ from pathlib import Path
10
+
11
+
12
+ def _default_model_path() -> str:
13
+ """Return path to the bundled ONNX model directory."""
14
+ return str(Path(__file__).resolve().parent.parent / "models" / "minilm-full-aug")
15
+
16
+
17
+ def _sigmoid(x: float) -> float:
18
+ return 1.0 / (1.0 + math.exp(-x))
19
+
20
+
21
+ class OnnxClassifier:
22
+ """ONNX Classifier for fine-tuned MiniLM models."""
23
+
24
+ def __init__(self, model_path: str | None = None):
25
+ self._model_path = model_path or _default_model_path()
26
+ self._session = None
27
+ self._tokenizer = None
28
+ self._max_length = 256
29
+ self._load_failed = False
30
+
31
+ def load_model(self, model_path: str | None = None) -> None:
32
+ if model_path:
33
+ self._model_path = model_path
34
+ if self._session is not None and self._tokenizer is not None:
35
+ return
36
+ if self._load_failed:
37
+ raise ImportError("ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]")
38
+ self._load_model()
39
+
40
+ def _load_model(self) -> None:
41
+ try:
42
+ import numpy as np # noqa: F401
43
+ import onnxruntime as ort
44
+ from tokenizers import Tokenizer
45
+ except ImportError as e:
46
+ self._load_failed = True
47
+ raise ImportError(
48
+ "ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]"
49
+ ) from e
50
+
51
+ tokenizer_path = str(Path(self._model_path) / "tokenizer.json")
52
+ self._tokenizer = Tokenizer.from_file(tokenizer_path)
53
+ self._tokenizer.enable_truncation(max_length=self._max_length)
54
+ self._tokenizer.enable_padding(length=self._max_length)
55
+
56
+ onnx_path = str(Path(self._model_path) / "model_quantized.onnx")
57
+ self._session = ort.InferenceSession(onnx_path)
58
+
59
+ def classify(self, text: str) -> float:
60
+ """Classify a single text, returning a sigmoid score in [0, 1]."""
61
+ self._ensure_loaded()
62
+ import numpy as np
63
+
64
+ encoding = self._tokenizer.encode(text)
65
+ input_ids = np.array([encoding.ids], dtype=np.int64)
66
+ attention_mask = np.array([encoding.attention_mask], dtype=np.int64)
67
+
68
+ results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
69
+ logit = float(results[0][0][0])
70
+ return _sigmoid(logit)
71
+
72
+ def classify_batch(self, texts: list[str]) -> list[float]:
73
+ """Classify multiple texts in batch."""
74
+ if not texts:
75
+ return []
76
+ self._ensure_loaded()
77
+ import numpy as np
78
+
79
+ encodings = self._tokenizer.encode_batch(texts)
80
+ input_ids = np.array([e.ids for e in encodings], dtype=np.int64)
81
+ attention_mask = np.array([e.attention_mask for e in encodings], dtype=np.int64)
82
+
83
+ results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
84
+ logits = results[0]
85
+ return [_sigmoid(float(logits[i][0])) for i in range(len(texts))]
86
+
87
+ def warmup(self) -> None:
88
+ self.load_model()
89
+
90
+ def is_loaded(self) -> bool:
91
+ return self._session is not None and self._tokenizer is not None
92
+
93
+ def _ensure_loaded(self) -> None:
94
+ if not self.is_loaded():
95
+ self.load_model()
@@ -0,0 +1,223 @@
1
+ """Tier 1: Pattern Detection.
2
+
3
+ Fast, regex-based detection of known injection patterns.
4
+ Target latency: < 1-2ms per field.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ import re
11
+ import time
12
+
13
+ from ..types import PatternDefinition, PatternMatch, RiskLevel, StructuralFlag, Tier1Result
14
+ from .patterns import ALL_PATTERNS, contains_filter_keywords
15
+
16
+ DEFAULT_DETECTOR_CONFIG = {
17
+ "use_fast_filter": True,
18
+ "max_analysis_length": 50000,
19
+ "entropy_threshold": 4.5,
20
+ "entropy_min_length": 50,
21
+ "max_field_length": 100000,
22
+ }
23
+
24
+
25
+ class PatternDetector:
26
+ """Pattern Detector for Tier 1 classification."""
27
+
28
+ def __init__(self, config: dict | None = None, custom_patterns: list[PatternDefinition] | None = None):
29
+ cfg = dict(DEFAULT_DETECTOR_CONFIG)
30
+ if config:
31
+ cfg.update(config)
32
+ self._use_fast_filter = cfg["use_fast_filter"]
33
+ self._max_analysis_length = cfg["max_analysis_length"]
34
+ self._entropy_threshold = cfg["entropy_threshold"]
35
+ self._entropy_min_length = cfg["entropy_min_length"]
36
+ self._max_field_length = cfg["max_field_length"]
37
+ self._patterns: list[PatternDefinition] = list(ALL_PATTERNS)
38
+ self._has_custom = False
39
+ if custom_patterns:
40
+ self._patterns.extend(custom_patterns)
41
+ self._has_custom = True
42
+
43
+ def analyze(self, text: str) -> Tier1Result:
44
+ start = time.perf_counter()
45
+
46
+ if not text or len(text) < 3:
47
+ return self._empty_result(start)
48
+
49
+ original_length = len(text)
50
+ analysis_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
51
+
52
+ should_use_fast_filter = self._use_fast_filter and not self._has_custom
53
+ if should_use_fast_filter and not contains_filter_keywords(analysis_text):
54
+ flags = self._detect_structural_issues(analysis_text, original_length)
55
+ return self._create_result([], flags, start)
56
+
57
+ matches = self._detect_patterns(analysis_text)
58
+ flags = self._detect_structural_issues(analysis_text, original_length)
59
+ return self._create_result(matches, flags, start)
60
+
61
+ # ------------------------------------------------------------------
62
+ # Pattern detection
63
+ # ------------------------------------------------------------------
64
+
65
+ def _detect_patterns(self, text: str) -> list[PatternMatch]:
66
+ matches: list[PatternMatch] = []
67
+ for defn in self._patterns:
68
+ # Use finditer for all patterns (handles global-like behavior)
69
+ for m in defn.pattern.finditer(text):
70
+ matches.append(
71
+ PatternMatch(
72
+ pattern=defn.id,
73
+ matched=m.group(0),
74
+ position=m.start(),
75
+ category=defn.category,
76
+ severity=defn.severity,
77
+ )
78
+ )
79
+ return matches
80
+
81
+ # ------------------------------------------------------------------
82
+ # Structural analysis
83
+ # ------------------------------------------------------------------
84
+
85
+ def _detect_structural_issues(self, text: str, original_length: int | None = None) -> list[StructuralFlag]:
86
+ flags: list[StructuralFlag] = []
87
+ length_to_check = original_length if original_length is not None else len(text)
88
+
89
+ if length_to_check > self._max_field_length:
90
+ flags.append(
91
+ StructuralFlag(
92
+ type="excessive_length",
93
+ details=f"Field length {length_to_check} exceeds maximum {self._max_field_length}",
94
+ severity="medium",
95
+ )
96
+ )
97
+
98
+ if len(text) >= self._entropy_min_length:
99
+ entropy = self._calculate_entropy(text)
100
+ if entropy > self._entropy_threshold:
101
+ flags.append(
102
+ StructuralFlag(
103
+ type="high_entropy",
104
+ details=f"Entropy {entropy:.2f} exceeds threshold {self._entropy_threshold}",
105
+ severity="medium",
106
+ )
107
+ )
108
+
109
+ if self._has_nested_markers(text):
110
+ flags.append(
111
+ StructuralFlag(
112
+ type="nested_markers",
113
+ details="Suspicious nested XML tags or bracket patterns detected",
114
+ severity="medium",
115
+ )
116
+ )
117
+
118
+ if self._has_suspicious_formatting(text):
119
+ flags.append(
120
+ StructuralFlag(
121
+ type="suspicious_formatting",
122
+ details="Unusual formatting patterns detected",
123
+ severity="low",
124
+ )
125
+ )
126
+
127
+ return flags
128
+
129
+ @staticmethod
130
+ def _calculate_entropy(text: str) -> float:
131
+ freq: dict[str, int] = {}
132
+ for ch in text:
133
+ freq[ch] = freq.get(ch, 0) + 1
134
+ length = len(text)
135
+ entropy = 0.0
136
+ for count in freq.values():
137
+ p = count / length
138
+ entropy -= p * math.log2(p)
139
+ return entropy
140
+
141
+ @staticmethod
142
+ def _has_nested_markers(text: str) -> bool:
143
+ suspicious_xml = re.compile(
144
+ r"</?(?:system|user|assistant|instruction|prompt|admin|developer)[^>]*>", re.I
145
+ )
146
+ tags = suspicious_xml.findall(text)
147
+ if len(tags) >= 2:
148
+ return True
149
+
150
+ xml_tags = re.findall(r"<[a-zA-Z][^>]*>", text)
151
+ if len(xml_tags) > 4:
152
+ marker_tags = [t for t in xml_tags if re.search(r"system|user|assistant|instruction|prompt", t, re.I)]
153
+ if marker_tags:
154
+ return True
155
+
156
+ if re.search(r"\[\[.*?(?:system|instruction|ignore).*?\]\]", text, re.I):
157
+ return True
158
+
159
+ return False
160
+
161
+ @staticmethod
162
+ def _has_suspicious_formatting(text: str) -> bool:
163
+ if re.search(r"\n{3,}(?:system|instruction|ignore|forget)", text, re.I):
164
+ return True
165
+ if re.search(r"^#{1,3}\s*(?:system|instruction|new rules)", text, re.I | re.M):
166
+ return True
167
+ if re.search(r"[-=]{3,}\s*\n\s*(?:system|instruction|ignore)", text, re.I):
168
+ return True
169
+ return False
170
+
171
+ # ------------------------------------------------------------------
172
+ # Risk calculation
173
+ # ------------------------------------------------------------------
174
+
175
+ @staticmethod
176
+ def _calculate_suggested_risk(matches: list[PatternMatch], flags: list[StructuralFlag]) -> RiskLevel:
177
+ high_matches = sum(1 for m in matches if m.severity == "high")
178
+ medium_matches = sum(1 for m in matches if m.severity == "medium")
179
+ high_flags = sum(1 for f in flags if f.severity == "high")
180
+ medium_flags = sum(1 for f in flags if f.severity == "medium")
181
+
182
+ if high_matches >= 2 or (high_matches >= 1 and high_flags >= 1):
183
+ return "critical"
184
+ if high_matches >= 1 or medium_matches >= 3 or (medium_matches >= 2 and medium_flags >= 1):
185
+ return "high"
186
+ if medium_matches >= 1 or high_flags >= 1 or medium_flags >= 2:
187
+ return "medium"
188
+ if matches or flags:
189
+ return "low"
190
+ return "low"
191
+
192
+ # ------------------------------------------------------------------
193
+ # Result helpers
194
+ # ------------------------------------------------------------------
195
+
196
+ def _create_result(self, matches: list[PatternMatch], flags: list[StructuralFlag], start: float) -> Tier1Result:
197
+ return Tier1Result(
198
+ matches=matches,
199
+ structural_flags=flags,
200
+ has_detections=bool(matches) or bool(flags),
201
+ suggested_risk=self._calculate_suggested_risk(matches, flags),
202
+ latency_ms=(time.perf_counter() - start) * 1000,
203
+ )
204
+
205
+ @staticmethod
206
+ def _empty_result(start: float) -> Tier1Result:
207
+ return Tier1Result(
208
+ matches=[],
209
+ structural_flags=[],
210
+ has_detections=False,
211
+ suggested_risk="low",
212
+ latency_ms=(time.perf_counter() - start) * 1000,
213
+ )
214
+
215
+ def add_pattern(self, pattern: PatternDefinition) -> None:
216
+ self._patterns.append(pattern)
217
+
218
+ def get_patterns(self) -> list[PatternDefinition]:
219
+ return list(self._patterns)
220
+
221
+
222
+ def create_pattern_detector(config: dict | None = None, custom_patterns: list[PatternDefinition] | None = None) -> PatternDetector:
223
+ return PatternDetector(config=config, custom_patterns=custom_patterns)
@@ -0,0 +1,170 @@
1
+ """Shared injection pattern definitions.
2
+
3
+ These patterns are used by both Tier 1 classification and sanitization.
4
+ Single source of truth for pattern matching.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+
11
+ from ..types import PatternDefinition
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Role markers
15
+ # ---------------------------------------------------------------------------
16
+ ROLE_MARKER_PATTERNS: list[PatternDefinition] = [
17
+ PatternDefinition("role_system", re.compile(r"^SYSTEM:\s*", re.I), "role_marker", "high", "System role marker at start of text"),
18
+ PatternDefinition("role_assistant", re.compile(r"^ASSISTANT:\s*", re.I), "role_marker", "high", "Assistant role marker at start of text"),
19
+ PatternDefinition("role_user", re.compile(r"^USER:\s*", re.I), "role_marker", "medium", "User role marker at start of text"),
20
+ PatternDefinition("role_developer", re.compile(r"^DEVELOPER:\s*", re.I), "role_marker", "high", "Developer role marker at start of text"),
21
+ PatternDefinition("role_admin", re.compile(r"^ADMIN(?:ISTRATOR)?:\s*", re.I), "role_marker", "high", "Admin role marker at start of text"),
22
+ PatternDefinition("role_instruction", re.compile(r"^INSTRUCTIONS?:\s*", re.I), "role_marker", "high", "Instruction marker at start of text"),
23
+ PatternDefinition("role_human", re.compile(r"^HUMAN:\s*", re.I), "role_marker", "medium", "Human role marker at start of text"),
24
+ PatternDefinition("role_ai", re.compile(r"^AI:\s*", re.I), "role_marker", "medium", "AI role marker at start of text"),
25
+ # Bracketed variants
26
+ PatternDefinition("role_system_bracket", re.compile(r"^\[SYSTEM\]", re.I), "role_marker", "high", "Bracketed system role marker"),
27
+ PatternDefinition("role_inst_bracket", re.compile(r"^\[INST\]", re.I), "role_marker", "high", "Bracketed instruction marker (Llama format)"),
28
+ # XML-style variants
29
+ PatternDefinition("role_system_xml", re.compile(r"<system>", re.I), "role_marker", "high", "XML-style system tag"),
30
+ PatternDefinition("role_assistant_xml", re.compile(r"<assistant>", re.I), "role_marker", "medium", "XML-style assistant tag"),
31
+ ]
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Instruction overrides
35
+ # ---------------------------------------------------------------------------
36
+ INSTRUCTION_OVERRIDE_PATTERNS: list[PatternDefinition] = [
37
+ PatternDefinition("ignore_previous", re.compile(r"ignore\s+(?:all\s+)?(?:previous|prior|earlier|above)\s+(?:instructions?|prompts?|rules?|guidelines?|directions?)", re.I), "instruction_override", "high", "Attempt to ignore previous instructions"),
38
+ PatternDefinition("forget_previous", re.compile(r"forget\s+(?:all\s+)?(?:(?:previous|prior|earlier|above)\s+)?(?:instructions?|prompts?|rules?|context|guidelines?)", re.I), "instruction_override", "high", "Attempt to make AI forget instructions"),
39
+ PatternDefinition("disregard_previous", re.compile(r"disregard\s+(?:all\s+)?(?:previous|prior|earlier|above)\s+(?:instructions?|prompts?|rules?)", re.I), "instruction_override", "high", "Attempt to disregard instructions"),
40
+ PatternDefinition("override_instructions", re.compile(r"override\s+(?:the\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?)", re.I), "instruction_override", "high", "Direct override attempt"),
41
+ PatternDefinition("new_instructions", re.compile(r"new\s+instructions?:\s*", re.I), "instruction_override", "high", "Attempt to inject new instructions"),
42
+ PatternDefinition("updated_instructions", re.compile(r"(?:updated?|revised?|changed?)\s+instructions?:\s*", re.I), "instruction_override", "high", "Attempt to update instructions"),
43
+ PatternDefinition("stop_being", re.compile(r"stop\s+being\s+(?:a\s+)?(?:helpful|assistant|ai|chatbot)", re.I), "instruction_override", "medium", "Attempt to change AI behavior"),
44
+ PatternDefinition("from_now_on", re.compile(r"from\s+now\s+on,?\s+(?:you\s+)?(?:will|must|should|are)", re.I), "instruction_override", "medium", "Attempt to set new behavior"),
45
+ ]
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Role assumption
49
+ # ---------------------------------------------------------------------------
50
+ ROLE_ASSUMPTION_PATTERNS: list[PatternDefinition] = [
51
+ PatternDefinition("you_are_now", re.compile(r"you\s+are\s+now\s+(?:a\s+)?(?:different|new|the|my)?", re.I), "role_assumption", "high", "Attempt to assign new role"),
52
+ PatternDefinition("act_as", re.compile(r"act\s+(?:as|like)\s+(?:a\s+)?(?:system|admin|developer|root|superuser)", re.I), "role_assumption", "high", "Attempt to make AI act as privileged role"),
53
+ PatternDefinition("pretend_to_be", re.compile(r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+)?", re.I), "role_assumption", "medium", "Attempt to make AI pretend"),
54
+ PatternDefinition("roleplay_as", re.compile(r"roleplay\s+(?:as|like)\s+(?:a\s+)?", re.I), "role_assumption", "low", "Roleplay request (lower severity)"),
55
+ PatternDefinition("imagine_you_are", re.compile(r"imagine\s+(?:that\s+)?you\s+are\s+(?:a\s+)?", re.I), "role_assumption", "low", "Imagination prompt (lower severity)"),
56
+ PatternDefinition("jailbreak_dan", re.compile(r"\bDAN\b.*?(?:do\s+anything|jailbreak)", re.I), "role_assumption", "high", "DAN jailbreak attempt"),
57
+ PatternDefinition("developer_mode", re.compile(r"developer\s+mode\s+(?:is\s+)?(?:now\s+)?(?:enabled?|activated?|on)", re.I), "role_assumption", "high", "Developer mode activation attempt"),
58
+ ]
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Security bypass
62
+ # ---------------------------------------------------------------------------
63
+ SECURITY_BYPASS_PATTERNS: list[PatternDefinition] = [
64
+ PatternDefinition("bypass_security", re.compile(r"bypass\s+(?:the\s+)?(?:security|safety|guardrails?|filters?|restrictions?)", re.I), "security_bypass", "high", "Direct security bypass attempt"),
65
+ PatternDefinition("disable_safety", re.compile(r"disable\s+(?:the\s+)?(?:safety|security|guardrails?|filters?|restrictions?)", re.I), "security_bypass", "high", "Attempt to disable safety features"),
66
+ PatternDefinition("ignore_safety", re.compile(r"ignore\s+(?:the\s+)?(?:safety|security|ethical)\s+(?:guidelines?|rules?|restrictions?)", re.I), "security_bypass", "high", "Attempt to ignore safety guidelines"),
67
+ PatternDefinition("no_restrictions", re.compile(r"(?:without|no)\s+(?:any\s+)?(?:restrictions?|limitations?|guardrails?|filters?)", re.I), "security_bypass", "medium", "Request for unrestricted response"),
68
+ PatternDefinition("uncensored", re.compile(r"(?:uncensored|unfiltered|unrestricted)\s*(?:mode|response|output|version)?", re.I), "security_bypass", "high", "Request for uncensored mode"),
69
+ ]
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Command execution
73
+ # ---------------------------------------------------------------------------
74
+ COMMAND_EXECUTION_PATTERNS: list[PatternDefinition] = [
75
+ PatternDefinition("execute_command", re.compile(r"execute\s+(?:the\s+)?(?:following|this|these)\s+(?:command|instruction|code)", re.I), "command_execution", "high", "Command execution instruction"),
76
+ PatternDefinition("run_code", re.compile(r"run\s+(?:the\s+)?(?:following|this|these)\s+(?:code|script|command)", re.I), "command_execution", "high", "Code execution instruction"),
77
+ PatternDefinition("eval_expression", re.compile(r"eval(?:uate)?\s*\(", re.I), "command_execution", "medium", "Eval function pattern"),
78
+ PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)|`[^`]+`"), "command_execution", "medium", "Shell command substitution"),
79
+ ]
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Encoding suspicious
83
+ # ---------------------------------------------------------------------------
84
+ ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
85
+ PatternDefinition("base64_instruction", re.compile(r"(?:decode|base64)\s*[:(]\s*[A-Za-z0-9+/=]{20,}", re.I), "encoding_suspicious", "high", "Base64 encoded content with decode instruction"),
86
+ PatternDefinition("hex_escape_sequence", re.compile(r"(?:\\x[0-9a-fA-F]{2}){4,}"), "encoding_suspicious", "medium", "Hex escape sequence (potential obfuscation)"),
87
+ PatternDefinition("unicode_escape_sequence", re.compile(r"(?:\\u[0-9a-fA-F]{4}){4,}"), "encoding_suspicious", "medium", "Unicode escape sequence (potential obfuscation)"),
88
+ PatternDefinition("html_entity_abuse", re.compile(r"(?:&#\d{2,4};){4,}|(?:&#x[0-9a-fA-F]{2,4};){4,}", re.I), "encoding_suspicious", "medium", "HTML entity encoding (potential obfuscation)"),
89
+ PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "low", "Mention of simple encoding schemes"),
90
+ PatternDefinition("leetspeak_injection", re.compile(r"1gn0r3|f0rg3t|byp4ss|syst3m|4dm1n|h4ck", re.I), "encoding_suspicious", "medium", "Leetspeak obfuscation of injection keywords"),
91
+ ]
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Prompt leaking
95
+ # ---------------------------------------------------------------------------
96
+ PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
97
+ PatternDefinition("reveal_system_prompt", re.compile(r"reveal\s+(?:your\s+)?(?:system\s+)?prompt", re.I), "instruction_override", "high", "Attempt to reveal system prompt"),
98
+ PatternDefinition("show_instructions", re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:(?:initial|original|system|hidden)\s+)?instructions?", re.I), "instruction_override", "high", "Attempt to show hidden instructions"),
99
+ PatternDefinition("print_above", re.compile(r"print\s+(?:everything|all|text)\s+(?:above|before)", re.I), "instruction_override", "high", "Attempt to print content above current context"),
100
+ PatternDefinition("repeat_back", re.compile(r"repeat\s+(?:back\s+)?(?:everything|all|your\s+instructions?)", re.I), "instruction_override", "high", "Attempt to make AI repeat instructions"),
101
+ PatternDefinition("what_are_your_instructions", re.compile(r"what\s+(?:are|were)\s+(?:your|the)\s+(?:(?:initial|original|system)\s+)?instructions?", re.I), "instruction_override", "medium", "Question about system instructions"),
102
+ PatternDefinition("output_initialization", re.compile(r"output\s+(?:your\s+)?(?:initialization|init|startup|boot)", re.I), "instruction_override", "high", "Attempt to output initialization content"),
103
+ ]
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Indirect injection
107
+ # ---------------------------------------------------------------------------
108
+ INDIRECT_INJECTION_PATTERNS: list[PatternDefinition] = [
109
+ PatternDefinition("markdown_hidden_instruction", re.compile(r"\[.*?\]\(.*?(?:ignore|forget|system|instruction).*?\)", re.I), "structural", "high", "Markdown link with hidden injection"),
110
+ PatternDefinition("html_comment_injection", re.compile(r"<!--\s*(?:system|ignore|instruction|prompt).*?-->", re.I), "structural", "high", "HTML comment containing injection keywords"),
111
+ PatternDefinition("invisible_unicode", re.compile(r"[\u200b-\u200d\ufeff\u2060\u2061\u2062\u2063\u2064]"), "encoding_suspicious", "medium", "Invisible Unicode characters (zero-width, etc.)"),
112
+ PatternDefinition("text_direction_override", re.compile(r"[\u202a-\u202e\u2066-\u2069]"), "encoding_suspicious", "medium", "Text direction override characters"),
113
+ PatternDefinition("confusable_homoglyphs", re.compile(r"[\u13a0-\u13f4]|[\u1d00-\u1d2b]|[\u0400-\u04ff]"), "encoding_suspicious", "medium", "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)"),
114
+ PatternDefinition("separator_injection", re.compile(r"[-=]{10,}[^-=\n]*(?:system|instruction|ignore)", re.I), "structural", "medium", "Separator followed by injection attempt"),
115
+ PatternDefinition("json_injection", re.compile(r'"(?:system|role|instruction|prompt)"\s*:\s*"', re.I), "structural", "medium", "JSON-style role/instruction injection"),
116
+ ]
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # All patterns combined
120
+ # ---------------------------------------------------------------------------
121
+ ALL_PATTERNS: list[PatternDefinition] = [
122
+ *ROLE_MARKER_PATTERNS,
123
+ *INSTRUCTION_OVERRIDE_PATTERNS,
124
+ *ROLE_ASSUMPTION_PATTERNS,
125
+ *SECURITY_BYPASS_PATTERNS,
126
+ *COMMAND_EXECUTION_PATTERNS,
127
+ *ENCODING_SUSPICIOUS_PATTERNS,
128
+ *PROMPT_LEAKING_PATTERNS,
129
+ *INDIRECT_INJECTION_PATTERNS,
130
+ ]
131
+
132
+
133
+ def get_patterns_by_category(category: str) -> list[PatternDefinition]:
134
+ return [p for p in ALL_PATTERNS if p.category == category]
135
+
136
+
137
+ def get_patterns_by_severity(severity: str) -> list[PatternDefinition]:
138
+ return [p for p in ALL_PATTERNS if p.severity == severity]
139
+
140
+
141
+ FAST_FILTER_KEYWORDS: list[str] = [
142
+ # Role markers
143
+ "system:", "assistant:", "user:", "developer:", "admin:",
144
+ "instruction", "[system]", "[inst]", "<system>", "<assistant>",
145
+ # Override keywords
146
+ "ignore", "forget", "disregard", "override", "bypass",
147
+ "disable", "stop being", "from now on",
148
+ # Role assumption
149
+ "you are now", "act as", "pretend", "roleplay", "jailbreak",
150
+ "dan", "developer mode", "imagine you",
151
+ # Security bypass
152
+ "uncensored", "unfiltered", "unrestricted",
153
+ "no restrictions", "without restrictions",
154
+ # Commands
155
+ "execute", "eval(", "$(", "run the",
156
+ # Encoding/obfuscation
157
+ "base64", "decode", "\\x", "\\u", "&#", "rot13",
158
+ "1gn0r3", "f0rg3t", "byp4ss",
159
+ # Prompt leaking
160
+ "reveal", "show me your", "print everything", "print above",
161
+ "repeat back", "what are your instructions", "output initialization",
162
+ # Indirect injection
163
+ "<!--", '"system"', '"role"', '"instruction"',
164
+ ]
165
+
166
+
167
+ def contains_filter_keywords(text: str) -> bool:
168
+ """Check if text contains any fast filter keywords (case-insensitive)."""
169
+ lower = text.lower()
170
+ return any(kw.lower() in lower for kw in FAST_FILTER_KEYWORDS)