stackone-defender 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stackone_defender/__init__.py +24 -0
- stackone_defender/classifiers/__init__.py +12 -0
- stackone_defender/classifiers/onnx_classifier.py +95 -0
- stackone_defender/classifiers/pattern_detector.py +223 -0
- stackone_defender/classifiers/patterns.py +170 -0
- stackone_defender/classifiers/tier2_classifier.py +164 -0
- stackone_defender/config.py +150 -0
- stackone_defender/core/__init__.py +12 -0
- stackone_defender/core/prompt_defense.py +197 -0
- stackone_defender/core/tool_result_sanitizer.py +295 -0
- stackone_defender/models/minilm-full-aug/config.json +28 -0
- stackone_defender/models/minilm-full-aug/model_quantized.onnx +0 -0
- stackone_defender/models/minilm-full-aug/tokenizer.json +30678 -0
- stackone_defender/models/minilm-full-aug/tokenizer_config.json +16 -0
- stackone_defender/sanitizers/__init__.py +25 -0
- stackone_defender/sanitizers/encoding_detector.py +180 -0
- stackone_defender/sanitizers/normalizer.py +94 -0
- stackone_defender/sanitizers/pattern_remover.py +113 -0
- stackone_defender/sanitizers/role_stripper.py +104 -0
- stackone_defender/sanitizers/sanitizer.py +205 -0
- stackone_defender/types.py +200 -0
- stackone_defender/utils/__init__.py +33 -0
- stackone_defender/utils/boundary.py +48 -0
- stackone_defender/utils/field_detection.py +65 -0
- stackone_defender/utils/structure.py +83 -0
- stackone_defender-0.1.1.dist-info/METADATA +229 -0
- stackone_defender-0.1.1.dist-info/RECORD +28 -0
- stackone_defender-0.1.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""stackone-defender: Prompt injection defense for AI tool-calling.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
from stackone_defender import create_prompt_defense
|
|
5
|
+
|
|
6
|
+
defense = create_prompt_defense(enable_tier2=True)
|
|
7
|
+
defense.warmup_tier2()
|
|
8
|
+
|
|
9
|
+
result = defense.defend_tool_result(tool_output, "gmail_get_message")
|
|
10
|
+
if not result.allowed:
|
|
11
|
+
print(f"Blocked: {result.risk_level}")
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .core.prompt_defense import PromptDefense, create_prompt_defense
|
|
15
|
+
from .types import DefenseResult, RiskLevel, Tier1Result, ToolSanitizationRule
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DefenseResult",
|
|
19
|
+
"PromptDefense",
|
|
20
|
+
"RiskLevel",
|
|
21
|
+
"Tier1Result",
|
|
22
|
+
"ToolSanitizationRule",
|
|
23
|
+
"create_prompt_defense",
|
|
24
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Classifiers for prompt injection detection."""
|
|
2
|
+
|
|
3
|
+
from .pattern_detector import PatternDetector, create_pattern_detector
|
|
4
|
+
from .patterns import ALL_PATTERNS, FAST_FILTER_KEYWORDS, contains_filter_keywords
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ALL_PATTERNS",
|
|
8
|
+
"FAST_FILTER_KEYWORDS",
|
|
9
|
+
"PatternDetector",
|
|
10
|
+
"contains_filter_keywords",
|
|
11
|
+
"create_pattern_detector",
|
|
12
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""ONNX classifier for fine-tuned MiniLM prompt injection detection.
|
|
2
|
+
|
|
3
|
+
Pipeline: text -> tokenizer -> ONNX Runtime -> logit -> sigmoid -> score
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _default_model_path() -> str:
|
|
13
|
+
"""Return path to the bundled ONNX model directory."""
|
|
14
|
+
return str(Path(__file__).resolve().parent.parent / "models" / "minilm-full-aug")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _sigmoid(x: float) -> float:
|
|
18
|
+
return 1.0 / (1.0 + math.exp(-x))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OnnxClassifier:
|
|
22
|
+
"""ONNX Classifier for fine-tuned MiniLM models."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, model_path: str | None = None):
|
|
25
|
+
self._model_path = model_path or _default_model_path()
|
|
26
|
+
self._session = None
|
|
27
|
+
self._tokenizer = None
|
|
28
|
+
self._max_length = 256
|
|
29
|
+
self._load_failed = False
|
|
30
|
+
|
|
31
|
+
def load_model(self, model_path: str | None = None) -> None:
|
|
32
|
+
if model_path:
|
|
33
|
+
self._model_path = model_path
|
|
34
|
+
if self._session is not None and self._tokenizer is not None:
|
|
35
|
+
return
|
|
36
|
+
if self._load_failed:
|
|
37
|
+
raise ImportError("ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]")
|
|
38
|
+
self._load_model()
|
|
39
|
+
|
|
40
|
+
def _load_model(self) -> None:
|
|
41
|
+
try:
|
|
42
|
+
import numpy as np # noqa: F401
|
|
43
|
+
import onnxruntime as ort
|
|
44
|
+
from tokenizers import Tokenizer
|
|
45
|
+
except ImportError as e:
|
|
46
|
+
self._load_failed = True
|
|
47
|
+
raise ImportError(
|
|
48
|
+
"ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]"
|
|
49
|
+
) from e
|
|
50
|
+
|
|
51
|
+
tokenizer_path = str(Path(self._model_path) / "tokenizer.json")
|
|
52
|
+
self._tokenizer = Tokenizer.from_file(tokenizer_path)
|
|
53
|
+
self._tokenizer.enable_truncation(max_length=self._max_length)
|
|
54
|
+
self._tokenizer.enable_padding(length=self._max_length)
|
|
55
|
+
|
|
56
|
+
onnx_path = str(Path(self._model_path) / "model_quantized.onnx")
|
|
57
|
+
self._session = ort.InferenceSession(onnx_path)
|
|
58
|
+
|
|
59
|
+
def classify(self, text: str) -> float:
|
|
60
|
+
"""Classify a single text, returning a sigmoid score in [0, 1]."""
|
|
61
|
+
self._ensure_loaded()
|
|
62
|
+
import numpy as np
|
|
63
|
+
|
|
64
|
+
encoding = self._tokenizer.encode(text)
|
|
65
|
+
input_ids = np.array([encoding.ids], dtype=np.int64)
|
|
66
|
+
attention_mask = np.array([encoding.attention_mask], dtype=np.int64)
|
|
67
|
+
|
|
68
|
+
results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
|
|
69
|
+
logit = float(results[0][0][0])
|
|
70
|
+
return _sigmoid(logit)
|
|
71
|
+
|
|
72
|
+
def classify_batch(self, texts: list[str]) -> list[float]:
|
|
73
|
+
"""Classify multiple texts in batch."""
|
|
74
|
+
if not texts:
|
|
75
|
+
return []
|
|
76
|
+
self._ensure_loaded()
|
|
77
|
+
import numpy as np
|
|
78
|
+
|
|
79
|
+
encodings = self._tokenizer.encode_batch(texts)
|
|
80
|
+
input_ids = np.array([e.ids for e in encodings], dtype=np.int64)
|
|
81
|
+
attention_mask = np.array([e.attention_mask for e in encodings], dtype=np.int64)
|
|
82
|
+
|
|
83
|
+
results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
|
|
84
|
+
logits = results[0]
|
|
85
|
+
return [_sigmoid(float(logits[i][0])) for i in range(len(texts))]
|
|
86
|
+
|
|
87
|
+
def warmup(self) -> None:
|
|
88
|
+
self.load_model()
|
|
89
|
+
|
|
90
|
+
def is_loaded(self) -> bool:
|
|
91
|
+
return self._session is not None and self._tokenizer is not None
|
|
92
|
+
|
|
93
|
+
def _ensure_loaded(self) -> None:
|
|
94
|
+
if not self.is_loaded():
|
|
95
|
+
self.load_model()
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""Tier 1: Pattern Detection.
|
|
2
|
+
|
|
3
|
+
Fast, regex-based detection of known injection patterns.
|
|
4
|
+
Target latency: < 1-2ms per field.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
import re
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
from ..types import PatternDefinition, PatternMatch, RiskLevel, StructuralFlag, Tier1Result
|
|
14
|
+
from .patterns import ALL_PATTERNS, contains_filter_keywords
|
|
15
|
+
|
|
16
|
+
DEFAULT_DETECTOR_CONFIG = {
|
|
17
|
+
"use_fast_filter": True,
|
|
18
|
+
"max_analysis_length": 50000,
|
|
19
|
+
"entropy_threshold": 4.5,
|
|
20
|
+
"entropy_min_length": 50,
|
|
21
|
+
"max_field_length": 100000,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PatternDetector:
|
|
26
|
+
"""Pattern Detector for Tier 1 classification."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: dict | None = None, custom_patterns: list[PatternDefinition] | None = None):
|
|
29
|
+
cfg = dict(DEFAULT_DETECTOR_CONFIG)
|
|
30
|
+
if config:
|
|
31
|
+
cfg.update(config)
|
|
32
|
+
self._use_fast_filter = cfg["use_fast_filter"]
|
|
33
|
+
self._max_analysis_length = cfg["max_analysis_length"]
|
|
34
|
+
self._entropy_threshold = cfg["entropy_threshold"]
|
|
35
|
+
self._entropy_min_length = cfg["entropy_min_length"]
|
|
36
|
+
self._max_field_length = cfg["max_field_length"]
|
|
37
|
+
self._patterns: list[PatternDefinition] = list(ALL_PATTERNS)
|
|
38
|
+
self._has_custom = False
|
|
39
|
+
if custom_patterns:
|
|
40
|
+
self._patterns.extend(custom_patterns)
|
|
41
|
+
self._has_custom = True
|
|
42
|
+
|
|
43
|
+
def analyze(self, text: str) -> Tier1Result:
|
|
44
|
+
start = time.perf_counter()
|
|
45
|
+
|
|
46
|
+
if not text or len(text) < 3:
|
|
47
|
+
return self._empty_result(start)
|
|
48
|
+
|
|
49
|
+
original_length = len(text)
|
|
50
|
+
analysis_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
|
|
51
|
+
|
|
52
|
+
should_use_fast_filter = self._use_fast_filter and not self._has_custom
|
|
53
|
+
if should_use_fast_filter and not contains_filter_keywords(analysis_text):
|
|
54
|
+
flags = self._detect_structural_issues(analysis_text, original_length)
|
|
55
|
+
return self._create_result([], flags, start)
|
|
56
|
+
|
|
57
|
+
matches = self._detect_patterns(analysis_text)
|
|
58
|
+
flags = self._detect_structural_issues(analysis_text, original_length)
|
|
59
|
+
return self._create_result(matches, flags, start)
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Pattern detection
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def _detect_patterns(self, text: str) -> list[PatternMatch]:
|
|
66
|
+
matches: list[PatternMatch] = []
|
|
67
|
+
for defn in self._patterns:
|
|
68
|
+
# Use finditer for all patterns (handles global-like behavior)
|
|
69
|
+
for m in defn.pattern.finditer(text):
|
|
70
|
+
matches.append(
|
|
71
|
+
PatternMatch(
|
|
72
|
+
pattern=defn.id,
|
|
73
|
+
matched=m.group(0),
|
|
74
|
+
position=m.start(),
|
|
75
|
+
category=defn.category,
|
|
76
|
+
severity=defn.severity,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
return matches
|
|
80
|
+
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
# Structural analysis
|
|
83
|
+
# ------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def _detect_structural_issues(self, text: str, original_length: int | None = None) -> list[StructuralFlag]:
|
|
86
|
+
flags: list[StructuralFlag] = []
|
|
87
|
+
length_to_check = original_length if original_length is not None else len(text)
|
|
88
|
+
|
|
89
|
+
if length_to_check > self._max_field_length:
|
|
90
|
+
flags.append(
|
|
91
|
+
StructuralFlag(
|
|
92
|
+
type="excessive_length",
|
|
93
|
+
details=f"Field length {length_to_check} exceeds maximum {self._max_field_length}",
|
|
94
|
+
severity="medium",
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if len(text) >= self._entropy_min_length:
|
|
99
|
+
entropy = self._calculate_entropy(text)
|
|
100
|
+
if entropy > self._entropy_threshold:
|
|
101
|
+
flags.append(
|
|
102
|
+
StructuralFlag(
|
|
103
|
+
type="high_entropy",
|
|
104
|
+
details=f"Entropy {entropy:.2f} exceeds threshold {self._entropy_threshold}",
|
|
105
|
+
severity="medium",
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if self._has_nested_markers(text):
|
|
110
|
+
flags.append(
|
|
111
|
+
StructuralFlag(
|
|
112
|
+
type="nested_markers",
|
|
113
|
+
details="Suspicious nested XML tags or bracket patterns detected",
|
|
114
|
+
severity="medium",
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if self._has_suspicious_formatting(text):
|
|
119
|
+
flags.append(
|
|
120
|
+
StructuralFlag(
|
|
121
|
+
type="suspicious_formatting",
|
|
122
|
+
details="Unusual formatting patterns detected",
|
|
123
|
+
severity="low",
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return flags
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _calculate_entropy(text: str) -> float:
|
|
131
|
+
freq: dict[str, int] = {}
|
|
132
|
+
for ch in text:
|
|
133
|
+
freq[ch] = freq.get(ch, 0) + 1
|
|
134
|
+
length = len(text)
|
|
135
|
+
entropy = 0.0
|
|
136
|
+
for count in freq.values():
|
|
137
|
+
p = count / length
|
|
138
|
+
entropy -= p * math.log2(p)
|
|
139
|
+
return entropy
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _has_nested_markers(text: str) -> bool:
|
|
143
|
+
suspicious_xml = re.compile(
|
|
144
|
+
r"</?(?:system|user|assistant|instruction|prompt|admin|developer)[^>]*>", re.I
|
|
145
|
+
)
|
|
146
|
+
tags = suspicious_xml.findall(text)
|
|
147
|
+
if len(tags) >= 2:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
xml_tags = re.findall(r"<[a-zA-Z][^>]*>", text)
|
|
151
|
+
if len(xml_tags) > 4:
|
|
152
|
+
marker_tags = [t for t in xml_tags if re.search(r"system|user|assistant|instruction|prompt", t, re.I)]
|
|
153
|
+
if marker_tags:
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
if re.search(r"\[\[.*?(?:system|instruction|ignore).*?\]\]", text, re.I):
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _has_suspicious_formatting(text: str) -> bool:
|
|
163
|
+
if re.search(r"\n{3,}(?:system|instruction|ignore|forget)", text, re.I):
|
|
164
|
+
return True
|
|
165
|
+
if re.search(r"^#{1,3}\s*(?:system|instruction|new rules)", text, re.I | re.M):
|
|
166
|
+
return True
|
|
167
|
+
if re.search(r"[-=]{3,}\s*\n\s*(?:system|instruction|ignore)", text, re.I):
|
|
168
|
+
return True
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
# ------------------------------------------------------------------
|
|
172
|
+
# Risk calculation
|
|
173
|
+
# ------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _calculate_suggested_risk(matches: list[PatternMatch], flags: list[StructuralFlag]) -> RiskLevel:
|
|
177
|
+
high_matches = sum(1 for m in matches if m.severity == "high")
|
|
178
|
+
medium_matches = sum(1 for m in matches if m.severity == "medium")
|
|
179
|
+
high_flags = sum(1 for f in flags if f.severity == "high")
|
|
180
|
+
medium_flags = sum(1 for f in flags if f.severity == "medium")
|
|
181
|
+
|
|
182
|
+
if high_matches >= 2 or (high_matches >= 1 and high_flags >= 1):
|
|
183
|
+
return "critical"
|
|
184
|
+
if high_matches >= 1 or medium_matches >= 3 or (medium_matches >= 2 and medium_flags >= 1):
|
|
185
|
+
return "high"
|
|
186
|
+
if medium_matches >= 1 or high_flags >= 1 or medium_flags >= 2:
|
|
187
|
+
return "medium"
|
|
188
|
+
if matches or flags:
|
|
189
|
+
return "low"
|
|
190
|
+
return "low"
|
|
191
|
+
|
|
192
|
+
# ------------------------------------------------------------------
|
|
193
|
+
# Result helpers
|
|
194
|
+
# ------------------------------------------------------------------
|
|
195
|
+
|
|
196
|
+
def _create_result(self, matches: list[PatternMatch], flags: list[StructuralFlag], start: float) -> Tier1Result:
|
|
197
|
+
return Tier1Result(
|
|
198
|
+
matches=matches,
|
|
199
|
+
structural_flags=flags,
|
|
200
|
+
has_detections=bool(matches) or bool(flags),
|
|
201
|
+
suggested_risk=self._calculate_suggested_risk(matches, flags),
|
|
202
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _empty_result(start: float) -> Tier1Result:
|
|
207
|
+
return Tier1Result(
|
|
208
|
+
matches=[],
|
|
209
|
+
structural_flags=[],
|
|
210
|
+
has_detections=False,
|
|
211
|
+
suggested_risk="low",
|
|
212
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def add_pattern(self, pattern: PatternDefinition) -> None:
|
|
216
|
+
self._patterns.append(pattern)
|
|
217
|
+
|
|
218
|
+
def get_patterns(self) -> list[PatternDefinition]:
|
|
219
|
+
return list(self._patterns)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def create_pattern_detector(config: dict | None = None, custom_patterns: list[PatternDefinition] | None = None) -> PatternDetector:
|
|
223
|
+
return PatternDetector(config=config, custom_patterns=custom_patterns)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Shared injection pattern definitions.
|
|
2
|
+
|
|
3
|
+
These patterns are used by both Tier 1 classification and sanitization.
|
|
4
|
+
Single source of truth for pattern matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
from ..types import PatternDefinition
|
|
12
|
+
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
# Role markers
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
ROLE_MARKER_PATTERNS: list[PatternDefinition] = [
|
|
17
|
+
PatternDefinition("role_system", re.compile(r"^SYSTEM:\s*", re.I), "role_marker", "high", "System role marker at start of text"),
|
|
18
|
+
PatternDefinition("role_assistant", re.compile(r"^ASSISTANT:\s*", re.I), "role_marker", "high", "Assistant role marker at start of text"),
|
|
19
|
+
PatternDefinition("role_user", re.compile(r"^USER:\s*", re.I), "role_marker", "medium", "User role marker at start of text"),
|
|
20
|
+
PatternDefinition("role_developer", re.compile(r"^DEVELOPER:\s*", re.I), "role_marker", "high", "Developer role marker at start of text"),
|
|
21
|
+
PatternDefinition("role_admin", re.compile(r"^ADMIN(?:ISTRATOR)?:\s*", re.I), "role_marker", "high", "Admin role marker at start of text"),
|
|
22
|
+
PatternDefinition("role_instruction", re.compile(r"^INSTRUCTIONS?:\s*", re.I), "role_marker", "high", "Instruction marker at start of text"),
|
|
23
|
+
PatternDefinition("role_human", re.compile(r"^HUMAN:\s*", re.I), "role_marker", "medium", "Human role marker at start of text"),
|
|
24
|
+
PatternDefinition("role_ai", re.compile(r"^AI:\s*", re.I), "role_marker", "medium", "AI role marker at start of text"),
|
|
25
|
+
# Bracketed variants
|
|
26
|
+
PatternDefinition("role_system_bracket", re.compile(r"^\[SYSTEM\]", re.I), "role_marker", "high", "Bracketed system role marker"),
|
|
27
|
+
PatternDefinition("role_inst_bracket", re.compile(r"^\[INST\]", re.I), "role_marker", "high", "Bracketed instruction marker (Llama format)"),
|
|
28
|
+
# XML-style variants
|
|
29
|
+
PatternDefinition("role_system_xml", re.compile(r"<system>", re.I), "role_marker", "high", "XML-style system tag"),
|
|
30
|
+
PatternDefinition("role_assistant_xml", re.compile(r"<assistant>", re.I), "role_marker", "medium", "XML-style assistant tag"),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Instruction overrides
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
INSTRUCTION_OVERRIDE_PATTERNS: list[PatternDefinition] = [
|
|
37
|
+
PatternDefinition("ignore_previous", re.compile(r"ignore\s+(?:all\s+)?(?:previous|prior|earlier|above)\s+(?:instructions?|prompts?|rules?|guidelines?|directions?)", re.I), "instruction_override", "high", "Attempt to ignore previous instructions"),
|
|
38
|
+
PatternDefinition("forget_previous", re.compile(r"forget\s+(?:all\s+)?(?:(?:previous|prior|earlier|above)\s+)?(?:instructions?|prompts?|rules?|context|guidelines?)", re.I), "instruction_override", "high", "Attempt to make AI forget instructions"),
|
|
39
|
+
PatternDefinition("disregard_previous", re.compile(r"disregard\s+(?:all\s+)?(?:previous|prior|earlier|above)\s+(?:instructions?|prompts?|rules?)", re.I), "instruction_override", "high", "Attempt to disregard instructions"),
|
|
40
|
+
PatternDefinition("override_instructions", re.compile(r"override\s+(?:the\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?)", re.I), "instruction_override", "high", "Direct override attempt"),
|
|
41
|
+
PatternDefinition("new_instructions", re.compile(r"new\s+instructions?:\s*", re.I), "instruction_override", "high", "Attempt to inject new instructions"),
|
|
42
|
+
PatternDefinition("updated_instructions", re.compile(r"(?:updated?|revised?|changed?)\s+instructions?:\s*", re.I), "instruction_override", "high", "Attempt to update instructions"),
|
|
43
|
+
PatternDefinition("stop_being", re.compile(r"stop\s+being\s+(?:a\s+)?(?:helpful|assistant|ai|chatbot)", re.I), "instruction_override", "medium", "Attempt to change AI behavior"),
|
|
44
|
+
PatternDefinition("from_now_on", re.compile(r"from\s+now\s+on,?\s+(?:you\s+)?(?:will|must|should|are)", re.I), "instruction_override", "medium", "Attempt to set new behavior"),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Role assumption
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
ROLE_ASSUMPTION_PATTERNS: list[PatternDefinition] = [
|
|
51
|
+
PatternDefinition("you_are_now", re.compile(r"you\s+are\s+now\s+(?:a\s+)?(?:different|new|the|my)?", re.I), "role_assumption", "high", "Attempt to assign new role"),
|
|
52
|
+
PatternDefinition("act_as", re.compile(r"act\s+(?:as|like)\s+(?:a\s+)?(?:system|admin|developer|root|superuser)", re.I), "role_assumption", "high", "Attempt to make AI act as privileged role"),
|
|
53
|
+
PatternDefinition("pretend_to_be", re.compile(r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+)?", re.I), "role_assumption", "medium", "Attempt to make AI pretend"),
|
|
54
|
+
PatternDefinition("roleplay_as", re.compile(r"roleplay\s+(?:as|like)\s+(?:a\s+)?", re.I), "role_assumption", "low", "Roleplay request (lower severity)"),
|
|
55
|
+
PatternDefinition("imagine_you_are", re.compile(r"imagine\s+(?:that\s+)?you\s+are\s+(?:a\s+)?", re.I), "role_assumption", "low", "Imagination prompt (lower severity)"),
|
|
56
|
+
PatternDefinition("jailbreak_dan", re.compile(r"\bDAN\b.*?(?:do\s+anything|jailbreak)", re.I), "role_assumption", "high", "DAN jailbreak attempt"),
|
|
57
|
+
PatternDefinition("developer_mode", re.compile(r"developer\s+mode\s+(?:is\s+)?(?:now\s+)?(?:enabled?|activated?|on)", re.I), "role_assumption", "high", "Developer mode activation attempt"),
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# Security bypass
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
SECURITY_BYPASS_PATTERNS: list[PatternDefinition] = [
|
|
64
|
+
PatternDefinition("bypass_security", re.compile(r"bypass\s+(?:the\s+)?(?:security|safety|guardrails?|filters?|restrictions?)", re.I), "security_bypass", "high", "Direct security bypass attempt"),
|
|
65
|
+
PatternDefinition("disable_safety", re.compile(r"disable\s+(?:the\s+)?(?:safety|security|guardrails?|filters?|restrictions?)", re.I), "security_bypass", "high", "Attempt to disable safety features"),
|
|
66
|
+
PatternDefinition("ignore_safety", re.compile(r"ignore\s+(?:the\s+)?(?:safety|security|ethical)\s+(?:guidelines?|rules?|restrictions?)", re.I), "security_bypass", "high", "Attempt to ignore safety guidelines"),
|
|
67
|
+
PatternDefinition("no_restrictions", re.compile(r"(?:without|no)\s+(?:any\s+)?(?:restrictions?|limitations?|guardrails?|filters?)", re.I), "security_bypass", "medium", "Request for unrestricted response"),
|
|
68
|
+
PatternDefinition("uncensored", re.compile(r"(?:uncensored|unfiltered|unrestricted)\s*(?:mode|response|output|version)?", re.I), "security_bypass", "high", "Request for uncensored mode"),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Command execution
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
COMMAND_EXECUTION_PATTERNS: list[PatternDefinition] = [
|
|
75
|
+
PatternDefinition("execute_command", re.compile(r"execute\s+(?:the\s+)?(?:following|this|these)\s+(?:command|instruction|code)", re.I), "command_execution", "high", "Command execution instruction"),
|
|
76
|
+
PatternDefinition("run_code", re.compile(r"run\s+(?:the\s+)?(?:following|this|these)\s+(?:code|script|command)", re.I), "command_execution", "high", "Code execution instruction"),
|
|
77
|
+
PatternDefinition("eval_expression", re.compile(r"eval(?:uate)?\s*\(", re.I), "command_execution", "medium", "Eval function pattern"),
|
|
78
|
+
PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)|`[^`]+`"), "command_execution", "medium", "Shell command substitution"),
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Encoding suspicious
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
|
|
85
|
+
PatternDefinition("base64_instruction", re.compile(r"(?:decode|base64)\s*[:(]\s*[A-Za-z0-9+/=]{20,}", re.I), "encoding_suspicious", "high", "Base64 encoded content with decode instruction"),
|
|
86
|
+
PatternDefinition("hex_escape_sequence", re.compile(r"(?:\\x[0-9a-fA-F]{2}){4,}"), "encoding_suspicious", "medium", "Hex escape sequence (potential obfuscation)"),
|
|
87
|
+
PatternDefinition("unicode_escape_sequence", re.compile(r"(?:\\u[0-9a-fA-F]{4}){4,}"), "encoding_suspicious", "medium", "Unicode escape sequence (potential obfuscation)"),
|
|
88
|
+
PatternDefinition("html_entity_abuse", re.compile(r"(?:&#\d{2,4};){4,}|(?:&#x[0-9a-fA-F]{2,4};){4,}", re.I), "encoding_suspicious", "medium", "HTML entity encoding (potential obfuscation)"),
|
|
89
|
+
PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "low", "Mention of simple encoding schemes"),
|
|
90
|
+
PatternDefinition("leetspeak_injection", re.compile(r"1gn0r3|f0rg3t|byp4ss|syst3m|4dm1n|h4ck", re.I), "encoding_suspicious", "medium", "Leetspeak obfuscation of injection keywords"),
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Prompt leaking
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
|
|
97
|
+
PatternDefinition("reveal_system_prompt", re.compile(r"reveal\s+(?:your\s+)?(?:system\s+)?prompt", re.I), "instruction_override", "high", "Attempt to reveal system prompt"),
|
|
98
|
+
PatternDefinition("show_instructions", re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:(?:initial|original|system|hidden)\s+)?instructions?", re.I), "instruction_override", "high", "Attempt to show hidden instructions"),
|
|
99
|
+
PatternDefinition("print_above", re.compile(r"print\s+(?:everything|all|text)\s+(?:above|before)", re.I), "instruction_override", "high", "Attempt to print content above current context"),
|
|
100
|
+
PatternDefinition("repeat_back", re.compile(r"repeat\s+(?:back\s+)?(?:everything|all|your\s+instructions?)", re.I), "instruction_override", "high", "Attempt to make AI repeat instructions"),
|
|
101
|
+
PatternDefinition("what_are_your_instructions", re.compile(r"what\s+(?:are|were)\s+(?:your|the)\s+(?:(?:initial|original|system)\s+)?instructions?", re.I), "instruction_override", "medium", "Question about system instructions"),
|
|
102
|
+
PatternDefinition("output_initialization", re.compile(r"output\s+(?:your\s+)?(?:initialization|init|startup|boot)", re.I), "instruction_override", "high", "Attempt to output initialization content"),
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# Indirect injection
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
INDIRECT_INJECTION_PATTERNS: list[PatternDefinition] = [
|
|
109
|
+
PatternDefinition("markdown_hidden_instruction", re.compile(r"\[.*?\]\(.*?(?:ignore|forget|system|instruction).*?\)", re.I), "structural", "high", "Markdown link with hidden injection"),
|
|
110
|
+
PatternDefinition("html_comment_injection", re.compile(r"<!--\s*(?:system|ignore|instruction|prompt).*?-->", re.I), "structural", "high", "HTML comment containing injection keywords"),
|
|
111
|
+
PatternDefinition("invisible_unicode", re.compile(r"[\u200b-\u200d\ufeff\u2060\u2061\u2062\u2063\u2064]"), "encoding_suspicious", "medium", "Invisible Unicode characters (zero-width, etc.)"),
|
|
112
|
+
PatternDefinition("text_direction_override", re.compile(r"[\u202a-\u202e\u2066-\u2069]"), "encoding_suspicious", "medium", "Text direction override characters"),
|
|
113
|
+
PatternDefinition("confusable_homoglyphs", re.compile(r"[\u13a0-\u13f4]|[\u1d00-\u1d2b]|[\u0400-\u04ff]"), "encoding_suspicious", "medium", "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)"),
|
|
114
|
+
PatternDefinition("separator_injection", re.compile(r"[-=]{10,}[^-=\n]*(?:system|instruction|ignore)", re.I), "structural", "medium", "Separator followed by injection attempt"),
|
|
115
|
+
PatternDefinition("json_injection", re.compile(r'"(?:system|role|instruction|prompt)"\s*:\s*"', re.I), "structural", "medium", "JSON-style role/instruction injection"),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
# All patterns combined
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
ALL_PATTERNS: list[PatternDefinition] = [
|
|
122
|
+
*ROLE_MARKER_PATTERNS,
|
|
123
|
+
*INSTRUCTION_OVERRIDE_PATTERNS,
|
|
124
|
+
*ROLE_ASSUMPTION_PATTERNS,
|
|
125
|
+
*SECURITY_BYPASS_PATTERNS,
|
|
126
|
+
*COMMAND_EXECUTION_PATTERNS,
|
|
127
|
+
*ENCODING_SUSPICIOUS_PATTERNS,
|
|
128
|
+
*PROMPT_LEAKING_PATTERNS,
|
|
129
|
+
*INDIRECT_INJECTION_PATTERNS,
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_patterns_by_category(category: str) -> list[PatternDefinition]:
|
|
134
|
+
return [p for p in ALL_PATTERNS if p.category == category]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_patterns_by_severity(severity: str) -> list[PatternDefinition]:
|
|
138
|
+
return [p for p in ALL_PATTERNS if p.severity == severity]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
FAST_FILTER_KEYWORDS: list[str] = [
|
|
142
|
+
# Role markers
|
|
143
|
+
"system:", "assistant:", "user:", "developer:", "admin:",
|
|
144
|
+
"instruction", "[system]", "[inst]", "<system>", "<assistant>",
|
|
145
|
+
# Override keywords
|
|
146
|
+
"ignore", "forget", "disregard", "override", "bypass",
|
|
147
|
+
"disable", "stop being", "from now on",
|
|
148
|
+
# Role assumption
|
|
149
|
+
"you are now", "act as", "pretend", "roleplay", "jailbreak",
|
|
150
|
+
"dan", "developer mode", "imagine you",
|
|
151
|
+
# Security bypass
|
|
152
|
+
"uncensored", "unfiltered", "unrestricted",
|
|
153
|
+
"no restrictions", "without restrictions",
|
|
154
|
+
# Commands
|
|
155
|
+
"execute", "eval(", "$(", "run the",
|
|
156
|
+
# Encoding/obfuscation
|
|
157
|
+
"base64", "decode", "\\x", "\\u", "&#", "rot13",
|
|
158
|
+
"1gn0r3", "f0rg3t", "byp4ss",
|
|
159
|
+
# Prompt leaking
|
|
160
|
+
"reveal", "show me your", "print everything", "print above",
|
|
161
|
+
"repeat back", "what are your instructions", "output initialization",
|
|
162
|
+
# Indirect injection
|
|
163
|
+
"<!--", '"system"', '"role"', '"instruction"',
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def contains_filter_keywords(text: str) -> bool:
|
|
168
|
+
"""Check if text contains any fast filter keywords (case-insensitive)."""
|
|
169
|
+
lower = text.lower()
|
|
170
|
+
return any(kw.lower() in lower for kw in FAST_FILTER_KEYWORDS)
|