stackone-defender 0.6.3__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. stackone_defender-0.7.0/.release-please-manifest.json +1 -0
  2. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/CHANGELOG.md +11 -0
  3. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/PKG-INFO +1 -1
  4. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/pyproject.toml +1 -1
  5. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/__init__.py +5 -2
  6. stackone_defender-0.7.0/src/stackone_defender/classifiers/onnx_classifier.py +276 -0
  7. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/classifiers/pattern_detector.py +76 -7
  8. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/classifiers/patterns.py +117 -9
  9. stackone_defender-0.7.0/src/stackone_defender/classifiers/tier2_classifier.py +477 -0
  10. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/config.py +2 -0
  11. stackone_defender-0.7.0/src/stackone_defender/core/prompt_defense.py +593 -0
  12. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/core/tool_result_sanitizer.py +29 -1
  13. stackone_defender-0.7.0/src/stackone_defender/models/minilm-multihead-v5/classifier_config.json +47 -0
  14. {stackone_defender-0.6.3/src/stackone_defender/models/minilm-full-aug → stackone_defender-0.7.0/src/stackone_defender/models/minilm-multihead-v5}/model_quantized.onnx +0 -0
  15. stackone_defender-0.7.0/src/stackone_defender/sanitizers/encoding_detector.py +600 -0
  16. stackone_defender-0.7.0/src/stackone_defender/sanitizers/leet_normalizer.py +115 -0
  17. stackone_defender-0.7.0/src/stackone_defender/sanitizers/normalizer.py +166 -0
  18. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sanitizers/sanitizer.py +37 -5
  19. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/types.py +40 -0
  20. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/utils/boundary.py +27 -5
  21. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_integration.py +172 -0
  22. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_onnx_classifier.py +131 -3
  23. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_pattern_detector.py +157 -3
  24. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_sanitizers.py +210 -4
  25. stackone_defender-0.7.0/tests/test_tier2_classifier.py +162 -0
  26. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/uv.lock +1 -1
  27. stackone_defender-0.6.3/.release-please-manifest.json +0 -1
  28. stackone_defender-0.6.3/src/stackone_defender/classifiers/onnx_classifier.py +0 -148
  29. stackone_defender-0.6.3/src/stackone_defender/classifiers/tier2_classifier.py +0 -291
  30. stackone_defender-0.6.3/src/stackone_defender/core/prompt_defense.py +0 -315
  31. stackone_defender-0.6.3/src/stackone_defender/sanitizers/encoding_detector.py +0 -180
  32. stackone_defender-0.6.3/src/stackone_defender/sanitizers/normalizer.py +0 -94
  33. stackone_defender-0.6.3/tests/test_tier2_classifier.py +0 -63
  34. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/.github/workflows/ci.yaml +0 -0
  35. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/.github/workflows/release.yaml +0 -0
  36. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/.gitignore +0 -0
  37. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/.python-version +0 -0
  38. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/.release-please-config.json +0 -0
  39. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/README.md +0 -0
  40. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/models/minilm-full-aug/config.json +0 -0
  41. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/models/minilm-full-aug/model_quantized.onnx +0 -0
  42. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/models/minilm-full-aug/tokenizer.json +0 -0
  43. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/models/minilm-full-aug/tokenizer_config.json +0 -0
  44. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/classifiers/__init__.py +0 -0
  45. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/core/__init__.py +0 -0
  46. {stackone_defender-0.6.3/src/stackone_defender/models/minilm-full-aug → stackone_defender-0.7.0/src/stackone_defender/models/minilm-multihead-v5}/config.json +0 -0
  47. {stackone_defender-0.6.3/src/stackone_defender/models/minilm-full-aug → stackone_defender-0.7.0/src/stackone_defender/models/minilm-multihead-v5}/tokenizer.json +0 -0
  48. {stackone_defender-0.6.3/src/stackone_defender/models/minilm-full-aug → stackone_defender-0.7.0/src/stackone_defender/models/minilm-multihead-v5}/tokenizer_config.json +0 -0
  49. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sanitizers/__init__.py +0 -0
  50. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sanitizers/pattern_remover.py +0 -0
  51. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sanitizers/role_stripper.py +0 -0
  52. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sfe/__init__.py +0 -0
  53. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sfe/model.ftz +0 -0
  54. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/sfe/preprocess.py +0 -0
  55. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/utils/__init__.py +0 -0
  56. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/utils/field_detection.py +0 -0
  57. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/utils/structure.py +0 -0
  58. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/__init__.py +0 -0
  59. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_sfe.py +0 -0
  60. {stackone_defender-0.6.3 → stackone_defender-0.7.0}/tests/test_utils.py +0 -0
@@ -0,0 +1 @@
1
+ {".":"0.7.0"}
@@ -1,5 +1,16 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.7.0](https://github.com/StackOneHQ/stackone-defender/compare/stackone-defender-v0.6.3...stackone-defender-v0.7.0) (2026-05-29)
4
+
5
+
6
+ ### ⚠ BREAKING CHANGES
7
+
8
+ * The default ONNX model directory changed from minilm-full-aug to minilm-multihead-v5. Custom code that hardcoded the old path will no longer load.
9
+
10
+ ### Features
11
+
12
+ * parity with TS defender 0.7.0 ([75d046a](https://github.com/StackOneHQ/stackone-defender/commit/75d046ab45066ee1f973e91357f7ecb23dea50c8))
13
+
3
14
  ## [0.6.3](https://github.com/StackOneHQ/stackone-defender/compare/stackone-defender-v0.6.2...stackone-defender-v0.6.3) (2026-05-26)
4
15
 
5
16
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stackone-defender
3
- Version: 0.6.3
3
+ Version: 0.7.0
4
4
  Summary: Indirect prompt injection defense for AI agents using tool calls
5
5
  Project-URL: Homepage, https://github.com/StackOneHQ/stackone-defender
6
6
  Project-URL: Repository, https://github.com/StackOneHQ/stackone-defender
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "stackone-defender"
3
- version = "0.6.3"
3
+ version = "0.7.0"
4
4
  description = "Indirect prompt injection defense for AI agents using tool calls"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -11,8 +11,8 @@ Usage:
11
11
  print(f"Blocked: {result.risk_level}")
12
12
  """
13
13
 
14
+ from .classifiers.onnx_classifier import get_default_model_path
14
15
  from .core.prompt_defense import PromptDefense, create_prompt_defense
15
- from .utils.boundary import contains_boundary_patterns, generate_boundary_instructions
16
16
  from .sfe.preprocess import (
17
17
  DropDecision,
18
18
  SfePredictor,
@@ -21,11 +21,13 @@ from .sfe.preprocess import (
21
21
  get_default_sfe_model_path,
22
22
  sfe_preprocess,
23
23
  )
24
- from .types import DefenseResult, RiskLevel, Tier1Result
24
+ from .types import DefenseResult, MultiheadConfig, RiskLevel, Tier1Result
25
+ from .utils.boundary import contains_boundary_patterns, generate_boundary_instructions
25
26
 
26
27
  __all__ = [
27
28
  "DefenseResult",
28
29
  "DropDecision",
30
+ "MultiheadConfig",
29
31
  "PromptDefense",
30
32
  "RiskLevel",
31
33
  "SfePredictor",
@@ -34,6 +36,7 @@ __all__ = [
34
36
  "contains_boundary_patterns",
35
37
  "create_prompt_defense",
36
38
  "generate_boundary_instructions",
39
+ "get_default_model_path",
37
40
  "get_default_predictor",
38
41
  "get_default_sfe_model_path",
39
42
  "sfe_preprocess",
@@ -0,0 +1,276 @@
1
+ """ONNX classifier for fine-tuned MiniLM prompt injection detection.
2
+
3
+ Pipeline: text -> tokenizer -> ONNX Runtime -> logit -> ``sigmoid(logit / T)``
4
+ -> score. Supports single-head ``[batch]`` / ``[batch, 1]`` models and
5
+ multi-head ``[batch, 2]`` models (main + aux). Temperature ``T`` enables
6
+ post-hoc calibration via temperature scaling.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import math
13
+ import threading
14
+ from pathlib import Path
15
+ from typing import Literal
16
+
17
+ _logger = logging.getLogger(__name__)
18
+
19
+ # Shared across all OnnxClassifier instances (keyed by resolved model dir path).
20
+ _session_cache: dict[str, tuple[object, object]] = {}
21
+ _registry_lock = threading.Lock()
22
+ _load_locks: dict[str, threading.Lock] = {}
23
+
24
+
25
+ def _lock_for_cache_key(cache_key: str) -> threading.Lock:
26
+ with _registry_lock:
27
+ if cache_key not in _load_locks:
28
+ _load_locks[cache_key] = threading.Lock()
29
+ return _load_locks[cache_key]
30
+
31
+
32
+ def get_default_model_path() -> str:
33
+ """Return the absolute path to the bundled ONNX model directory.
34
+
35
+ Exported so :class:`Tier2Classifier` can read model-specific calibration
36
+ defaults from ``classifier_config.json`` at construction time without
37
+ needing an :class:`OnnxClassifier` instance.
38
+ """
39
+ return str(Path(__file__).resolve().parent.parent / "models" / "minilm-multihead-v5")
40
+
41
+
42
+ # Back-compat shim retained for internal users; same value as the public name.
43
+ def _default_model_path() -> str:
44
+ return get_default_model_path()
45
+
46
+
47
+ def _sigmoid(x: float) -> float:
48
+ return 1.0 / (1.0 + math.exp(-x))
49
+
50
+
51
+ class OnnxClassifier:
52
+ """ONNX Classifier for fine-tuned MiniLM models.
53
+
54
+ Loads the model lazily on first inference. The session and tokenizer
55
+ are cached at module level so multiple instances pointing at the same
56
+ model path share a single backing session (safe: ONNX Runtime
57
+ guarantees thread-safe ``Run()`` from v1.7.0, and the ``tokenizers``
58
+ library's encode methods do not mutate the tokenizer object).
59
+ """
60
+
61
+ _MAX_BATCH_CHUNK = 32
62
+
63
+ def __init__(self, model_path: str | None = None, temperature_t: float | None = None):
64
+ self._model_path = model_path or get_default_model_path()
65
+ self._session = None
66
+ self._tokenizer = None
67
+ self._max_length = 256
68
+ self._load_failed = False
69
+ # Output mode is detected lazily from the logits shape on the first
70
+ # inference call. ``None`` until then.
71
+ self._output_mode: Literal["single", "multi"] | None = None
72
+ # Temperature ``T`` must be a positive finite number. ``T <= 0`` is
73
+ # undefined (divide-by-zero or sign flip) and almost certainly a
74
+ # programming error rather than a config the caller wants gracefully
75
+ # ignored.
76
+ self._temperature_t = 1.0
77
+ if temperature_t is not None:
78
+ if not math.isfinite(temperature_t) or temperature_t <= 0:
79
+ raise ValueError(
80
+ f"OnnxClassifier: temperature_t must be a positive finite number, got {temperature_t}"
81
+ )
82
+ self._temperature_t = float(temperature_t)
83
+
84
+ # ------------------------------------------------------------------
85
+ # Public introspection
86
+ # ------------------------------------------------------------------
87
+
88
+ def get_temperature(self) -> float:
89
+ """Current temperature scaling factor (``1.0`` = no calibration)."""
90
+ return self._temperature_t
91
+
92
+ def get_output_mode(self) -> Literal["single", "multi"] | None:
93
+ """Output mode of the loaded model.
94
+
95
+ ``None`` until the first inference runs. ``"multi"`` indicates the
96
+ model emits ``[batch, 2]`` logits (main + aux).
97
+ """
98
+ return self._output_mode
99
+
100
+ # ------------------------------------------------------------------
101
+ # Loading
102
+ # ------------------------------------------------------------------
103
+
104
+ def load_model(self, model_path: str | None = None) -> None:
105
+ if model_path:
106
+ self._model_path = model_path
107
+ if self._session is not None and self._tokenizer is not None:
108
+ return
109
+ if self._load_failed:
110
+ raise ImportError("ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]")
111
+ self._load_model()
112
+
113
+ def _load_model(self) -> None:
114
+ cache_key = str(Path(self._model_path).resolve())
115
+ cached = _session_cache.get(cache_key)
116
+ if cached:
117
+ self._session, self._tokenizer = cached
118
+ return
119
+
120
+ with _lock_for_cache_key(cache_key):
121
+ cached = _session_cache.get(cache_key)
122
+ if cached:
123
+ self._session, self._tokenizer = cached
124
+ return
125
+
126
+ try:
127
+ import numpy as np # noqa: F401
128
+ import onnxruntime as ort
129
+ from tokenizers import Tokenizer
130
+ except ImportError as e:
131
+ self._load_failed = True
132
+ _logger.warning("[defender] ONNX model failed to load: %s", e)
133
+ raise ImportError(
134
+ "ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]"
135
+ ) from e
136
+
137
+ try:
138
+ tokenizer_path = str(Path(self._model_path) / "tokenizer.json")
139
+ self._tokenizer = Tokenizer.from_file(tokenizer_path)
140
+ self._tokenizer.enable_truncation(max_length=self._max_length)
141
+ self._tokenizer.enable_padding(length=self._max_length)
142
+
143
+ onnx_path = str(Path(self._model_path) / "model_quantized.onnx")
144
+ self._session = ort.InferenceSession(onnx_path)
145
+ except Exception as e:
146
+ _logger.warning("[defender] ONNX model failed to load: %s", e)
147
+ raise
148
+
149
+ _session_cache[cache_key] = (self._session, self._tokenizer)
150
+
151
+ # ------------------------------------------------------------------
152
+ # Inference
153
+ # ------------------------------------------------------------------
154
+
155
+ def classify(self, text: str) -> float:
156
+ """Classify a single text, returning the main-head sigmoid score.
157
+
158
+ For multi-head models only the main score is returned; callers that
159
+ need the aux score should use :meth:`classify_pair`.
160
+ """
161
+ return self.classify_pair(text)[0]
162
+
163
+ def classify_pair(self, text: str) -> tuple[float, float | None]:
164
+ """Classify a single text, returning ``(main, aux)``.
165
+
166
+ ``aux`` is ``None`` for single-head models. Both scores are
167
+ sigmoid-activated with the configured temperature ``T``.
168
+ """
169
+ self._ensure_loaded()
170
+ import numpy as np
171
+
172
+ encoding = self._tokenizer.encode(text)
173
+ input_ids = np.array([encoding.ids], dtype=np.int64)
174
+ attention_mask = np.array([encoding.attention_mask], dtype=np.int64)
175
+
176
+ results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
177
+ logits = results[0]
178
+ self._detect_output_mode(logits.shape)
179
+
180
+ t = self._temperature_t
181
+ row = logits[0]
182
+ # row shape: (), (1,) or (2,) depending on model export.
183
+ if self._output_mode == "multi":
184
+ main = _sigmoid(float(row[0]) / t)
185
+ aux = _sigmoid(float(row[1]) / t)
186
+ return main, aux
187
+ main_logit = float(row[0]) if hasattr(row, "__len__") and len(row) > 0 else float(row)
188
+ return _sigmoid(main_logit / t), None
189
+
190
+ def classify_batch(self, texts: list[str]) -> list[float]:
191
+ """Classify multiple texts; returns main-head scores only.
192
+
193
+ Back-compat wrapper around :meth:`classify_batch_pair`.
194
+ """
195
+ return [main for main, _ in self.classify_batch_pair(texts)]
196
+
197
+ def classify_batch_pair(self, texts: list[str]) -> list[tuple[float, float | None]]:
198
+ """Classify multiple texts, returning ``(main, aux)`` per row.
199
+
200
+ Aux is ``None`` per-row for single-head models. Chunks the input to
201
+ bound native memory; the attention matrix is ``O(chunk * seq_len^2)``,
202
+ and for MiniLM (``max_length=256``) a chunk of 32 keeps memory
203
+ under ~50MB per call.
204
+ """
205
+ if not texts:
206
+ return []
207
+ self._ensure_loaded()
208
+ all_pairs: list[tuple[float, float | None]] = []
209
+ for offset in range(0, len(texts), self._MAX_BATCH_CHUNK):
210
+ chunk = texts[offset : offset + self._MAX_BATCH_CHUNK]
211
+ all_pairs.extend(self._classify_batch_chunk_pair(chunk))
212
+ return all_pairs
213
+
214
+ def _classify_batch_chunk_pair(self, texts: list[str]) -> list[tuple[float, float | None]]:
215
+ import numpy as np
216
+
217
+ encodings = self._tokenizer.encode_batch(texts)
218
+ input_ids = np.array([e.ids for e in encodings], dtype=np.int64)
219
+ attention_mask = np.array([e.attention_mask for e in encodings], dtype=np.int64)
220
+
221
+ results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
222
+ logits = results[0]
223
+ self._detect_output_mode(logits.shape)
224
+
225
+ t = self._temperature_t
226
+ pairs: list[tuple[float, float | None]] = []
227
+ if self._output_mode == "multi":
228
+ for i in range(len(texts)):
229
+ main = _sigmoid(float(logits[i][0]) / t)
230
+ aux = _sigmoid(float(logits[i][1]) / t)
231
+ pairs.append((main, aux))
232
+ else:
233
+ for i in range(len(texts)):
234
+ row = logits[i]
235
+ # ``row`` may be a scalar (shape ``[batch]``) or 1-vector.
236
+ main_logit = float(row[0]) if hasattr(row, "__len__") and len(row) > 0 else float(row)
237
+ pairs.append((_sigmoid(main_logit / t), None))
238
+ return pairs
239
+
240
+ def _detect_output_mode(self, dims) -> None:
241
+ """Detect output mode from the logits tensor shape on first inference.
242
+
243
+ - ``[batch]`` or ``[batch, 1]`` -> ``"single"``
244
+ - ``[batch, 2]`` -> ``"multi"`` (main + aux dual head)
245
+
246
+ Idempotent: subsequent calls are no-ops once mode is set.
247
+ """
248
+ if self._output_mode is not None:
249
+ return
250
+ if dims is None or len(dims) < 2:
251
+ self._output_mode = "single"
252
+ return
253
+ self._output_mode = "multi" if dims[1] == 2 else "single"
254
+
255
+ # ------------------------------------------------------------------
256
+ # Misc
257
+ # ------------------------------------------------------------------
258
+
259
+ def count_tokens(self, text: str) -> int:
260
+ self._ensure_loaded()
261
+ encoding = self._tokenizer.encode(text)
262
+ # Padding is enabled at a fixed length; count only real (attended) tokens.
263
+ return int(sum(encoding.attention_mask))
264
+
265
+ def get_max_length(self) -> int:
266
+ return self._max_length
267
+
268
+ def warmup(self) -> None:
269
+ self.load_model()
270
+
271
+ def is_loaded(self) -> bool:
272
+ return self._session is not None and self._tokenizer is not None
273
+
274
+ def _ensure_loaded(self) -> None:
275
+ if not self.is_loaded():
276
+ self.load_model()
@@ -9,7 +9,10 @@ from __future__ import annotations
9
9
  import math
10
10
  import re
11
11
  import time
12
+ import unicodedata
12
13
 
14
+ from ..sanitizers.leet_normalizer import normalize_leet_speak
15
+ from ..sanitizers.normalizer import normalize_unicode, normalize_whitespace, strip_combining_marks
13
16
  from ..types import PatternDefinition, PatternMatch, RiskLevel, StructuralFlag, Tier1Result
14
17
  from .patterns import ALL_PATTERNS, contains_filter_keywords
15
18
 
@@ -47,16 +50,83 @@ class PatternDetector:
47
50
  return self._empty_result(start)
48
51
 
49
52
  original_length = len(text)
50
- analysis_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
53
+ raw_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
54
+
55
+ # Normalisation chain: collapse obfuscation before injection pattern
56
+ # matching. Order matters:
57
+ # 1. NFD-decompose: precomposed accents become base + combining mark.
58
+ # 2. strip_combining_marks: Zalgo defense + accent stripping.
59
+ # 3. normalize_unicode: homoglyphs/fullwidth -> ASCII.
60
+ # 4. normalize_whitespace: collapse spaced letters + embedded newlines.
61
+ # 5. normalize_leet_speak: 1gn0r3 -> ignore.
62
+ # NFD-decomposition lives here (not in normalize_unicode) because it
63
+ # strips legitimate accents like ``café`` -> ``cafe`` -- fine for
64
+ # analysis but would be data loss if returned to callers. The result
65
+ # is analysis-only and never returned.
66
+ analysis_text = normalize_leet_speak(
67
+ normalize_whitespace(
68
+ normalize_unicode(strip_combining_marks(unicodedata.normalize("NFD", raw_text)))
69
+ )
70
+ )
51
71
 
72
+ # Fast filter: short-circuit if neither raw nor normalised text
73
+ # contains keywords. Raw text is checked to preserve detection of
74
+ # obfuscation patterns (e.g. invisible unicode, leet-speak variants)
75
+ # that are normalised away before injection patterns run. Disable the
76
+ # fast filter when custom patterns are provided -- callers may add
77
+ # patterns whose keywords aren't in the static list.
52
78
  should_use_fast_filter = self._use_fast_filter and not self._has_custom
53
- if should_use_fast_filter and not contains_filter_keywords(analysis_text):
54
- flags = self._detect_structural_issues(analysis_text, original_length)
79
+ raw_has_keywords = not should_use_fast_filter or contains_filter_keywords(raw_text)
80
+ norm_has_keywords = not should_use_fast_filter or contains_filter_keywords(analysis_text)
81
+
82
+ if not raw_has_keywords and not norm_has_keywords:
83
+ flags = self._detect_structural_issues(raw_text, original_length)
55
84
  return self._create_result([], flags, start)
56
85
 
57
- matches = self._detect_patterns(analysis_text)
58
- flags = self._detect_structural_issues(analysis_text, original_length)
59
- return self._create_result(matches, flags, start)
86
+ # Short-circuit: if normalisation produced no change, a single pass
87
+ # is sufficient and avoids doubling pattern work for plain-text input.
88
+ if raw_text == analysis_text:
89
+ matches = self._detect_patterns(raw_text) if raw_has_keywords else []
90
+ flags = self._detect_structural_issues(raw_text, original_length)
91
+ return self._create_result(matches, flags, start)
92
+
93
+ # Run patterns on raw text -- catches obfuscation-specific patterns
94
+ # (e.g. invisible_unicode, leetspeak_injection) that normalisation
95
+ # removes. Run whenever EITHER pass has keywords: if only the
96
+ # normalised text has keywords (pure leet-speak with no other
97
+ # fast-filter hits), we still want the raw pass to fire obfuscation
98
+ # patterns like leetspeak_injection.
99
+ raw_matches = (
100
+ self._detect_patterns(raw_text) if (raw_has_keywords or norm_has_keywords) else []
101
+ )
102
+
103
+ # Run patterns on normalised text -- catches injection patterns
104
+ # hidden behind leet-speak, whitespace, or homoglyph obfuscation.
105
+ # Matches are tagged ``normalised=True`` because their
106
+ # position/matched values reference the transformed text.
107
+ norm_matches_raw = self._detect_patterns(analysis_text) if norm_has_keywords else []
108
+ norm_matches = [
109
+ PatternMatch(
110
+ pattern=m.pattern,
111
+ matched=m.matched,
112
+ position=m.position,
113
+ category=m.category,
114
+ severity=m.severity,
115
+ normalised=True,
116
+ )
117
+ for m in norm_matches_raw
118
+ ]
119
+
120
+ # Merge: normalised matches take priority. Raw-only matches are
121
+ # appended for patterns that fired on the original text but not the
122
+ # normalised form (e.g. obfuscation-detection patterns that match the
123
+ # raw encoding characters).
124
+ seen_patterns = {m.pattern for m in norm_matches}
125
+ merged_matches: list[PatternMatch] = [*norm_matches]
126
+ merged_matches.extend(m for m in raw_matches if m.pattern not in seen_patterns)
127
+
128
+ flags = self._detect_structural_issues(raw_text, original_length)
129
+ return self._create_result(merged_matches, flags, start)
60
130
 
61
131
  # ------------------------------------------------------------------
62
132
  # Pattern detection
@@ -65,7 +135,6 @@ class PatternDetector:
65
135
  def _detect_patterns(self, text: str) -> list[PatternMatch]:
66
136
  matches: list[PatternMatch] = []
67
137
  for defn in self._patterns:
68
- # Use finditer for all patterns (handles global-like behavior)
69
138
  for m in defn.pattern.finditer(text):
70
139
  matches.append(
71
140
  PatternMatch(
@@ -26,7 +26,21 @@ ROLE_MARKER_PATTERNS: list[PatternDefinition] = [
26
26
  PatternDefinition("role_system_bracket", re.compile(r"^\[SYSTEM\]", re.I), "role_marker", "high", "Bracketed system role marker"),
27
27
  PatternDefinition("role_inst_bracket", re.compile(r"^\[INST\]", re.I), "role_marker", "high", "Bracketed instruction marker (Llama format)"),
28
28
  # XML-style variants
29
- PatternDefinition("role_system_xml", re.compile(r"<system>", re.I), "role_marker", "high", "XML-style system tag"),
29
+ # role_system_xml requires directive-shaped content immediately following
30
+ # the tag. Bare ``<system>`` mentions are common in XML schemas, ML config
31
+ # docs, and OS specs; the attack shape is ``<system>`` followed by an
32
+ # imperative or role-switch payload. Closing-tag pairs are matched
33
+ # implicitly because the directive content sits inside them.
34
+ PatternDefinition(
35
+ "role_system_xml",
36
+ re.compile(
37
+ r"<system>\s*(?:ignore|disregard|forget|override|you\s+are|new\s+instructions?|stop|disable|bypass)",
38
+ re.I,
39
+ ),
40
+ "role_marker",
41
+ "high",
42
+ "XML-style system tag",
43
+ ),
30
44
  PatternDefinition("role_assistant_xml", re.compile(r"<assistant>", re.I), "role_marker", "medium", "XML-style assistant tag"),
31
45
  ]
32
46
 
@@ -48,9 +62,35 @@ INSTRUCTION_OVERRIDE_PATTERNS: list[PatternDefinition] = [
48
62
  # Role assumption
49
63
  # ---------------------------------------------------------------------------
50
64
  ROLE_ASSUMPTION_PATTERNS: list[PatternDefinition] = [
51
- PatternDefinition("you_are_now", re.compile(r"you\s+are\s+now\s+(?:a\s+)?(?:different|new|the|my)?", re.I), "role_assumption", "high", "Attempt to assign new role"),
65
+ # you_are_now: require an actual role-noun terminator. The previous form
66
+ # made both alternation groups optional, so "you are now logged in" /
67
+ # "you are now ready" UI strings FP'd. Role list expanded to cover the
68
+ # actual attack distribution (DAN/GPT/AI/jailbroken/persona-switching).
69
+ PatternDefinition(
70
+ "you_are_now",
71
+ re.compile(
72
+ r"you\s+are\s+now\s+(?:a\s+|an\s+)?(?:different|new|unrestricted|jailbroken|free|uncensored|DAN|GPT|AI|chatbot|model|persona|hacker|admin|root|developer|superuser)\b",
73
+ re.I,
74
+ ),
75
+ "role_assumption",
76
+ "high",
77
+ "Attempt to assign new role",
78
+ ),
52
79
  PatternDefinition("act_as", re.compile(r"act\s+(?:as|like)\s+(?:a\s+)?(?:system|admin|developer|root|superuser)", re.I), "role_assumption", "high", "Attempt to make AI act as privileged role"),
53
- PatternDefinition("pretend_to_be", re.compile(r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+)?", re.I), "role_assumption", "medium", "Attempt to make AI pretend"),
80
+ # pretend_to_be: require an attack-shaped role-noun. Previously matched any
81
+ # "pretend to be ..." prefix, FPing on children's literature, drama
82
+ # exercises, jokes. Role list overlaps with `you_are_now` plus privilege
83
+ # escalation nouns (hacker/admin/root/developer).
84
+ PatternDefinition(
85
+ "pretend_to_be",
86
+ re.compile(
87
+ r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+|an\s+)?(?:different|new|unrestricted|jailbroken|free|uncensored|DAN|GPT|AI|chatbot|model|persona|hacker|admin|root|developer)\b",
88
+ re.I,
89
+ ),
90
+ "role_assumption",
91
+ "medium",
92
+ "Attempt to make AI pretend",
93
+ ),
54
94
  PatternDefinition("roleplay_as", re.compile(r"roleplay\s+(?:as|like)\s+(?:a\s+)?", re.I), "role_assumption", "low", "Roleplay request (lower severity)"),
55
95
  PatternDefinition("imagine_you_are", re.compile(r"imagine\s+(?:that\s+)?you\s+are\s+(?:a\s+)?", re.I), "role_assumption", "low", "Imagination prompt (lower severity)"),
56
96
  PatternDefinition("jailbreak_dan", re.compile(r"\bDAN\b.*?(?:do\s+anything|jailbreak)", re.I), "role_assumption", "high", "DAN jailbreak attempt"),
@@ -75,7 +115,13 @@ COMMAND_EXECUTION_PATTERNS: list[PatternDefinition] = [
75
115
  PatternDefinition("execute_command", re.compile(r"execute\s+(?:the\s+)?(?:following|this|these)\s+(?:command|instruction|code)", re.I), "command_execution", "high", "Command execution instruction"),
76
116
  PatternDefinition("run_code", re.compile(r"run\s+(?:the\s+)?(?:following|this|these)\s+(?:code|script|command)", re.I), "command_execution", "high", "Code execution instruction"),
77
117
  PatternDefinition("eval_expression", re.compile(r"eval(?:uate)?\s*\(", re.I), "command_execution", "medium", "Eval function pattern"),
78
- PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)|`[^`]+`"), "command_execution", "medium", "Shell command substitution"),
118
+ # shell_command: POSIX ``$(...)`` only. The legacy backtick form
119
+ # ``` `cmd` ``` used to be included here but FPs on every markdown
120
+ # inline-code span (``` `cat foo.json` ```, ``` `npm install` ```,
121
+ # ``` `filename.txt` ```). Modern shell idioms have used ``$(...)`` for
122
+ # decades; real attackers default to it because it nests. Tier 2 still
123
+ # catches the rare backtick attack via context.
124
+ PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)"), "command_execution", "medium", "Shell command substitution"),
79
125
  ]
80
126
 
81
127
  # ---------------------------------------------------------------------------
@@ -86,7 +132,9 @@ ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
86
132
  PatternDefinition("hex_escape_sequence", re.compile(r"(?:\\x[0-9a-fA-F]{2}){4,}"), "encoding_suspicious", "medium", "Hex escape sequence (potential obfuscation)"),
87
133
  PatternDefinition("unicode_escape_sequence", re.compile(r"(?:\\u[0-9a-fA-F]{4}){4,}"), "encoding_suspicious", "medium", "Unicode escape sequence (potential obfuscation)"),
88
134
  PatternDefinition("html_entity_abuse", re.compile(r"(?:&#\d{2,4};){4,}|(?:&#x[0-9a-fA-F]{2,4};){4,}", re.I), "encoding_suspicious", "medium", "HTML entity encoding (potential obfuscation)"),
89
- PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "low", "Mention of simple encoding schemes"),
135
+ PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "medium", "Mention of ROT13 or similar encoding schemes"),
136
+ PatternDefinition("binary_string_encoding", re.compile(r"\b[01]{8}(?:\s+[01]{8}){2,}\b"), "encoding_suspicious", "medium", "Binary-encoded string (potential obfuscation)"),
137
+ PatternDefinition("morse_code_encoding", re.compile(r"(?:[.-]+\s){4,}[.-]+"), "encoding_suspicious", "low", "Morse code pattern (potential obfuscation)"),
90
138
  PatternDefinition("leetspeak_injection", re.compile(r"1gn0r3|f0rg3t|byp4ss|syst3m|4dm1n|h4ck", re.I), "encoding_suspicious", "medium", "Leetspeak obfuscation of injection keywords"),
91
139
  ]
92
140
 
@@ -95,7 +143,17 @@ ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
95
143
  # ---------------------------------------------------------------------------
96
144
  PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
97
145
  PatternDefinition("reveal_system_prompt", re.compile(r"reveal\s+(?:your\s+)?(?:system\s+)?prompt", re.I), "instruction_override", "high", "Attempt to reveal system prompt"),
98
- PatternDefinition("show_instructions", re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:(?:initial|original|system|hidden)\s+)?instructions?", re.I), "instruction_override", "high", "Attempt to show hidden instructions"),
146
+ # show_instructions: require an attack-shaped qualifier
147
+ # (initial/original/system/hidden). The inner group used to be optional,
148
+ # so bare "show instructions" -- common in FAQs, help docs, onboarding
149
+ # tours -- triggered the rule.
150
+ PatternDefinition(
151
+ "show_instructions",
152
+ re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:initial|original|system|hidden)\s+instructions?", re.I),
153
+ "instruction_override",
154
+ "high",
155
+ "Attempt to show hidden instructions",
156
+ ),
99
157
  PatternDefinition("print_above", re.compile(r"print\s+(?:everything|all|text)\s+(?:above|before)", re.I), "instruction_override", "high", "Attempt to print content above current context"),
100
158
  PatternDefinition("repeat_back", re.compile(r"repeat\s+(?:back\s+)?(?:everything|all|your\s+instructions?)", re.I), "instruction_override", "high", "Attempt to make AI repeat instructions"),
101
159
  PatternDefinition("what_are_your_instructions", re.compile(r"what\s+(?:are|were)\s+(?:your|the)\s+(?:(?:initial|original|system)\s+)?instructions?", re.I), "instruction_override", "medium", "Question about system instructions"),
@@ -106,13 +164,57 @@ PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
106
164
  # Indirect injection
107
165
  # ---------------------------------------------------------------------------
108
166
  INDIRECT_INJECTION_PATTERNS: list[PatternDefinition] = [
109
- PatternDefinition("markdown_hidden_instruction", re.compile(r"\[.*?\]\(.*?(?:ignore|forget|system|instruction).*?\)", re.I), "structural", "high", "Markdown link with hidden injection"),
167
+ # markdown_hidden_instruction: require an imperative + scope qualifier in
168
+ # the URL. The earlier form matched "system" or "instruction" anywhere in
169
+ # the URL, so every doc cross-reference like
170
+ # ``[config](https://.../system-setup)`` triggered. Real smuggled-
171
+ # instruction attacks include the full "ignore (all|the|previous|prior)"
172
+ # phrasing in the URL/anchor.
173
+ PatternDefinition(
174
+ "markdown_hidden_instruction",
175
+ re.compile(
176
+ r"\[.*?\]\(.*?(?:ignore|disregard|forget|override)\W+(?:all|the|previous|prior)\W+.*?\)",
177
+ re.I,
178
+ ),
179
+ "structural",
180
+ "high",
181
+ "Markdown link with hidden injection",
182
+ ),
110
183
  PatternDefinition("html_comment_injection", re.compile(r"<!--\s*(?:system|ignore|instruction|prompt).*?-->", re.I), "structural", "high", "HTML comment containing injection keywords"),
111
184
  PatternDefinition("invisible_unicode", re.compile(r"[\u200b-\u200d\ufeff\u2060\u2061\u2062\u2063\u2064]"), "encoding_suspicious", "medium", "Invisible Unicode characters (zero-width, etc.)"),
112
185
  PatternDefinition("text_direction_override", re.compile(r"[\u202a-\u202e\u2066-\u2069]"), "encoding_suspicious", "medium", "Text direction override characters"),
113
- PatternDefinition("confusable_homoglyphs", re.compile(r"[\u13a0-\u13f4]|[\u1d00-\u1d2b]|[\u0400-\u04ff]"), "encoding_suspicious", "medium", "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)"),
186
+ # confusable_homoglyphs: Cherokee (U+13A0-U+13F4) and Phonetic Extensions
187
+ # (U+1D00-U+1D2B) are essentially never in real customer content, so
188
+ # single-char presence remains a useful signal. Cyrillic (U+0400-U+04FF)
189
+ # is mainstream Russian text -- flag only when *mixed* with Latin letters
190
+ # (the actual attack: ``аdmin`` with a Cyrillic 'a'), not when the whole
191
+ # word/text is Cyrillic.
192
+ PatternDefinition(
193
+ "confusable_homoglyphs",
194
+ re.compile(
195
+ r"[\u13a0-\u13f4\u1d00-\u1d2b]|[a-zA-Z][\u0400-\u04ff]|[\u0400-\u04ff][a-zA-Z]"
196
+ ),
197
+ "encoding_suspicious",
198
+ "medium",
199
+ "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)",
200
+ ),
114
201
  PatternDefinition("separator_injection", re.compile(r"[-=]{10,}[^-=\n]*(?:system|instruction|ignore)", re.I), "structural", "medium", "Separator followed by injection attempt"),
115
- PatternDefinition("json_injection", re.compile(r'"(?:system|role|instruction|prompt)"\s*:\s*"', re.I), "structural", "medium", "JSON-style role/instruction injection"),
202
+ # json_injection: target the actual attack shape -- setting a chat-message
203
+ # role to a privileged value (system/developer/admin), or stuffing a long
204
+ # string into a ``"system"`` key. The previous form matched the bare key
205
+ # ``"system":`` / ``"role":`` etc., which fires on every OpenAI / Anthropic
206
+ # SDK example, chat-log dump, and JSON schema that just *declares* the
207
+ # field without abusing it.
208
+ PatternDefinition(
209
+ "json_injection",
210
+ re.compile(
211
+ r'"role"\s*:\s*"(?:system|developer|admin)"|"system"\s*:\s*"[^"]{20,}',
212
+ re.I,
213
+ ),
214
+ "structural",
215
+ "medium",
216
+ "JSON-style role/instruction injection",
217
+ ),
116
218
  ]
117
219
 
118
220
  # ---------------------------------------------------------------------------
@@ -155,6 +257,12 @@ FAST_FILTER_KEYWORDS: list[str] = [
155
257
  "execute", "eval(", "$(", "run the",
156
258
  # Encoding/obfuscation
157
259
  "base64", "decode", "\\x", "\\u", "&#", "rot13",
260
+ # Raw leet-speak keywords -- kept here because the leet normaliser skips
261
+ # 20+ character alphanumeric tokens (treated as base64-like blobs), so
262
+ # long leet payloads like "1gn0r3pr3v10us1nstruct10ns" are NOT normalised
263
+ # to plain English and won't trip the "ignore" / "forget" / "bypass"
264
+ # keywords above. These literal entries ensure such payloads still
265
+ # trigger the fast filter and reach the leetspeak_injection regex.
158
266
  "1gn0r3", "f0rg3t", "byp4ss",
159
267
  # Prompt leaking
160
268
  "reveal", "show me your", "print everything", "print above",