PyPI - stackone-defender - Versions diffs - 0.6.3__tar.gz → 0.7.0__tar.gz - Mend

stackone-defender 0.6.3tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

stackone_defender-0.7.0/.release-please-manifest.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {".":"0.7.0"}

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,16 @@
 # Changelog
+## [0.7.0](https://github.com/StackOneHQ/stackone-defender/compare/stackone-defender-v0.6.3...stackone-defender-v0.7.0) (2026-05-29)
+### ⚠ BREAKING CHANGES
+* The default ONNX model directory changed from minilm-full-aug to minilm-multihead-v5. Custom code that hardcoded the old path will no longer load.
+### Features
+* parity with TS defender 0.7.0 ([75d046a](https://github.com/StackOneHQ/stackone-defender/commit/75d046ab45066ee1f973e91357f7ecb23dea50c8))
 ## [0.6.3](https://github.com/StackOneHQ/stackone-defender/compare/stackone-defender-v0.6.2...stackone-defender-v0.6.3) (2026-05-26)

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: stackone-defender
-Version: 0.6.3
+Version: 0.7.0
 Summary: Indirect prompt injection defense for AI agents using tool calls
 Project-URL: Homepage, https://github.com/StackOneHQ/stackone-defender
 Project-URL: Repository, https://github.com/StackOneHQ/stackone-defender

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "stackone-defender"
-version = "0.6.3"
+version = "0.7.0"
 description = "Indirect prompt injection defense for AI agents using tool calls"
 readme = "README.md"
 requires-python = ">=3.11"

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/__init__.py RENAMED Viewed

@@ -11,8 +11,8 @@ Usage:
         print(f"Blocked: {result.risk_level}")
 """
+from .classifiers.onnx_classifier import get_default_model_path
 from .core.prompt_defense import PromptDefense, create_prompt_defense
-from .utils.boundary import contains_boundary_patterns, generate_boundary_instructions
 from .sfe.preprocess import (
     DropDecision,
     SfePredictor,
@@ -21,11 +21,13 @@ from .sfe.preprocess import (
     get_default_sfe_model_path,
     sfe_preprocess,
 )
-from .types import DefenseResult, RiskLevel, Tier1Result
+from .types import DefenseResult, MultiheadConfig, RiskLevel, Tier1Result
+from .utils.boundary import contains_boundary_patterns, generate_boundary_instructions
 __all__ = [
     "DefenseResult",
     "DropDecision",
+    "MultiheadConfig",
     "PromptDefense",
     "RiskLevel",
     "SfePredictor",
@@ -34,6 +36,7 @@ __all__ = [
     "contains_boundary_patterns",
     "create_prompt_defense",
     "generate_boundary_instructions",
+    "get_default_model_path",
     "get_default_predictor",
     "get_default_sfe_model_path",
     "sfe_preprocess",

stackone_defender-0.7.0/src/stackone_defender/classifiers/onnx_classifier.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""ONNX classifier for fine-tuned MiniLM prompt injection detection.
+Pipeline: text -> tokenizer -> ONNX Runtime -> logit -> ``sigmoid(logit / T)``
+-> score. Supports single-head ``[batch]`` / ``[batch, 1]`` models and
+multi-head ``[batch, 2]`` models (main + aux). Temperature ``T`` enables
+post-hoc calibration via temperature scaling.
+"""
+from __future__ import annotations
+import logging
+import math
+import threading
+from pathlib import Path
+from typing import Literal
+_logger = logging.getLogger(__name__)
+# Shared across all OnnxClassifier instances (keyed by resolved model dir path).
+_session_cache: dict[str, tuple[object, object]] = {}
+_registry_lock = threading.Lock()
+_load_locks: dict[str, threading.Lock] = {}
+def _lock_for_cache_key(cache_key: str) -> threading.Lock:
+    with _registry_lock:
+        if cache_key not in _load_locks:
+            _load_locks[cache_key] = threading.Lock()
+        return _load_locks[cache_key]
+def get_default_model_path() -> str:
+    """Return the absolute path to the bundled ONNX model directory.
+    Exported so :class:`Tier2Classifier` can read model-specific calibration
+    defaults from ``classifier_config.json`` at construction time without
+    needing an :class:`OnnxClassifier` instance.
+    """
+    return str(Path(__file__).resolve().parent.parent / "models" / "minilm-multihead-v5")
+# Back-compat shim retained for internal users; same value as the public name.
+def _default_model_path() -> str:
+    return get_default_model_path()
+def _sigmoid(x: float) -> float:
+    return 1.0 / (1.0 + math.exp(-x))
+class OnnxClassifier:
+    """ONNX Classifier for fine-tuned MiniLM models.
+    Loads the model lazily on first inference. The session and tokenizer
+    are cached at module level so multiple instances pointing at the same
+    model path share a single backing session (safe: ONNX Runtime
+    guarantees thread-safe ``Run()`` from v1.7.0, and the ``tokenizers``
+    library's encode methods do not mutate the tokenizer object).
+    """
+    _MAX_BATCH_CHUNK = 32
+    def __init__(self, model_path: str | None = None, temperature_t: float | None = None):
+        self._model_path = model_path or get_default_model_path()
+        self._session = None
+        self._tokenizer = None
+        self._max_length = 256
+        self._load_failed = False
+        # Output mode is detected lazily from the logits shape on the first
+        # inference call. ``None`` until then.
+        self._output_mode: Literal["single", "multi"] | None = None
+        # Temperature ``T`` must be a positive finite number. ``T <= 0`` is
+        # undefined (divide-by-zero or sign flip) and almost certainly a
+        # programming error rather than a config the caller wants gracefully
+        # ignored.
+        self._temperature_t = 1.0
+        if temperature_t is not None:
+            if not math.isfinite(temperature_t) or temperature_t <= 0:
+                raise ValueError(
+                    f"OnnxClassifier: temperature_t must be a positive finite number, got {temperature_t}"
+                )
+            self._temperature_t = float(temperature_t)
+    # ------------------------------------------------------------------
+    # Public introspection
+    # ------------------------------------------------------------------
+    def get_temperature(self) -> float:
+        """Current temperature scaling factor (``1.0`` = no calibration)."""
+        return self._temperature_t
+    def get_output_mode(self) -> Literal["single", "multi"] | None:
+        """Output mode of the loaded model.
+        ``None`` until the first inference runs. ``"multi"`` indicates the
+        model emits ``[batch, 2]`` logits (main + aux).
+        """
+        return self._output_mode
+    # ------------------------------------------------------------------
+    # Loading
+    # ------------------------------------------------------------------
+    def load_model(self, model_path: str | None = None) -> None:
+        if model_path:
+            self._model_path = model_path
+        if self._session is not None and self._tokenizer is not None:
+            return
+        if self._load_failed:
+            raise ImportError("ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]")
+        self._load_model()
+    def _load_model(self) -> None:
+        cache_key = str(Path(self._model_path).resolve())
+        cached = _session_cache.get(cache_key)
+        if cached:
+            self._session, self._tokenizer = cached
+            return
+        with _lock_for_cache_key(cache_key):
+            cached = _session_cache.get(cache_key)
+            if cached:
+                self._session, self._tokenizer = cached
+                return
+            try:
+                import numpy as np  # noqa: F401
+                import onnxruntime as ort
+                from tokenizers import Tokenizer
+            except ImportError as e:
+                self._load_failed = True
+                _logger.warning("[defender] ONNX model failed to load: %s", e)
+                raise ImportError(
+                    "ONNX dependencies not installed. Install with: pip install stackone-defender[onnx]"
+                ) from e
+            try:
+                tokenizer_path = str(Path(self._model_path) / "tokenizer.json")
+                self._tokenizer = Tokenizer.from_file(tokenizer_path)
+                self._tokenizer.enable_truncation(max_length=self._max_length)
+                self._tokenizer.enable_padding(length=self._max_length)
+                onnx_path = str(Path(self._model_path) / "model_quantized.onnx")
+                self._session = ort.InferenceSession(onnx_path)
+            except Exception as e:
+                _logger.warning("[defender] ONNX model failed to load: %s", e)
+                raise
+            _session_cache[cache_key] = (self._session, self._tokenizer)
+    # ------------------------------------------------------------------
+    # Inference
+    # ------------------------------------------------------------------
+    def classify(self, text: str) -> float:
+        """Classify a single text, returning the main-head sigmoid score.
+        For multi-head models only the main score is returned; callers that
+        need the aux score should use :meth:`classify_pair`.
+        """
+        return self.classify_pair(text)[0]
+    def classify_pair(self, text: str) -> tuple[float, float | None]:
+        """Classify a single text, returning ``(main, aux)``.
+        ``aux`` is ``None`` for single-head models. Both scores are
+        sigmoid-activated with the configured temperature ``T``.
+        """
+        self._ensure_loaded()
+        import numpy as np
+        encoding = self._tokenizer.encode(text)
+        input_ids = np.array([encoding.ids], dtype=np.int64)
+        attention_mask = np.array([encoding.attention_mask], dtype=np.int64)
+        results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
+        logits = results[0]
+        self._detect_output_mode(logits.shape)
+        t = self._temperature_t
+        row = logits[0]
+        # row shape: (), (1,) or (2,) depending on model export.
+        if self._output_mode == "multi":
+            main = _sigmoid(float(row[0]) / t)
+            aux = _sigmoid(float(row[1]) / t)
+            return main, aux
+        main_logit = float(row[0]) if hasattr(row, "__len__") and len(row) > 0 else float(row)
+        return _sigmoid(main_logit / t), None
+    def classify_batch(self, texts: list[str]) -> list[float]:
+        """Classify multiple texts; returns main-head scores only.
+        Back-compat wrapper around :meth:`classify_batch_pair`.
+        """
+        return [main for main, _ in self.classify_batch_pair(texts)]
+    def classify_batch_pair(self, texts: list[str]) -> list[tuple[float, float | None]]:
+        """Classify multiple texts, returning ``(main, aux)`` per row.
+        Aux is ``None`` per-row for single-head models. Chunks the input to
+        bound native memory; the attention matrix is ``O(chunk * seq_len^2)``,
+        and for MiniLM (``max_length=256``) a chunk of 32 keeps memory
+        under ~50MB per call.
+        """
+        if not texts:
+            return []
+        self._ensure_loaded()
+        all_pairs: list[tuple[float, float | None]] = []
+        for offset in range(0, len(texts), self._MAX_BATCH_CHUNK):
+            chunk = texts[offset : offset + self._MAX_BATCH_CHUNK]
+            all_pairs.extend(self._classify_batch_chunk_pair(chunk))
+        return all_pairs
+    def _classify_batch_chunk_pair(self, texts: list[str]) -> list[tuple[float, float | None]]:
+        import numpy as np
+        encodings = self._tokenizer.encode_batch(texts)
+        input_ids = np.array([e.ids for e in encodings], dtype=np.int64)
+        attention_mask = np.array([e.attention_mask for e in encodings], dtype=np.int64)
+        results = self._session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
+        logits = results[0]
+        self._detect_output_mode(logits.shape)
+        t = self._temperature_t
+        pairs: list[tuple[float, float | None]] = []
+        if self._output_mode == "multi":
+            for i in range(len(texts)):
+                main = _sigmoid(float(logits[i][0]) / t)
+                aux = _sigmoid(float(logits[i][1]) / t)
+                pairs.append((main, aux))
+        else:
+            for i in range(len(texts)):
+                row = logits[i]
+                # ``row`` may be a scalar (shape ``[batch]``) or 1-vector.
+                main_logit = float(row[0]) if hasattr(row, "__len__") and len(row) > 0 else float(row)
+                pairs.append((_sigmoid(main_logit / t), None))
+        return pairs
+    def _detect_output_mode(self, dims) -> None:
+        """Detect output mode from the logits tensor shape on first inference.
+        - ``[batch]`` or ``[batch, 1]`` -> ``"single"``
+        - ``[batch, 2]`` -> ``"multi"`` (main + aux dual head)
+        Idempotent: subsequent calls are no-ops once mode is set.
+        """
+        if self._output_mode is not None:
+            return
+        if dims is None or len(dims) < 2:
+            self._output_mode = "single"
+            return
+        self._output_mode = "multi" if dims[1] == 2 else "single"
+    # ------------------------------------------------------------------
+    # Misc
+    # ------------------------------------------------------------------
+    def count_tokens(self, text: str) -> int:
+        self._ensure_loaded()
+        encoding = self._tokenizer.encode(text)
+        # Padding is enabled at a fixed length; count only real (attended) tokens.
+        return int(sum(encoding.attention_mask))
+    def get_max_length(self) -> int:
+        return self._max_length
+    def warmup(self) -> None:
+        self.load_model()
+    def is_loaded(self) -> bool:
+        return self._session is not None and self._tokenizer is not None
+    def _ensure_loaded(self) -> None:
+        if not self.is_loaded():
+            self.load_model()

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/classifiers/pattern_detector.py RENAMED Viewed

@@ -9,7 +9,10 @@ from __future__ import annotations
 import math
 import re
 import time
+import unicodedata
+from ..sanitizers.leet_normalizer import normalize_leet_speak
+from ..sanitizers.normalizer import normalize_unicode, normalize_whitespace, strip_combining_marks
 from ..types import PatternDefinition, PatternMatch, RiskLevel, StructuralFlag, Tier1Result
 from .patterns import ALL_PATTERNS, contains_filter_keywords
@@ -47,16 +50,83 @@ class PatternDetector:
             return self._empty_result(start)
         original_length = len(text)
-        analysis_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
+        raw_text = text[: self._max_analysis_length] if len(text) > self._max_analysis_length else text
+        # Normalisation chain: collapse obfuscation before injection pattern
+        # matching. Order matters:
+        # 1. NFD-decompose: precomposed accents become base + combining mark.
+        # 2. strip_combining_marks: Zalgo defense + accent stripping.
+        # 3. normalize_unicode: homoglyphs/fullwidth -> ASCII.
+        # 4. normalize_whitespace: collapse spaced letters + embedded newlines.
+        # 5. normalize_leet_speak: 1gn0r3 -> ignore.
+        # NFD-decomposition lives here (not in normalize_unicode) because it
+        # strips legitimate accents like ``café`` -> ``cafe`` -- fine for
+        # analysis but would be data loss if returned to callers. The result
+        # is analysis-only and never returned.
+        analysis_text = normalize_leet_speak(
+            normalize_whitespace(
+                normalize_unicode(strip_combining_marks(unicodedata.normalize("NFD", raw_text)))
+            )
+        )
+        # Fast filter: short-circuit if neither raw nor normalised text
+        # contains keywords. Raw text is checked to preserve detection of
+        # obfuscation patterns (e.g. invisible unicode, leet-speak variants)
+        # that are normalised away before injection patterns run. Disable the
+        # fast filter when custom patterns are provided -- callers may add
+        # patterns whose keywords aren't in the static list.
         should_use_fast_filter = self._use_fast_filter and not self._has_custom
-        if should_use_fast_filter and not contains_filter_keywords(analysis_text):
-            flags = self._detect_structural_issues(analysis_text, original_length)
+        raw_has_keywords = not should_use_fast_filter or contains_filter_keywords(raw_text)
+        norm_has_keywords = not should_use_fast_filter or contains_filter_keywords(analysis_text)
+        if not raw_has_keywords and not norm_has_keywords:
+            flags = self._detect_structural_issues(raw_text, original_length)
             return self._create_result([], flags, start)
-        matches = self._detect_patterns(analysis_text)
-        flags = self._detect_structural_issues(analysis_text, original_length)
-        return self._create_result(matches, flags, start)
+        # Short-circuit: if normalisation produced no change, a single pass
+        # is sufficient and avoids doubling pattern work for plain-text input.
+        if raw_text == analysis_text:
+            matches = self._detect_patterns(raw_text) if raw_has_keywords else []
+            flags = self._detect_structural_issues(raw_text, original_length)
+            return self._create_result(matches, flags, start)
+        # Run patterns on raw text -- catches obfuscation-specific patterns
+        # (e.g. invisible_unicode, leetspeak_injection) that normalisation
+        # removes. Run whenever EITHER pass has keywords: if only the
+        # normalised text has keywords (pure leet-speak with no other
+        # fast-filter hits), we still want the raw pass to fire obfuscation
+        # patterns like leetspeak_injection.
+        raw_matches = (
+            self._detect_patterns(raw_text) if (raw_has_keywords or norm_has_keywords) else []
+        )
+        # Run patterns on normalised text -- catches injection patterns
+        # hidden behind leet-speak, whitespace, or homoglyph obfuscation.
+        # Matches are tagged ``normalised=True`` because their
+        # position/matched values reference the transformed text.
+        norm_matches_raw = self._detect_patterns(analysis_text) if norm_has_keywords else []
+        norm_matches = [
+            PatternMatch(
+                pattern=m.pattern,
+                matched=m.matched,
+                position=m.position,
+                category=m.category,
+                severity=m.severity,
+                normalised=True,
+            )
+            for m in norm_matches_raw
+        ]
+        # Merge: normalised matches take priority. Raw-only matches are
+        # appended for patterns that fired on the original text but not the
+        # normalised form (e.g. obfuscation-detection patterns that match the
+        # raw encoding characters).
+        seen_patterns = {m.pattern for m in norm_matches}
+        merged_matches: list[PatternMatch] = [*norm_matches]
+        merged_matches.extend(m for m in raw_matches if m.pattern not in seen_patterns)
+        flags = self._detect_structural_issues(raw_text, original_length)
+        return self._create_result(merged_matches, flags, start)
     # ------------------------------------------------------------------
     # Pattern detection
@@ -65,7 +135,6 @@ class PatternDetector:
     def _detect_patterns(self, text: str) -> list[PatternMatch]:
         matches: list[PatternMatch] = []
         for defn in self._patterns:
-            # Use finditer for all patterns (handles global-like behavior)
             for m in defn.pattern.finditer(text):
                 matches.append(
                     PatternMatch(

{stackone_defender-0.6.3 → stackone_defender-0.7.0}/src/stackone_defender/classifiers/patterns.py RENAMED Viewed

@@ -26,7 +26,21 @@ ROLE_MARKER_PATTERNS: list[PatternDefinition] = [
     PatternDefinition("role_system_bracket", re.compile(r"^\[SYSTEM\]", re.I), "role_marker", "high", "Bracketed system role marker"),
     PatternDefinition("role_inst_bracket", re.compile(r"^\[INST\]", re.I), "role_marker", "high", "Bracketed instruction marker (Llama format)"),
     # XML-style variants
-    PatternDefinition("role_system_xml", re.compile(r"<system>", re.I), "role_marker", "high", "XML-style system tag"),
+    # role_system_xml requires directive-shaped content immediately following
+    # the tag. Bare ``<system>`` mentions are common in XML schemas, ML config
+    # docs, and OS specs; the attack shape is ``<system>`` followed by an
+    # imperative or role-switch payload. Closing-tag pairs are matched
+    # implicitly because the directive content sits inside them.
+    PatternDefinition(
+        "role_system_xml",
+        re.compile(
+            r"<system>\s*(?:ignore|disregard|forget|override|you\s+are|new\s+instructions?|stop|disable|bypass)",
+            re.I,
+        ),
+        "role_marker",
+        "high",
+        "XML-style system tag",
+    ),
     PatternDefinition("role_assistant_xml", re.compile(r"<assistant>", re.I), "role_marker", "medium", "XML-style assistant tag"),
 ]
@@ -48,9 +62,35 @@ INSTRUCTION_OVERRIDE_PATTERNS: list[PatternDefinition] = [
 # Role assumption
 # ---------------------------------------------------------------------------
 ROLE_ASSUMPTION_PATTERNS: list[PatternDefinition] = [
-    PatternDefinition("you_are_now", re.compile(r"you\s+are\s+now\s+(?:a\s+)?(?:different|new|the|my)?", re.I), "role_assumption", "high", "Attempt to assign new role"),
+    # you_are_now: require an actual role-noun terminator. The previous form
+    # made both alternation groups optional, so "you are now logged in" /
+    # "you are now ready" UI strings FP'd. Role list expanded to cover the
+    # actual attack distribution (DAN/GPT/AI/jailbroken/persona-switching).
+    PatternDefinition(
+        "you_are_now",
+        re.compile(
+            r"you\s+are\s+now\s+(?:a\s+|an\s+)?(?:different|new|unrestricted|jailbroken|free|uncensored|DAN|GPT|AI|chatbot|model|persona|hacker|admin|root|developer|superuser)\b",
+            re.I,
+        ),
+        "role_assumption",
+        "high",
+        "Attempt to assign new role",
+    ),
     PatternDefinition("act_as", re.compile(r"act\s+(?:as|like)\s+(?:a\s+)?(?:system|admin|developer|root|superuser)", re.I), "role_assumption", "high", "Attempt to make AI act as privileged role"),
-    PatternDefinition("pretend_to_be", re.compile(r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+)?", re.I), "role_assumption", "medium", "Attempt to make AI pretend"),
+    # pretend_to_be: require an attack-shaped role-noun. Previously matched any
+    # "pretend to be ..." prefix, FPing on children's literature, drama
+    # exercises, jokes. Role list overlaps with `you_are_now` plus privilege
+    # escalation nouns (hacker/admin/root/developer).
+    PatternDefinition(
+        "pretend_to_be",
+        re.compile(
+            r"pretend\s+(?:to\s+be|you\s+are)\s+(?:a\s+|an\s+)?(?:different|new|unrestricted|jailbroken|free|uncensored|DAN|GPT|AI|chatbot|model|persona|hacker|admin|root|developer)\b",
+            re.I,
+        ),
+        "role_assumption",
+        "medium",
+        "Attempt to make AI pretend",
+    ),
     PatternDefinition("roleplay_as", re.compile(r"roleplay\s+(?:as|like)\s+(?:a\s+)?", re.I), "role_assumption", "low", "Roleplay request (lower severity)"),
     PatternDefinition("imagine_you_are", re.compile(r"imagine\s+(?:that\s+)?you\s+are\s+(?:a\s+)?", re.I), "role_assumption", "low", "Imagination prompt (lower severity)"),
     PatternDefinition("jailbreak_dan", re.compile(r"\bDAN\b.*?(?:do\s+anything|jailbreak)", re.I), "role_assumption", "high", "DAN jailbreak attempt"),
@@ -75,7 +115,13 @@ COMMAND_EXECUTION_PATTERNS: list[PatternDefinition] = [
     PatternDefinition("execute_command", re.compile(r"execute\s+(?:the\s+)?(?:following|this|these)\s+(?:command|instruction|code)", re.I), "command_execution", "high", "Command execution instruction"),
     PatternDefinition("run_code", re.compile(r"run\s+(?:the\s+)?(?:following|this|these)\s+(?:code|script|command)", re.I), "command_execution", "high", "Code execution instruction"),
     PatternDefinition("eval_expression", re.compile(r"eval(?:uate)?\s*\(", re.I), "command_execution", "medium", "Eval function pattern"),
-    PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)|`[^`]+`"), "command_execution", "medium", "Shell command substitution"),
+    # shell_command: POSIX ``$(...)`` only. The legacy backtick form
+    # ``` `cmd` ``` used to be included here but FPs on every markdown
+    # inline-code span (``` `cat foo.json` ```, ``` `npm install` ```,
+    # ``` `filename.txt` ```). Modern shell idioms have used ``$(...)`` for
+    # decades; real attackers default to it because it nests. Tier 2 still
+    # catches the rare backtick attack via context.
+    PatternDefinition("shell_command", re.compile(r"\$\([^)]+\)"), "command_execution", "medium", "Shell command substitution"),
 ]
 # ---------------------------------------------------------------------------
@@ -86,7 +132,9 @@ ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
     PatternDefinition("hex_escape_sequence", re.compile(r"(?:\\x[0-9a-fA-F]{2}){4,}"), "encoding_suspicious", "medium", "Hex escape sequence (potential obfuscation)"),
     PatternDefinition("unicode_escape_sequence", re.compile(r"(?:\\u[0-9a-fA-F]{4}){4,}"), "encoding_suspicious", "medium", "Unicode escape sequence (potential obfuscation)"),
     PatternDefinition("html_entity_abuse", re.compile(r"(?:&#\d{2,4};){4,}|(?:&#x[0-9a-fA-F]{2,4};){4,}", re.I), "encoding_suspicious", "medium", "HTML entity encoding (potential obfuscation)"),
-    PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "low", "Mention of simple encoding schemes"),
+    PatternDefinition("rot13_mention", re.compile(r"rot13|caesar\s+cipher|decode\s+this", re.I), "encoding_suspicious", "medium", "Mention of ROT13 or similar encoding schemes"),
+    PatternDefinition("binary_string_encoding", re.compile(r"\b[01]{8}(?:\s+[01]{8}){2,}\b"), "encoding_suspicious", "medium", "Binary-encoded string (potential obfuscation)"),
+    PatternDefinition("morse_code_encoding", re.compile(r"(?:[.-]+\s){4,}[.-]+"), "encoding_suspicious", "low", "Morse code pattern (potential obfuscation)"),
     PatternDefinition("leetspeak_injection", re.compile(r"1gn0r3|f0rg3t|byp4ss|syst3m|4dm1n|h4ck", re.I), "encoding_suspicious", "medium", "Leetspeak obfuscation of injection keywords"),
 ]
@@ -95,7 +143,17 @@ ENCODING_SUSPICIOUS_PATTERNS: list[PatternDefinition] = [
 # ---------------------------------------------------------------------------
 PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
     PatternDefinition("reveal_system_prompt", re.compile(r"reveal\s+(?:your\s+)?(?:system\s+)?prompt", re.I), "instruction_override", "high", "Attempt to reveal system prompt"),
-    PatternDefinition("show_instructions", re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:(?:initial|original|system|hidden)\s+)?instructions?", re.I), "instruction_override", "high", "Attempt to show hidden instructions"),
+    # show_instructions: require an attack-shaped qualifier
+    # (initial/original/system/hidden). The inner group used to be optional,
+    # so bare "show instructions" -- common in FAQs, help docs, onboarding
+    # tours -- triggered the rule.
+    PatternDefinition(
+        "show_instructions",
+        re.compile(r"show\s+(?:me\s+)?(?:your\s+)?(?:initial|original|system|hidden)\s+instructions?", re.I),
+        "instruction_override",
+        "high",
+        "Attempt to show hidden instructions",
+    ),
     PatternDefinition("print_above", re.compile(r"print\s+(?:everything|all|text)\s+(?:above|before)", re.I), "instruction_override", "high", "Attempt to print content above current context"),
     PatternDefinition("repeat_back", re.compile(r"repeat\s+(?:back\s+)?(?:everything|all|your\s+instructions?)", re.I), "instruction_override", "high", "Attempt to make AI repeat instructions"),
     PatternDefinition("what_are_your_instructions", re.compile(r"what\s+(?:are|were)\s+(?:your|the)\s+(?:(?:initial|original|system)\s+)?instructions?", re.I), "instruction_override", "medium", "Question about system instructions"),
@@ -106,13 +164,57 @@ PROMPT_LEAKING_PATTERNS: list[PatternDefinition] = [
 # Indirect injection
 # ---------------------------------------------------------------------------
 INDIRECT_INJECTION_PATTERNS: list[PatternDefinition] = [
-    PatternDefinition("markdown_hidden_instruction", re.compile(r"\[.*?\]\(.*?(?:ignore|forget|system|instruction).*?\)", re.I), "structural", "high", "Markdown link with hidden injection"),
+    # markdown_hidden_instruction: require an imperative + scope qualifier in
+    # the URL. The earlier form matched "system" or "instruction" anywhere in
+    # the URL, so every doc cross-reference like
+    # ``[config](https://.../system-setup)`` triggered. Real smuggled-
+    # instruction attacks include the full "ignore (all|the|previous|prior)"
+    # phrasing in the URL/anchor.
+    PatternDefinition(
+        "markdown_hidden_instruction",
+        re.compile(
+            r"\[.*?\]\(.*?(?:ignore|disregard|forget|override)\W+(?:all|the|previous|prior)\W+.*?\)",
+            re.I,
+        ),
+        "structural",
+        "high",
+        "Markdown link with hidden injection",
+    ),
     PatternDefinition("html_comment_injection", re.compile(r"<!--\s*(?:system|ignore|instruction|prompt).*?-->", re.I), "structural", "high", "HTML comment containing injection keywords"),
     PatternDefinition("invisible_unicode", re.compile(r"[\u200b-\u200d\ufeff\u2060\u2061\u2062\u2063\u2064]"), "encoding_suspicious", "medium", "Invisible Unicode characters (zero-width, etc.)"),
     PatternDefinition("text_direction_override", re.compile(r"[\u202a-\u202e\u2066-\u2069]"), "encoding_suspicious", "medium", "Text direction override characters"),
-    PatternDefinition("confusable_homoglyphs", re.compile(r"[\u13a0-\u13f4]|[\u1d00-\u1d2b]|[\u0400-\u04ff]"), "encoding_suspicious", "medium", "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)"),
+    # confusable_homoglyphs: Cherokee (U+13A0-U+13F4) and Phonetic Extensions
+    # (U+1D00-U+1D2B) are essentially never in real customer content, so
+    # single-char presence remains a useful signal. Cyrillic (U+0400-U+04FF)
+    # is mainstream Russian text -- flag only when *mixed* with Latin letters
+    # (the actual attack: ``аdmin`` with a Cyrillic 'a'), not when the whole
+    # word/text is Cyrillic.
+    PatternDefinition(
+        "confusable_homoglyphs",
+        re.compile(
+            r"[\u13a0-\u13f4\u1d00-\u1d2b]|[a-zA-Z][\u0400-\u04ff]|[\u0400-\u04ff][a-zA-Z]"
+        ),
+        "encoding_suspicious",
+        "medium",
+        "Unicode homoglyph characters (Cherokee, Small Caps, Cyrillic)",
+    ),
     PatternDefinition("separator_injection", re.compile(r"[-=]{10,}[^-=\n]*(?:system|instruction|ignore)", re.I), "structural", "medium", "Separator followed by injection attempt"),
-    PatternDefinition("json_injection", re.compile(r'"(?:system|role|instruction|prompt)"\s*:\s*"', re.I), "structural", "medium", "JSON-style role/instruction injection"),
+    # json_injection: target the actual attack shape -- setting a chat-message
+    # role to a privileged value (system/developer/admin), or stuffing a long
+    # string into a ``"system"`` key. The previous form matched the bare key
+    # ``"system":`` / ``"role":`` etc., which fires on every OpenAI / Anthropic
+    # SDK example, chat-log dump, and JSON schema that just *declares* the
+    # field without abusing it.
+    PatternDefinition(
+        "json_injection",
+        re.compile(
+            r'"role"\s*:\s*"(?:system|developer|admin)"|"system"\s*:\s*"[^"]{20,}',
+            re.I,
+        ),
+        "structural",
+        "medium",
+        "JSON-style role/instruction injection",
+    ),
 ]
 # ---------------------------------------------------------------------------
@@ -155,6 +257,12 @@ FAST_FILTER_KEYWORDS: list[str] = [
     "execute", "eval(", "$(", "run the",
     # Encoding/obfuscation
     "base64", "decode", "\\x", "\\u", "&#", "rot13",
+    # Raw leet-speak keywords -- kept here because the leet normaliser skips
+    # 20+ character alphanumeric tokens (treated as base64-like blobs), so
+    # long leet payloads like "1gn0r3pr3v10us1nstruct10ns" are NOT normalised
+    # to plain English and won't trip the "ignore" / "forget" / "bypass"
+    # keywords above. These literal entries ensure such payloads still
+    # trigger the fast filter and reach the leetspeak_injection regex.
     "1gn0r3", "f0rg3t", "byp4ss",
     # Prompt leaking
     "reveal", "show me your", "print everything", "print above",

stackone-defender 0.6.3__tar.gz → 0.7.0__tar.gz

stackone-defender 0.6.3tar.gz → 0.7.0tar.gz