PyPI - sibylline-scurl - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

sibylline-scurl 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

scurl/browser.py +78 -0
scurl/cli.py +58 -10
scurl/middleware.py +1 -1
scurl/prompt_defender/__init__.py +20 -0
scurl/prompt_defender/classifier.py +178 -0
scurl/prompt_defender/embedder.py +293 -0
scurl/prompt_defender/middleware.py +461 -0
scurl/prompt_defender/models/.gitkeep +0 -0
scurl/prompt_defender/models/prompt_injection_rf.pkl +0 -0
scurl/prompt_defender/motifs.py +362 -0
scurl/prompt_defender/normalizer.py +147 -0
scurl/prompt_defender/patterns.py +227 -0
scurl/prompt_defender/windowing.py +397 -0
scurl/response_middleware.py +92 -25
scurl/sanitize.py +69 -0
sibylline_scurl-0.2.0.dist-info/METADATA +143 -0
sibylline_scurl-0.2.0.dist-info/RECORD +22 -0
sibylline_scurl-0.1.1.dist-info/METADATA +0 -81
sibylline_scurl-0.1.1.dist-info/RECORD +0 -10
{sibylline_scurl-0.1.1.dist-info → sibylline_scurl-0.2.0.dist-info}/WHEEL +0 -0
{sibylline_scurl-0.1.1.dist-info → sibylline_scurl-0.2.0.dist-info}/entry_points.txt +0 -0

scurl/browser.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Playwright-based browser fetcher for JS-rendered pages."""
+from .curl import CurlResult
+from .middleware import RequestContext
+def execute_browser(context: RequestContext, timeout: int = 60000) -> CurlResult:
+    """Fetch a URL using headless Chromium via Playwright.
+    Returns a CurlResult with the fully-rendered HTML as the body,
+    compatible with the existing response middleware chain.
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError:
+        return CurlResult(
+            body=b"",
+            headers={},
+            status_code=0,
+            content_type=None,
+            final_url=context.url,
+            return_code=-1,
+            stderr="playwright not installed. Install with: pip install sibylline-scurl[browser]",
+        )
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            response = page.goto(context.url, wait_until="networkidle", timeout=timeout)
+            if response is None:
+                browser.close()
+                return CurlResult(
+                    body=b"",
+                    headers={},
+                    status_code=0,
+                    content_type=None,
+                    final_url=context.url,
+                    return_code=-1,
+                    stderr="No response from page",
+                )
+            status_code = response.status
+            final_url = page.url
+            html = page.content()
+            # Extract response headers
+            headers = {}
+            for name, value in response.headers.items():
+                headers[name.lower()] = value
+            # Ensure content-type is text/html so readability middleware processes it
+            content_type = headers.get("content-type", "text/html")
+            browser.close()
+            return CurlResult(
+                body=html.encode("utf-8"),
+                headers=headers,
+                status_code=status_code,
+                content_type=content_type,
+                final_url=final_url,
+                return_code=0,
+                stderr="",
+            )
+    except Exception as e:
+        return CurlResult(
+            body=b"",
+            headers={},
+            status_code=0,
+            content_type=None,
+            final_url=context.url,
+            return_code=-1,
+            stderr=f"Browser rendering failed: {e}",
+        )

scurl/cli.py CHANGED Viewed

@@ -10,7 +10,8 @@ from .middleware import (
     RequestAction,
 )
 from .request_middleware import SecretDefender
-from .response_middleware import TrafilaturaExtractor
+from .response_middleware import ReadabilityExtractor
+from .prompt_defender import PromptInjectionDefender
 from .curl import parse_curl_args, execute_curl, curl_result_to_response_context
@@ -20,7 +21,8 @@ REQUEST_MIDDLEWARE = {
 }
 RESPONSE_MIDDLEWARE = {
-    "trafilatura": ("TrafilaturaExtractor", "Extracts clean markdown from HTML", TrafilaturaExtractor),
+    "readability": ("ReadabilityExtractor", "Extracts clean markdown from HTML", ReadabilityExtractor),
+    "prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content", PromptInjectionDefender),
 }
@@ -39,10 +41,14 @@ def print_middleware_list() -> None:
 class ScurlFlags:
     """Parsed scurl-specific flags."""
     raw: bool = False
+    render: bool = False
     disable: set[str] = field(default_factory=set)
     enable: set[str] = field(default_factory=set)
     list_middleware: bool = False
     help: bool = False
+    # Prompt injection defender options
+    injection_threshold: float = 0.3
+    injection_action: str = "redact"  # "warn", "redact", "datamark", "metadata", "silent"
 def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
@@ -56,6 +62,9 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
         if arg == "--raw":
             flags.raw = True
             i += 1
+        elif arg == "--render":
+            flags.render = True
+            i += 1
         elif arg == "--disable":
             if i + 1 < len(args):
                 flags.disable.add(args[i + 1])
@@ -70,6 +79,23 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
             else:
                 remaining.append(arg)
                 i += 1
+        elif arg == "--injection-threshold":
+            if i + 1 < len(args):
+                try:
+                    flags.injection_threshold = float(args[i + 1])
+                except ValueError:
+                    pass  # Keep default
+                i += 2
+            else:
+                i += 1
+        elif arg == "--injection-action":
+            if i + 1 < len(args):
+                action = args[i + 1].lower()
+                if action in ("warn", "redact", "datamark", "metadata", "silent"):
+                    flags.injection_action = action
+                i += 2
+            else:
+                i += 1
         elif arg == "--list-middleware":
             flags.list_middleware = True
             i += 1
@@ -92,19 +118,31 @@ def print_help() -> None:
     print()
     print("scurl-specific options:")
     print("  --raw                  Disable all response middleware (raw curl output)")
+    print("  --render               Use headless browser for JS-rendered pages")
     print("  --disable <middleware>  Disable a middleware by slug (can be repeated)")
-    print("  --enable <middleware>  Override a middleware's block (can be repeated)")
+    print("  --enable <middleware>  Enable an opt-in middleware (can be repeated)")
     print("  --list-middleware      List available middleware and their slugs")
     print("  --help, -h             Show this help (use curl --help for curl options)")
     print()
+    print("Prompt injection detection (requires --enable prompt-defender):")
+    print("  --injection-threshold <0.0-1.0>  Detection threshold (default: 0.5)")
+    print("  --injection-action <action>      Action on detection (default: redact):")
+    print("                                     warn     - wrap in <suspected-prompt-injection> tag, content unchanged")
+    print("                                     redact   - wrap in <suspected-prompt-injection> tag, mask patterns with █")
+    print("                                     datamark - wrap in <suspected-prompt-injection> tag, spotlighting mode")
+    print("                                     metadata - return JSON analysis")
+    print("                                     silent   - pass through unchanged")
+    print()
     print("All other options are passed directly to curl.")
     print()
     print("Examples:")
     print("  scurl https://example.com                    # Fetch and extract markdown")
     print("  scurl --raw https://example.com              # Raw HTML output")
-    print("  scurl --disable trafilatura https://example.com # Disable markdown extraction")
-    print("  scurl --disable secret-defender https://...     # Disable secret scanning")
-    print("  scurl --enable secret-defender https://...   # Override a secret block")
+    print("  scurl --disable trafilatura https://...      # Disable markdown extraction")
+    print("  scurl --disable secret-defender https://...  # Disable secret scanning")
+    print("  scurl --enable prompt-defender https://...   # Enable injection detection")
+    print("  scurl --render https://github.com/user/repo    # Render JS-heavy pages")
+    print("  scurl --enable prompt-defender --injection-threshold 0.5 https://...")
     print("  scurl -H 'Accept: application/json' https://api.example.com/data")
@@ -154,8 +192,12 @@ def run(args: Optional[list[str]] = None) -> int:
     if result.context:
         context = result.context
-    # Execute curl
-    curl_result = execute_curl(context)
+    # Execute fetch (curl or browser)
+    if flags.render:
+        from .browser import execute_browser
+        curl_result = execute_browser(context)
+    else:
+        curl_result = execute_curl(context)
     if curl_result.return_code != 0 and curl_result.return_code != -1:
         # curl failed but not our timeout/not-found
@@ -170,8 +212,14 @@ def run(args: Optional[list[str]] = None) -> int:
     # Build response middleware chain
     response_chain = ResponseMiddlewareChain()
     if not flags.raw:
-        if "trafilatura" not in flags.disable:
-            response_chain.add(TrafilaturaExtractor())
+        if "readability" not in flags.disable:
+            response_chain.add(ReadabilityExtractor())
+        # Prompt defender is opt-in (requires --enable)
+        if "prompt-defender" in flags.enable:
+            response_chain.add(PromptInjectionDefender(
+                threshold=flags.injection_threshold,
+                action=flags.injection_action,
+            ))
     # Execute response middleware
     response_context = curl_result_to_response_context(curl_result)

scurl/middleware.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Base middleware classes for request and response processing."""
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum
 from typing import Optional

scurl/prompt_defender/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Prompt injection detection middleware for scurl."""
+from .middleware import PromptInjectionDefender
+from .normalizer import TextNormalizer
+from .patterns import PatternExtractor, PatternFeatures
+from .motifs import MotifMatcher, MotifFeatureExtractor, MotifSignal, HAS_RAPIDFUZZ
+from .windowing import SlidingWindowAnalyzer, AdaptiveWindowAnalyzer
+__all__ = [
+    "PromptInjectionDefender",
+    "TextNormalizer",
+    "PatternExtractor",
+    "PatternFeatures",
+    "MotifMatcher",
+    "MotifFeatureExtractor",
+    "MotifSignal",
+    "SlidingWindowAnalyzer",
+    "AdaptiveWindowAnalyzer",
+    "HAS_RAPIDFUZZ",
+]

scurl/prompt_defender/classifier.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Classifier for prompt injection detection."""
+from __future__ import annotations
+import pickle
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+if TYPE_CHECKING:
+    from .patterns import PatternFeatures
+class InjectionClassifier:
+    """Random Forest classifier for prompt injection detection.
+    Expects feature vectors combining pattern features and embeddings.
+    Lazy-loads model on first use.
+    """
+    MODEL_FILENAME = "prompt_injection_rf.pkl"
+    def __init__(self, model_path: Optional[Path] = None):
+        """Initialize classifier.
+        Args:
+            model_path: Path to pickled model file. If not provided,
+                       looks in package data and default cache locations.
+        """
+        self._model_path = model_path
+        self._model = None
+    def _ensure_loaded(self) -> None:
+        """Lazy-load model on first use."""
+        if self._model is not None:
+            return
+        # Try explicit path first
+        if self._model_path and self._model_path.exists():
+            with open(self._model_path, 'rb') as f:
+                self._model = pickle.load(f)
+            return
+        # Try package data
+        try:
+            import importlib.resources as resources
+            try:
+                # Python 3.9+
+                files = resources.files('scurl.prompt_defender.models')
+                model_file = files.joinpath(self.MODEL_FILENAME)
+                if model_file.is_file():
+                    with model_file.open('rb') as f:
+                        self._model = pickle.load(f)
+                    return
+            except (AttributeError, TypeError):
+                # Python 3.8 fallback
+                with resources.open_binary(
+                    'scurl.prompt_defender.models',
+                    self.MODEL_FILENAME
+                ) as f:
+                    self._model = pickle.load(f)
+                return
+        except (FileNotFoundError, ModuleNotFoundError):
+            pass
+        # Try cache directory
+        from .embedder import EmbeddingGemmaONNX
+        cache_path = EmbeddingGemmaONNX._default_model_dir() / self.MODEL_FILENAME
+        if cache_path.exists():
+            with open(cache_path, 'rb') as f:
+                self._model = pickle.load(f)
+            return
+        raise RuntimeError(
+            f"Classifier model not found. Expected at:\n"
+            f"  - {self._model_path or 'Not specified'}\n"
+            f"  - Package data: scurl.prompt_defender.models/{self.MODEL_FILENAME}\n"
+            f"  - Cache: {cache_path}\n\n"
+            f"Please run the training script or download a pre-trained model."
+        )
+    def predict_proba(self, features: np.ndarray) -> float:
+        """Predict probability of prompt injection.
+        Args:
+            features: Feature vector of shape (n_features,) or (1, n_features).
+        Returns:
+            Probability of injection (0.0 to 1.0).
+        """
+        self._ensure_loaded()
+        # Ensure 2D array
+        if features.ndim == 1:
+            features = features.reshape(1, -1)
+        # Get probability of positive class (injection = 1)
+        proba = self._model.predict_proba(features)[0, 1]
+        return float(proba)
+    def predict(self, features: np.ndarray, threshold: float = 0.5) -> bool:
+        """Predict whether input is prompt injection.
+        Args:
+            features: Feature vector.
+            threshold: Classification threshold.
+        Returns:
+            True if predicted as injection.
+        """
+        return self.predict_proba(features) >= threshold
+    @property
+    def is_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self._model is not None
+    @property
+    def n_features(self) -> Optional[int]:
+        """Return expected number of features, or None if not loaded."""
+        if self._model is None:
+            return None
+        return self._model.n_features_in_
+class PatternOnlyClassifier:
+    """Simple threshold-based classifier using only pattern features.
+    Useful as a fallback when the full model isn't available,
+    or for fast-path detection of obvious injections.
+    """
+    # Weights for each pattern category
+    WEIGHTS = {
+        'instruction_override': 3.0,
+        'role_injection': 2.5,
+        'system_manipulation': 3.0,
+        'prompt_leak': 2.0,
+        'jailbreak_keywords': 2.5,
+        'encoding_markers': 1.0,
+        'suspicious_delimiters': 1.5,
+    }
+    def __init__(self, threshold: float = 0.3):
+        """Initialize classifier.
+        Args:
+            threshold: Score threshold for detection.
+        """
+        self.threshold = threshold
+    def predict_proba(self, pattern_features: 'PatternFeatures') -> float:
+        """Calculate weighted score from pattern features.
+        Args:
+            pattern_features: PatternFeatures dataclass instance.
+        Returns:
+            Weighted score (higher = more likely injection).
+        """
+        score = 0.0
+        score += pattern_features.instruction_override * self.WEIGHTS['instruction_override']
+        score += pattern_features.role_injection * self.WEIGHTS['role_injection']
+        score += pattern_features.system_manipulation * self.WEIGHTS['system_manipulation']
+        score += pattern_features.prompt_leak * self.WEIGHTS['prompt_leak']
+        score += pattern_features.jailbreak_keywords * self.WEIGHTS['jailbreak_keywords']
+        score += pattern_features.encoding_markers * self.WEIGHTS['encoding_markers']
+        score += pattern_features.suspicious_delimiters * self.WEIGHTS['suspicious_delimiters']
+        # Normalize to roughly [0, 1] range
+        max_possible = sum(self.WEIGHTS.values())
+        return min(score / max_possible, 1.0)
+    def predict(self, pattern_features: 'PatternFeatures') -> bool:
+        """Predict whether input is prompt injection."""
+        return self.predict_proba(pattern_features) >= self.threshold

sibylline-scurl 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

sibylline-scurl 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl