sibylline-scurl 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scurl/browser.py ADDED
@@ -0,0 +1,78 @@
1
+ """Playwright-based browser fetcher for JS-rendered pages."""
2
+
3
+ from .curl import CurlResult
4
+ from .middleware import RequestContext
5
+
6
+
7
+ def execute_browser(context: RequestContext, timeout: int = 60000) -> CurlResult:
8
+ """Fetch a URL using headless Chromium via Playwright.
9
+
10
+ Returns a CurlResult with the fully-rendered HTML as the body,
11
+ compatible with the existing response middleware chain.
12
+ """
13
+ try:
14
+ from playwright.sync_api import sync_playwright
15
+ except ImportError:
16
+ return CurlResult(
17
+ body=b"",
18
+ headers={},
19
+ status_code=0,
20
+ content_type=None,
21
+ final_url=context.url,
22
+ return_code=-1,
23
+ stderr="playwright not installed. Install with: pip install sibylline-scurl[browser]",
24
+ )
25
+
26
+ try:
27
+ with sync_playwright() as p:
28
+ browser = p.chromium.launch(headless=True)
29
+ page = browser.new_page()
30
+
31
+ response = page.goto(context.url, wait_until="networkidle", timeout=timeout)
32
+
33
+ if response is None:
34
+ browser.close()
35
+ return CurlResult(
36
+ body=b"",
37
+ headers={},
38
+ status_code=0,
39
+ content_type=None,
40
+ final_url=context.url,
41
+ return_code=-1,
42
+ stderr="No response from page",
43
+ )
44
+
45
+ status_code = response.status
46
+ final_url = page.url
47
+ html = page.content()
48
+
49
+ # Extract response headers
50
+ headers = {}
51
+ for name, value in response.headers.items():
52
+ headers[name.lower()] = value
53
+
54
+ # Ensure content-type is text/html so readability middleware processes it
55
+ content_type = headers.get("content-type", "text/html")
56
+
57
+ browser.close()
58
+
59
+ return CurlResult(
60
+ body=html.encode("utf-8"),
61
+ headers=headers,
62
+ status_code=status_code,
63
+ content_type=content_type,
64
+ final_url=final_url,
65
+ return_code=0,
66
+ stderr="",
67
+ )
68
+
69
+ except Exception as e:
70
+ return CurlResult(
71
+ body=b"",
72
+ headers={},
73
+ status_code=0,
74
+ content_type=None,
75
+ final_url=context.url,
76
+ return_code=-1,
77
+ stderr=f"Browser rendering failed: {e}",
78
+ )
scurl/cli.py CHANGED
@@ -10,7 +10,8 @@ from .middleware import (
10
10
  RequestAction,
11
11
  )
12
12
  from .request_middleware import SecretDefender
13
- from .response_middleware import TrafilaturaExtractor
13
+ from .response_middleware import ReadabilityExtractor
14
+ from .prompt_defender import PromptInjectionDefender
14
15
  from .curl import parse_curl_args, execute_curl, curl_result_to_response_context
15
16
 
16
17
 
@@ -20,7 +21,8 @@ REQUEST_MIDDLEWARE = {
20
21
  }
21
22
 
22
23
  RESPONSE_MIDDLEWARE = {
23
- "trafilatura": ("TrafilaturaExtractor", "Extracts clean markdown from HTML", TrafilaturaExtractor),
24
+ "readability": ("ReadabilityExtractor", "Extracts clean markdown from HTML", ReadabilityExtractor),
25
+ "prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content", PromptInjectionDefender),
24
26
  }
25
27
 
26
28
 
@@ -39,10 +41,14 @@ def print_middleware_list() -> None:
39
41
  class ScurlFlags:
40
42
  """Parsed scurl-specific flags."""
41
43
  raw: bool = False
44
+ render: bool = False
42
45
  disable: set[str] = field(default_factory=set)
43
46
  enable: set[str] = field(default_factory=set)
44
47
  list_middleware: bool = False
45
48
  help: bool = False
49
+ # Prompt injection defender options
50
+ injection_threshold: float = 0.3
51
+ injection_action: str = "redact" # "warn", "redact", "datamark", "metadata", "silent"
46
52
 
47
53
 
48
54
  def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
@@ -56,6 +62,9 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
56
62
  if arg == "--raw":
57
63
  flags.raw = True
58
64
  i += 1
65
+ elif arg == "--render":
66
+ flags.render = True
67
+ i += 1
59
68
  elif arg == "--disable":
60
69
  if i + 1 < len(args):
61
70
  flags.disable.add(args[i + 1])
@@ -70,6 +79,23 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
70
79
  else:
71
80
  remaining.append(arg)
72
81
  i += 1
82
+ elif arg == "--injection-threshold":
83
+ if i + 1 < len(args):
84
+ try:
85
+ flags.injection_threshold = float(args[i + 1])
86
+ except ValueError:
87
+ pass # Keep default
88
+ i += 2
89
+ else:
90
+ i += 1
91
+ elif arg == "--injection-action":
92
+ if i + 1 < len(args):
93
+ action = args[i + 1].lower()
94
+ if action in ("warn", "redact", "datamark", "metadata", "silent"):
95
+ flags.injection_action = action
96
+ i += 2
97
+ else:
98
+ i += 1
73
99
  elif arg == "--list-middleware":
74
100
  flags.list_middleware = True
75
101
  i += 1
@@ -92,19 +118,31 @@ def print_help() -> None:
92
118
  print()
93
119
  print("scurl-specific options:")
94
120
  print(" --raw Disable all response middleware (raw curl output)")
121
+ print(" --render Use headless browser for JS-rendered pages")
95
122
  print(" --disable <middleware> Disable a middleware by slug (can be repeated)")
96
- print(" --enable <middleware> Override a middleware's block (can be repeated)")
123
+ print(" --enable <middleware> Enable an opt-in middleware (can be repeated)")
97
124
  print(" --list-middleware List available middleware and their slugs")
98
125
  print(" --help, -h Show this help (use curl --help for curl options)")
99
126
  print()
127
+ print("Prompt injection detection (requires --enable prompt-defender):")
128
+ print(" --injection-threshold <0.0-1.0> Detection threshold (default: 0.5)")
129
+ print(" --injection-action <action> Action on detection (default: redact):")
130
+ print(" warn - wrap in <suspected-prompt-injection> tag, content unchanged")
131
+ print(" redact - wrap in <suspected-prompt-injection> tag, mask patterns with █")
132
+ print(" datamark - wrap in <suspected-prompt-injection> tag, spotlighting mode")
133
+ print(" metadata - return JSON analysis")
134
+ print(" silent - pass through unchanged")
135
+ print()
100
136
  print("All other options are passed directly to curl.")
101
137
  print()
102
138
  print("Examples:")
103
139
  print(" scurl https://example.com # Fetch and extract markdown")
104
140
  print(" scurl --raw https://example.com # Raw HTML output")
105
- print(" scurl --disable trafilatura https://example.com # Disable markdown extraction")
106
- print(" scurl --disable secret-defender https://... # Disable secret scanning")
107
- print(" scurl --enable secret-defender https://... # Override a secret block")
141
+ print(" scurl --disable trafilatura https://... # Disable markdown extraction")
142
+ print(" scurl --disable secret-defender https://... # Disable secret scanning")
143
+ print(" scurl --enable prompt-defender https://... # Enable injection detection")
144
+ print(" scurl --render https://github.com/user/repo # Render JS-heavy pages")
145
+ print(" scurl --enable prompt-defender --injection-threshold 0.5 https://...")
108
146
  print(" scurl -H 'Accept: application/json' https://api.example.com/data")
109
147
 
110
148
 
@@ -154,8 +192,12 @@ def run(args: Optional[list[str]] = None) -> int:
154
192
  if result.context:
155
193
  context = result.context
156
194
 
157
- # Execute curl
158
- curl_result = execute_curl(context)
195
+ # Execute fetch (curl or browser)
196
+ if flags.render:
197
+ from .browser import execute_browser
198
+ curl_result = execute_browser(context)
199
+ else:
200
+ curl_result = execute_curl(context)
159
201
 
160
202
  if curl_result.return_code != 0 and curl_result.return_code != -1:
161
203
  # curl failed but not our timeout/not-found
@@ -170,8 +212,14 @@ def run(args: Optional[list[str]] = None) -> int:
170
212
  # Build response middleware chain
171
213
  response_chain = ResponseMiddlewareChain()
172
214
  if not flags.raw:
173
- if "trafilatura" not in flags.disable:
174
- response_chain.add(TrafilaturaExtractor())
215
+ if "readability" not in flags.disable:
216
+ response_chain.add(ReadabilityExtractor())
217
+ # Prompt defender is opt-in (requires --enable)
218
+ if "prompt-defender" in flags.enable:
219
+ response_chain.add(PromptInjectionDefender(
220
+ threshold=flags.injection_threshold,
221
+ action=flags.injection_action,
222
+ ))
175
223
 
176
224
  # Execute response middleware
177
225
  response_context = curl_result_to_response_context(curl_result)
scurl/middleware.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """Base middleware classes for request and response processing."""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
- from dataclasses import dataclass, field
4
+ from dataclasses import dataclass
5
5
  from enum import Enum
6
6
  from typing import Optional
7
7
 
@@ -0,0 +1,20 @@
1
+ """Prompt injection detection middleware for scurl."""
2
+
3
+ from .middleware import PromptInjectionDefender
4
+ from .normalizer import TextNormalizer
5
+ from .patterns import PatternExtractor, PatternFeatures
6
+ from .motifs import MotifMatcher, MotifFeatureExtractor, MotifSignal, HAS_RAPIDFUZZ
7
+ from .windowing import SlidingWindowAnalyzer, AdaptiveWindowAnalyzer
8
+
9
+ __all__ = [
10
+ "PromptInjectionDefender",
11
+ "TextNormalizer",
12
+ "PatternExtractor",
13
+ "PatternFeatures",
14
+ "MotifMatcher",
15
+ "MotifFeatureExtractor",
16
+ "MotifSignal",
17
+ "SlidingWindowAnalyzer",
18
+ "AdaptiveWindowAnalyzer",
19
+ "HAS_RAPIDFUZZ",
20
+ ]
@@ -0,0 +1,178 @@
1
+ """Classifier for prompt injection detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pickle
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Optional
8
+
9
+ import numpy as np
10
+
11
+ if TYPE_CHECKING:
12
+ from .patterns import PatternFeatures
13
+
14
+
15
+ class InjectionClassifier:
16
+ """Random Forest classifier for prompt injection detection.
17
+
18
+ Expects feature vectors combining pattern features and embeddings.
19
+ Lazy-loads model on first use.
20
+ """
21
+
22
+ MODEL_FILENAME = "prompt_injection_rf.pkl"
23
+
24
+ def __init__(self, model_path: Optional[Path] = None):
25
+ """Initialize classifier.
26
+
27
+ Args:
28
+ model_path: Path to pickled model file. If not provided,
29
+ looks in package data and default cache locations.
30
+ """
31
+ self._model_path = model_path
32
+ self._model = None
33
+
34
+ def _ensure_loaded(self) -> None:
35
+ """Lazy-load model on first use."""
36
+ if self._model is not None:
37
+ return
38
+
39
+ # Try explicit path first
40
+ if self._model_path and self._model_path.exists():
41
+ with open(self._model_path, 'rb') as f:
42
+ self._model = pickle.load(f)
43
+ return
44
+
45
+ # Try package data
46
+ try:
47
+ import importlib.resources as resources
48
+ try:
49
+ # Python 3.9+
50
+ files = resources.files('scurl.prompt_defender.models')
51
+ model_file = files.joinpath(self.MODEL_FILENAME)
52
+ if model_file.is_file():
53
+ with model_file.open('rb') as f:
54
+ self._model = pickle.load(f)
55
+ return
56
+ except (AttributeError, TypeError):
57
+ # Python 3.8 fallback
58
+ with resources.open_binary(
59
+ 'scurl.prompt_defender.models',
60
+ self.MODEL_FILENAME
61
+ ) as f:
62
+ self._model = pickle.load(f)
63
+ return
64
+ except (FileNotFoundError, ModuleNotFoundError):
65
+ pass
66
+
67
+ # Try cache directory
68
+ from .embedder import EmbeddingGemmaONNX
69
+ cache_path = EmbeddingGemmaONNX._default_model_dir() / self.MODEL_FILENAME
70
+ if cache_path.exists():
71
+ with open(cache_path, 'rb') as f:
72
+ self._model = pickle.load(f)
73
+ return
74
+
75
+ raise RuntimeError(
76
+ f"Classifier model not found. Expected at:\n"
77
+ f" - {self._model_path or 'Not specified'}\n"
78
+ f" - Package data: scurl.prompt_defender.models/{self.MODEL_FILENAME}\n"
79
+ f" - Cache: {cache_path}\n\n"
80
+ f"Please run the training script or download a pre-trained model."
81
+ )
82
+
83
+ def predict_proba(self, features: np.ndarray) -> float:
84
+ """Predict probability of prompt injection.
85
+
86
+ Args:
87
+ features: Feature vector of shape (n_features,) or (1, n_features).
88
+
89
+ Returns:
90
+ Probability of injection (0.0 to 1.0).
91
+ """
92
+ self._ensure_loaded()
93
+
94
+ # Ensure 2D array
95
+ if features.ndim == 1:
96
+ features = features.reshape(1, -1)
97
+
98
+ # Get probability of positive class (injection = 1)
99
+ proba = self._model.predict_proba(features)[0, 1]
100
+ return float(proba)
101
+
102
+ def predict(self, features: np.ndarray, threshold: float = 0.5) -> bool:
103
+ """Predict whether input is prompt injection.
104
+
105
+ Args:
106
+ features: Feature vector.
107
+ threshold: Classification threshold.
108
+
109
+ Returns:
110
+ True if predicted as injection.
111
+ """
112
+ return self.predict_proba(features) >= threshold
113
+
114
+ @property
115
+ def is_loaded(self) -> bool:
116
+ """Check if model is loaded."""
117
+ return self._model is not None
118
+
119
+ @property
120
+ def n_features(self) -> Optional[int]:
121
+ """Return expected number of features, or None if not loaded."""
122
+ if self._model is None:
123
+ return None
124
+ return self._model.n_features_in_
125
+
126
+
127
+ class PatternOnlyClassifier:
128
+ """Simple threshold-based classifier using only pattern features.
129
+
130
+ Useful as a fallback when the full model isn't available,
131
+ or for fast-path detection of obvious injections.
132
+ """
133
+
134
+ # Weights for each pattern category
135
+ WEIGHTS = {
136
+ 'instruction_override': 3.0,
137
+ 'role_injection': 2.5,
138
+ 'system_manipulation': 3.0,
139
+ 'prompt_leak': 2.0,
140
+ 'jailbreak_keywords': 2.5,
141
+ 'encoding_markers': 1.0,
142
+ 'suspicious_delimiters': 1.5,
143
+ }
144
+
145
+ def __init__(self, threshold: float = 0.3):
146
+ """Initialize classifier.
147
+
148
+ Args:
149
+ threshold: Score threshold for detection.
150
+ """
151
+ self.threshold = threshold
152
+
153
+ def predict_proba(self, pattern_features: 'PatternFeatures') -> float:
154
+ """Calculate weighted score from pattern features.
155
+
156
+ Args:
157
+ pattern_features: PatternFeatures dataclass instance.
158
+
159
+ Returns:
160
+ Weighted score (higher = more likely injection).
161
+ """
162
+
163
+ score = 0.0
164
+ score += pattern_features.instruction_override * self.WEIGHTS['instruction_override']
165
+ score += pattern_features.role_injection * self.WEIGHTS['role_injection']
166
+ score += pattern_features.system_manipulation * self.WEIGHTS['system_manipulation']
167
+ score += pattern_features.prompt_leak * self.WEIGHTS['prompt_leak']
168
+ score += pattern_features.jailbreak_keywords * self.WEIGHTS['jailbreak_keywords']
169
+ score += pattern_features.encoding_markers * self.WEIGHTS['encoding_markers']
170
+ score += pattern_features.suspicious_delimiters * self.WEIGHTS['suspicious_delimiters']
171
+
172
+ # Normalize to roughly [0, 1] range
173
+ max_possible = sum(self.WEIGHTS.values())
174
+ return min(score / max_possible, 1.0)
175
+
176
+ def predict(self, pattern_features: 'PatternFeatures') -> bool:
177
+ """Predict whether input is prompt injection."""
178
+ return self.predict_proba(pattern_features) >= self.threshold