sibylline-scurl 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scurl/cli.py +8 -3
- scurl/prompt_defender/middleware.py +2 -2
- scurl/response_middleware.py +9 -5
- {sibylline_scurl-0.2.0.dist-info → sibylline_scurl-0.2.3.dist-info}/METADATA +5 -3
- {sibylline_scurl-0.2.0.dist-info → sibylline_scurl-0.2.3.dist-info}/RECORD +7 -7
- {sibylline_scurl-0.2.0.dist-info → sibylline_scurl-0.2.3.dist-info}/WHEEL +0 -0
- {sibylline_scurl-0.2.0.dist-info → sibylline_scurl-0.2.3.dist-info}/entry_points.txt +0 -0
scurl/cli.py
CHANGED
|
@@ -11,7 +11,6 @@ from .middleware import (
|
|
|
11
11
|
)
|
|
12
12
|
from .request_middleware import SecretDefender
|
|
13
13
|
from .response_middleware import ReadabilityExtractor
|
|
14
|
-
from .prompt_defender import PromptInjectionDefender
|
|
15
14
|
from .curl import parse_curl_args, execute_curl, curl_result_to_response_context
|
|
16
15
|
|
|
17
16
|
|
|
@@ -22,7 +21,7 @@ REQUEST_MIDDLEWARE = {
|
|
|
22
21
|
|
|
23
22
|
RESPONSE_MIDDLEWARE = {
|
|
24
23
|
"readability": ("ReadabilityExtractor", "Extracts clean markdown from HTML", ReadabilityExtractor),
|
|
25
|
-
"prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content",
|
|
24
|
+
"prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content", None), # Lazy loaded
|
|
26
25
|
}
|
|
27
26
|
|
|
28
27
|
|
|
@@ -42,6 +41,7 @@ class ScurlFlags:
|
|
|
42
41
|
"""Parsed scurl-specific flags."""
|
|
43
42
|
raw: bool = False
|
|
44
43
|
render: bool = False
|
|
44
|
+
readability: bool = False
|
|
45
45
|
disable: set[str] = field(default_factory=set)
|
|
46
46
|
enable: set[str] = field(default_factory=set)
|
|
47
47
|
list_middleware: bool = False
|
|
@@ -65,6 +65,9 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
|
|
|
65
65
|
elif arg == "--render":
|
|
66
66
|
flags.render = True
|
|
67
67
|
i += 1
|
|
68
|
+
elif arg == "--readability":
|
|
69
|
+
flags.readability = True
|
|
70
|
+
i += 1
|
|
68
71
|
elif arg == "--disable":
|
|
69
72
|
if i + 1 < len(args):
|
|
70
73
|
flags.disable.add(args[i + 1])
|
|
@@ -119,6 +122,7 @@ def print_help() -> None:
|
|
|
119
122
|
print("scurl-specific options:")
|
|
120
123
|
print(" --raw Disable all response middleware (raw curl output)")
|
|
121
124
|
print(" --render Use headless browser for JS-rendered pages")
|
|
125
|
+
print(" --readability Extract article content (strips nav, ads, etc.)")
|
|
122
126
|
print(" --disable <middleware> Disable a middleware by slug (can be repeated)")
|
|
123
127
|
print(" --enable <middleware> Enable an opt-in middleware (can be repeated)")
|
|
124
128
|
print(" --list-middleware List available middleware and their slugs")
|
|
@@ -213,9 +217,10 @@ def run(args: Optional[list[str]] = None) -> int:
|
|
|
213
217
|
response_chain = ResponseMiddlewareChain()
|
|
214
218
|
if not flags.raw:
|
|
215
219
|
if "readability" not in flags.disable:
|
|
216
|
-
response_chain.add(ReadabilityExtractor())
|
|
220
|
+
response_chain.add(ReadabilityExtractor(use_readability=flags.readability))
|
|
217
221
|
# Prompt defender is opt-in (requires --enable)
|
|
218
222
|
if "prompt-defender" in flags.enable:
|
|
223
|
+
from .prompt_defender import PromptInjectionDefender
|
|
219
224
|
response_chain.add(PromptInjectionDefender(
|
|
220
225
|
threshold=flags.injection_threshold,
|
|
221
226
|
action=flags.injection_action,
|
|
@@ -5,8 +5,6 @@ from dataclasses import dataclass
|
|
|
5
5
|
from typing import Dict, List, Set, Tuple
|
|
6
6
|
import json
|
|
7
7
|
|
|
8
|
-
import numpy as np
|
|
9
|
-
|
|
10
8
|
from ..middleware import ResponseMiddleware, ResponseContext, ResponseMiddlewareResult
|
|
11
9
|
from .normalizer import TextNormalizer
|
|
12
10
|
from .patterns import PatternExtractor, PATTERN_CATEGORIES
|
|
@@ -209,6 +207,8 @@ class PromptInjectionDefender(ResponseMiddleware):
|
|
|
209
207
|
Returns:
|
|
210
208
|
InjectionAnalysis with detection results.
|
|
211
209
|
"""
|
|
210
|
+
import numpy as np # Lazy import - optional dependency
|
|
211
|
+
|
|
212
212
|
# Normalize text to defeat obfuscation
|
|
213
213
|
normalized = self._normalizer.normalize(text)
|
|
214
214
|
|
scurl/response_middleware.py
CHANGED
|
@@ -20,11 +20,13 @@ class ReadabilityExtractor(ResponseMiddleware):
|
|
|
20
20
|
include_images: bool = True,
|
|
21
21
|
include_tables: bool = True,
|
|
22
22
|
body_width: int = 0,
|
|
23
|
+
use_readability: bool = False,
|
|
23
24
|
):
|
|
24
25
|
self._include_links = include_links
|
|
25
26
|
self._include_images = include_images
|
|
26
27
|
self._include_tables = include_tables
|
|
27
28
|
self._body_width = body_width
|
|
29
|
+
self._use_readability = use_readability
|
|
28
30
|
|
|
29
31
|
@property
|
|
30
32
|
def name(self) -> str:
|
|
@@ -105,16 +107,18 @@ class ReadabilityExtractor(ResponseMiddleware):
|
|
|
105
107
|
def process(self, context: ResponseContext) -> ResponseMiddlewareResult:
|
|
106
108
|
"""Extract markdown from HTML.
|
|
107
109
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
+
By default uses html2text directly for full page content.
|
|
111
|
+
With use_readability=True, tries readability article extraction first.
|
|
110
112
|
"""
|
|
111
113
|
html = context.body.decode("utf-8", errors="replace")
|
|
112
114
|
url = context.url or ""
|
|
113
115
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
result = None
|
|
117
|
+
if self._use_readability:
|
|
118
|
+
# Try readability + html2text (best for article-like content)
|
|
119
|
+
result = self._extract_with_readability(html, url)
|
|
116
120
|
|
|
117
|
-
#
|
|
121
|
+
# Use html2text direct (full page, preserves all content)
|
|
118
122
|
if not result:
|
|
119
123
|
result = self._extract_with_html2text_direct(html)
|
|
120
124
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sibylline-scurl
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A secure curl wrapper with middleware support and HTML-to-markdown extraction
|
|
5
5
|
Author: Nathan
|
|
6
6
|
License: MIT
|
|
@@ -76,7 +76,7 @@ scurl -H "Accept: application/json" https://api.example.com/data
|
|
|
76
76
|
## Features
|
|
77
77
|
|
|
78
78
|
- **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
|
|
79
|
-
- **
|
|
79
|
+
- **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
|
|
80
80
|
- **Middleware System**: Composable request and response middleware
|
|
81
81
|
|
|
82
82
|
## Why scurl?
|
|
@@ -124,7 +124,9 @@ was added later...
|
|
|
124
124
|
|
|
125
125
|
| Flag | Description |
|
|
126
126
|
|------|-------------|
|
|
127
|
-
| `--raw` | Disable all response middleware |
|
|
127
|
+
| `--raw` | Disable all response middleware (raw HTML output) |
|
|
128
|
+
| `--readability` | Extract article content only (strips nav, ads, sidebars) |
|
|
129
|
+
| `--render` | Use headless browser for JS-rendered pages |
|
|
128
130
|
| `--disable <slug>` | Disable a middleware by slug (can be repeated) |
|
|
129
131
|
| `--enable <slug>` | Override a middleware's block (can be repeated) |
|
|
130
132
|
| `--list-middleware` | List available middleware and their slugs |
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
scurl/__init__.py,sha256=ycKB1BvXjpJObD-zYujBLX0n1BGu9WpqkLvizv_JH8E,84
|
|
2
2
|
scurl/browser.py,sha256=ICLz2AI8b9zmNunLbk__IArvJZ1dJv5e4GKcZaXIhcI,2466
|
|
3
|
-
scurl/cli.py,sha256=
|
|
3
|
+
scurl/cli.py,sha256=rqAinrbhnOdbiHMO4a-jtMTrTqQ5ZS7t58bZST5H-oI,9027
|
|
4
4
|
scurl/curl.py,sha256=S4NfAd0VfYrTbjn0RyMwZk-C14AkBa7YOT6GfOdedz0,5648
|
|
5
5
|
scurl/middleware.py,sha256=-On84ovv9y5U5Ti5oJOtEGQyunnk0nipFwOuLnvYphw,5275
|
|
6
6
|
scurl/request_middleware.py,sha256=LLGwQJ96cj5lX0Umkwf0T61W6fEmwieBCKr0Is3EfQk,8202
|
|
7
|
-
scurl/response_middleware.py,sha256=
|
|
7
|
+
scurl/response_middleware.py,sha256=QqbUs7OZ9k_XfJSD-IzOOEA7DQI6OzYWRoGUKwY82gI,5441
|
|
8
8
|
scurl/sanitize.py,sha256=tSsLLHoSsohDFgaWOzrv9Qfzi-vzKUOmkRR-LcnMH6o,2144
|
|
9
9
|
scurl/prompt_defender/__init__.py,sha256=2cGKzoG85MaWvaQNuCcsPYEKQ_TzC0E54o9hH-NiHvA,615
|
|
10
10
|
scurl/prompt_defender/classifier.py,sha256=NEIN2oh2eiBoeGObFgG9NnE5n5qJktkttVAhgNE74t4,5993
|
|
11
11
|
scurl/prompt_defender/embedder.py,sha256=VlyEkoTZd47bIBUaLfKy-flPO3YIrYqAiFRAzcHx31Q,8769
|
|
12
|
-
scurl/prompt_defender/middleware.py,sha256=
|
|
12
|
+
scurl/prompt_defender/middleware.py,sha256=H7p0TrQ1Mc5wjTIFR938X78JcuheTnoo1RDY890MPIw,18166
|
|
13
13
|
scurl/prompt_defender/motifs.py,sha256=Rm606Lp0s0haz-uc8aURbeb3iGQExvIGJJ7KtEICC3M,11491
|
|
14
14
|
scurl/prompt_defender/normalizer.py,sha256=IKwlbWXhQbem1pzuFjqR8HUssT6v1jpal4Y8KybSkY4,5026
|
|
15
15
|
scurl/prompt_defender/patterns.py,sha256=d0fJjtkKwgK6kI-H-ucpHtStZi0yM2tBA9wiyEmFxjQ,9674
|
|
16
16
|
scurl/prompt_defender/windowing.py,sha256=HJS-w6lsPWt1_SIDo5nG1m22r4vslWZyRHG8n-QqgH8,12598
|
|
17
17
|
scurl/prompt_defender/models/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
scurl/prompt_defender/models/prompt_injection_rf.pkl,sha256=wW1NXC2xiExx2dj5D3uZivKlaZ_XrF59KxdUU4b5BQg,1321504
|
|
19
|
-
sibylline_scurl-0.2.
|
|
20
|
-
sibylline_scurl-0.2.
|
|
21
|
-
sibylline_scurl-0.2.
|
|
22
|
-
sibylline_scurl-0.2.
|
|
19
|
+
sibylline_scurl-0.2.3.dist-info/METADATA,sha256=ITs-GFqntnKrNCwqARy8TpKIpnZPkLhs9YuxvXCzVNo,5279
|
|
20
|
+
sibylline_scurl-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
21
|
+
sibylline_scurl-0.2.3.dist-info/entry_points.txt,sha256=iza-x5PFyqniQVHvoFBlKKvr32aJCulbgqGdYC8grAA,41
|
|
22
|
+
sibylline_scurl-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|