sibylline-scurl 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scurl/cli.py CHANGED
@@ -11,7 +11,6 @@ from .middleware import (
11
11
  )
12
12
  from .request_middleware import SecretDefender
13
13
  from .response_middleware import ReadabilityExtractor
14
- from .prompt_defender import PromptInjectionDefender
15
14
  from .curl import parse_curl_args, execute_curl, curl_result_to_response_context
16
15
 
17
16
 
@@ -22,7 +21,7 @@ REQUEST_MIDDLEWARE = {
22
21
 
23
22
  RESPONSE_MIDDLEWARE = {
24
23
  "readability": ("ReadabilityExtractor", "Extracts clean markdown from HTML", ReadabilityExtractor),
25
- "prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content", PromptInjectionDefender),
24
+ "prompt-defender": ("PromptInjectionDefender", "Detects prompt injection in web content", None), # Lazy loaded
26
25
  }
27
26
 
28
27
 
@@ -42,6 +41,7 @@ class ScurlFlags:
42
41
  """Parsed scurl-specific flags."""
43
42
  raw: bool = False
44
43
  render: bool = False
44
+ readability: bool = False
45
45
  disable: set[str] = field(default_factory=set)
46
46
  enable: set[str] = field(default_factory=set)
47
47
  list_middleware: bool = False
@@ -65,6 +65,9 @@ def extract_scurl_flags(args: list[str]) -> tuple[ScurlFlags, list[str]]:
65
65
  elif arg == "--render":
66
66
  flags.render = True
67
67
  i += 1
68
+ elif arg == "--readability":
69
+ flags.readability = True
70
+ i += 1
68
71
  elif arg == "--disable":
69
72
  if i + 1 < len(args):
70
73
  flags.disable.add(args[i + 1])
@@ -119,6 +122,7 @@ def print_help() -> None:
119
122
  print("scurl-specific options:")
120
123
  print(" --raw Disable all response middleware (raw curl output)")
121
124
  print(" --render Use headless browser for JS-rendered pages")
125
+ print(" --readability Extract article content (strips nav, ads, etc.)")
122
126
  print(" --disable <middleware> Disable a middleware by slug (can be repeated)")
123
127
  print(" --enable <middleware> Enable an opt-in middleware (can be repeated)")
124
128
  print(" --list-middleware List available middleware and their slugs")
@@ -213,9 +217,10 @@ def run(args: Optional[list[str]] = None) -> int:
213
217
  response_chain = ResponseMiddlewareChain()
214
218
  if not flags.raw:
215
219
  if "readability" not in flags.disable:
216
- response_chain.add(ReadabilityExtractor())
220
+ response_chain.add(ReadabilityExtractor(use_readability=flags.readability))
217
221
  # Prompt defender is opt-in (requires --enable)
218
222
  if "prompt-defender" in flags.enable:
223
+ from .prompt_defender import PromptInjectionDefender
219
224
  response_chain.add(PromptInjectionDefender(
220
225
  threshold=flags.injection_threshold,
221
226
  action=flags.injection_action,
@@ -5,8 +5,6 @@ from dataclasses import dataclass
5
5
  from typing import Dict, List, Set, Tuple
6
6
  import json
7
7
 
8
- import numpy as np
9
-
10
8
  from ..middleware import ResponseMiddleware, ResponseContext, ResponseMiddlewareResult
11
9
  from .normalizer import TextNormalizer
12
10
  from .patterns import PatternExtractor, PATTERN_CATEGORIES
@@ -209,6 +207,8 @@ class PromptInjectionDefender(ResponseMiddleware):
209
207
  Returns:
210
208
  InjectionAnalysis with detection results.
211
209
  """
210
+ import numpy as np # Lazy import - optional dependency
211
+
212
212
  # Normalize text to defeat obfuscation
213
213
  normalized = self._normalizer.normalize(text)
214
214
 
@@ -20,11 +20,13 @@ class ReadabilityExtractor(ResponseMiddleware):
20
20
  include_images: bool = True,
21
21
  include_tables: bool = True,
22
22
  body_width: int = 0,
23
+ use_readability: bool = False,
23
24
  ):
24
25
  self._include_links = include_links
25
26
  self._include_images = include_images
26
27
  self._include_tables = include_tables
27
28
  self._body_width = body_width
29
+ self._use_readability = use_readability
28
30
 
29
31
  @property
30
32
  def name(self) -> str:
@@ -105,16 +107,18 @@ class ReadabilityExtractor(ResponseMiddleware):
105
107
  def process(self, context: ResponseContext) -> ResponseMiddlewareResult:
106
108
  """Extract markdown from HTML.
107
109
 
108
- Tries readability + html2text first, falls back to html2text direct
109
- for content readability can't handle.
110
+ By default uses html2text directly for full page content.
111
+ With use_readability=True, tries readability article extraction first.
110
112
  """
111
113
  html = context.body.decode("utf-8", errors="replace")
112
114
  url = context.url or ""
113
115
 
114
- # Try readability + html2text first (best for article-like content)
115
- result = self._extract_with_readability(html, url)
116
+ result = None
117
+ if self._use_readability:
118
+ # Try readability + html2text (best for article-like content)
119
+ result = self._extract_with_readability(html, url)
116
120
 
117
- # Fall back to html2text direct (preserves links, may include boilerplate)
121
+ # Use html2text direct (full page, preserves all content)
118
122
  if not result:
119
123
  result = self._extract_with_html2text_direct(html)
120
124
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sibylline-scurl
3
- Version: 0.2.0
3
+ Version: 0.2.3
4
4
  Summary: A secure curl wrapper with middleware support and HTML-to-markdown extraction
5
5
  Author: Nathan
6
6
  License: MIT
@@ -76,7 +76,7 @@ scurl -H "Accept: application/json" https://api.example.com/data
76
76
  ## Features
77
77
 
78
78
  - **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
79
- - **ReadabilityExtractor**: Extracts clean markdown from HTML responses using readability + html2text
79
+ - **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
80
80
  - **Middleware System**: Composable request and response middleware
81
81
 
82
82
  ## Why scurl?
@@ -124,7 +124,9 @@ was added later...
124
124
 
125
125
  | Flag | Description |
126
126
  |------|-------------|
127
- | `--raw` | Disable all response middleware |
127
+ | `--raw` | Disable all response middleware (raw HTML output) |
128
+ | `--readability` | Extract article content only (strips nav, ads, sidebars) |
129
+ | `--render` | Use headless browser for JS-rendered pages |
128
130
  | `--disable <slug>` | Disable a middleware by slug (can be repeated) |
129
131
  | `--enable <slug>` | Override a middleware's block (can be repeated) |
130
132
  | `--list-middleware` | List available middleware and their slugs |
@@ -1,22 +1,22 @@
1
1
  scurl/__init__.py,sha256=ycKB1BvXjpJObD-zYujBLX0n1BGu9WpqkLvizv_JH8E,84
2
2
  scurl/browser.py,sha256=ICLz2AI8b9zmNunLbk__IArvJZ1dJv5e4GKcZaXIhcI,2466
3
- scurl/cli.py,sha256=mfi_YzCNlF3h7ipx9Unj2Y0cljmt1r584s3cI7tNSsc,8777
3
+ scurl/cli.py,sha256=rqAinrbhnOdbiHMO4a-jtMTrTqQ5ZS7t58bZST5H-oI,9027
4
4
  scurl/curl.py,sha256=S4NfAd0VfYrTbjn0RyMwZk-C14AkBa7YOT6GfOdedz0,5648
5
5
  scurl/middleware.py,sha256=-On84ovv9y5U5Ti5oJOtEGQyunnk0nipFwOuLnvYphw,5275
6
6
  scurl/request_middleware.py,sha256=LLGwQJ96cj5lX0Umkwf0T61W6fEmwieBCKr0Is3EfQk,8202
7
- scurl/response_middleware.py,sha256=FeS9XaM191QyJRrXuGL9-D-g_tVzCQ9M8mCuvb4NjYg,5290
7
+ scurl/response_middleware.py,sha256=QqbUs7OZ9k_XfJSD-IzOOEA7DQI6OzYWRoGUKwY82gI,5441
8
8
  scurl/sanitize.py,sha256=tSsLLHoSsohDFgaWOzrv9Qfzi-vzKUOmkRR-LcnMH6o,2144
9
9
  scurl/prompt_defender/__init__.py,sha256=2cGKzoG85MaWvaQNuCcsPYEKQ_TzC0E54o9hH-NiHvA,615
10
10
  scurl/prompt_defender/classifier.py,sha256=NEIN2oh2eiBoeGObFgG9NnE5n5qJktkttVAhgNE74t4,5993
11
11
  scurl/prompt_defender/embedder.py,sha256=VlyEkoTZd47bIBUaLfKy-flPO3YIrYqAiFRAzcHx31Q,8769
12
- scurl/prompt_defender/middleware.py,sha256=Ilx5p3mi9vgyTwWCU2Y2oigv9nqc8wIc-X8hRUP2keY,18121
12
+ scurl/prompt_defender/middleware.py,sha256=H7p0TrQ1Mc5wjTIFR938X78JcuheTnoo1RDY890MPIw,18166
13
13
  scurl/prompt_defender/motifs.py,sha256=Rm606Lp0s0haz-uc8aURbeb3iGQExvIGJJ7KtEICC3M,11491
14
14
  scurl/prompt_defender/normalizer.py,sha256=IKwlbWXhQbem1pzuFjqR8HUssT6v1jpal4Y8KybSkY4,5026
15
15
  scurl/prompt_defender/patterns.py,sha256=d0fJjtkKwgK6kI-H-ucpHtStZi0yM2tBA9wiyEmFxjQ,9674
16
16
  scurl/prompt_defender/windowing.py,sha256=HJS-w6lsPWt1_SIDo5nG1m22r4vslWZyRHG8n-QqgH8,12598
17
17
  scurl/prompt_defender/models/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  scurl/prompt_defender/models/prompt_injection_rf.pkl,sha256=wW1NXC2xiExx2dj5D3uZivKlaZ_XrF59KxdUU4b5BQg,1321504
19
- sibylline_scurl-0.2.0.dist-info/METADATA,sha256=hxhwCJ2DSS2Dj8oKWuP-4_YJWDsQuVW2m9mWOvFgd_o,5113
20
- sibylline_scurl-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
21
- sibylline_scurl-0.2.0.dist-info/entry_points.txt,sha256=iza-x5PFyqniQVHvoFBlKKvr32aJCulbgqGdYC8grAA,41
22
- sibylline_scurl-0.2.0.dist-info/RECORD,,
19
+ sibylline_scurl-0.2.3.dist-info/METADATA,sha256=ITs-GFqntnKrNCwqARy8TpKIpnZPkLhs9YuxvXCzVNo,5279
20
+ sibylline_scurl-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
21
+ sibylline_scurl-0.2.3.dist-info/entry_points.txt,sha256=iza-x5PFyqniQVHvoFBlKKvr32aJCulbgqGdYC8grAA,41
22
+ sibylline_scurl-0.2.3.dist-info/RECORD,,