sibylline-scurl 0.1.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.github/workflows/ci.yml +25 -1
  2. sibylline_scurl-0.2.3/.github/workflows/docs.yml +60 -0
  3. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.gitignore +7 -0
  4. sibylline_scurl-0.2.3/PKG-INFO +145 -0
  5. sibylline_scurl-0.2.3/README.md +102 -0
  6. sibylline_scurl-0.2.3/cf/.gitignore +9 -0
  7. sibylline_scurl-0.2.3/cf/container/.gitignore +2 -0
  8. sibylline_scurl-0.2.3/cf/container/Dockerfile +22 -0
  9. sibylline_scurl-0.2.3/cf/container/inline_model.py +24 -0
  10. sibylline_scurl-0.2.3/cf/container/server.py +191 -0
  11. sibylline_scurl-0.2.3/cf/package-lock.json +1536 -0
  12. sibylline_scurl-0.2.3/cf/package.json +19 -0
  13. sibylline_scurl-0.2.3/cf/public/app.js +206 -0
  14. sibylline_scurl-0.2.3/cf/public/index.html +171 -0
  15. sibylline_scurl-0.2.3/cf/public/styles.css +616 -0
  16. sibylline_scurl-0.2.3/cf/src/container.ts +16 -0
  17. sibylline_scurl-0.2.3/cf/src/index.ts +215 -0
  18. sibylline_scurl-0.2.3/cf/src/ratelimit.ts +50 -0
  19. sibylline_scurl-0.2.3/cf/tsconfig.json +16 -0
  20. sibylline_scurl-0.2.3/cf/wrangler.toml +24 -0
  21. sibylline_scurl-0.2.3/docs/algorithm.md +511 -0
  22. sibylline_scurl-0.2.3/docs/benchmarks.md +369 -0
  23. sibylline_scurl-0.2.3/docs/prompt-injection-primer.md +384 -0
  24. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/pyproject.toml +27 -2
  25. sibylline_scurl-0.2.3/scripts/train_classifier.py +344 -0
  26. sibylline_scurl-0.2.3/site/eleventy.config.js +53 -0
  27. sibylline_scurl-0.2.3/site/package-lock.json +1801 -0
  28. sibylline_scurl-0.2.3/site/package.json +17 -0
  29. sibylline_scurl-0.2.3/site/src/.nojekyll +0 -0
  30. sibylline_scurl-0.2.3/site/src/CNAME +1 -0
  31. sibylline_scurl-0.2.3/site/src/_data/site.js +7 -0
  32. sibylline_scurl-0.2.3/site/src/_includes/layouts/base.njk +46 -0
  33. sibylline_scurl-0.2.3/site/src/_includes/layouts/doc.njk +38 -0
  34. sibylline_scurl-0.2.3/site/src/docs/algorithm.md +352 -0
  35. sibylline_scurl-0.2.3/site/src/docs/benchmarks.md +213 -0
  36. sibylline_scurl-0.2.3/site/src/docs/primer.md +220 -0
  37. sibylline_scurl-0.2.3/site/src/img/favicon.svg +4 -0
  38. sibylline_scurl-0.2.3/site/src/index.njk +69 -0
  39. sibylline_scurl-0.2.3/site/src/styles/main.css +529 -0
  40. sibylline_scurl-0.2.3/site/src/styles/prism-dark.css +159 -0
  41. sibylline_scurl-0.2.3/src/scurl/browser.py +78 -0
  42. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/cli.py +63 -10
  43. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/middleware.py +1 -1
  44. sibylline_scurl-0.2.3/src/scurl/prompt_defender/__init__.py +20 -0
  45. sibylline_scurl-0.2.3/src/scurl/prompt_defender/classifier.py +178 -0
  46. sibylline_scurl-0.2.3/src/scurl/prompt_defender/embedder.py +293 -0
  47. sibylline_scurl-0.2.3/src/scurl/prompt_defender/middleware.py +461 -0
  48. sibylline_scurl-0.2.3/src/scurl/prompt_defender/models/.gitkeep +0 -0
  49. sibylline_scurl-0.2.3/src/scurl/prompt_defender/models/prompt_injection_rf.pkl +0 -0
  50. sibylline_scurl-0.2.3/src/scurl/prompt_defender/motifs.py +362 -0
  51. sibylline_scurl-0.2.3/src/scurl/prompt_defender/normalizer.py +147 -0
  52. sibylline_scurl-0.2.3/src/scurl/prompt_defender/patterns.py +227 -0
  53. sibylline_scurl-0.2.3/src/scurl/prompt_defender/windowing.py +397 -0
  54. sibylline_scurl-0.2.3/src/scurl/response_middleware.py +166 -0
  55. sibylline_scurl-0.2.3/src/scurl/sanitize.py +69 -0
  56. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_cli.py +11 -12
  57. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_curl.py +0 -1
  58. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_middleware.py +0 -1
  59. sibylline_scurl-0.2.3/tests/test_motifs.py +205 -0
  60. sibylline_scurl-0.2.3/tests/test_prompt_defender.py +269 -0
  61. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_response_middleware.py +36 -13
  62. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_secret_defender.py +1 -2
  63. sibylline_scurl-0.2.3/tests/test_windowing.py +249 -0
  64. sibylline_scurl-0.2.3/uv.lock +2686 -0
  65. sibylline_scurl-0.1.1/PKG-INFO +0 -81
  66. sibylline_scurl-0.1.1/README.md +0 -57
  67. sibylline_scurl-0.1.1/src/scurl/response_middleware.py +0 -95
  68. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.github/workflows/publish.yml +0 -0
  69. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/__init__.py +0 -0
  70. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/curl.py +0 -0
  71. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/request_middleware.py +0 -0
  72. {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/__init__.py +0 -0
@@ -3,6 +3,7 @@ name: CI
3
3
  on:
4
4
  push:
5
5
  branches: [main]
6
+ tags: ['v*']
6
7
  pull_request:
7
8
  branches: [main]
8
9
 
@@ -24,7 +25,7 @@ jobs:
24
25
  - name: Install dependencies
25
26
  run: |
26
27
  python -m pip install --upgrade pip
27
- pip install -e ".[dev]" pytest-mock
28
+ pip install -e ".[dev,prompt-defender]" pytest-mock
28
29
 
29
30
  - name: Run tests
30
31
  run: pytest -v
@@ -44,3 +45,26 @@ jobs:
44
45
 
45
46
  - name: Run ruff
46
47
  run: ruff check src/ tests/
48
+
49
+ publish:
50
+ needs: [test, lint]
51
+ runs-on: ubuntu-latest
52
+ if: startsWith(github.ref, 'refs/tags/v')
53
+ permissions:
54
+ id-token: write
55
+ steps:
56
+ - uses: actions/checkout@v4
57
+
58
+ - name: Set up Python
59
+ uses: actions/setup-python@v5
60
+ with:
61
+ python-version: "3.12"
62
+
63
+ - name: Install build tools
64
+ run: pip install build
65
+
66
+ - name: Build package
67
+ run: python -m build
68
+
69
+ - name: Publish to PyPI
70
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,60 @@
1
+ name: Deploy Documentation
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - 'site/**'
8
+ - 'docs/**'
9
+ - '.github/workflows/docs.yml'
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+ pages: write
15
+ id-token: write
16
+
17
+ concurrency:
18
+ group: "pages"
19
+ cancel-in-progress: false
20
+
21
+ jobs:
22
+ build:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - name: Checkout
26
+ uses: actions/checkout@v4
27
+
28
+ - name: Setup Node.js
29
+ uses: actions/setup-node@v4
30
+ with:
31
+ node-version: '20'
32
+ cache: 'npm'
33
+ cache-dependency-path: site/package-lock.json
34
+
35
+ - name: Install dependencies
36
+ working-directory: site
37
+ run: npm ci
38
+
39
+ - name: Build site
40
+ working-directory: site
41
+ run: npm run build:production
42
+
43
+ - name: Setup Pages
44
+ uses: actions/configure-pages@v4
45
+
46
+ - name: Upload artifact
47
+ uses: actions/upload-pages-artifact@v3
48
+ with:
49
+ path: site/_site
50
+
51
+ deploy:
52
+ environment:
53
+ name: github-pages
54
+ url: ${{ steps.deployment.outputs.page_url }}
55
+ runs-on: ubuntu-latest
56
+ needs: build
57
+ steps:
58
+ - name: Deploy to GitHub Pages
59
+ id: deployment
60
+ uses: actions/deploy-pages@v4
@@ -41,3 +41,10 @@ htmlcov/
41
41
  # OS
42
42
  .DS_Store
43
43
  Thumbs.db
44
+
45
+ # Generated docs site
46
+ site/node_modules/
47
+ site/_site/
48
+
49
+ # Arbiter (project management)
50
+ .arbiter/
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: sibylline-scurl
3
+ Version: 0.2.3
4
+ Summary: A secure curl wrapper with middleware support and HTML-to-markdown extraction
5
+ Author: Nathan
6
+ License: MIT
7
+ Keywords: curl,markdown,security,web-scraping
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Internet :: WWW/HTTP
17
+ Classifier: Topic :: Security
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: html2text>=2024.2.26
20
+ Requires-Dist: readability-lxml>=0.8.1
21
+ Provides-Extra: browser
22
+ Requires-Dist: playwright>=1.40.0; extra == 'browser'
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
25
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
26
+ Provides-Extra: prompt-defender
27
+ Requires-Dist: confusable-homoglyphs>=3.2.0; extra == 'prompt-defender'
28
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'prompt-defender'
29
+ Requires-Dist: numpy>=1.24.0; extra == 'prompt-defender'
30
+ Requires-Dist: onnxruntime>=1.17.0; extra == 'prompt-defender'
31
+ Requires-Dist: rapidfuzz>=3.5.0; extra == 'prompt-defender'
32
+ Requires-Dist: scikit-learn>=1.3.0; extra == 'prompt-defender'
33
+ Requires-Dist: tokenizers>=0.15.0; extra == 'prompt-defender'
34
+ Provides-Extra: training
35
+ Requires-Dist: confusable-homoglyphs>=3.2.0; extra == 'training'
36
+ Requires-Dist: datasets>=2.14.0; extra == 'training'
37
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'training'
38
+ Requires-Dist: numpy>=1.24.0; extra == 'training'
39
+ Requires-Dist: onnxruntime>=1.17.0; extra == 'training'
40
+ Requires-Dist: scikit-learn>=1.3.0; extra == 'training'
41
+ Requires-Dist: tokenizers>=0.15.0; extra == 'training'
42
+ Description-Content-Type: text/markdown
43
+
44
+ # scurl
45
+
46
+ [![PyPI version](https://badge.fury.io/py/scurl.svg)](https://badge.fury.io/py/scurl)
47
+ [![CI](https://github.com/yourusername/scurl/actions/workflows/ci.yml/badge.svg)](https://github.com/yourusername/scurl/actions/workflows/ci.yml)
48
+
49
+ A secure curl wrapper with middleware support and HTML-to-markdown extraction.
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install sibylline-scurl
55
+ ```
56
+
57
+ Or with [pipx](https://pipx.pypa.io/) (recommended for CLI tools):
58
+
59
+ ```bash
60
+ pipx install sibylline-scurl
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ```bash
66
+ # Fetch a URL and extract clean markdown from HTML
67
+ scurl https://example.com
68
+
69
+ # Raw output (disable response middleware)
70
+ scurl --raw https://example.com
71
+
72
+ # All curl flags work
73
+ scurl -H "Accept: application/json" https://api.example.com/data
74
+ ```
75
+
76
+ ## Features
77
+
78
+ - **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
79
+ - **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
80
+ - **Middleware System**: Composable request and response middleware
81
+
82
+ ## Why scurl?
83
+
84
+ scurl extracts clean, readable content from web pages - perfect for LLM consumption, readability, or bandwidth savings.
85
+
86
+ ### Size Comparison
87
+
88
+ | Website | curl | scurl | Reduction |
89
+ |---------|------|-------|-----------|
90
+ | example.com | 513 | 167 | 67.4% |
91
+ | news.ycombinator.com | 34,082 | 10,739 | 68.5% |
92
+ | en.wikipedia.org/wiki/Curl | 110,373 | 10,044 | 90.9% |
93
+ | github.com/anthropics | 296,788 | 353 | 99.9% |
94
+ | docs.python.org | 319,554 | 12,348 | 96.1% |
95
+
96
+ ### Visual Comparison
97
+
98
+ **curl output** (Wikipedia, first 500 chars):
99
+ ```html
100
+ <!DOCTYPE html><html class="client-nojs" lang="en" dir="ltr"><head>
101
+ <meta charset="UTF-8"/><title>Curl (programming language) - Wikipedia</title>
102
+ <script>(function(){var className="client-js";var cookie=document.cookie.
103
+ match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].
104
+ split('%2C').forEach(function(pref){className=className.replace(new
105
+ RegExp('(^| )'+pref.replace(/-hierarchical-hierarchical/,'')
106
+ +'($| )'),'$1teleported-hierarchical$2');});...
107
+ ```
108
+
109
+ **scurl output** (same page):
110
+ ```markdown
111
+ # Curl (programming language) - Wikipedia
112
+
113
+ **Curl** is a reflective object-oriented programming language for interactive
114
+ web applications, whose goal is to provide a smoother transition between
115
+ content formatting and computer programming. It makes it possible to embed
116
+ complex objects in simple documents without needing to switch between
117
+ programming languages or development platforms.
118
+
119
+ The Curl implementation initially consisted of an interpreter only; a compiler
120
+ was added later...
121
+ ```
122
+
123
+ ## Flags
124
+
125
+ | Flag | Description |
126
+ |------|-------------|
127
+ | `--raw` | Disable all response middleware (raw HTML output) |
128
+ | `--readability` | Extract article content only (strips nav, ads, sidebars) |
129
+ | `--render` | Use headless browser for JS-rendered pages |
130
+ | `--disable <slug>` | Disable a middleware by slug (can be repeated) |
131
+ | `--enable <slug>` | Override a middleware's block (can be repeated) |
132
+ | `--list-middleware` | List available middleware and their slugs |
133
+
134
+ ## Middleware Slugs
135
+
136
+ | Slug | Type | Description |
137
+ |------|------|-------------|
138
+ | `secret-defender` | Request | Detects and blocks requests containing secrets |
139
+ | `readability` | Response | Extracts clean markdown from HTML |
140
+
141
+ ## License
142
+
143
+ Copyright 2026 [Sibylline Software](https://sibylline.dev)
144
+
145
+ MIT
@@ -0,0 +1,102 @@
1
+ # scurl
2
+
3
+ [![PyPI version](https://badge.fury.io/py/scurl.svg)](https://badge.fury.io/py/scurl)
4
+ [![CI](https://github.com/yourusername/scurl/actions/workflows/ci.yml/badge.svg)](https://github.com/yourusername/scurl/actions/workflows/ci.yml)
5
+
6
+ A secure curl wrapper with middleware support and HTML-to-markdown extraction.
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ pip install sibylline-scurl
12
+ ```
13
+
14
+ Or with [pipx](https://pipx.pypa.io/) (recommended for CLI tools):
15
+
16
+ ```bash
17
+ pipx install sibylline-scurl
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ ```bash
23
+ # Fetch a URL and extract clean markdown from HTML
24
+ scurl https://example.com
25
+
26
+ # Raw output (disable response middleware)
27
+ scurl --raw https://example.com
28
+
29
+ # All curl flags work
30
+ scurl -H "Accept: application/json" https://api.example.com/data
31
+ ```
32
+
33
+ ## Features
34
+
35
+ - **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
36
+ - **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
37
+ - **Middleware System**: Composable request and response middleware
38
+
39
+ ## Why scurl?
40
+
41
+ scurl extracts clean, readable content from web pages - perfect for LLM consumption, readability, or bandwidth savings.
42
+
43
+ ### Size Comparison
44
+
45
+ | Website | curl | scurl | Reduction |
46
+ |---------|------|-------|-----------|
47
+ | example.com | 513 | 167 | 67.4% |
48
+ | news.ycombinator.com | 34,082 | 10,739 | 68.5% |
49
+ | en.wikipedia.org/wiki/Curl | 110,373 | 10,044 | 90.9% |
50
+ | github.com/anthropics | 296,788 | 353 | 99.9% |
51
+ | docs.python.org | 319,554 | 12,348 | 96.1% |
52
+
53
+ ### Visual Comparison
54
+
55
+ **curl output** (Wikipedia, first 500 chars):
56
+ ```html
57
+ <!DOCTYPE html><html class="client-nojs" lang="en" dir="ltr"><head>
58
+ <meta charset="UTF-8"/><title>Curl (programming language) - Wikipedia</title>
59
+ <script>(function(){var className="client-js";var cookie=document.cookie.
60
+ match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].
61
+ split('%2C').forEach(function(pref){className=className.replace(new
62
+ RegExp('(^| )'+pref.replace(/-hierarchical-hierarchical/,'')
63
+ +'($| )'),'$1teleported-hierarchical$2');});...
64
+ ```
65
+
66
+ **scurl output** (same page):
67
+ ```markdown
68
+ # Curl (programming language) - Wikipedia
69
+
70
+ **Curl** is a reflective object-oriented programming language for interactive
71
+ web applications, whose goal is to provide a smoother transition between
72
+ content formatting and computer programming. It makes it possible to embed
73
+ complex objects in simple documents without needing to switch between
74
+ programming languages or development platforms.
75
+
76
+ The Curl implementation initially consisted of an interpreter only; a compiler
77
+ was added later...
78
+ ```
79
+
80
+ ## Flags
81
+
82
+ | Flag | Description |
83
+ |------|-------------|
84
+ | `--raw` | Disable all response middleware (raw HTML output) |
85
+ | `--readability` | Extract article content only (strips nav, ads, sidebars) |
86
+ | `--render` | Use headless browser for JS-rendered pages |
87
+ | `--disable <slug>` | Disable a middleware by slug (can be repeated) |
88
+ | `--enable <slug>` | Override a middleware's block (can be repeated) |
89
+ | `--list-middleware` | List available middleware and their slugs |
90
+
91
+ ## Middleware Slugs
92
+
93
+ | Slug | Type | Description |
94
+ |------|------|-------------|
95
+ | `secret-defender` | Request | Detects and blocks requests containing secrets |
96
+ | `readability` | Response | Extracts clean markdown from HTML |
97
+
98
+ ## License
99
+
100
+ Copyright 2026 [Sibylline Software](https://sibylline.dev)
101
+
102
+ MIT
@@ -0,0 +1,9 @@
1
+ # Dependencies
2
+ node_modules/
3
+
4
+ # Wrangler
5
+ .wrangler/
6
+ .dev.vars
7
+
8
+ # Build artifacts
9
+ *.log
@@ -0,0 +1,2 @@
1
+ # Build artifacts - wheel is copied in before docker build
2
+ *.whl
@@ -0,0 +1,22 @@
1
+ FROM python:3.12-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
4
+
5
+ WORKDIR /app
6
+
7
+ # Install scurl from local wheel with browser extra
8
+ COPY *.whl /tmp/
9
+ RUN pip install --no-cache-dir "/tmp/sibylline_scurl-0.2.0-py3-none-any.whl[browser,prompt-defender]" numpy && \
10
+ playwright install chromium --only-shell && \
11
+ playwright install-deps chromium && \
12
+ rm -f /tmp/*.whl
13
+
14
+ # Pre-download MiniLM model (86MB, no external data file)
15
+ COPY inline_model.py /tmp/
16
+ RUN python /tmp/inline_model.py && rm /tmp/inline_model.py
17
+
18
+ COPY server.py .
19
+
20
+ EXPOSE 8080
21
+
22
+ CMD ["python", "server.py"]
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python3
2
+ """Download MiniLM model for prompt injection detection."""
3
+
4
+ from huggingface_hub import hf_hub_download
5
+ from pathlib import Path
6
+
7
+ cache = '/root/.cache/scurl/models'
8
+
9
+ # Download MiniLM-L6-v2 (86MB, single file - no external data)
10
+ print('Downloading MiniLM-L6-v2 model...')
11
+ model_path = hf_hub_download(
12
+ 'Qdrant/all-MiniLM-L6-v2-onnx',
13
+ filename='model.onnx',
14
+ cache_dir=cache,
15
+ )
16
+ tokenizer_path = hf_hub_download(
17
+ 'Qdrant/all-MiniLM-L6-v2-onnx',
18
+ filename='tokenizer.json',
19
+ cache_dir=cache,
20
+ )
21
+
22
+ print(f'Model downloaded to: {model_path}')
23
+ print(f'Tokenizer downloaded to: {tokenizer_path}')
24
+ print('Done')
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env python3
2
+ """Simple HTTP server that runs scurl commands."""
3
+
4
+ import json
5
+ import subprocess
6
+ import sys
7
+ import tempfile
8
+ from http.server import HTTPServer, BaseHTTPRequestHandler
9
+
10
+ from scurl.sanitize import sanitize_text
11
+ from scurl.middleware import ResponseContext
12
+
13
+ TIMEOUT_SECONDS = 90
14
+ MAX_CONVERT_SIZE = 256 * 1024 # 256 KB
15
+
16
+ # Singleton defender — avoids re-loading the ONNX session on every request.
17
+ # Safe because HTTPServer.serve_forever() is single-threaded.
18
+ _defender = None
19
+
20
+
21
+ def _get_defender(threshold: float = 0.3, action: str = "redact"):
22
+ """Return the singleton PromptInjectionDefender, creating it on first call."""
23
+ global _defender
24
+ if _defender is None:
25
+ from scurl.prompt_defender import PromptInjectionDefender
26
+ _defender = PromptInjectionDefender(
27
+ threshold=threshold,
28
+ action=action,
29
+ )
30
+ else:
31
+ if _defender.threshold != threshold:
32
+ _defender.threshold = threshold
33
+ if _defender.action != action:
34
+ _defender.action = action
35
+ return _defender
36
+
37
+
38
+ def _injection_dict(analysis) -> dict:
39
+ """Convert an InjectionAnalysis into a JSON-safe dict."""
40
+ active_signals = [k for k, v in analysis.pattern_features.items() if v > 0]
41
+ return {
42
+ "score": round(float(analysis.score), 4),
43
+ "flagged": analysis.flagged,
44
+ "threshold": analysis.threshold,
45
+ "action_taken": analysis.action_taken,
46
+ "signals": active_signals,
47
+ }
48
+
49
+
50
+ class ScurlHandler(BaseHTTPRequestHandler):
51
+ def do_POST(self):
52
+ content_length = int(self.headers.get("Content-Length", 0))
53
+ body = self.rfile.read(content_length)
54
+
55
+ if self.path == "/convert":
56
+ self._handle_convert(body)
57
+ else:
58
+ self._handle_fetch(body)
59
+
60
+ def _handle_fetch(self, body: bytes):
61
+ try:
62
+ data = json.loads(body)
63
+ url = data.get("url")
64
+ render = data.get("render", True)
65
+ threshold = float(data.get("threshold", 0.3))
66
+ action = data.get("action", "redact")
67
+
68
+ if not url:
69
+ self._send_json(400, {"markdown": None, "error": "Missing 'url' field", "injection": None})
70
+ return
71
+
72
+ cmd = ["scurl", "--render", url] if render else ["scurl", url]
73
+ result = subprocess.run(
74
+ cmd,
75
+ capture_output=True,
76
+ text=True,
77
+ timeout=TIMEOUT_SECONDS,
78
+ )
79
+
80
+ if result.returncode == 0:
81
+ defender = _get_defender(threshold=threshold, action=action)
82
+ analysis = defender.analyze(result.stdout)
83
+ injection = _injection_dict(analysis)
84
+
85
+ ctx = ResponseContext(
86
+ body=result.stdout.encode("utf-8"),
87
+ headers={},
88
+ status_code=200,
89
+ content_type="text/plain",
90
+ url=url,
91
+ )
92
+ if analysis.flagged and defender.should_process(ctx):
93
+ processed = defender.process(ctx)
94
+ output = processed.body.decode("utf-8", errors="replace")
95
+ else:
96
+ output = result.stdout
97
+
98
+ self._send_json(200, {"markdown": output, "error": None, "injection": injection})
99
+ else:
100
+ error = result.stderr.strip() or f"scurl exited with code {result.returncode}"
101
+ self._send_json(500, {"markdown": None, "error": error, "injection": None})
102
+
103
+ except json.JSONDecodeError:
104
+ self._send_json(400, {"markdown": None, "error": "Invalid JSON", "injection": None})
105
+ except subprocess.TimeoutExpired:
106
+ self._send_json(504, {"markdown": None, "error": "Request timed out", "injection": None})
107
+ except Exception as e:
108
+ self._send_json(500, {"markdown": None, "error": str(e), "injection": None})
109
+
110
+ def _handle_convert(self, body: bytes):
111
+ try:
112
+ data = json.loads(body)
113
+ text = data.get("html") or data.get("text")
114
+ action = data.get("action", "redact")
115
+ threshold = float(data.get("threshold", 0.3))
116
+
117
+ if not text:
118
+ self._send_json(400, {"markdown": None, "error": "Missing 'html' or 'text' field", "injection": None})
119
+ return
120
+
121
+ text_bytes = len(text.encode("utf-8"))
122
+ if text_bytes > MAX_CONVERT_SIZE:
123
+ self._send_json(413, {
124
+ "markdown": None,
125
+ "error": f"Input too large ({text_bytes // 1024} KB). Maximum is 256 KB.",
126
+ "injection": None,
127
+ })
128
+ return
129
+
130
+ # First sanitize (strip HTML, normalize whitespace)
131
+ cleaned = sanitize_text(text)
132
+
133
+ # Reuse singleton defender (avoids re-loading ONNX session)
134
+ defender = _get_defender(threshold=threshold, action=action)
135
+ analysis = defender.analyze(cleaned)
136
+ injection = _injection_dict(analysis)
137
+
138
+ ctx = ResponseContext(
139
+ body=cleaned.encode("utf-8"),
140
+ headers={},
141
+ status_code=200,
142
+ content_type="text/plain",
143
+ url="",
144
+ )
145
+
146
+ if analysis.flagged and defender.should_process(ctx):
147
+ result = defender.process(ctx)
148
+ output = result.body.decode("utf-8", errors="replace")
149
+ else:
150
+ output = cleaned
151
+
152
+ self._send_json(200, {"markdown": output, "error": None, "injection": injection})
153
+
154
+ except json.JSONDecodeError:
155
+ self._send_json(400, {"markdown": None, "error": "Invalid JSON", "injection": None})
156
+ except Exception as e:
157
+ self._send_json(500, {"markdown": None, "error": str(e), "injection": None})
158
+
159
+ def do_GET(self):
160
+ if self.path == "/health":
161
+ self._send_json(200, {"status": "ok"})
162
+ else:
163
+ self._send_json(404, {"error": "Not found"})
164
+
165
+ def _send_json(self, status: int, data: dict):
166
+ response = json.dumps(data)
167
+ self.send_response(status)
168
+ self.send_header("Content-Type", "application/json")
169
+ self.send_header("Content-Length", str(len(response)))
170
+ self.end_headers()
171
+ self.wfile.write(response.encode())
172
+
173
+ def log_message(self, format, *args):
174
+ print(f"[scurl-server] {args[0]}", file=sys.stderr)
175
+
176
+
177
+ if __name__ == "__main__":
178
+ # Eagerly load the defender + ONNX model so the cost is paid at boot,
179
+ # not on the first request.
180
+ print("[scurl-server] Pre-loading prompt injection model…", file=sys.stderr)
181
+ try:
182
+ defender = _get_defender()
183
+ defender._ensure_heavy_components()
184
+ defender._embedder.embed("warmup")
185
+ print("[scurl-server] Model ready", file=sys.stderr)
186
+ except Exception as e:
187
+ print(f"[scurl-server] Model pre-load failed (will retry on first request): {e}", file=sys.stderr)
188
+
189
+ server = HTTPServer(("0.0.0.0", 8080), ScurlHandler)
190
+ print("[scurl-server] Listening on port 8080", file=sys.stderr)
191
+ server.serve_forever()