sibylline-scurl 0.1.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.github/workflows/ci.yml +25 -1
- sibylline_scurl-0.2.3/.github/workflows/docs.yml +60 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.gitignore +7 -0
- sibylline_scurl-0.2.3/PKG-INFO +145 -0
- sibylline_scurl-0.2.3/README.md +102 -0
- sibylline_scurl-0.2.3/cf/.gitignore +9 -0
- sibylline_scurl-0.2.3/cf/container/.gitignore +2 -0
- sibylline_scurl-0.2.3/cf/container/Dockerfile +22 -0
- sibylline_scurl-0.2.3/cf/container/inline_model.py +24 -0
- sibylline_scurl-0.2.3/cf/container/server.py +191 -0
- sibylline_scurl-0.2.3/cf/package-lock.json +1536 -0
- sibylline_scurl-0.2.3/cf/package.json +19 -0
- sibylline_scurl-0.2.3/cf/public/app.js +206 -0
- sibylline_scurl-0.2.3/cf/public/index.html +171 -0
- sibylline_scurl-0.2.3/cf/public/styles.css +616 -0
- sibylline_scurl-0.2.3/cf/src/container.ts +16 -0
- sibylline_scurl-0.2.3/cf/src/index.ts +215 -0
- sibylline_scurl-0.2.3/cf/src/ratelimit.ts +50 -0
- sibylline_scurl-0.2.3/cf/tsconfig.json +16 -0
- sibylline_scurl-0.2.3/cf/wrangler.toml +24 -0
- sibylline_scurl-0.2.3/docs/algorithm.md +511 -0
- sibylline_scurl-0.2.3/docs/benchmarks.md +369 -0
- sibylline_scurl-0.2.3/docs/prompt-injection-primer.md +384 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/pyproject.toml +27 -2
- sibylline_scurl-0.2.3/scripts/train_classifier.py +344 -0
- sibylline_scurl-0.2.3/site/eleventy.config.js +53 -0
- sibylline_scurl-0.2.3/site/package-lock.json +1801 -0
- sibylline_scurl-0.2.3/site/package.json +17 -0
- sibylline_scurl-0.2.3/site/src/.nojekyll +0 -0
- sibylline_scurl-0.2.3/site/src/CNAME +1 -0
- sibylline_scurl-0.2.3/site/src/_data/site.js +7 -0
- sibylline_scurl-0.2.3/site/src/_includes/layouts/base.njk +46 -0
- sibylline_scurl-0.2.3/site/src/_includes/layouts/doc.njk +38 -0
- sibylline_scurl-0.2.3/site/src/docs/algorithm.md +352 -0
- sibylline_scurl-0.2.3/site/src/docs/benchmarks.md +213 -0
- sibylline_scurl-0.2.3/site/src/docs/primer.md +220 -0
- sibylline_scurl-0.2.3/site/src/img/favicon.svg +4 -0
- sibylline_scurl-0.2.3/site/src/index.njk +69 -0
- sibylline_scurl-0.2.3/site/src/styles/main.css +529 -0
- sibylline_scurl-0.2.3/site/src/styles/prism-dark.css +159 -0
- sibylline_scurl-0.2.3/src/scurl/browser.py +78 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/cli.py +63 -10
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/middleware.py +1 -1
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/__init__.py +20 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/classifier.py +178 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/embedder.py +293 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/middleware.py +461 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/models/.gitkeep +0 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/models/prompt_injection_rf.pkl +0 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/motifs.py +362 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/normalizer.py +147 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/patterns.py +227 -0
- sibylline_scurl-0.2.3/src/scurl/prompt_defender/windowing.py +397 -0
- sibylline_scurl-0.2.3/src/scurl/response_middleware.py +166 -0
- sibylline_scurl-0.2.3/src/scurl/sanitize.py +69 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_cli.py +11 -12
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_curl.py +0 -1
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_middleware.py +0 -1
- sibylline_scurl-0.2.3/tests/test_motifs.py +205 -0
- sibylline_scurl-0.2.3/tests/test_prompt_defender.py +269 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_response_middleware.py +36 -13
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/test_secret_defender.py +1 -2
- sibylline_scurl-0.2.3/tests/test_windowing.py +249 -0
- sibylline_scurl-0.2.3/uv.lock +2686 -0
- sibylline_scurl-0.1.1/PKG-INFO +0 -81
- sibylline_scurl-0.1.1/README.md +0 -57
- sibylline_scurl-0.1.1/src/scurl/response_middleware.py +0 -95
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/.github/workflows/publish.yml +0 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/__init__.py +0 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/curl.py +0 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/src/scurl/request_middleware.py +0 -0
- {sibylline_scurl-0.1.1 → sibylline_scurl-0.2.3}/tests/__init__.py +0 -0
|
@@ -3,6 +3,7 @@ name: CI
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
5
|
branches: [main]
|
|
6
|
+
tags: ['v*']
|
|
6
7
|
pull_request:
|
|
7
8
|
branches: [main]
|
|
8
9
|
|
|
@@ -24,7 +25,7 @@ jobs:
|
|
|
24
25
|
- name: Install dependencies
|
|
25
26
|
run: |
|
|
26
27
|
python -m pip install --upgrade pip
|
|
27
|
-
pip install -e ".[dev]" pytest-mock
|
|
28
|
+
pip install -e ".[dev,prompt-defender]" pytest-mock
|
|
28
29
|
|
|
29
30
|
- name: Run tests
|
|
30
31
|
run: pytest -v
|
|
@@ -44,3 +45,26 @@ jobs:
|
|
|
44
45
|
|
|
45
46
|
- name: Run ruff
|
|
46
47
|
run: ruff check src/ tests/
|
|
48
|
+
|
|
49
|
+
publish:
|
|
50
|
+
needs: [test, lint]
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
53
|
+
permissions:
|
|
54
|
+
id-token: write
|
|
55
|
+
steps:
|
|
56
|
+
- uses: actions/checkout@v4
|
|
57
|
+
|
|
58
|
+
- name: Set up Python
|
|
59
|
+
uses: actions/setup-python@v5
|
|
60
|
+
with:
|
|
61
|
+
python-version: "3.12"
|
|
62
|
+
|
|
63
|
+
- name: Install build tools
|
|
64
|
+
run: pip install build
|
|
65
|
+
|
|
66
|
+
- name: Build package
|
|
67
|
+
run: python -m build
|
|
68
|
+
|
|
69
|
+
- name: Publish to PyPI
|
|
70
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: Deploy Documentation
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- 'site/**'
|
|
8
|
+
- 'docs/**'
|
|
9
|
+
- '.github/workflows/docs.yml'
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
pages: write
|
|
15
|
+
id-token: write
|
|
16
|
+
|
|
17
|
+
concurrency:
|
|
18
|
+
group: "pages"
|
|
19
|
+
cancel-in-progress: false
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
build:
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
steps:
|
|
25
|
+
- name: Checkout
|
|
26
|
+
uses: actions/checkout@v4
|
|
27
|
+
|
|
28
|
+
- name: Setup Node.js
|
|
29
|
+
uses: actions/setup-node@v4
|
|
30
|
+
with:
|
|
31
|
+
node-version: '20'
|
|
32
|
+
cache: 'npm'
|
|
33
|
+
cache-dependency-path: site/package-lock.json
|
|
34
|
+
|
|
35
|
+
- name: Install dependencies
|
|
36
|
+
working-directory: site
|
|
37
|
+
run: npm ci
|
|
38
|
+
|
|
39
|
+
- name: Build site
|
|
40
|
+
working-directory: site
|
|
41
|
+
run: npm run build:production
|
|
42
|
+
|
|
43
|
+
- name: Setup Pages
|
|
44
|
+
uses: actions/configure-pages@v4
|
|
45
|
+
|
|
46
|
+
- name: Upload artifact
|
|
47
|
+
uses: actions/upload-pages-artifact@v3
|
|
48
|
+
with:
|
|
49
|
+
path: site/_site
|
|
50
|
+
|
|
51
|
+
deploy:
|
|
52
|
+
environment:
|
|
53
|
+
name: github-pages
|
|
54
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
55
|
+
runs-on: ubuntu-latest
|
|
56
|
+
needs: build
|
|
57
|
+
steps:
|
|
58
|
+
- name: Deploy to GitHub Pages
|
|
59
|
+
id: deployment
|
|
60
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sibylline-scurl
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary: A secure curl wrapper with middleware support and HTML-to-markdown extraction
|
|
5
|
+
Author: Nathan
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: curl,markdown,security,web-scraping
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Security
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: html2text>=2024.2.26
|
|
20
|
+
Requires-Dist: readability-lxml>=0.8.1
|
|
21
|
+
Provides-Extra: browser
|
|
22
|
+
Requires-Dist: playwright>=1.40.0; extra == 'browser'
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
26
|
+
Provides-Extra: prompt-defender
|
|
27
|
+
Requires-Dist: confusable-homoglyphs>=3.2.0; extra == 'prompt-defender'
|
|
28
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'prompt-defender'
|
|
29
|
+
Requires-Dist: numpy>=1.24.0; extra == 'prompt-defender'
|
|
30
|
+
Requires-Dist: onnxruntime>=1.17.0; extra == 'prompt-defender'
|
|
31
|
+
Requires-Dist: rapidfuzz>=3.5.0; extra == 'prompt-defender'
|
|
32
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'prompt-defender'
|
|
33
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'prompt-defender'
|
|
34
|
+
Provides-Extra: training
|
|
35
|
+
Requires-Dist: confusable-homoglyphs>=3.2.0; extra == 'training'
|
|
36
|
+
Requires-Dist: datasets>=2.14.0; extra == 'training'
|
|
37
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'training'
|
|
38
|
+
Requires-Dist: numpy>=1.24.0; extra == 'training'
|
|
39
|
+
Requires-Dist: onnxruntime>=1.17.0; extra == 'training'
|
|
40
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'training'
|
|
41
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'training'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# scurl
|
|
45
|
+
|
|
46
|
+
[](https://badge.fury.io/py/scurl)
|
|
47
|
+
[](https://github.com/yourusername/scurl/actions/workflows/ci.yml)
|
|
48
|
+
|
|
49
|
+
A secure curl wrapper with middleware support and HTML-to-markdown extraction.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install sibylline-scurl
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Or with [pipx](https://pipx.pypa.io/) (recommended for CLI tools):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pipx install sibylline-scurl
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Fetch a URL and extract clean markdown from HTML
|
|
67
|
+
scurl https://example.com
|
|
68
|
+
|
|
69
|
+
# Raw output (disable response middleware)
|
|
70
|
+
scurl --raw https://example.com
|
|
71
|
+
|
|
72
|
+
# All curl flags work
|
|
73
|
+
scurl -H "Accept: application/json" https://api.example.com/data
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Features
|
|
77
|
+
|
|
78
|
+
- **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
|
|
79
|
+
- **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
|
|
80
|
+
- **Middleware System**: Composable request and response middleware
|
|
81
|
+
|
|
82
|
+
## Why scurl?
|
|
83
|
+
|
|
84
|
+
scurl extracts clean, readable content from web pages - perfect for LLM consumption, readability, or bandwidth savings.
|
|
85
|
+
|
|
86
|
+
### Size Comparison
|
|
87
|
+
|
|
88
|
+
| Website | curl | scurl | Reduction |
|
|
89
|
+
|---------|------|-------|-----------|
|
|
90
|
+
| example.com | 513 | 167 | 67.4% |
|
|
91
|
+
| news.ycombinator.com | 34,082 | 10,739 | 68.5% |
|
|
92
|
+
| en.wikipedia.org/wiki/Curl | 110,373 | 10,044 | 90.9% |
|
|
93
|
+
| github.com/anthropics | 296,788 | 353 | 99.9% |
|
|
94
|
+
| docs.python.org | 319,554 | 12,348 | 96.1% |
|
|
95
|
+
|
|
96
|
+
### Visual Comparison
|
|
97
|
+
|
|
98
|
+
**curl output** (Wikipedia, first 500 chars):
|
|
99
|
+
```html
|
|
100
|
+
<!DOCTYPE html><html class="client-nojs" lang="en" dir="ltr"><head>
|
|
101
|
+
<meta charset="UTF-8"/><title>Curl (programming language) - Wikipedia</title>
|
|
102
|
+
<script>(function(){var className="client-js";var cookie=document.cookie.
|
|
103
|
+
match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].
|
|
104
|
+
split('%2C').forEach(function(pref){className=className.replace(new
|
|
105
|
+
RegExp('(^| )'+pref.replace(/-hierarchical-hierarchical/,'')
|
|
106
|
+
+'($| )'),'$1teleported-hierarchical$2');});...
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**scurl output** (same page):
|
|
110
|
+
```markdown
|
|
111
|
+
# Curl (programming language) - Wikipedia
|
|
112
|
+
|
|
113
|
+
**Curl** is a reflective object-oriented programming language for interactive
|
|
114
|
+
web applications, whose goal is to provide a smoother transition between
|
|
115
|
+
content formatting and computer programming. It makes it possible to embed
|
|
116
|
+
complex objects in simple documents without needing to switch between
|
|
117
|
+
programming languages or development platforms.
|
|
118
|
+
|
|
119
|
+
The Curl implementation initially consisted of an interpreter only; a compiler
|
|
120
|
+
was added later...
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Flags
|
|
124
|
+
|
|
125
|
+
| Flag | Description |
|
|
126
|
+
|------|-------------|
|
|
127
|
+
| `--raw` | Disable all response middleware (raw HTML output) |
|
|
128
|
+
| `--readability` | Extract article content only (strips nav, ads, sidebars) |
|
|
129
|
+
| `--render` | Use headless browser for JS-rendered pages |
|
|
130
|
+
| `--disable <slug>` | Disable a middleware by slug (can be repeated) |
|
|
131
|
+
| `--enable <slug>` | Override a middleware's block (can be repeated) |
|
|
132
|
+
| `--list-middleware` | List available middleware and their slugs |
|
|
133
|
+
|
|
134
|
+
## Middleware Slugs
|
|
135
|
+
|
|
136
|
+
| Slug | Type | Description |
|
|
137
|
+
|------|------|-------------|
|
|
138
|
+
| `secret-defender` | Request | Detects and blocks requests containing secrets |
|
|
139
|
+
| `readability` | Response | Extracts clean markdown from HTML |
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
Copyright 2026 [Sibylline Software](https://sibylline.dev)
|
|
144
|
+
|
|
145
|
+
MIT
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# scurl
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/scurl)
|
|
4
|
+
[](https://github.com/yourusername/scurl/actions/workflows/ci.yml)
|
|
5
|
+
|
|
6
|
+
A secure curl wrapper with middleware support and HTML-to-markdown extraction.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install sibylline-scurl
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Or with [pipx](https://pipx.pypa.io/) (recommended for CLI tools):
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pipx install sibylline-scurl
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Usage
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Fetch a URL and extract clean markdown from HTML
|
|
24
|
+
scurl https://example.com
|
|
25
|
+
|
|
26
|
+
# Raw output (disable response middleware)
|
|
27
|
+
scurl --raw https://example.com
|
|
28
|
+
|
|
29
|
+
# All curl flags work
|
|
30
|
+
scurl -H "Accept: application/json" https://api.example.com/data
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **SecretDefender**: Automatically detects and blocks requests containing exposed secrets/tokens
|
|
36
|
+
- **HTML to Markdown**: Converts HTML responses to clean markdown (use `--readability` for article extraction)
|
|
37
|
+
- **Middleware System**: Composable request and response middleware
|
|
38
|
+
|
|
39
|
+
## Why scurl?
|
|
40
|
+
|
|
41
|
+
scurl extracts clean, readable content from web pages - perfect for LLM consumption, readability, or bandwidth savings.
|
|
42
|
+
|
|
43
|
+
### Size Comparison
|
|
44
|
+
|
|
45
|
+
| Website | curl | scurl | Reduction |
|
|
46
|
+
|---------|------|-------|-----------|
|
|
47
|
+
| example.com | 513 | 167 | 67.4% |
|
|
48
|
+
| news.ycombinator.com | 34,082 | 10,739 | 68.5% |
|
|
49
|
+
| en.wikipedia.org/wiki/Curl | 110,373 | 10,044 | 90.9% |
|
|
50
|
+
| github.com/anthropics | 296,788 | 353 | 99.9% |
|
|
51
|
+
| docs.python.org | 319,554 | 12,348 | 96.1% |
|
|
52
|
+
|
|
53
|
+
### Visual Comparison
|
|
54
|
+
|
|
55
|
+
**curl output** (Wikipedia, first 500 chars):
|
|
56
|
+
```html
|
|
57
|
+
<!DOCTYPE html><html class="client-nojs" lang="en" dir="ltr"><head>
|
|
58
|
+
<meta charset="UTF-8"/><title>Curl (programming language) - Wikipedia</title>
|
|
59
|
+
<script>(function(){var className="client-js";var cookie=document.cookie.
|
|
60
|
+
match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].
|
|
61
|
+
split('%2C').forEach(function(pref){className=className.replace(new
|
|
62
|
+
RegExp('(^| )'+pref.replace(/-hierarchical-hierarchical/,'')
|
|
63
|
+
+'($| )'),'$1teleported-hierarchical$2');});...
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**scurl output** (same page):
|
|
67
|
+
```markdown
|
|
68
|
+
# Curl (programming language) - Wikipedia
|
|
69
|
+
|
|
70
|
+
**Curl** is a reflective object-oriented programming language for interactive
|
|
71
|
+
web applications, whose goal is to provide a smoother transition between
|
|
72
|
+
content formatting and computer programming. It makes it possible to embed
|
|
73
|
+
complex objects in simple documents without needing to switch between
|
|
74
|
+
programming languages or development platforms.
|
|
75
|
+
|
|
76
|
+
The Curl implementation initially consisted of an interpreter only; a compiler
|
|
77
|
+
was added later...
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Flags
|
|
81
|
+
|
|
82
|
+
| Flag | Description |
|
|
83
|
+
|------|-------------|
|
|
84
|
+
| `--raw` | Disable all response middleware (raw HTML output) |
|
|
85
|
+
| `--readability` | Extract article content only (strips nav, ads, sidebars) |
|
|
86
|
+
| `--render` | Use headless browser for JS-rendered pages |
|
|
87
|
+
| `--disable <slug>` | Disable a middleware by slug (can be repeated) |
|
|
88
|
+
| `--enable <slug>` | Override a middleware's block (can be repeated) |
|
|
89
|
+
| `--list-middleware` | List available middleware and their slugs |
|
|
90
|
+
|
|
91
|
+
## Middleware Slugs
|
|
92
|
+
|
|
93
|
+
| Slug | Type | Description |
|
|
94
|
+
|------|------|-------------|
|
|
95
|
+
| `secret-defender` | Request | Detects and blocks requests containing secrets |
|
|
96
|
+
| `readability` | Response | Extracts clean markdown from HTML |
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
Copyright 2026 [Sibylline Software](https://sibylline.dev)
|
|
101
|
+
|
|
102
|
+
MIT
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
|
|
4
|
+
|
|
5
|
+
WORKDIR /app
|
|
6
|
+
|
|
7
|
+
# Install scurl from local wheel with browser extra
|
|
8
|
+
COPY *.whl /tmp/
|
|
9
|
+
RUN pip install --no-cache-dir "/tmp/sibylline_scurl-0.2.0-py3-none-any.whl[browser,prompt-defender]" numpy && \
|
|
10
|
+
playwright install chromium --only-shell && \
|
|
11
|
+
playwright install-deps chromium && \
|
|
12
|
+
rm -f /tmp/*.whl
|
|
13
|
+
|
|
14
|
+
# Pre-download MiniLM model (86MB, no external data file)
|
|
15
|
+
COPY inline_model.py /tmp/
|
|
16
|
+
RUN python /tmp/inline_model.py && rm /tmp/inline_model.py
|
|
17
|
+
|
|
18
|
+
COPY server.py .
|
|
19
|
+
|
|
20
|
+
EXPOSE 8080
|
|
21
|
+
|
|
22
|
+
CMD ["python", "server.py"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Download MiniLM model for prompt injection detection."""
|
|
3
|
+
|
|
4
|
+
from huggingface_hub import hf_hub_download
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
cache = '/root/.cache/scurl/models'
|
|
8
|
+
|
|
9
|
+
# Download MiniLM-L6-v2 (86MB, single file - no external data)
|
|
10
|
+
print('Downloading MiniLM-L6-v2 model...')
|
|
11
|
+
model_path = hf_hub_download(
|
|
12
|
+
'Qdrant/all-MiniLM-L6-v2-onnx',
|
|
13
|
+
filename='model.onnx',
|
|
14
|
+
cache_dir=cache,
|
|
15
|
+
)
|
|
16
|
+
tokenizer_path = hf_hub_download(
|
|
17
|
+
'Qdrant/all-MiniLM-L6-v2-onnx',
|
|
18
|
+
filename='tokenizer.json',
|
|
19
|
+
cache_dir=cache,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
print(f'Model downloaded to: {model_path}')
|
|
23
|
+
print(f'Tokenizer downloaded to: {tokenizer_path}')
|
|
24
|
+
print('Done')
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Simple HTTP server that runs scurl commands."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import tempfile
|
|
8
|
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
9
|
+
|
|
10
|
+
from scurl.sanitize import sanitize_text
|
|
11
|
+
from scurl.middleware import ResponseContext
|
|
12
|
+
|
|
13
|
+
TIMEOUT_SECONDS = 90
|
|
14
|
+
MAX_CONVERT_SIZE = 256 * 1024 # 256 KB
|
|
15
|
+
|
|
16
|
+
# Singleton defender — avoids re-loading the ONNX session on every request.
|
|
17
|
+
# Safe because HTTPServer.serve_forever() is single-threaded.
|
|
18
|
+
_defender = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_defender(threshold: float = 0.3, action: str = "redact"):
|
|
22
|
+
"""Return the singleton PromptInjectionDefender, creating it on first call."""
|
|
23
|
+
global _defender
|
|
24
|
+
if _defender is None:
|
|
25
|
+
from scurl.prompt_defender import PromptInjectionDefender
|
|
26
|
+
_defender = PromptInjectionDefender(
|
|
27
|
+
threshold=threshold,
|
|
28
|
+
action=action,
|
|
29
|
+
)
|
|
30
|
+
else:
|
|
31
|
+
if _defender.threshold != threshold:
|
|
32
|
+
_defender.threshold = threshold
|
|
33
|
+
if _defender.action != action:
|
|
34
|
+
_defender.action = action
|
|
35
|
+
return _defender
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _injection_dict(analysis) -> dict:
|
|
39
|
+
"""Convert an InjectionAnalysis into a JSON-safe dict."""
|
|
40
|
+
active_signals = [k for k, v in analysis.pattern_features.items() if v > 0]
|
|
41
|
+
return {
|
|
42
|
+
"score": round(float(analysis.score), 4),
|
|
43
|
+
"flagged": analysis.flagged,
|
|
44
|
+
"threshold": analysis.threshold,
|
|
45
|
+
"action_taken": analysis.action_taken,
|
|
46
|
+
"signals": active_signals,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ScurlHandler(BaseHTTPRequestHandler):
|
|
51
|
+
def do_POST(self):
|
|
52
|
+
content_length = int(self.headers.get("Content-Length", 0))
|
|
53
|
+
body = self.rfile.read(content_length)
|
|
54
|
+
|
|
55
|
+
if self.path == "/convert":
|
|
56
|
+
self._handle_convert(body)
|
|
57
|
+
else:
|
|
58
|
+
self._handle_fetch(body)
|
|
59
|
+
|
|
60
|
+
def _handle_fetch(self, body: bytes):
|
|
61
|
+
try:
|
|
62
|
+
data = json.loads(body)
|
|
63
|
+
url = data.get("url")
|
|
64
|
+
render = data.get("render", True)
|
|
65
|
+
threshold = float(data.get("threshold", 0.3))
|
|
66
|
+
action = data.get("action", "redact")
|
|
67
|
+
|
|
68
|
+
if not url:
|
|
69
|
+
self._send_json(400, {"markdown": None, "error": "Missing 'url' field", "injection": None})
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
cmd = ["scurl", "--render", url] if render else ["scurl", url]
|
|
73
|
+
result = subprocess.run(
|
|
74
|
+
cmd,
|
|
75
|
+
capture_output=True,
|
|
76
|
+
text=True,
|
|
77
|
+
timeout=TIMEOUT_SECONDS,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if result.returncode == 0:
|
|
81
|
+
defender = _get_defender(threshold=threshold, action=action)
|
|
82
|
+
analysis = defender.analyze(result.stdout)
|
|
83
|
+
injection = _injection_dict(analysis)
|
|
84
|
+
|
|
85
|
+
ctx = ResponseContext(
|
|
86
|
+
body=result.stdout.encode("utf-8"),
|
|
87
|
+
headers={},
|
|
88
|
+
status_code=200,
|
|
89
|
+
content_type="text/plain",
|
|
90
|
+
url=url,
|
|
91
|
+
)
|
|
92
|
+
if analysis.flagged and defender.should_process(ctx):
|
|
93
|
+
processed = defender.process(ctx)
|
|
94
|
+
output = processed.body.decode("utf-8", errors="replace")
|
|
95
|
+
else:
|
|
96
|
+
output = result.stdout
|
|
97
|
+
|
|
98
|
+
self._send_json(200, {"markdown": output, "error": None, "injection": injection})
|
|
99
|
+
else:
|
|
100
|
+
error = result.stderr.strip() or f"scurl exited with code {result.returncode}"
|
|
101
|
+
self._send_json(500, {"markdown": None, "error": error, "injection": None})
|
|
102
|
+
|
|
103
|
+
except json.JSONDecodeError:
|
|
104
|
+
self._send_json(400, {"markdown": None, "error": "Invalid JSON", "injection": None})
|
|
105
|
+
except subprocess.TimeoutExpired:
|
|
106
|
+
self._send_json(504, {"markdown": None, "error": "Request timed out", "injection": None})
|
|
107
|
+
except Exception as e:
|
|
108
|
+
self._send_json(500, {"markdown": None, "error": str(e), "injection": None})
|
|
109
|
+
|
|
110
|
+
def _handle_convert(self, body: bytes):
|
|
111
|
+
try:
|
|
112
|
+
data = json.loads(body)
|
|
113
|
+
text = data.get("html") or data.get("text")
|
|
114
|
+
action = data.get("action", "redact")
|
|
115
|
+
threshold = float(data.get("threshold", 0.3))
|
|
116
|
+
|
|
117
|
+
if not text:
|
|
118
|
+
self._send_json(400, {"markdown": None, "error": "Missing 'html' or 'text' field", "injection": None})
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
text_bytes = len(text.encode("utf-8"))
|
|
122
|
+
if text_bytes > MAX_CONVERT_SIZE:
|
|
123
|
+
self._send_json(413, {
|
|
124
|
+
"markdown": None,
|
|
125
|
+
"error": f"Input too large ({text_bytes // 1024} KB). Maximum is 256 KB.",
|
|
126
|
+
"injection": None,
|
|
127
|
+
})
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
# First sanitize (strip HTML, normalize whitespace)
|
|
131
|
+
cleaned = sanitize_text(text)
|
|
132
|
+
|
|
133
|
+
# Reuse singleton defender (avoids re-loading ONNX session)
|
|
134
|
+
defender = _get_defender(threshold=threshold, action=action)
|
|
135
|
+
analysis = defender.analyze(cleaned)
|
|
136
|
+
injection = _injection_dict(analysis)
|
|
137
|
+
|
|
138
|
+
ctx = ResponseContext(
|
|
139
|
+
body=cleaned.encode("utf-8"),
|
|
140
|
+
headers={},
|
|
141
|
+
status_code=200,
|
|
142
|
+
content_type="text/plain",
|
|
143
|
+
url="",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
if analysis.flagged and defender.should_process(ctx):
|
|
147
|
+
result = defender.process(ctx)
|
|
148
|
+
output = result.body.decode("utf-8", errors="replace")
|
|
149
|
+
else:
|
|
150
|
+
output = cleaned
|
|
151
|
+
|
|
152
|
+
self._send_json(200, {"markdown": output, "error": None, "injection": injection})
|
|
153
|
+
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
self._send_json(400, {"markdown": None, "error": "Invalid JSON", "injection": None})
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self._send_json(500, {"markdown": None, "error": str(e), "injection": None})
|
|
158
|
+
|
|
159
|
+
def do_GET(self):
|
|
160
|
+
if self.path == "/health":
|
|
161
|
+
self._send_json(200, {"status": "ok"})
|
|
162
|
+
else:
|
|
163
|
+
self._send_json(404, {"error": "Not found"})
|
|
164
|
+
|
|
165
|
+
def _send_json(self, status: int, data: dict):
|
|
166
|
+
response = json.dumps(data)
|
|
167
|
+
self.send_response(status)
|
|
168
|
+
self.send_header("Content-Type", "application/json")
|
|
169
|
+
self.send_header("Content-Length", str(len(response)))
|
|
170
|
+
self.end_headers()
|
|
171
|
+
self.wfile.write(response.encode())
|
|
172
|
+
|
|
173
|
+
def log_message(self, format, *args):
|
|
174
|
+
print(f"[scurl-server] {args[0]}", file=sys.stderr)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
# Eagerly load the defender + ONNX model so the cost is paid at boot,
|
|
179
|
+
# not on the first request.
|
|
180
|
+
print("[scurl-server] Pre-loading prompt injection model…", file=sys.stderr)
|
|
181
|
+
try:
|
|
182
|
+
defender = _get_defender()
|
|
183
|
+
defender._ensure_heavy_components()
|
|
184
|
+
defender._embedder.embed("warmup")
|
|
185
|
+
print("[scurl-server] Model ready", file=sys.stderr)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"[scurl-server] Model pre-load failed (will retry on first request): {e}", file=sys.stderr)
|
|
188
|
+
|
|
189
|
+
server = HTTPServer(("0.0.0.0", 8080), ScurlHandler)
|
|
190
|
+
print("[scurl-server] Listening on port 8080", file=sys.stderr)
|
|
191
|
+
server.serve_forever()
|