fetch-guard 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fetch_guard-0.9.0/.github/workflows/ci.yml +31 -0
- fetch_guard-0.9.0/.github/workflows/publish.yml +28 -0
- fetch_guard-0.9.0/.gitignore +44 -0
- fetch_guard-0.9.0/LICENSE +21 -0
- fetch_guard-0.9.0/PKG-INFO +247 -0
- fetch_guard-0.9.0/README.md +215 -0
- fetch_guard-0.9.0/fetch_guard/__init__.py +1 -0
- fetch_guard-0.9.0/fetch_guard/cli.py +118 -0
- fetch_guard-0.9.0/fetch_guard/extraction/__init__.py +20 -0
- fetch_guard-0.9.0/fetch_guard/extraction/content.py +18 -0
- fetch_guard-0.9.0/fetch_guard/extraction/content_type.py +263 -0
- fetch_guard-0.9.0/fetch_guard/extraction/edges.py +141 -0
- fetch_guard-0.9.0/fetch_guard/extraction/links.py +54 -0
- fetch_guard-0.9.0/fetch_guard/extraction/metadata.py +133 -0
- fetch_guard-0.9.0/fetch_guard/http/__init__.py +15 -0
- fetch_guard-0.9.0/fetch_guard/http/client.py +60 -0
- fetch_guard-0.9.0/fetch_guard/http/llms_txt.py +48 -0
- fetch_guard-0.9.0/fetch_guard/http/playwright.py +58 -0
- fetch_guard-0.9.0/fetch_guard/output/__init__.py +5 -0
- fetch_guard-0.9.0/fetch_guard/output/formatter.py +100 -0
- fetch_guard-0.9.0/fetch_guard/pipeline.py +248 -0
- fetch_guard-0.9.0/fetch_guard/security/__init__.py +21 -0
- fetch_guard-0.9.0/fetch_guard/security/guard.py +55 -0
- fetch_guard-0.9.0/fetch_guard/security/patterns.py +100 -0
- fetch_guard-0.9.0/fetch_guard/security/sanitizer.py +113 -0
- fetch_guard-0.9.0/fetch_guard/server.py +118 -0
- fetch_guard-0.9.0/pyproject.toml +84 -0
- fetch_guard-0.9.0/resources/fetch-guard/SKILL.md +84 -0
- fetch_guard-0.9.0/resources/fetch-guard.md +84 -0
- fetch_guard-0.9.0/tests/__init__.py +0 -0
- fetch_guard-0.9.0/tests/test_client.py +183 -0
- fetch_guard-0.9.0/tests/test_content.py +41 -0
- fetch_guard-0.9.0/tests/test_content_type.py +228 -0
- fetch_guard-0.9.0/tests/test_edges.py +147 -0
- fetch_guard-0.9.0/tests/test_formatter.py +259 -0
- fetch_guard-0.9.0/tests/test_guard.py +104 -0
- fetch_guard-0.9.0/tests/test_links.py +91 -0
- fetch_guard-0.9.0/tests/test_live.py +247 -0
- fetch_guard-0.9.0/tests/test_llms_txt.py +102 -0
- fetch_guard-0.9.0/tests/test_metadata.py +168 -0
- fetch_guard-0.9.0/tests/test_patterns.py +29 -0
- fetch_guard-0.9.0/tests/test_pipeline.py +642 -0
- fetch_guard-0.9.0/tests/test_playwright.py +94 -0
- fetch_guard-0.9.0/tests/test_sanitizer.py +102 -0
- fetch_guard-0.9.0/tests/test_server.py +214 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v5
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v6
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Lint
|
|
28
|
+
run: ruff check fetch_guard/ tests/
|
|
29
|
+
|
|
30
|
+
- name: Test
|
|
31
|
+
run: pytest -q
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: pypi
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v5
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v6
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.13"
|
|
20
|
+
|
|
21
|
+
- name: Install uv
|
|
22
|
+
run: pip install uv
|
|
23
|
+
|
|
24
|
+
- name: Build
|
|
25
|
+
run: uv build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# Testing
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.coverage
|
|
19
|
+
htmlcov/
|
|
20
|
+
*.cover
|
|
21
|
+
|
|
22
|
+
# Ruff
|
|
23
|
+
.ruff_cache/
|
|
24
|
+
|
|
25
|
+
# IDE
|
|
26
|
+
.vscode/
|
|
27
|
+
.idea/
|
|
28
|
+
*.swp
|
|
29
|
+
*.swo
|
|
30
|
+
*~
|
|
31
|
+
|
|
32
|
+
# OS
|
|
33
|
+
Thumbs.db
|
|
34
|
+
Desktop.ini
|
|
35
|
+
.DS_Store
|
|
36
|
+
|
|
37
|
+
# MCP local config
|
|
38
|
+
.mcp.json
|
|
39
|
+
|
|
40
|
+
# Plans / scope docs
|
|
41
|
+
plans/
|
|
42
|
+
|
|
43
|
+
# AI workspace
|
|
44
|
+
CLAUDE.md
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Owen Sterling
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fetch-guard
|
|
3
|
+
Version: 0.9.0
|
|
4
|
+
Summary: Fetch URLs and return clean, LLM-ready markdown with metadata and prompt injection defense
|
|
5
|
+
Project-URL: Homepage, https://github.com/Erodenn/fetch-guard
|
|
6
|
+
Project-URL: Repository, https://github.com/Erodenn/fetch-guard
|
|
7
|
+
Project-URL: Issues, https://github.com/Erodenn/fetch-guard/issues
|
|
8
|
+
Author-email: Owen Sterling <owen@erodenn.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: beautifulsoup4
|
|
22
|
+
Requires-Dist: extruct
|
|
23
|
+
Requires-Dist: mcp
|
|
24
|
+
Requires-Dist: requests
|
|
25
|
+
Requires-Dist: trafilatura
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
29
|
+
Provides-Extra: js
|
|
30
|
+
Requires-Dist: playwright; extra == 'js'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# Fetch Guard
|
|
34
|
+
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
[](https://www.python.org/)
|
|
37
|
+
|
|
38
|
+
An [MCP](https://modelcontextprotocol.io/) server and CLI tool that fetches URLs and returns clean, LLM-ready markdown. Not a simple HTTP client, not a browser automation wrapper. A purpose-built extraction pipeline that sanitizes HTML, pulls structured metadata, detects prompt injection attempts, and handles the edge cases that break naive fetchers: bot blocks, paywalls, login walls, non-HTML content types, and pages that require JavaScript to render.
|
|
39
|
+
|
|
40
|
+
The core problem is straightforward: LLMs need web content, but raw HTML is noisy and potentially hostile. Fetched pages can contain hidden text, invisible Unicode, off-screen elements, and outright prompt injection attempts embedded in the content itself. This pipeline strips all of that before the content reaches the model.
|
|
41
|
+
|
|
42
|
+
Three layers handle the injection defense specifically:
|
|
43
|
+
|
|
44
|
+
1. **Pre-extraction sanitization** removes hidden elements (`display:none`, `visibility:hidden`, `opacity:0`), off-screen positioned content, `aria-hidden` elements, `<noscript>` tags, and 18 categories of non-printing Unicode characters. This happens before content extraction, so trafilatura never sees the attack vectors.
|
|
45
|
+
2. **Pattern scanning** runs 15 compiled regex patterns against the extracted text, covering system prompt overrides, ignore-previous instructions, role injection, fake conversation tags, hidden instruction markers (`[INST]`, `<<SYS>>`), and suspicious base64 blocks.
|
|
46
|
+
3. **Session-salted output wrapping** generates a random 8-character hex salt per invocation and wraps the body in `<fetch-content-{salt}>` tags. Since the salt is unpredictable, injected content cannot spoof the wrapper boundaries.
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
### Prerequisites
|
|
51
|
+
|
|
52
|
+
- Python 3.10+
|
|
53
|
+
- pip
|
|
54
|
+
|
|
55
|
+
### Install
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install fetch-guard
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
For JavaScript rendering (optional):
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install 'fetch-guard[js]' && playwright install chromium
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Configure Your MCP Client
|
|
68
|
+
|
|
69
|
+
Add the following to your MCP client config. Works with Claude Code, Claude Desktop, Cursor, or any MCP-compatible client.
|
|
70
|
+
|
|
71
|
+
**Via uvx (recommended):**
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"mcpServers": {
|
|
76
|
+
"fetch-guard": {
|
|
77
|
+
"command": "uvx",
|
|
78
|
+
"args": ["fetch-guard"]
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Via pip install:**
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
{
|
|
88
|
+
"mcpServers": {
|
|
89
|
+
"fetch-guard": {
|
|
90
|
+
"command": "fetch-guard"
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**From source:**
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"mcpServers": {
|
|
101
|
+
"fetch-guard": {
|
|
102
|
+
"command": "python",
|
|
103
|
+
"args": ["-m", "fetch_guard.server"]
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Verify
|
|
110
|
+
|
|
111
|
+
Ask your AI assistant to fetch any URL. If it returns structured content with a status header, metadata, and risk assessment, you're connected.
|
|
112
|
+
|
|
113
|
+
### CLI
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
fetch-guard-cli <url> [options]
|
|
117
|
+
# or: python -m fetch_guard.cli <url> [options]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
| Flag | Default | Description |
|
|
121
|
+
|---|---|---|
|
|
122
|
+
| `--timeout N` | 180 | Request timeout in seconds |
|
|
123
|
+
| `--max-words N` | none | Word cap on extracted body content |
|
|
124
|
+
| `--js` | off | Use Playwright for JS-rendered pages |
|
|
125
|
+
| `--strict` | off | Exit code 2 on high-risk injection |
|
|
126
|
+
| `--links MODE` | `domains` | `domains` for unique external domains, `full` for all URLs with anchor text |
|
|
127
|
+
|
|
128
|
+
### Claude Code Skill
|
|
129
|
+
|
|
130
|
+
Copy `resources/fetch-guard/` to `.claude/skills/fetch-guard/` in your project, or use the standalone command file `resources/fetch-guard.md` as a Claude Code command.
|
|
131
|
+
|
|
132
|
+
## What It Does
|
|
133
|
+
|
|
134
|
+
The pipeline runs a 13-step sequence from URL to structured output:
|
|
135
|
+
|
|
136
|
+
1. **`/llms.txt` preflight.** Checks the domain root for `/llms.txt` before the full fetch. If the requested URL is a domain root and `/llms.txt` exists, that content replaces the normal HTML pipeline entirely. This respects the emerging convention for LLM-friendly site summaries.
|
|
137
|
+
|
|
138
|
+
2. **Fetch.** Static HTTP request via `requests`, or Playwright-driven browser rendering if `--js` is set. No automatic fallback between the two: `--js` is explicit opt-in.
|
|
139
|
+
|
|
140
|
+
3. **Edge detection.** Classifies the response for bot blocks (Cloudflare challenges, 403/429/503 with block signatures, LinkedIn's custom 999), paywalls (subscription prompts, premium overlays), and login walls (sign-in redirects, members-only patterns).
|
|
141
|
+
|
|
142
|
+
4. **Automatic retry.** Bot blocks trigger one retry with a full Chrome User-Agent string before reporting. Paywalls and login walls are reported immediately with no retry.
|
|
143
|
+
|
|
144
|
+
5. **Content-type routing.** Non-HTML responses get a fast path: JSON is rendered as a fenced code block, RSS/Atom feeds are parsed into structured summaries, CSV becomes a markdown table (capped at 2,000 rows), and plain text passes through directly. Binary content types are rejected.
|
|
145
|
+
|
|
146
|
+
6. **HTML sanitization.** Strips hidden elements, off-screen positioned content, `aria-hidden` nodes, `<noscript>` tags, and non-printing Unicode. Returns a tally of everything removed.
|
|
147
|
+
|
|
148
|
+
7. **Content extraction.** trafilatura converts sanitized HTML to markdown with link preservation.
|
|
149
|
+
|
|
150
|
+
8. **Metadata extraction.** Pulls title, author, date, description, canonical URL, and image from three sources in priority order: JSON-LD, Open Graph, then meta tags.
|
|
151
|
+
|
|
152
|
+
9. **Link extraction.** Two modes: `domains` returns a sorted list of unique external domains, `full` returns all external URLs grouped by domain with anchor text.
|
|
153
|
+
|
|
154
|
+
10. **Injection scanning.** Runs all 15 patterns against the extracted markdown. Each match records the pattern name, severity (high/medium), and a 60-character context snippet.
|
|
155
|
+
|
|
156
|
+
11. **Truncation.** If `--max-words` is set, the body is truncated after extraction but before output wrapping.
|
|
157
|
+
|
|
158
|
+
12. **Salt wrapping.** The body gets wrapped in session-salted tags for defense-in-depth.
|
|
159
|
+
|
|
160
|
+
13. **Output formatting.** CLI produces five plaintext sections (status header, body, metadata, links, injection details). MCP server returns a structured JSON dict with the same data.
|
|
161
|
+
|
|
162
|
+
## Output
|
|
163
|
+
|
|
164
|
+
### CLI
|
|
165
|
+
|
|
166
|
+
Five sections, printed to stdout:
|
|
167
|
+
|
|
168
|
+
- **Status header:** URL, fetch timestamp, risk flag (`OK` or `INJECTION WARNING`), sanitization tally, edge case info if detected
|
|
169
|
+
- **Body:** clean markdown wrapped in `<fetch-content-{salt}>` tags
|
|
170
|
+
- **Metadata:** JSON block with title, author, date, description, canonical URL, image
|
|
171
|
+
- **External links:** domain list or full URL breakdown by domain
|
|
172
|
+
- **Injection details:** pattern name, severity, and context snippet for each match (only present when patterns detected)
|
|
173
|
+
|
|
174
|
+
### MCP Server
|
|
175
|
+
|
|
176
|
+
Returns a structured dict:
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
url, fetched_at, body, content_type, metadata, links, links_mode,
|
|
180
|
+
risk_level, injection_matches, edge_cases, sanitization,
|
|
181
|
+
llms_txt_available, llms_txt_replaced, js_rendered, js_hint,
|
|
182
|
+
retried, truncated_at
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
When `--strict` is set and the risk level is `HIGH`, the CLI exits with code 2 and the MCP server raises an error response. The full result is still available in both cases.
|
|
186
|
+
|
|
187
|
+
## Exit Codes
|
|
188
|
+
|
|
189
|
+
| Code | Meaning |
|
|
190
|
+
|---|---|
|
|
191
|
+
| 0 | Success |
|
|
192
|
+
| 1 | Fetch error (network failure, empty response, binary content) |
|
|
193
|
+
| 2 | High-risk injection detected (`--strict` only) |
|
|
194
|
+
|
|
195
|
+
## Architecture
|
|
196
|
+
|
|
197
|
+
```
|
|
198
|
+
fetch_guard/
|
|
199
|
+
├── pipeline.py # Core orchestration — 13-step sequence, shared by CLI and server
|
|
200
|
+
├── cli.py # CLI entry point — arg parsing, pipeline call, output
|
|
201
|
+
├── server.py # MCP server — FastMCP wrapper over the same pipeline
|
|
202
|
+
│
|
|
203
|
+
├── http/ # HTTP fetching layer
|
|
204
|
+
│ ├── client.py # Static HTTP fetch via requests
|
|
205
|
+
│ ├── playwright.py # JS rendering via Playwright (optional)
|
|
206
|
+
│ └── llms_txt.py # /llms.txt preflight check
|
|
207
|
+
│
|
|
208
|
+
├── extraction/ # Content extraction and edge detection
|
|
209
|
+
│ ├── content.py # trafilatura wrapper — HTML to markdown
|
|
210
|
+
│ ├── content_type.py # Non-HTML routing — JSON, XML/RSS, CSV, plain text
|
|
211
|
+
│ ├── edges.py # Bot block, paywall, login wall classification
|
|
212
|
+
│ ├── links.py # External link extraction (domain list or full URLs)
|
|
213
|
+
│ └── metadata.py # JSON-LD, Open Graph, meta tag extraction
|
|
214
|
+
│
|
|
215
|
+
├── security/ # Injection defense
|
|
216
|
+
│ ├── guard.py # Salt generation, content wrapping, pattern scanning
|
|
217
|
+
│ ├── patterns.py # 15 compiled regex patterns — single source of truth
|
|
218
|
+
│ └── sanitizer.py # Hidden element and non-printing character removal
|
|
219
|
+
│
|
|
220
|
+
└── output/ # Formatting
|
|
221
|
+
└── formatter.py # CLI output assembly
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Each module is a single-responsibility unit with a public function as its interface. `pipeline.py` is the shared core: both `cli.py` and `server.py` call `pipeline.run()` and handle the result in their own way.
|
|
225
|
+
|
|
226
|
+
## Development
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# Run tests (217 unit tests, all mocked — no network calls)
|
|
230
|
+
pytest
|
|
231
|
+
|
|
232
|
+
# Run live integration tests (hits real URLs)
|
|
233
|
+
pytest -m live
|
|
234
|
+
|
|
235
|
+
# Lint
|
|
236
|
+
ruff check fetch_guard/ tests/
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
CI runs on push and PR to `main` via GitHub Actions, testing against Python 3.10, 3.12, and 3.13.
|
|
240
|
+
|
|
241
|
+
## Acknowledgements
|
|
242
|
+
|
|
243
|
+
Developed with [Claude Code](https://claude.ai/code).
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Fetch Guard
|
|
2
|
+
|
|
3
|
+
[](LICENSE)
|
|
4
|
+
[](https://www.python.org/)
|
|
5
|
+
|
|
6
|
+
An [MCP](https://modelcontextprotocol.io/) server and CLI tool that fetches URLs and returns clean, LLM-ready markdown. Not a simple HTTP client, not a browser automation wrapper. A purpose-built extraction pipeline that sanitizes HTML, pulls structured metadata, detects prompt injection attempts, and handles the edge cases that break naive fetchers: bot blocks, paywalls, login walls, non-HTML content types, and pages that require JavaScript to render.
|
|
7
|
+
|
|
8
|
+
The core problem is straightforward: LLMs need web content, but raw HTML is noisy and potentially hostile. Fetched pages can contain hidden text, invisible Unicode, off-screen elements, and outright prompt injection attempts embedded in the content itself. This pipeline strips all of that before the content reaches the model.
|
|
9
|
+
|
|
10
|
+
Three layers handle the injection defense specifically:
|
|
11
|
+
|
|
12
|
+
1. **Pre-extraction sanitization** removes hidden elements (`display:none`, `visibility:hidden`, `opacity:0`), off-screen positioned content, `aria-hidden` elements, `<noscript>` tags, and 18 categories of non-printing Unicode characters. This happens before content extraction, so trafilatura never sees the attack vectors.
|
|
13
|
+
2. **Pattern scanning** runs 15 compiled regex patterns against the extracted text, covering system prompt overrides, ignore-previous instructions, role injection, fake conversation tags, hidden instruction markers (`[INST]`, `<<SYS>>`), and suspicious base64 blocks.
|
|
14
|
+
3. **Session-salted output wrapping** generates a random 8-character hex salt per invocation and wraps the body in `<fetch-content-{salt}>` tags. Since the salt is unpredictable, injected content cannot spoof the wrapper boundaries.
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
### Prerequisites
|
|
19
|
+
|
|
20
|
+
- Python 3.10+
|
|
21
|
+
- pip
|
|
22
|
+
|
|
23
|
+
### Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install fetch-guard
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
For JavaScript rendering (optional):
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install 'fetch-guard[js]' && playwright install chromium
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Configure Your MCP Client
|
|
36
|
+
|
|
37
|
+
Add the following to your MCP client config. Works with Claude Code, Claude Desktop, Cursor, or any MCP-compatible client.
|
|
38
|
+
|
|
39
|
+
**Via uvx (recommended):**
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"fetch-guard": {
|
|
45
|
+
"command": "uvx",
|
|
46
|
+
"args": ["fetch-guard"]
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Via pip install:**
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"mcpServers": {
|
|
57
|
+
"fetch-guard": {
|
|
58
|
+
"command": "fetch-guard"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**From source:**
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"mcpServers": {
|
|
69
|
+
"fetch-guard": {
|
|
70
|
+
"command": "python",
|
|
71
|
+
"args": ["-m", "fetch_guard.server"]
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Verify
|
|
78
|
+
|
|
79
|
+
Ask your AI assistant to fetch any URL. If it returns structured content with a status header, metadata, and risk assessment, you're connected.
|
|
80
|
+
|
|
81
|
+
### CLI
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
fetch-guard-cli <url> [options]
|
|
85
|
+
# or: python -m fetch_guard.cli <url> [options]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
| Flag | Default | Description |
|
|
89
|
+
|---|---|---|
|
|
90
|
+
| `--timeout N` | 180 | Request timeout in seconds |
|
|
91
|
+
| `--max-words N` | none | Word cap on extracted body content |
|
|
92
|
+
| `--js` | off | Use Playwright for JS-rendered pages |
|
|
93
|
+
| `--strict` | off | Exit code 2 on high-risk injection |
|
|
94
|
+
| `--links MODE` | `domains` | `domains` for unique external domains, `full` for all URLs with anchor text |
|
|
95
|
+
|
|
96
|
+
### Claude Code Skill
|
|
97
|
+
|
|
98
|
+
Copy `resources/fetch-guard/` to `.claude/skills/fetch-guard/` in your project, or use the standalone command file `resources/fetch-guard.md` as a Claude Code command.
|
|
99
|
+
|
|
100
|
+
## What It Does
|
|
101
|
+
|
|
102
|
+
The pipeline runs a 13-step sequence from URL to structured output:
|
|
103
|
+
|
|
104
|
+
1. **`/llms.txt` preflight.** Checks the domain root for `/llms.txt` before the full fetch. If the requested URL is a domain root and `/llms.txt` exists, that content replaces the normal HTML pipeline entirely. This respects the emerging convention for LLM-friendly site summaries.
|
|
105
|
+
|
|
106
|
+
2. **Fetch.** Static HTTP request via `requests`, or Playwright-driven browser rendering if `--js` is set. No automatic fallback between the two: `--js` is explicit opt-in.
|
|
107
|
+
|
|
108
|
+
3. **Edge detection.** Classifies the response for bot blocks (Cloudflare challenges, 403/429/503 with block signatures, LinkedIn's custom 999), paywalls (subscription prompts, premium overlays), and login walls (sign-in redirects, members-only patterns).
|
|
109
|
+
|
|
110
|
+
4. **Automatic retry.** Bot blocks trigger one retry with a full Chrome User-Agent string before reporting. Paywalls and login walls are reported immediately with no retry.
|
|
111
|
+
|
|
112
|
+
5. **Content-type routing.** Non-HTML responses get a fast path: JSON is rendered as a fenced code block, RSS/Atom feeds are parsed into structured summaries, CSV becomes a markdown table (capped at 2,000 rows), and plain text passes through directly. Binary content types are rejected.
|
|
113
|
+
|
|
114
|
+
6. **HTML sanitization.** Strips hidden elements, off-screen positioned content, `aria-hidden` nodes, `<noscript>` tags, and non-printing Unicode. Returns a tally of everything removed.
|
|
115
|
+
|
|
116
|
+
7. **Content extraction.** trafilatura converts sanitized HTML to markdown with link preservation.
|
|
117
|
+
|
|
118
|
+
8. **Metadata extraction.** Pulls title, author, date, description, canonical URL, and image from three sources in priority order: JSON-LD, Open Graph, then meta tags.
|
|
119
|
+
|
|
120
|
+
9. **Link extraction.** Two modes: `domains` returns a sorted list of unique external domains, `full` returns all external URLs grouped by domain with anchor text.
|
|
121
|
+
|
|
122
|
+
10. **Injection scanning.** Runs all 15 patterns against the extracted markdown. Each match records the pattern name, severity (high/medium), and a 60-character context snippet.
|
|
123
|
+
|
|
124
|
+
11. **Truncation.** If `--max-words` is set, the body is truncated after extraction but before output wrapping.
|
|
125
|
+
|
|
126
|
+
12. **Salt wrapping.** The body gets wrapped in session-salted tags for defense-in-depth.
|
|
127
|
+
|
|
128
|
+
13. **Output formatting.** CLI produces five plaintext sections (status header, body, metadata, links, injection details). MCP server returns a structured JSON dict with the same data.
|
|
129
|
+
|
|
130
|
+
## Output
|
|
131
|
+
|
|
132
|
+
### CLI
|
|
133
|
+
|
|
134
|
+
Five sections, printed to stdout:
|
|
135
|
+
|
|
136
|
+
- **Status header:** URL, fetch timestamp, risk flag (`OK` or `INJECTION WARNING`), sanitization tally, edge case info if detected
|
|
137
|
+
- **Body:** clean markdown wrapped in `<fetch-content-{salt}>` tags
|
|
138
|
+
- **Metadata:** JSON block with title, author, date, description, canonical URL, image
|
|
139
|
+
- **External links:** domain list or full URL breakdown by domain
|
|
140
|
+
- **Injection details:** pattern name, severity, and context snippet for each match (only present when patterns detected)
|
|
141
|
+
|
|
142
|
+
### MCP Server
|
|
143
|
+
|
|
144
|
+
Returns a structured dict:
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
url, fetched_at, body, content_type, metadata, links, links_mode,
|
|
148
|
+
risk_level, injection_matches, edge_cases, sanitization,
|
|
149
|
+
llms_txt_available, llms_txt_replaced, js_rendered, js_hint,
|
|
150
|
+
retried, truncated_at
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
When `--strict` is set and the risk level is `HIGH`, the CLI exits with code 2 and the MCP server raises an error response. The full result is still available in both cases.
|
|
154
|
+
|
|
155
|
+
## Exit Codes
|
|
156
|
+
|
|
157
|
+
| Code | Meaning |
|
|
158
|
+
|---|---|
|
|
159
|
+
| 0 | Success |
|
|
160
|
+
| 1 | Fetch error (network failure, empty response, binary content) |
|
|
161
|
+
| 2 | High-risk injection detected (`--strict` only) |
|
|
162
|
+
|
|
163
|
+
## Architecture
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
fetch_guard/
|
|
167
|
+
├── pipeline.py # Core orchestration — 13-step sequence, shared by CLI and server
|
|
168
|
+
├── cli.py # CLI entry point — arg parsing, pipeline call, output
|
|
169
|
+
├── server.py # MCP server — FastMCP wrapper over the same pipeline
|
|
170
|
+
│
|
|
171
|
+
├── http/ # HTTP fetching layer
|
|
172
|
+
│ ├── client.py # Static HTTP fetch via requests
|
|
173
|
+
│ ├── playwright.py # JS rendering via Playwright (optional)
|
|
174
|
+
│ └── llms_txt.py # /llms.txt preflight check
|
|
175
|
+
│
|
|
176
|
+
├── extraction/ # Content extraction and edge detection
|
|
177
|
+
│ ├── content.py # trafilatura wrapper — HTML to markdown
|
|
178
|
+
│ ├── content_type.py # Non-HTML routing — JSON, XML/RSS, CSV, plain text
|
|
179
|
+
│ ├── edges.py # Bot block, paywall, login wall classification
|
|
180
|
+
│ ├── links.py # External link extraction (domain list or full URLs)
|
|
181
|
+
│ └── metadata.py # JSON-LD, Open Graph, meta tag extraction
|
|
182
|
+
│
|
|
183
|
+
├── security/ # Injection defense
|
|
184
|
+
│ ├── guard.py # Salt generation, content wrapping, pattern scanning
|
|
185
|
+
│ ├── patterns.py # 15 compiled regex patterns — single source of truth
|
|
186
|
+
│ └── sanitizer.py # Hidden element and non-printing character removal
|
|
187
|
+
│
|
|
188
|
+
└── output/ # Formatting
|
|
189
|
+
└── formatter.py # CLI output assembly
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Each module is a single-responsibility unit with a public function as its interface. `pipeline.py` is the shared core: both `cli.py` and `server.py` call `pipeline.run()` and handle the result in their own way.
|
|
193
|
+
|
|
194
|
+
## Development
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
# Run tests (217 unit tests, all mocked — no network calls)
|
|
198
|
+
pytest
|
|
199
|
+
|
|
200
|
+
# Run live integration tests (hits real URLs)
|
|
201
|
+
pytest -m live
|
|
202
|
+
|
|
203
|
+
# Lint
|
|
204
|
+
ruff check fetch_guard/ tests/
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
CI runs on push and PR to `main` via GitHub Actions, testing against Python 3.10, 3.12, and 3.13.
|
|
208
|
+
|
|
209
|
+
## Acknowledgements
|
|
210
|
+
|
|
211
|
+
Developed with [Claude Code](https://claude.ai/code).
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Fetch Guard — LLM-ready web fetching with prompt injection defense."""
|