fetch-guard 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. fetch_guard-0.9.0/.github/workflows/ci.yml +31 -0
  2. fetch_guard-0.9.0/.github/workflows/publish.yml +28 -0
  3. fetch_guard-0.9.0/.gitignore +44 -0
  4. fetch_guard-0.9.0/LICENSE +21 -0
  5. fetch_guard-0.9.0/PKG-INFO +247 -0
  6. fetch_guard-0.9.0/README.md +215 -0
  7. fetch_guard-0.9.0/fetch_guard/__init__.py +1 -0
  8. fetch_guard-0.9.0/fetch_guard/cli.py +118 -0
  9. fetch_guard-0.9.0/fetch_guard/extraction/__init__.py +20 -0
  10. fetch_guard-0.9.0/fetch_guard/extraction/content.py +18 -0
  11. fetch_guard-0.9.0/fetch_guard/extraction/content_type.py +263 -0
  12. fetch_guard-0.9.0/fetch_guard/extraction/edges.py +141 -0
  13. fetch_guard-0.9.0/fetch_guard/extraction/links.py +54 -0
  14. fetch_guard-0.9.0/fetch_guard/extraction/metadata.py +133 -0
  15. fetch_guard-0.9.0/fetch_guard/http/__init__.py +15 -0
  16. fetch_guard-0.9.0/fetch_guard/http/client.py +60 -0
  17. fetch_guard-0.9.0/fetch_guard/http/llms_txt.py +48 -0
  18. fetch_guard-0.9.0/fetch_guard/http/playwright.py +58 -0
  19. fetch_guard-0.9.0/fetch_guard/output/__init__.py +5 -0
  20. fetch_guard-0.9.0/fetch_guard/output/formatter.py +100 -0
  21. fetch_guard-0.9.0/fetch_guard/pipeline.py +248 -0
  22. fetch_guard-0.9.0/fetch_guard/security/__init__.py +21 -0
  23. fetch_guard-0.9.0/fetch_guard/security/guard.py +55 -0
  24. fetch_guard-0.9.0/fetch_guard/security/patterns.py +100 -0
  25. fetch_guard-0.9.0/fetch_guard/security/sanitizer.py +113 -0
  26. fetch_guard-0.9.0/fetch_guard/server.py +118 -0
  27. fetch_guard-0.9.0/pyproject.toml +84 -0
  28. fetch_guard-0.9.0/resources/fetch-guard/SKILL.md +84 -0
  29. fetch_guard-0.9.0/resources/fetch-guard.md +84 -0
  30. fetch_guard-0.9.0/tests/__init__.py +0 -0
  31. fetch_guard-0.9.0/tests/test_client.py +183 -0
  32. fetch_guard-0.9.0/tests/test_content.py +41 -0
  33. fetch_guard-0.9.0/tests/test_content_type.py +228 -0
  34. fetch_guard-0.9.0/tests/test_edges.py +147 -0
  35. fetch_guard-0.9.0/tests/test_formatter.py +259 -0
  36. fetch_guard-0.9.0/tests/test_guard.py +104 -0
  37. fetch_guard-0.9.0/tests/test_links.py +91 -0
  38. fetch_guard-0.9.0/tests/test_live.py +247 -0
  39. fetch_guard-0.9.0/tests/test_llms_txt.py +102 -0
  40. fetch_guard-0.9.0/tests/test_metadata.py +168 -0
  41. fetch_guard-0.9.0/tests/test_patterns.py +29 -0
  42. fetch_guard-0.9.0/tests/test_pipeline.py +642 -0
  43. fetch_guard-0.9.0/tests/test_playwright.py +94 -0
  44. fetch_guard-0.9.0/tests/test_sanitizer.py +102 -0
  45. fetch_guard-0.9.0/tests/test_server.py +214 -0
@@ -0,0 +1,31 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v5
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v6
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Lint
28
+ run: ruff check fetch_guard/ tests/
29
+
30
+ - name: Test
31
+ run: pytest -q
@@ -0,0 +1,28 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v5
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v6
18
+ with:
19
+ python-version: "3.13"
20
+
21
+ - name: Install uv
22
+ run: pip install uv
23
+
24
+ - name: Build
25
+ run: uv build
26
+
27
+ - name: Publish to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,44 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ *.whl
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # Testing
17
+ .pytest_cache/
18
+ .coverage
19
+ htmlcov/
20
+ *.cover
21
+
22
+ # Ruff
23
+ .ruff_cache/
24
+
25
+ # IDE
26
+ .vscode/
27
+ .idea/
28
+ *.swp
29
+ *.swo
30
+ *~
31
+
32
+ # OS
33
+ Thumbs.db
34
+ Desktop.ini
35
+ .DS_Store
36
+
37
+ # MCP local config
38
+ .mcp.json
39
+
40
+ # Plans / scope docs
41
+ plans/
42
+
43
+ # AI workspace
44
+ CLAUDE.md
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Owen Sterling
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: fetch-guard
3
+ Version: 0.9.0
4
+ Summary: Fetch URLs and return clean, LLM-ready markdown with metadata and prompt injection defense
5
+ Project-URL: Homepage, https://github.com/Erodenn/fetch-guard
6
+ Project-URL: Repository, https://github.com/Erodenn/fetch-guard
7
+ Project-URL: Issues, https://github.com/Erodenn/fetch-guard/issues
8
+ Author-email: Owen Sterling <owen@erodenn.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Internet :: WWW/HTTP
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: beautifulsoup4
22
+ Requires-Dist: extruct
23
+ Requires-Dist: mcp
24
+ Requires-Dist: requests
25
+ Requires-Dist: trafilatura
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == 'dev'
28
+ Requires-Dist: ruff; extra == 'dev'
29
+ Provides-Extra: js
30
+ Requires-Dist: playwright; extra == 'js'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # Fetch Guard
34
+
35
+ [![License: MIT](https://badgen.net/github/license/Erodenn/fetch-guard)](LICENSE)
36
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/)
37
+
38
+ An [MCP](https://modelcontextprotocol.io/) server and CLI tool that fetches URLs and returns clean, LLM-ready markdown. Not a simple HTTP client, not a browser automation wrapper. A purpose-built extraction pipeline that sanitizes HTML, pulls structured metadata, detects prompt injection attempts, and handles the edge cases that break naive fetchers: bot blocks, paywalls, login walls, non-HTML content types, and pages that require JavaScript to render.
39
+
40
+ The core problem is straightforward: LLMs need web content, but raw HTML is noisy and potentially hostile. Fetched pages can contain hidden text, invisible Unicode, off-screen elements, and outright prompt injection attempts embedded in the content itself. This pipeline strips all of that before the content reaches the model.
41
+
42
+ Three layers handle the injection defense specifically:
43
+
44
+ 1. **Pre-extraction sanitization** removes hidden elements (`display:none`, `visibility:hidden`, `opacity:0`), off-screen positioned content, `aria-hidden` elements, `<noscript>` tags, and 18 categories of non-printing Unicode characters. This happens before content extraction, so trafilatura never sees the attack vectors.
45
+ 2. **Pattern scanning** runs 15 compiled regex patterns against the extracted text, covering system prompt overrides, ignore-previous instructions, role injection, fake conversation tags, hidden instruction markers (`[INST]`, `<<SYS>>`), and suspicious base64 blocks.
46
+ 3. **Session-salted output wrapping** generates a random 8-character hex salt per invocation and wraps the body in `<fetch-content-{salt}>` tags. Since the salt is unpredictable, injected content cannot spoof the wrapper boundaries.
47
+
48
+ ## Quick Start
49
+
50
+ ### Prerequisites
51
+
52
+ - Python 3.10+
53
+ - pip
54
+
55
+ ### Install
56
+
57
+ ```bash
58
+ pip install fetch-guard
59
+ ```
60
+
61
+ For JavaScript rendering (optional):
62
+
63
+ ```bash
64
+ pip install 'fetch-guard[js]' && playwright install chromium
65
+ ```
66
+
67
+ ### Configure Your MCP Client
68
+
69
+ Add the following to your MCP client config. Works with Claude Code, Claude Desktop, Cursor, or any MCP-compatible client.
70
+
71
+ **Via uvx (recommended):**
72
+
73
+ ```json
74
+ {
75
+ "mcpServers": {
76
+ "fetch-guard": {
77
+ "command": "uvx",
78
+ "args": ["fetch-guard"]
79
+ }
80
+ }
81
+ }
82
+ ```
83
+
84
+ **Via pip install:**
85
+
86
+ ```json
87
+ {
88
+ "mcpServers": {
89
+ "fetch-guard": {
90
+ "command": "fetch-guard"
91
+ }
92
+ }
93
+ }
94
+ ```
95
+
96
+ **From source:**
97
+
98
+ ```json
99
+ {
100
+ "mcpServers": {
101
+ "fetch-guard": {
102
+ "command": "python",
103
+ "args": ["-m", "fetch_guard.server"]
104
+ }
105
+ }
106
+ }
107
+ ```
108
+
109
+ ### Verify
110
+
111
+ Ask your AI assistant to fetch any URL. If it returns structured content with a status header, metadata, and risk assessment, you're connected.
112
+
113
+ ### CLI
114
+
115
+ ```bash
116
+ fetch-guard-cli <url> [options]
117
+ # or: python -m fetch_guard.cli <url> [options]
118
+ ```
119
+
120
+ | Flag | Default | Description |
121
+ |---|---|---|
122
+ | `--timeout N` | 180 | Request timeout in seconds |
123
+ | `--max-words N` | none | Word cap on extracted body content |
124
+ | `--js` | off | Use Playwright for JS-rendered pages |
125
+ | `--strict` | off | Exit code 2 on high-risk injection |
126
+ | `--links MODE` | `domains` | `domains` for unique external domains, `full` for all URLs with anchor text |
127
+
128
+ ### Claude Code Skill
129
+
130
+ Copy `resources/fetch-guard/` to `.claude/skills/fetch-guard/` in your project, or use the standalone command file `resources/fetch-guard.md` as a Claude Code command.
131
+
132
+ ## What It Does
133
+
134
+ The pipeline runs a 13-step sequence from URL to structured output:
135
+
136
+ 1. **`/llms.txt` preflight.** Checks the domain root for `/llms.txt` before the full fetch. If the requested URL is a domain root and `/llms.txt` exists, that content replaces the normal HTML pipeline entirely. This respects the emerging convention for LLM-friendly site summaries.
137
+
138
+ 2. **Fetch.** Static HTTP request via `requests`, or Playwright-driven browser rendering if `--js` is set. No automatic fallback between the two: `--js` is explicit opt-in.
139
+
140
+ 3. **Edge detection.** Classifies the response for bot blocks (Cloudflare challenges, 403/429/503 with block signatures, LinkedIn's custom 999), paywalls (subscription prompts, premium overlays), and login walls (sign-in redirects, members-only patterns).
141
+
142
+ 4. **Automatic retry.** Bot blocks trigger one retry with a full Chrome User-Agent string before reporting. Paywalls and login walls are reported immediately with no retry.
143
+
144
+ 5. **Content-type routing.** Non-HTML responses get a fast path: JSON is rendered as a fenced code block, RSS/Atom feeds are parsed into structured summaries, CSV becomes a markdown table (capped at 2,000 rows), and plain text passes through directly. Binary content types are rejected.
145
+
146
+ 6. **HTML sanitization.** Strips hidden elements, off-screen positioned content, `aria-hidden` nodes, `<noscript>` tags, and non-printing Unicode. Returns a tally of everything removed.
147
+
148
+ 7. **Content extraction.** trafilatura converts sanitized HTML to markdown with link preservation.
149
+
150
+ 8. **Metadata extraction.** Pulls title, author, date, description, canonical URL, and image from three sources in priority order: JSON-LD, Open Graph, then meta tags.
151
+
152
+ 9. **Link extraction.** Two modes: `domains` returns a sorted list of unique external domains, `full` returns all external URLs grouped by domain with anchor text.
153
+
154
+ 10. **Injection scanning.** Runs all 15 patterns against the extracted markdown. Each match records the pattern name, severity (high/medium), and a 60-character context snippet.
155
+
156
+ 11. **Truncation.** If `--max-words` is set, the body is truncated after extraction but before output wrapping.
157
+
158
+ 12. **Salt wrapping.** The body gets wrapped in session-salted tags for defense-in-depth.
159
+
160
+ 13. **Output formatting.** CLI produces five plaintext sections (status header, body, metadata, links, injection details). MCP server returns a structured JSON dict with the same data.
161
+
162
+ ## Output
163
+
164
+ ### CLI
165
+
166
+ Five sections, printed to stdout:
167
+
168
+ - **Status header:** URL, fetch timestamp, risk flag (`OK` or `INJECTION WARNING`), sanitization tally, edge case info if detected
169
+ - **Body:** clean markdown wrapped in `<fetch-content-{salt}>` tags
170
+ - **Metadata:** JSON block with title, author, date, description, canonical URL, image
171
+ - **External links:** domain list or full URL breakdown by domain
172
+ - **Injection details:** pattern name, severity, and context snippet for each match (only present when patterns detected)
173
+
174
+ ### MCP Server
175
+
176
+ Returns a structured dict:
177
+
178
+ ```
179
+ url, fetched_at, body, content_type, metadata, links, links_mode,
180
+ risk_level, injection_matches, edge_cases, sanitization,
181
+ llms_txt_available, llms_txt_replaced, js_rendered, js_hint,
182
+ retried, truncated_at
183
+ ```
184
+
185
+ When `--strict` is set and the risk level is `HIGH`, the CLI exits with code 2 and the MCP server raises an error response. The full result is still available in both cases.
186
+
187
+ ## Exit Codes
188
+
189
+ | Code | Meaning |
190
+ |---|---|
191
+ | 0 | Success |
192
+ | 1 | Fetch error (network failure, empty response, binary content) |
193
+ | 2 | High-risk injection detected (`--strict` only) |
194
+
195
+ ## Architecture
196
+
197
+ ```
198
+ fetch_guard/
199
+ ├── pipeline.py # Core orchestration — 13-step sequence, shared by CLI and server
200
+ ├── cli.py # CLI entry point — arg parsing, pipeline call, output
201
+ ├── server.py # MCP server — FastMCP wrapper over the same pipeline
202
+
203
+ ├── http/ # HTTP fetching layer
204
+ │ ├── client.py # Static HTTP fetch via requests
205
+ │ ├── playwright.py # JS rendering via Playwright (optional)
206
+ │ └── llms_txt.py # /llms.txt preflight check
207
+
208
+ ├── extraction/ # Content extraction and edge detection
209
+ │ ├── content.py # trafilatura wrapper — HTML to markdown
210
+ │ ├── content_type.py # Non-HTML routing — JSON, XML/RSS, CSV, plain text
211
+ │ ├── edges.py # Bot block, paywall, login wall classification
212
+ │ ├── links.py # External link extraction (domain list or full URLs)
213
+ │ └── metadata.py # JSON-LD, Open Graph, meta tag extraction
214
+
215
+ ├── security/ # Injection defense
216
+ │ ├── guard.py # Salt generation, content wrapping, pattern scanning
217
+ │ ├── patterns.py # 15 compiled regex patterns — single source of truth
218
+ │ └── sanitizer.py # Hidden element and non-printing character removal
219
+
220
+ └── output/ # Formatting
221
+ └── formatter.py # CLI output assembly
222
+ ```
223
+
224
+ Each module is a single-responsibility unit with a public function as its interface. `pipeline.py` is the shared core: both `cli.py` and `server.py` call `pipeline.run()` and handle the result in their own way.
225
+
226
+ ## Development
227
+
228
+ ```bash
229
+ # Run tests (217 unit tests, all mocked — no network calls)
230
+ pytest
231
+
232
+ # Run live integration tests (hits real URLs)
233
+ pytest -m live
234
+
235
+ # Lint
236
+ ruff check fetch_guard/ tests/
237
+ ```
238
+
239
+ CI runs on push and PR to `main` via GitHub Actions, testing against Python 3.10, 3.12, and 3.13.
240
+
241
+ ## Acknowledgements
242
+
243
+ Developed with [Claude Code](https://claude.ai/code).
244
+
245
+ ## License
246
+
247
+ [MIT](LICENSE)
@@ -0,0 +1,215 @@
1
+ # Fetch Guard
2
+
3
+ [![License: MIT](https://badgen.net/github/license/Erodenn/fetch-guard)](LICENSE)
4
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/)
5
+
6
+ An [MCP](https://modelcontextprotocol.io/) server and CLI tool that fetches URLs and returns clean, LLM-ready markdown. Not a simple HTTP client, not a browser automation wrapper. A purpose-built extraction pipeline that sanitizes HTML, pulls structured metadata, detects prompt injection attempts, and handles the edge cases that break naive fetchers: bot blocks, paywalls, login walls, non-HTML content types, and pages that require JavaScript to render.
7
+
8
+ The core problem is straightforward: LLMs need web content, but raw HTML is noisy and potentially hostile. Fetched pages can contain hidden text, invisible Unicode, off-screen elements, and outright prompt injection attempts embedded in the content itself. This pipeline strips all of that before the content reaches the model.
9
+
10
+ Three layers handle the injection defense specifically:
11
+
12
+ 1. **Pre-extraction sanitization** removes hidden elements (`display:none`, `visibility:hidden`, `opacity:0`), off-screen positioned content, `aria-hidden` elements, `<noscript>` tags, and 18 categories of non-printing Unicode characters. This happens before content extraction, so trafilatura never sees the attack vectors.
13
+ 2. **Pattern scanning** runs 15 compiled regex patterns against the extracted text, covering system prompt overrides, ignore-previous instructions, role injection, fake conversation tags, hidden instruction markers (`[INST]`, `<<SYS>>`), and suspicious base64 blocks.
14
+ 3. **Session-salted output wrapping** generates a random 8-character hex salt per invocation and wraps the body in `<fetch-content-{salt}>` tags. Since the salt is unpredictable, injected content cannot spoof the wrapper boundaries.
15
+
16
+ ## Quick Start
17
+
18
+ ### Prerequisites
19
+
20
+ - Python 3.10+
21
+ - pip
22
+
23
+ ### Install
24
+
25
+ ```bash
26
+ pip install fetch-guard
27
+ ```
28
+
29
+ For JavaScript rendering (optional):
30
+
31
+ ```bash
32
+ pip install 'fetch-guard[js]' && playwright install chromium
33
+ ```
34
+
35
+ ### Configure Your MCP Client
36
+
37
+ Add the following to your MCP client config. Works with Claude Code, Claude Desktop, Cursor, or any MCP-compatible client.
38
+
39
+ **Via uvx (recommended):**
40
+
41
+ ```json
42
+ {
43
+ "mcpServers": {
44
+ "fetch-guard": {
45
+ "command": "uvx",
46
+ "args": ["fetch-guard"]
47
+ }
48
+ }
49
+ }
50
+ ```
51
+
52
+ **Via pip install:**
53
+
54
+ ```json
55
+ {
56
+ "mcpServers": {
57
+ "fetch-guard": {
58
+ "command": "fetch-guard"
59
+ }
60
+ }
61
+ }
62
+ ```
63
+
64
+ **From source:**
65
+
66
+ ```json
67
+ {
68
+ "mcpServers": {
69
+ "fetch-guard": {
70
+ "command": "python",
71
+ "args": ["-m", "fetch_guard.server"]
72
+ }
73
+ }
74
+ }
75
+ ```
76
+
77
+ ### Verify
78
+
79
+ Ask your AI assistant to fetch any URL. If it returns structured content with a status header, metadata, and risk assessment, you're connected.
80
+
81
+ ### CLI
82
+
83
+ ```bash
84
+ fetch-guard-cli <url> [options]
85
+ # or: python -m fetch_guard.cli <url> [options]
86
+ ```
87
+
88
+ | Flag | Default | Description |
89
+ |---|---|---|
90
+ | `--timeout N` | 180 | Request timeout in seconds |
91
+ | `--max-words N` | none | Word cap on extracted body content |
92
+ | `--js` | off | Use Playwright for JS-rendered pages |
93
+ | `--strict` | off | Exit code 2 on high-risk injection |
94
+ | `--links MODE` | `domains` | `domains` for unique external domains, `full` for all URLs with anchor text |
95
+
96
+ ### Claude Code Skill
97
+
98
+ Copy `resources/fetch-guard/` to `.claude/skills/fetch-guard/` in your project, or use the standalone command file `resources/fetch-guard.md` as a Claude Code command.
99
+
100
+ ## What It Does
101
+
102
+ The pipeline runs a 13-step sequence from URL to structured output:
103
+
104
+ 1. **`/llms.txt` preflight.** Checks the domain root for `/llms.txt` before the full fetch. If the requested URL is a domain root and `/llms.txt` exists, that content replaces the normal HTML pipeline entirely. This respects the emerging convention for LLM-friendly site summaries.
105
+
106
+ 2. **Fetch.** Static HTTP request via `requests`, or Playwright-driven browser rendering if `--js` is set. No automatic fallback between the two: `--js` is explicit opt-in.
107
+
108
+ 3. **Edge detection.** Classifies the response for bot blocks (Cloudflare challenges, 403/429/503 with block signatures, LinkedIn's custom 999), paywalls (subscription prompts, premium overlays), and login walls (sign-in redirects, members-only patterns).
109
+
110
+ 4. **Automatic retry.** Bot blocks trigger one retry with a full Chrome User-Agent string before reporting. Paywalls and login walls are reported immediately with no retry.
111
+
112
+ 5. **Content-type routing.** Non-HTML responses get a fast path: JSON is rendered as a fenced code block, RSS/Atom feeds are parsed into structured summaries, CSV becomes a markdown table (capped at 2,000 rows), and plain text passes through directly. Binary content types are rejected.
113
+
114
+ 6. **HTML sanitization.** Strips hidden elements, off-screen positioned content, `aria-hidden` nodes, `<noscript>` tags, and non-printing Unicode. Returns a tally of everything removed.
115
+
116
+ 7. **Content extraction.** trafilatura converts sanitized HTML to markdown with link preservation.
117
+
118
+ 8. **Metadata extraction.** Pulls title, author, date, description, canonical URL, and image from three sources in priority order: JSON-LD, Open Graph, then meta tags.
119
+
120
+ 9. **Link extraction.** Two modes: `domains` returns a sorted list of unique external domains, `full` returns all external URLs grouped by domain with anchor text.
121
+
122
+ 10. **Injection scanning.** Runs all 15 patterns against the extracted markdown. Each match records the pattern name, severity (high/medium), and a 60-character context snippet.
123
+
124
+ 11. **Truncation.** If `--max-words` is set, the body is truncated after extraction but before output wrapping.
125
+
126
+ 12. **Salt wrapping.** The body gets wrapped in session-salted tags for defense-in-depth.
127
+
128
+ 13. **Output formatting.** CLI produces five plaintext sections (status header, body, metadata, links, injection details). MCP server returns a structured JSON dict with the same data.
129
+
130
+ ## Output
131
+
132
+ ### CLI
133
+
134
+ Five sections, printed to stdout:
135
+
136
+ - **Status header:** URL, fetch timestamp, risk flag (`OK` or `INJECTION WARNING`), sanitization tally, edge case info if detected
137
+ - **Body:** clean markdown wrapped in `<fetch-content-{salt}>` tags
138
+ - **Metadata:** JSON block with title, author, date, description, canonical URL, image
139
+ - **External links:** domain list or full URL breakdown by domain
140
+ - **Injection details:** pattern name, severity, and context snippet for each match (only present when patterns detected)
141
+
142
+ ### MCP Server
143
+
144
+ Returns a structured dict:
145
+
146
+ ```
147
+ url, fetched_at, body, content_type, metadata, links, links_mode,
148
+ risk_level, injection_matches, edge_cases, sanitization,
149
+ llms_txt_available, llms_txt_replaced, js_rendered, js_hint,
150
+ retried, truncated_at
151
+ ```
152
+
153
+ When `--strict` is set and the risk level is `HIGH`, the CLI exits with code 2 and the MCP server raises an error response. The full result is still available in both cases.
154
+
155
+ ## Exit Codes
156
+
157
+ | Code | Meaning |
158
+ |---|---|
159
+ | 0 | Success |
160
+ | 1 | Fetch error (network failure, empty response, binary content) |
161
+ | 2 | High-risk injection detected (`--strict` only) |
162
+
163
+ ## Architecture
164
+
165
+ ```
166
+ fetch_guard/
167
+ ├── pipeline.py # Core orchestration — 13-step sequence, shared by CLI and server
168
+ ├── cli.py # CLI entry point — arg parsing, pipeline call, output
169
+ ├── server.py # MCP server — FastMCP wrapper over the same pipeline
170
+
171
+ ├── http/ # HTTP fetching layer
172
+ │ ├── client.py # Static HTTP fetch via requests
173
+ │ ├── playwright.py # JS rendering via Playwright (optional)
174
+ │ └── llms_txt.py # /llms.txt preflight check
175
+
176
+ ├── extraction/ # Content extraction and edge detection
177
+ │ ├── content.py # trafilatura wrapper — HTML to markdown
178
+ │ ├── content_type.py # Non-HTML routing — JSON, XML/RSS, CSV, plain text
179
+ │ ├── edges.py # Bot block, paywall, login wall classification
180
+ │ ├── links.py # External link extraction (domain list or full URLs)
181
+ │ └── metadata.py # JSON-LD, Open Graph, meta tag extraction
182
+
183
+ ├── security/ # Injection defense
184
+ │ ├── guard.py # Salt generation, content wrapping, pattern scanning
185
+ │ ├── patterns.py # 15 compiled regex patterns — single source of truth
186
+ │ └── sanitizer.py # Hidden element and non-printing character removal
187
+
188
+ └── output/ # Formatting
189
+ └── formatter.py # CLI output assembly
190
+ ```
191
+
192
+ Each module is a single-responsibility unit with a public function as its interface. `pipeline.py` is the shared core: both `cli.py` and `server.py` call `pipeline.run()` and handle the result in their own way.
193
+
194
+ ## Development
195
+
196
+ ```bash
197
+ # Run tests (217 unit tests, all mocked — no network calls)
198
+ pytest
199
+
200
+ # Run live integration tests (hits real URLs)
201
+ pytest -m live
202
+
203
+ # Lint
204
+ ruff check fetch_guard/ tests/
205
+ ```
206
+
207
+ CI runs on push and PR to `main` via GitHub Actions, testing against Python 3.10, 3.12, and 3.13.
208
+
209
+ ## Acknowledgements
210
+
211
+ Developed with [Claude Code](https://claude.ai/code).
212
+
213
+ ## License
214
+
215
+ [MIT](LICENSE)
@@ -0,0 +1 @@
1
+ """Fetch Guard — LLM-ready web fetching with prompt injection defense."""