@j0hanz/fetch-url-mcp 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -110
- package/dist/AGENTS.md +119 -84
- package/dist/instructions.md +80 -27
- package/package.json +91 -91
package/README.md
CHANGED
|
@@ -1,38 +1,30 @@
|
|
|
1
|
-
<!-- markdownlint-disable MD033 -->
|
|
2
|
-
|
|
3
1
|
# Fetch URL MCP Server
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
[](https://www.npmjs.com/package/@j0hanz/fetch-url-mcp) [](https://opensource.org/licenses/MIT) [](https://nodejs.org) [](https://www.typescriptlang.org) [](https://modelcontextprotocol.io)
|
|
3
|
+
[](https://www.npmjs.com/package/@j0hanz/fetch-url-mcp) [](https://opensource.org/licenses/MIT) [](https://nodejs.org) [](https://www.typescriptlang.org) [](https://modelcontextprotocol.io)
|
|
8
4
|
|
|
9
5
|
[](https://insiders.vscode.dev/redirect?url=vscode%3Amcp%2Finstall%3F%7B%22name%22%3A%22fetch-url-mcp%22%2C%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22%40j0hanz%2Ffetch-url-mcp%40latest%22%2C%22--stdio%22%5D%7D) [](https://insiders.vscode.dev/redirect?url=vscode-insiders%3Amcp%2Finstall%3F%7B%22name%22%3A%22fetch-url-mcp%22%2C%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22%40j0hanz%2Ffetch-url-mcp%40latest%22%2C%22--stdio%22%5D%7D) [](https://cursor.com/install-mcp?name=fetch-url-mcp&config=eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsIkBqMGhhbnovZmV0Y2gtdXJsLW1jcEBsYXRlc3QiLCItLXN0ZGlvIl19)
|
|
10
6
|
|
|
11
|
-
Fetch
|
|
7
|
+
Fetch public web pages and convert them into clean, AI-readable Markdown.
|
|
12
8
|
|
|
13
9
|
## Overview
|
|
14
10
|
|
|
15
|
-
Fetch URL is a [Model Context Protocol](https://modelcontextprotocol.io) (MCP) server that fetches public web pages, extracts meaningful content using Mozilla's Readability algorithm, and converts the result into clean Markdown optimized for LLM context windows. It handles noise removal, caching, SSRF protection, async task execution, and supports both stdio and Streamable HTTP transports.
|
|
11
|
+
Fetch URL is a [Model Context Protocol](https://modelcontextprotocol.io) (MCP) server that fetches public web pages, extracts meaningful content using Mozilla's Readability algorithm, and converts the result into clean Markdown optimized for LLM context windows. It handles noise removal, caching, SSRF protection, async task execution, and supports both **stdio** and **Streamable HTTP** transports.
|
|
16
12
|
|
|
17
|
-
|
|
13
|
+
> [!NOTE]
|
|
14
|
+
> Content extraction quality varies depending on the HTML structure and complexity of the source page. Fetch URL works best with standard article and documentation layouts. Pages relying on client-side JavaScript rendering may yield incomplete results.
|
|
18
15
|
|
|
19
|
-
|
|
20
|
-
- Raw content URL rewriting for GitHub, GitLab, Bitbucket, and Gist.
|
|
21
|
-
- In-memory LRU cache for faster repeat fetches.
|
|
22
|
-
- Stdio or Streamable HTTP transport with session management.
|
|
23
|
-
- SSRF protections: blocked private IP ranges and internal hostnames.
|
|
16
|
+
## Key Features
|
|
24
17
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
> approaches.
|
|
18
|
+
- **HTML to Markdown** — Content extraction via Mozilla Readability + node-html-markdown
|
|
19
|
+
- **Noise removal** — Strips navigation, ads, cookie banners, and other non-content elements
|
|
20
|
+
- **In-memory LRU cache** — Faster repeat fetches with configurable TTL (24 h default)
|
|
21
|
+
- **Raw URL rewriting** — Auto-converts GitHub, GitLab, Bitbucket, and Gist URLs to raw content endpoints
|
|
30
22
|
|
|
31
23
|
## Tech Stack
|
|
32
24
|
|
|
33
25
|
| Component | Technology |
|
|
34
26
|
| ------------------- | ----------------------------------- |
|
|
35
|
-
| Runtime | Node.js
|
|
27
|
+
| Runtime | Node.js >= 24 |
|
|
36
28
|
| Language | TypeScript 5.9 |
|
|
37
29
|
| MCP SDK | `@modelcontextprotocol/sdk` ^1.26.0 |
|
|
38
30
|
| Content Extraction | `@mozilla/readability` ^0.6.0 |
|
|
@@ -50,51 +42,31 @@ URL → Validate → DNS Preflight → HTTP Fetch → Decompress
|
|
|
50
42
|
```
|
|
51
43
|
|
|
52
44
|
1. **URL Validation** — Normalize, block private hosts, transform raw-content URLs (GitHub, GitLab, Bitbucket)
|
|
53
|
-
2. **Fetch** — HTTP request
|
|
45
|
+
2. **Fetch** — HTTP request with redirect following, DNS preflight SSRF checks, and size limits (10 MB)
|
|
54
46
|
3. **Transform** — Offloaded to worker threads: parse HTML with `linkedom`, extract with Readability, remove DOM noise, convert to Markdown
|
|
55
|
-
4. **Cleanup** — Multi-pass Markdown normalization (heading promotion, spacing, skip-link removal
|
|
56
|
-
5. **Cache + Respond** — Store result, apply inline content limits, return structured content
|
|
47
|
+
4. **Cleanup** — Multi-pass Markdown normalization (heading promotion, spacing, skip-link removal)
|
|
48
|
+
5. **Cache + Respond** — Store result in LRU cache, apply inline content limits, return structured content
|
|
57
49
|
|
|
58
50
|
## Repository Structure
|
|
59
51
|
|
|
60
52
|
```text
|
|
61
53
|
fetch-url-mcp/
|
|
62
|
-
├── assets/
|
|
63
|
-
|
|
64
|
-
├── scripts/
|
|
65
|
-
│ ├── tasks.mjs
|
|
66
|
-
│ └── validate-fetch.mjs
|
|
54
|
+
├── assets/ # Server icon (logo.svg)
|
|
55
|
+
├── scripts/ # Build & test orchestration
|
|
67
56
|
├── src/
|
|
68
|
-
│ ├── workers/
|
|
69
|
-
│
|
|
70
|
-
│
|
|
71
|
-
│ ├──
|
|
72
|
-
│ ├──
|
|
73
|
-
│ ├──
|
|
74
|
-
│ ├──
|
|
75
|
-
│ ├──
|
|
76
|
-
│ ├──
|
|
77
|
-
│ ├──
|
|
78
|
-
│ ├── http-native.ts
|
|
79
|
-
│
|
|
80
|
-
|
|
81
|
-
│ ├── ip-blocklist.ts
|
|
82
|
-
│ ├── json.ts
|
|
83
|
-
│ ├── language-detection.ts
|
|
84
|
-
│ ├── markdown-cleanup.ts
|
|
85
|
-
│ ├── mcp-validator.ts
|
|
86
|
-
│ ├── mcp.ts
|
|
87
|
-
│ ├── observability.ts
|
|
88
|
-
│ ├── server-tuning.ts
|
|
89
|
-
│ ├── session.ts
|
|
90
|
-
│ ├── tasks.ts
|
|
91
|
-
│ ├── timer-utils.ts
|
|
92
|
-
│ ├── tools.ts
|
|
93
|
-
│ ├── transform-types.ts
|
|
94
|
-
│ ├── transform.ts
|
|
95
|
-
│ └── type-guards.ts
|
|
96
|
-
├── tests/
|
|
97
|
-
│ └── *.test.ts
|
|
57
|
+
│ ├── workers/ # Worker-thread child for HTML transforms
|
|
58
|
+
│ ├── index.ts # CLI entrypoint, transport wiring, shutdown
|
|
59
|
+
│ ├── server.ts # McpServer lifecycle and registration
|
|
60
|
+
│ ├── tools.ts # fetch-url tool definition and pipeline
|
|
61
|
+
│ ├── fetch.ts # URL normalization, SSRF, HTTP fetch
|
|
62
|
+
│ ├── transform.ts # HTML-to-Markdown pipeline, worker pool
|
|
63
|
+
│ ├── config.ts # Env-driven configuration
|
|
64
|
+
│ ├── resources.ts # MCP resource/template registration
|
|
65
|
+
│ ├── prompts.ts # MCP prompt registration (get-help)
|
|
66
|
+
│ ├── mcp.ts # Task execution management
|
|
67
|
+
│ ├── http-native.ts # Streamable HTTP server, auth, sessions
|
|
68
|
+
│ └── instructions.md # Server instructions embedded at runtime
|
|
69
|
+
├── tests/ # Unit/integration tests (Node.js test runner)
|
|
98
70
|
├── package.json
|
|
99
71
|
├── tsconfig.json
|
|
100
72
|
└── AGENTS.md
|
|
@@ -102,7 +74,7 @@ fetch-url-mcp/
|
|
|
102
74
|
|
|
103
75
|
## Requirements
|
|
104
76
|
|
|
105
|
-
- **Node.js**
|
|
77
|
+
- **Node.js** >= 24
|
|
106
78
|
|
|
107
79
|
## Quickstart
|
|
108
80
|
|
|
@@ -150,6 +122,12 @@ npm run build
|
|
|
150
122
|
node dist/index.js --stdio
|
|
151
123
|
```
|
|
152
124
|
|
|
125
|
+
### Docker
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
docker compose up --build
|
|
129
|
+
```
|
|
130
|
+
|
|
153
131
|
## Configuration
|
|
154
132
|
|
|
155
133
|
### Runtime Modes
|
|
@@ -166,18 +144,23 @@ When no `--stdio` flag is passed, the server starts in **HTTP mode** (Streamable
|
|
|
166
144
|
|
|
167
145
|
#### Core Settings
|
|
168
146
|
|
|
169
|
-
| Variable
|
|
170
|
-
|
|
|
171
|
-
| `HOST`
|
|
172
|
-
| `PORT`
|
|
173
|
-
| `LOG_LEVEL`
|
|
174
|
-
| `FETCH_TIMEOUT_MS`
|
|
175
|
-
| `CACHE_ENABLED`
|
|
176
|
-
| `USER_AGENT`
|
|
177
|
-
| `ALLOW_REMOTE`
|
|
178
|
-
| `ALLOWED_HOSTS`
|
|
179
|
-
|
|
180
|
-
|
|
147
|
+
| Variable | Default | Description |
|
|
148
|
+
| ------------------ | ------------------------- | --------------------------------------------------- |
|
|
149
|
+
| `HOST` | `127.0.0.1` | HTTP server bind address |
|
|
150
|
+
| `PORT` | `3000` | HTTP server port (1024–65535) |
|
|
151
|
+
| `LOG_LEVEL` | `info` | Log level: `debug`, `info`, `warn`, `error` |
|
|
152
|
+
| `FETCH_TIMEOUT_MS` | `15000` | HTTP fetch timeout in ms (1000–60000) |
|
|
153
|
+
| `CACHE_ENABLED` | `true` | Enable/disable in-memory content cache |
|
|
154
|
+
| `USER_AGENT` | `fetch-url-mcp/{version}` | Custom User-Agent header |
|
|
155
|
+
| `ALLOW_REMOTE` | `false` | Allow remote connections in HTTP mode |
|
|
156
|
+
| `ALLOWED_HOSTS` | _(empty)_ | Comma-separated host/origin allowlist for HTTP mode |
|
|
157
|
+
|
|
158
|
+
#### Task Management
|
|
159
|
+
|
|
160
|
+
| Variable | Default | Description |
|
|
161
|
+
| --------------------- | ------- | ------------------------------------------------ |
|
|
162
|
+
| `TASKS_MAX_TOTAL` | `5000` | Maximum retained task records across all owners |
|
|
163
|
+
| `TASKS_MAX_PER_OWNER` | `1000` | Maximum retained task records per session/client |
|
|
181
164
|
|
|
182
165
|
#### Authentication (HTTP Mode)
|
|
183
166
|
|
|
@@ -282,6 +265,7 @@ Fetches a webpage and converts it to clean Markdown format optimized for LLM con
|
|
|
282
265
|
**Limitations:**
|
|
283
266
|
|
|
284
267
|
- Does not execute complex client-side JavaScript interactions
|
|
268
|
+
- Inline output may be truncated when `MAX_INLINE_CONTENT_CHARS` is set
|
|
285
269
|
|
|
286
270
|
##### Parameters
|
|
287
271
|
|
|
@@ -297,31 +281,43 @@ Fetches a webpage and converts it to clean Markdown format optimized for LLM con
|
|
|
297
281
|
```json
|
|
298
282
|
{
|
|
299
283
|
"url": "https://example.com",
|
|
284
|
+
"inputUrl": "https://example.com",
|
|
300
285
|
"resolvedUrl": "https://example.com",
|
|
301
286
|
"finalUrl": "https://example.com",
|
|
302
|
-
"inputUrl": "https://example.com",
|
|
303
287
|
"title": "Example Domain",
|
|
288
|
+
"metadata": {
|
|
289
|
+
"title": "Example Domain",
|
|
290
|
+
"description": "...",
|
|
291
|
+
"author": "...",
|
|
292
|
+
"image": "...",
|
|
293
|
+
"favicon": "...",
|
|
294
|
+
"publishedAt": "...",
|
|
295
|
+
"modifiedAt": "..."
|
|
296
|
+
},
|
|
304
297
|
"markdown": "# Example Domain\n\nThis domain is for use in illustrative examples...",
|
|
298
|
+
"fromCache": false,
|
|
299
|
+
"fetchedAt": "2026-02-11T12:00:00.000Z",
|
|
300
|
+
"contentSize": 1234,
|
|
305
301
|
"truncated": false
|
|
306
302
|
}
|
|
307
303
|
```
|
|
308
304
|
|
|
309
|
-
| Field | Type | Description
|
|
310
|
-
| ------------- | ---------- |
|
|
311
|
-
| `url` | `string` | The canonical URL (pre-raw-transform)
|
|
312
|
-
| `inputUrl` | `string
|
|
313
|
-
| `resolvedUrl` | `string
|
|
314
|
-
| `finalUrl` | `string?` | Final response URL after redirects
|
|
315
|
-
| `title` | `string?` | Extracted page title
|
|
316
|
-
| `metadata` | `object?` | Extracted metadata (title, description, author
|
|
317
|
-
| `markdown` | `string?` | Extracted content in Markdown format
|
|
318
|
-
| `fromCache` | `boolean?` | Whether the response was served from cache
|
|
319
|
-
| `fetchedAt` | `string?` | ISO timestamp for fetch/cache retrieval
|
|
320
|
-
| `contentSize` | `number?` | Full markdown size before inline truncation
|
|
321
|
-
| `truncated` | `boolean?` | Whether inline markdown was truncated
|
|
322
|
-
| `error` | `string?` | Error message if the request failed
|
|
323
|
-
| `statusCode` | `number?` | HTTP status code for failed requests
|
|
324
|
-
| `details` | `object?` | Additional error details
|
|
305
|
+
| Field | Type | Description |
|
|
306
|
+
| ------------- | ---------- | ---------------------------------------------------------------------------------------- |
|
|
307
|
+
| `url` | `string` | The canonical URL (pre-raw-transform) |
|
|
308
|
+
| `inputUrl` | `string?` | The original URL provided by the caller |
|
|
309
|
+
| `resolvedUrl` | `string?` | The normalized/transformed URL that was fetched |
|
|
310
|
+
| `finalUrl` | `string?` | Final response URL after redirects |
|
|
311
|
+
| `title` | `string?` | Extracted page title |
|
|
312
|
+
| `metadata` | `object?` | Extracted metadata (title, description, author, image, favicon, publishedAt, modifiedAt) |
|
|
313
|
+
| `markdown` | `string?` | Extracted content in Markdown format |
|
|
314
|
+
| `fromCache` | `boolean?` | Whether the response was served from cache |
|
|
315
|
+
| `fetchedAt` | `string?` | ISO timestamp for fetch/cache retrieval |
|
|
316
|
+
| `contentSize` | `number?` | Full markdown size before inline truncation |
|
|
317
|
+
| `truncated` | `boolean?` | Whether inline markdown was truncated |
|
|
318
|
+
| `error` | `string?` | Error message if the request failed |
|
|
319
|
+
| `statusCode` | `number?` | HTTP status code for failed requests |
|
|
320
|
+
| `details` | `object?` | Additional error details |
|
|
325
321
|
|
|
326
322
|
##### Annotations
|
|
327
323
|
|
|
@@ -334,7 +330,7 @@ Fetches a webpage and converts it to clean Markdown format optimized for LLM con
|
|
|
334
330
|
|
|
335
331
|
##### Async Task Execution
|
|
336
332
|
|
|
337
|
-
The `fetch-url` tool supports optional async task execution. Include a `task` field in the tool call to run the fetch in the background:
|
|
333
|
+
The `fetch-url` tool supports optional async task execution (`execution.taskSupport: "optional"`). Include a `task` field in the tool call to run the fetch in the background:
|
|
338
334
|
|
|
339
335
|
```json
|
|
340
336
|
{
|
|
@@ -351,9 +347,9 @@ Then poll `tasks/get` until the task status is `completed` or `failed`, and retr
|
|
|
351
347
|
|
|
352
348
|
### Prompts
|
|
353
349
|
|
|
354
|
-
| Name | Description
|
|
355
|
-
| ---------- |
|
|
356
|
-
| `get-help` | Returns server usage
|
|
350
|
+
| Name | Description |
|
|
351
|
+
| ---------- | --------------------------------- |
|
|
352
|
+
| `get-help` | Returns server usage instructions |
|
|
357
353
|
|
|
358
354
|
### Resources
|
|
359
355
|
|
|
@@ -362,12 +358,6 @@ Then poll `tasks/get` until the task status is `completed` or `failed`, and retr
|
|
|
362
358
|
| `internal://instructions` | `text/markdown` | Server instructions and usage guidance |
|
|
363
359
|
| `internal://cache/{namespace}/{hash}` | `text/markdown` | Cached markdown entries from prior `fetch-url` calls |
|
|
364
360
|
|
|
365
|
-
### Completions
|
|
366
|
-
|
|
367
|
-
- `completion/complete` supports `internal://cache/{namespace}/{hash}` template variables:
|
|
368
|
-
- `namespace`
|
|
369
|
-
- `hash` (optionally filtered by `context.arguments.namespace`)
|
|
370
|
-
|
|
371
361
|
### Tasks
|
|
372
362
|
|
|
373
363
|
The server declares full MCP task support:
|
|
@@ -479,6 +469,37 @@ Add to your Windsurf MCP configuration:
|
|
|
479
469
|
|
|
480
470
|
</details>
|
|
481
471
|
|
|
472
|
+
<details>
|
|
473
|
+
<summary>Docker</summary>
|
|
474
|
+
|
|
475
|
+
Use the published image from GitHub Container Registry:
|
|
476
|
+
|
|
477
|
+
```json
|
|
478
|
+
{
|
|
479
|
+
"mcpServers": {
|
|
480
|
+
"fetch-url-mcp": {
|
|
481
|
+
"command": "docker",
|
|
482
|
+
"args": [
|
|
483
|
+
"run",
|
|
484
|
+
"-i",
|
|
485
|
+
"--rm",
|
|
486
|
+
"ghcr.io/j0hanz/fetch-url-mcp:latest",
|
|
487
|
+
"--stdio"
|
|
488
|
+
]
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
Or build and run locally:
|
|
495
|
+
|
|
496
|
+
```bash
|
|
497
|
+
docker build -t fetch-url-mcp .
|
|
498
|
+
docker run -i --rm fetch-url-mcp --stdio
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
</details>
|
|
502
|
+
|
|
482
503
|
## Security
|
|
483
504
|
|
|
484
505
|
### SSRF Protection
|
|
@@ -486,7 +507,7 @@ Add to your Windsurf MCP configuration:
|
|
|
486
507
|
Fetch URL blocks requests to private and internal network addresses:
|
|
487
508
|
|
|
488
509
|
- **Blocked hosts**: `localhost`, `127.0.0.0/8`, `10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`, `169.254.0.0/16`, `100.64.0.0/10`
|
|
489
|
-
- **Blocked IPv6**: `::1`, `fc00::/7`, `fe80::/10`, IPv4-mapped private addresses
|
|
510
|
+
- **Blocked IPv6**: `::1`, `fc00::/7`, `fe80::/10`, IPv4-mapped private addresses
|
|
490
511
|
- **Cloud metadata**: `169.254.169.254` (AWS), `metadata.google.internal`, `metadata.azure.com`, `100.100.100.200` (Azure IMDS)
|
|
491
512
|
|
|
492
513
|
DNS preflight checks run on every redirect hop to prevent DNS rebinding attacks.
|
|
@@ -532,12 +553,12 @@ npm install
|
|
|
532
553
|
## Build and Release
|
|
533
554
|
|
|
534
555
|
```bash
|
|
535
|
-
npm run build
|
|
556
|
+
npm run build # Clean → Compile → Copy Assets → chmod
|
|
536
557
|
npm run prepublishOnly # Lint → Type-Check → Build
|
|
537
|
-
npm publish
|
|
558
|
+
npm publish # Publish to npm
|
|
538
559
|
```
|
|
539
560
|
|
|
540
|
-
|
|
561
|
+
CI/CD is handled via a GitHub Actions workflow (`release.yml`) that runs lint, type-check, test, build, and publishes to npm with version bumping.
|
|
541
562
|
|
|
542
563
|
## Troubleshooting
|
|
543
564
|
|
|
@@ -549,17 +570,15 @@ Use the built-in inspector to test the server interactively:
|
|
|
549
570
|
npm run inspector
|
|
550
571
|
```
|
|
551
572
|
|
|
552
|
-
This builds the project and launches `@modelcontextprotocol/inspector` pointing to the compiled server.
|
|
553
|
-
|
|
554
573
|
### Common Issues
|
|
555
574
|
|
|
556
|
-
| Issue | Solution
|
|
557
|
-
| ------------------------- |
|
|
558
|
-
| `VALIDATION_ERROR` on URL | URL is blocked (private IP/localhost) or malformed. Do not retry.
|
|
559
|
-
| `queue_full` error |
|
|
560
|
-
| Garbled output | Binary content (images, PDFs) cannot be converted. Ensure the URL serves HTML.
|
|
561
|
-
| No output in stdio mode | Ensure `--stdio` flag is passed. Without it, the server starts in HTTP mode.
|
|
562
|
-
| Auth errors in HTTP mode | Set `ACCESS_TOKENS` or `API_KEY` env var and pass as `Authorization: Bearer <token>`.
|
|
575
|
+
| Issue | Solution |
|
|
576
|
+
| ------------------------- | ------------------------------------------------------------------------------------- |
|
|
577
|
+
| `VALIDATION_ERROR` on URL | URL is blocked (private IP/localhost) or malformed. Do not retry. |
|
|
578
|
+
| `queue_full` error | Worker pool busy. Wait briefly, then retry or use async task mode. |
|
|
579
|
+
| Garbled output | Binary content (images, PDFs) cannot be converted. Ensure the URL serves HTML. |
|
|
580
|
+
| No output in stdio mode | Ensure `--stdio` flag is passed. Without it, the server starts in HTTP mode. |
|
|
581
|
+
| Auth errors in HTTP mode | Set `ACCESS_TOKENS` or `API_KEY` env var and pass as `Authorization: Bearer <token>`. |
|
|
563
582
|
|
|
564
583
|
### Stdout / Stderr Guidance
|
|
565
584
|
|
package/dist/AGENTS.md
CHANGED
|
@@ -4,112 +4,147 @@
|
|
|
4
4
|
|
|
5
5
|
## 1) Project Context
|
|
6
6
|
|
|
7
|
-
- **Domain:** MCP (Model Context Protocol) server that fetches web pages and converts HTML
|
|
7
|
+
- **Domain:** MCP (Model Context Protocol) server that fetches public web pages and converts HTML into clean, AI-readable Markdown — published as `@j0hanz/fetch-url-mcp` on npm and `io.github.j0hanz/fetch-url-mcp` on the MCP Registry (see `server.json`, `package.json`).
|
|
8
8
|
- **Tech Stack (Verified):**
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
9
|
+
- **Language:** TypeScript 5.9+ (see `package.json` `devDependencies`, `tsconfig.json` strict config)
|
|
10
|
+
- **Runtime:** Node.js >= 24 (see `package.json` `engines`, `.github/workflows/release.yml`)
|
|
11
|
+
- **Framework:** `@modelcontextprotocol/sdk` ^1.26.0 — MCP server SDK v1.x (see `package.json` `dependencies`)
|
|
11
12
|
- **Key Libraries:**
|
|
12
|
-
- `zod`
|
|
13
|
-
- `@mozilla/readability`
|
|
14
|
-
- `linkedom`
|
|
15
|
-
- `node-html-markdown`
|
|
16
|
-
- **Architecture:** Single-package MCP server
|
|
13
|
+
- `zod` ^4.3.6 — input/output schema validation (see `package.json`)
|
|
14
|
+
- `@mozilla/readability` ^0.6.0 — content extraction (see `package.json`)
|
|
15
|
+
- `linkedom` ^0.18.12 — server-side DOM (see `package.json`)
|
|
16
|
+
- `node-html-markdown` ^2.0.0 — HTML-to-Markdown conversion (see `package.json`)
|
|
17
|
+
- **Architecture:** Single-package MCP server exposing `fetch-url` tool via **stdio** (default) and **Streamable HTTP** transports. Entrypoint at `src/index.ts` wires CLI parsing, signal handlers, and transport selection. Server lifecycle managed in `src/server.ts` with tool registration in `src/tools.ts`. HTML fetching, URL normalization, and security (IP blocklist, SSRF protection) in `src/fetch.ts`. HTML → Markdown transformation optionally offloaded to a worker-thread pool (`src/workers/`). In-memory LRU caching in `src/cache.ts`.
|
|
17
18
|
|
|
18
19
|
## 2) Repository Map (High-Level)
|
|
19
20
|
|
|
20
|
-
- `src/` —
|
|
21
|
-
- `index.ts` — CLI entrypoint
|
|
22
|
-
- `
|
|
23
|
-
- `tools.ts` — `fetch-url` tool definition,
|
|
24
|
-
- `
|
|
25
|
-
- `
|
|
26
|
-
- `
|
|
27
|
-
- `
|
|
28
|
-
- `
|
|
29
|
-
- `
|
|
30
|
-
- `
|
|
31
|
-
- `
|
|
32
|
-
- `
|
|
33
|
-
- `tests/` —
|
|
34
|
-
- `scripts/` — Build orchestration (`tasks.mjs`)
|
|
35
|
-
- `assets/` —
|
|
36
|
-
- `.github/workflows/` — CI/CD (publish to npm
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
> Ignore: `dist/`, `node_modules/`, `.tsbuildinfo`
|
|
21
|
+
- `src/` — TypeScript source (compiled to `dist/`); flat module structure, no subdirectories except `workers/` (see `tsconfig.json` `rootDir`)
|
|
22
|
+
- `src/index.ts` — CLI entrypoint with shebang, transport wiring, shutdown handlers
|
|
23
|
+
- `src/server.ts` — `McpServer` lifecycle: capabilities, icons, instructions, registration
|
|
24
|
+
- `src/tools.ts` — `fetch-url` tool definition, input/output schemas, fetch pipeline, progress reporting, inline truncation
|
|
25
|
+
- `src/fetch.ts` — URL normalization, SSRF protection, DNS validation, streaming HTTP fetch, raw-URL transforms (GitHub/GitLab/Bitbucket)
|
|
26
|
+
- `src/transform.ts` — HTML-to-Markdown pipeline, worker-pool management
|
|
27
|
+
- `src/workers/` — Worker-thread child for off-main-thread HTML transforms
|
|
28
|
+
- `src/config.ts` — Centralized env-driven configuration
|
|
29
|
+
- `src/errors.ts` — Error helpers (`FetchError`, `getErrorMessage`)
|
|
30
|
+
- `src/mcp.ts` — MCP protocol handlers, task execution management
|
|
31
|
+
- `src/resources.ts` — MCP resource/template registration (cache snapshots, instructions)
|
|
32
|
+
- `src/prompts.ts` — MCP prompt registration (`get-help`)
|
|
33
|
+
- `src/instructions.md` — Server instructions embedded at runtime
|
|
34
|
+
- `tests/` — Unit/integration tests (46+ test files) using Node.js built-in test runner
|
|
35
|
+
- `scripts/` — Build & test orchestration (`tasks.mjs`)
|
|
36
|
+
- `assets/` — Server icon (`logo.svg`)
|
|
37
|
+
- `.github/workflows/` — CI/CD (`release.yml`: lint → type-check → test → build → publish to npm, MCP Registry, Docker)
|
|
38
|
+
|
|
39
|
+
> Ignore: `dist/`, `node_modules/`, `coverage/`, `.cache/`, `.tsbuildinfo`
|
|
40
40
|
|
|
41
41
|
## 3) Operational Commands (Verified)
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
- **
|
|
46
|
-
- **
|
|
47
|
-
- **
|
|
48
|
-
- **
|
|
49
|
-
- **
|
|
50
|
-
- **
|
|
51
|
-
- **
|
|
52
|
-
- **
|
|
43
|
+
All commands verified from `.github/workflows/release.yml` (CI) and `package.json` scripts.
|
|
44
|
+
|
|
45
|
+
- **Environment:** Node.js >= 24 with npm; no additional runtime managers required (see `package.json` `engines`, `Dockerfile`)
|
|
46
|
+
- **Install:** `npm ci` (see `.github/workflows/release.yml` "Install & validate" step)
|
|
47
|
+
- **Dev:** `npm run dev` → `tsc --watch --preserveWatchOutput` (see `package.json`)
|
|
48
|
+
- **Dev (run):** `npm run dev:run` → `node --env-file=.env --watch dist/index.js` (see `package.json`)
|
|
49
|
+
- **Start:** `npm run start` → `node dist/index.js` (see `package.json`)
|
|
50
|
+
- **Build:** `npm run build` → `node scripts/tasks.mjs build` — cleans `dist/`, compiles TS, validates `instructions.md`, copies assets, sets executable bit (see `scripts/tasks.mjs`, `package.json`)
|
|
51
|
+
- **Type-check:** `npm run type-check` → `tsc -p tsconfig.json --noEmit` (see `scripts/tasks.mjs`, `.github/workflows/release.yml`)
|
|
52
|
+
- **Lint:** `npm run lint` → `eslint .` (see `package.json`, `.github/workflows/release.yml`)
|
|
53
|
+
- **Lint (fix):** `npm run lint:fix` → `eslint . --fix` (see `package.json`)
|
|
54
|
+
- **Format:** `npm run format` → `prettier --write .` (see `package.json`)
|
|
55
|
+
- **Test:** `npm run test` → `node scripts/tasks.mjs test` — builds first, then runs `node --test` on `tests/**/*.test.ts` (see `scripts/tasks.mjs`, `.github/workflows/release.yml`)
|
|
56
|
+
- **Test (coverage):** `npm run test:coverage` (see `package.json`)
|
|
57
|
+
- **Inspector:** `npm run inspector` → builds then launches MCP Inspector on stdio (see `package.json`)
|
|
58
|
+
- **Dead code:** `npm run knip` / `npm run knip:fix` (see `package.json`)
|
|
59
|
+
- **Docker:** `docker compose up --build` (see `docker-compose.yml`, `Dockerfile`)
|
|
53
60
|
|
|
54
61
|
## 4) Coding Standards (Style & Patterns)
|
|
55
62
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
- **
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
- **
|
|
68
|
-
- **
|
|
69
|
-
- **
|
|
70
|
-
- **
|
|
71
|
-
- **
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
63
|
+
### Naming (see `eslint.config.mjs` `@typescript-eslint/naming-convention`)
|
|
64
|
+
|
|
65
|
+
- **Default:** `camelCase` (leading `_` allowed)
|
|
66
|
+
- **Variables:** `camelCase`, `UPPER_CASE`, or `PascalCase`
|
|
67
|
+
- **Types/Interfaces:** `PascalCase`
|
|
68
|
+
- **Enum members:** `PascalCase` or `UPPER_CASE`
|
|
69
|
+
- **Properties:** unrestricted format
|
|
70
|
+
- **Imports:** `camelCase` or `PascalCase`
|
|
71
|
+
|
|
72
|
+
### Structure
|
|
73
|
+
|
|
74
|
+
- **Module system:** ESM (`"type": "module"` in `package.json`); use `.js` extensions in local imports (see `tsconfig.json` `module: "NodeNext"`, `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
75
|
+
- **Exports:** Named exports only — no default exports (see `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
76
|
+
- **Imports:** Type-only imports required (`import type { X }` / `import { type X }`) — enforced by `@typescript-eslint/consistent-type-imports` (see `eslint.config.mjs`)
|
|
77
|
+
- **Import order:** Automated via `@trivago/prettier-plugin-sort-imports` — `node:` → third-party → `@modelcontextprotocol` → `@mozilla` → local by layer (see `.prettierrc`)
|
|
78
|
+
- **No unused imports:** Enforced by `eslint-plugin-unused-imports` (see `eslint.config.mjs`)
|
|
79
|
+
|
|
80
|
+
### Typing/Strictness (see `tsconfig.json`)
|
|
81
|
+
|
|
82
|
+
- `strict: true`
|
|
83
|
+
- `noUncheckedIndexedAccess: true`
|
|
84
|
+
- `exactOptionalPropertyTypes: true`
|
|
85
|
+
- `verbatimModuleSyntax: true`
|
|
86
|
+
- `isolatedModules: true`
|
|
87
|
+
- `noImplicitReturns: true`
|
|
88
|
+
- `noFallthroughCasesInSwitch: true`
|
|
89
|
+
- `useUnknownInCatchVariables: true`
|
|
90
|
+
- ESLint extends `tseslint.configs.strictTypeChecked` + `stylisticTypeChecked` (see `eslint.config.mjs`)
|
|
91
|
+
|
|
92
|
+
### Formatting (see `.prettierrc`)
|
|
93
|
+
|
|
94
|
+
- 2-space indent, no tabs
|
|
95
|
+
- Single quotes, semicolons, trailing commas (`es5`)
|
|
96
|
+
- Print width: 80
|
|
97
|
+
- LF line endings
|
|
98
|
+
- Arrow parens: always
|
|
99
|
+
|
|
100
|
+
### Patterns Observed
|
|
101
|
+
|
|
102
|
+
- **Zod v4 strict schemas** for all tool inputs/outputs with `.describe()`, `.min()`/`.max()`, `z.strictObject()` (observed in `src/tools.ts`)
|
|
103
|
+
- **Structured + text content** responses: `structuredContent` always paired with `content: [{ type: 'text', text: JSON.stringify(structured) }]` for backward compatibility (observed in `src/tools.ts`)
|
|
104
|
+
- **Error handling:** Tool errors return `isError: true` in result — never throw uncaught; `FetchError` class with error codes (observed in `src/tools.ts`, `src/errors.ts`)
|
|
105
|
+
- **Class-based internal services** with injected dependencies (e.g., `IpBlocker`, `UrlNormalizer`, `RawUrlTransformer` in `src/fetch.ts`)
|
|
106
|
+
- **AsyncLocalStorage** for request-scoped context/observability (`runWithRequestContext` in `src/observability.ts`, used in `src/tools.ts`)
|
|
107
|
+
- **Worker-thread pool** for CPU-intensive HTML transforms with graceful scaling and shutdown (observed in `src/transform.ts`, `src/workers/`)
|
|
108
|
+
- **Explicit return types** on exported functions — enforced by `@typescript-eslint/explicit-function-return-type` (see `eslint.config.mjs`)
|
|
109
|
+
- **Shebang required:** `src/index.ts` must start with `#!/usr/bin/env node` (observed in `src/index.ts`, documented in `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
110
|
+
- **Prefer arrow callbacks, const, template literals, destructuring, optional chaining, nullish coalescing** — all enforced via ESLint rules (see `eslint.config.mjs`)
|
|
77
111
|
|
|
78
112
|
## 5) Agent Behavioral Rules (Do Nots)
|
|
79
113
|
|
|
80
|
-
- Do not introduce new dependencies without updating
|
|
81
|
-
- Do not edit `package-lock.json` manually.
|
|
82
|
-
- Do not commit secrets; never print `.env` values
|
|
83
|
-
- Do not
|
|
84
|
-
- Do not
|
|
85
|
-
- Do not use
|
|
86
|
-
- Do not
|
|
87
|
-
- Do not
|
|
88
|
-
- Do not
|
|
89
|
-
- Do not
|
|
90
|
-
- Do not throw uncaught exceptions from tool handlers — return `isError: true`
|
|
114
|
+
- Do not introduce new dependencies without updating `package.json` and `package-lock.json` via `npm install`. (see `package-lock.json` presence, `.github/workflows/release.yml` uses `npm ci`)
|
|
115
|
+
- Do not edit `package-lock.json` manually. (see `package-lock.json`)
|
|
116
|
+
- Do not commit secrets; never print `.env` values. Use environment variables via `config.ts`. (see `.gitignore` excludes `.env*`)
|
|
117
|
+
- Do not write non-MCP output to **stdout** in server code — it corrupts JSON-RPC on stdio transport. Use `console.error()` or protocol logging. (see `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
118
|
+
- Do not use default exports. Use named exports only. (see `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
119
|
+
- Do not use `any` — enforced by `@typescript-eslint/no-explicit-any: 'error'`. (see `eslint.config.mjs`)
|
|
120
|
+
- Do not disable or bypass existing lint/type rules without explicit approval. (see `eslint.config.mjs`, `tsconfig.json`)
|
|
121
|
+
- Do not use `zod/v3` compat mode — standardize on Zod v4. (see `.github/instructions/typescript-mcp-server.instructions.md`, `package.json`)
|
|
122
|
+
- Do not omit `.js` extensions in local imports. (see `tsconfig.json` `module: "NodeNext"`)
|
|
123
|
+
- Do not remove the shebang line (`#!/usr/bin/env node`) from `src/index.ts`. (see `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
124
|
+
- Do not throw uncaught exceptions from tool handlers — return `isError: true` instead. (see `.github/instructions/typescript-mcp-server.instructions.md`)
|
|
91
125
|
|
|
92
126
|
## 6) Testing Strategy (Verified)
|
|
93
127
|
|
|
94
|
-
- **Framework:**
|
|
95
|
-
- **Where tests live:** `tests/` directory
|
|
128
|
+
- **Framework:** Node.js built-in test runner (`node:test`) with `node:assert/strict` (see `scripts/tasks.mjs`, `tests/fetch-url-tool.test.ts`)
|
|
129
|
+
- **Where tests live:** `tests/` directory — 46+ `.test.ts` files (see repo tree)
|
|
130
|
+
- **Test patterns scanned:** `src/__tests__/**/*.test.ts`, `tests/**/*.test.ts` (see `scripts/tasks.mjs` `CONFIG.test.patterns`)
|
|
96
131
|
- **Approach:**
|
|
97
|
-
- Tests
|
|
98
|
-
- Unit tests with `t.mock.method()`
|
|
99
|
-
-
|
|
100
|
-
-
|
|
101
|
-
-
|
|
102
|
-
|
|
132
|
+
- Tests import from compiled `../dist/` — a full build runs before tests (see `scripts/tasks.mjs` `TestTasks.test`, `tests/fetch-url-tool.test.ts` imports)
|
|
133
|
+
- Unit tests with `globalThis.fetch` mocked via `t.mock.method()` (observed in `tests/fetch-url-tool.test.ts`)
|
|
134
|
+
- Config values temporarily overridden per test with `try/finally` cleanup (observed in `tests/fetch-url-tool.test.ts`)
|
|
135
|
+
- Worker pool shutdown in `after()` hooks for clean teardown (observed in `tests/fetch-url-tool.test.ts`)
|
|
136
|
+
- No external services (DB/containers) required for tests
|
|
137
|
+
- **CI validation order:** `lint` → `type-check` → `test` → `build` (see `.github/workflows/release.yml`)
|
|
103
138
|
|
|
104
|
-
## 7) Common Pitfalls (Verified)
|
|
139
|
+
## 7) Common Pitfalls (Verified Only)
|
|
105
140
|
|
|
106
|
-
-
|
|
107
|
-
-
|
|
108
|
-
-
|
|
109
|
-
-
|
|
110
|
-
- **Shebang line** — `src/index.ts` must keep `#!/usr/bin/env node` as the exact first line (no BOM, no blank lines before it).
|
|
141
|
+
- Tests run against compiled output (`dist/`), not source — always build before testing. The `npm run test` command handles this automatically. (see `scripts/tasks.mjs`)
|
|
142
|
+
- `src/instructions.md` must exist — the build validates its presence and copies it to `dist/`. Missing it will fail the build. (see `scripts/tasks.mjs` `BuildTasks.validate`)
|
|
143
|
+
- Worker pool state is process-global — tests that change `config.transform.maxWorkerScale` must call `shutdownTransformWorkerPool()` before and/or after to avoid stale pool state. (observed in `tests/fetch-url-tool.test.ts`)
|
|
144
|
+
- Import sorting is enforced by Prettier plugin — manual import reordering will be overwritten by `npm run format`. (see `.prettierrc` `importOrder`)
|
|
111
145
|
|
|
112
146
|
## 8) Evolution Rules
|
|
113
147
|
|
|
114
148
|
- If conventions change, include an `AGENTS.md` update in the same PR.
|
|
115
149
|
- If a command is corrected after failures, record the final verified command here.
|
|
150
|
+
- If a new critical path or pattern is discovered, add it to the relevant section with evidence.
|
package/dist/instructions.md
CHANGED
|
@@ -1,57 +1,110 @@
|
|
|
1
|
-
# FETCH
|
|
1
|
+
# FETCH-URL INSTRUCTIONS
|
|
2
2
|
|
|
3
|
-
Available as resource (internal://instructions) or prompt (get-help). Load when unsure about tool usage.
|
|
3
|
+
Available as resource (`internal://instructions`) or prompt (`get-help`). Load when unsure about tool usage.
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
## CORE CAPABILITY
|
|
8
8
|
|
|
9
9
|
- Domain: Fetch public web pages and convert HTML to clean, LLM-readable Markdown.
|
|
10
|
-
- Primary
|
|
11
|
-
- Tools: fetch-url (READ-ONLY; no write tools exist).
|
|
12
|
-
- Prompts: get-help.
|
|
10
|
+
- Primary Resources: Markdown content, cached snapshots (`internal://cache/{namespace}/{hash}`).
|
|
11
|
+
- Tools: `fetch-url` (READ-ONLY; no write tools exist).
|
|
13
12
|
|
|
14
13
|
---
|
|
15
14
|
|
|
16
|
-
##
|
|
15
|
+
## PROMPTS
|
|
16
|
+
|
|
17
|
+
- `get-help`: Returns these instructions for quick recall.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## RESOURCES & RESOURCE LINKS
|
|
22
|
+
|
|
23
|
+
- `internal://instructions`: This document.
|
|
24
|
+
- `internal://cache/{namespace}/{hash}`: Immutable cached Markdown snapshots from previous `fetch-url` calls. Ephemeral — lost when the server process restarts.
|
|
25
|
+
- If inline Markdown is truncated (ends with `...[truncated]`), the full content may be available via the cache resource. Use `resources/read` with the cache URI to retrieve it.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## PROGRESS & TASKS
|
|
30
|
+
|
|
31
|
+
- Include `_meta.progressToken` in requests to receive `notifications/progress` updates during fetch.
|
|
32
|
+
- Task-augmented tool calls are supported for `fetch-url`:
|
|
33
|
+
- These tools declare `execution.taskSupport: "optional"` — invoke normally or as a task.
|
|
34
|
+
- Send `tools/call` with `task` to get a task id.
|
|
35
|
+
- Poll `tasks/get` and fetch results via `tasks/result`.
|
|
36
|
+
- Use `tasks/cancel` to abort.
|
|
37
|
+
- Task data is stored in memory and cleared on restart.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## THE "GOLDEN PATH" WORKFLOWS (CRITICAL)
|
|
17
42
|
|
|
18
43
|
### WORKFLOW A: STANDARD FETCH
|
|
19
44
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
45
|
+
1. Call `fetch-url` with `{ "url": "https://..." }`.
|
|
46
|
+
2. Read the `markdown` field from `structuredContent`.
|
|
47
|
+
3. If `truncated` is `true`: use the cache resource URI or paginated access to get full content.
|
|
48
|
+
NOTE: Never guess URIs; always use values returned in responses.
|
|
49
|
+
|
|
50
|
+
### WORKFLOW B: FRESH CONTENT (BYPASS CACHE)
|
|
51
|
+
|
|
52
|
+
1. Call `fetch-url` with `{ "url": "https://...", "forceRefresh": true }`.
|
|
53
|
+
2. Read the `markdown` field.
|
|
54
|
+
NOTE: Use `forceRefresh` only when stale content is suspected. Cached responses are faster.
|
|
23
55
|
|
|
24
|
-
### WORKFLOW
|
|
56
|
+
### WORKFLOW C: FULL-FIDELITY FETCH (PRESERVE NOISE)
|
|
25
57
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
58
|
+
1. Call `fetch-url` with `{ "url": "https://...", "skipNoiseRemoval": true }`.
|
|
59
|
+
2. Read the `markdown` field — navigation, footers, and sidebars are preserved.
|
|
60
|
+
NOTE: Use this when page structure (nav, footer) is relevant to the task.
|
|
61
|
+
|
|
62
|
+
### WORKFLOW D: ASYNC EXECUTION (LARGE SITES / TIMEOUTS)
|
|
63
|
+
|
|
64
|
+
1. Call `tools/call` with `task: { ttl: ... }` to start a background fetch.
|
|
65
|
+
2. Poll `tasks/get` until status is `completed` or `failed`.
|
|
66
|
+
3. Retrieve result via `tasks/result`.
|
|
29
67
|
|
|
30
68
|
---
|
|
31
69
|
|
|
32
70
|
## TOOL NUANCES & GOTCHAS
|
|
33
71
|
|
|
34
|
-
fetch-url
|
|
72
|
+
`fetch-url`
|
|
35
73
|
|
|
36
74
|
- Purpose: Fetch a URL and return Markdown.
|
|
37
|
-
- Input: {
|
|
38
|
-
-
|
|
39
|
-
-
|
|
40
|
-
-
|
|
41
|
-
-
|
|
42
|
-
-
|
|
75
|
+
- Input: `{ url, skipNoiseRemoval?, forceRefresh?, maxInlineChars? }`
|
|
76
|
+
- `url` (required): Must be `http://` or `https://`. Max 2048 chars.
|
|
77
|
+
- `skipNoiseRemoval` (bool): Keeps navigation, footers, and other elements normally filtered.
|
|
78
|
+
- `forceRefresh` (bool): Bypasses the cache and fetches live.
|
|
79
|
+
- `maxInlineChars` (int, 0–10485760): Per-call inline limit. `0` means unlimited. If a global limit is configured, the lower value wins.
|
|
80
|
+
- Output: `{ url, inputUrl, resolvedUrl, finalUrl, title, metadata, markdown, fromCache, fetchedAt, contentSize, truncated, error, statusCode, details }`
|
|
81
|
+
- `metadata`: Extracted page metadata — `title`, `description`, `author`, `image`, `favicon`, `publishedAt`, `modifiedAt`.
|
|
82
|
+
- `markdown`: The extracted content. May be absent on error.
|
|
83
|
+
- `truncated`: `true` when inline content was cut. Full content stored in cache.
|
|
84
|
+
- `resolvedUrl`: The normalized/raw-transformed URL actually fetched (GitHub/GitLab/Bitbucket URLs auto-convert to raw content URLs).
|
|
85
|
+
- `finalUrl`: The URL after following redirects.
|
|
86
|
+
- Side effects: None (read-only, idempotent). Populates the in-memory cache automatically.
|
|
87
|
+
- Gotcha: Inline Markdown may be truncated when `MAX_INLINE_CONTENT_CHARS` is configured. Check the `truncated` field and use the cache resource for full content.
|
|
88
|
+
- Gotcha: GitHub, GitLab, and Bitbucket URLs are auto-transformed to raw content endpoints. Check `resolvedUrl` to see the actual fetched URL.
|
|
89
|
+
- Gotcha: Does not execute client-side JavaScript. Content requiring JS rendering may be incomplete.
|
|
90
|
+
- Limits: HTML capped at 10 MB (`MAX_HTML_BYTES`). Inline content unlimited by default; set `MAX_INLINE_CONTENT_CHARS` env var to cap.
|
|
43
91
|
|
|
44
92
|
---
|
|
45
93
|
|
|
46
|
-
##
|
|
94
|
+
## CONSTRAINTS & LIMITATIONS
|
|
47
95
|
|
|
48
|
-
-
|
|
49
|
-
-
|
|
50
|
-
-
|
|
96
|
+
- **Blocked URLs:** localhost, private IPs (`10.x`, `172.16–31.x`, `192.168.x`), cloud metadata endpoints (`169.254.169.254`, `metadata.google.internal`, etc.), `.local`/`.internal` suffixes.
|
|
97
|
+
- **Max HTML size:** 10 MB per fetch.
|
|
98
|
+
- **Cache:** In-memory LRU — max 100 entries, 50 MB total, 24-hour TTL. Lost on process restart.
|
|
99
|
+
- **No JavaScript execution:** Pages relying on client-side rendering may yield incomplete Markdown.
|
|
100
|
+
- **Binary files:** Not supported — only HTML content is processed.
|
|
101
|
+
- **Redirects:** Max 5 redirects followed automatically.
|
|
51
102
|
|
|
52
103
|
---
|
|
53
104
|
|
|
54
|
-
##
|
|
105
|
+
## ERROR HANDLING STRATEGY
|
|
55
106
|
|
|
56
|
-
-
|
|
57
|
-
-
|
|
107
|
+
- `VALIDATION_ERROR`: URL invalid or blocked (private IP, metadata endpoint). Do not retry — fix the URL.
|
|
108
|
+
- `FETCH_ERROR`: Network/upstream failure (DNS, connection refused, timeout). Retry once with backoff.
|
|
109
|
+
- `HTTP_{status}` (e.g. `HTTP_404`, `HTTP_500`): Upstream returned an HTTP error. Check `statusCode` and `details` fields. Retry only for 5xx errors.
|
|
110
|
+
- `queue_full`: Worker pool busy (concurrent transforms). Wait briefly, then retry or use the Task interface.
|
package/package.json
CHANGED
|
@@ -1,91 +1,91 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "@j0hanz/fetch-url-mcp",
|
|
3
|
-
"version": "0.0
|
|
4
|
-
"mcpName": "io.github.j0hanz/fetch-url-mcp",
|
|
5
|
-
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
|
|
6
|
-
"type": "module",
|
|
7
|
-
"main": "./dist/index.js",
|
|
8
|
-
"types": "./dist/index.d.ts",
|
|
9
|
-
"bin": {
|
|
10
|
-
"fetch-url-mcp": "dist/index.js"
|
|
11
|
-
},
|
|
12
|
-
"exports": {
|
|
13
|
-
".": {
|
|
14
|
-
"types": "./dist/index.d.ts",
|
|
15
|
-
"default": "./dist/index.js"
|
|
16
|
-
},
|
|
17
|
-
"./package.json": "./package.json"
|
|
18
|
-
},
|
|
19
|
-
"files": [
|
|
20
|
-
"dist",
|
|
21
|
-
"README.md"
|
|
22
|
-
],
|
|
23
|
-
"repository": {
|
|
24
|
-
"type": "git",
|
|
25
|
-
"url": "https://github.com/j0hanz/fetch-url-mcp.git"
|
|
26
|
-
},
|
|
27
|
-
"homepage": "https://github.com/j0hanz/fetch-url-mcp#readme",
|
|
28
|
-
"bugs": {
|
|
29
|
-
"url": "https://github.com/j0hanz/fetch-url-mcp/issues"
|
|
30
|
-
},
|
|
31
|
-
"author": "j0hanz",
|
|
32
|
-
"license": "MIT",
|
|
33
|
-
"keywords": [
|
|
34
|
-
"mcp",
|
|
35
|
-
"mcp-server",
|
|
36
|
-
"web-fetching",
|
|
37
|
-
"content-extraction",
|
|
38
|
-
"readability",
|
|
39
|
-
"markdown",
|
|
40
|
-
"ai-tools",
|
|
41
|
-
"model-context-protocol",
|
|
42
|
-
"fetch-url-mcp"
|
|
43
|
-
],
|
|
44
|
-
"scripts": {
|
|
45
|
-
"clean": "node scripts/tasks.mjs clean",
|
|
46
|
-
"validate:instructions": "node scripts/tasks.mjs validate:instructions",
|
|
47
|
-
"build": "node scripts/tasks.mjs build",
|
|
48
|
-
"copy:assets": "node scripts/tasks.mjs copy:assets",
|
|
49
|
-
"prepare": "npm run build",
|
|
50
|
-
"dev": "tsc --watch --preserveWatchOutput",
|
|
51
|
-
"dev:run": "node --env-file=.env --watch dist/index.js",
|
|
52
|
-
"start": "node dist/index.js",
|
|
53
|
-
"format": "prettier --write .",
|
|
54
|
-
"type-check": "node scripts/tasks.mjs type-check",
|
|
55
|
-
"type-check:diagnostics": "tsc --noEmit --extendedDiagnostics",
|
|
56
|
-
"type-check:trace": "node -e \"require('fs').rmSync('.ts-trace',{recursive:true,force:true})\" && tsc --noEmit --generateTrace .ts-trace",
|
|
57
|
-
"lint": "eslint .",
|
|
58
|
-
"lint:fix": "eslint . --fix",
|
|
59
|
-
"test": "node scripts/tasks.mjs test",
|
|
60
|
-
"test:coverage": "node scripts/tasks.mjs test --coverage",
|
|
61
|
-
"knip": "knip",
|
|
62
|
-
"knip:fix": "knip --fix",
|
|
63
|
-
"inspector": "npm run build && npx -y @modelcontextprotocol/inspector node dist/index.js --stdio",
|
|
64
|
-
"prepublishOnly": "npm run lint && npm run type-check && npm run build"
|
|
65
|
-
},
|
|
66
|
-
"dependencies": {
|
|
67
|
-
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
68
|
-
"@mozilla/readability": "^0.6.0",
|
|
69
|
-
"linkedom": "^0.18.12",
|
|
70
|
-
"node-html-markdown": "^2.0.0",
|
|
71
|
-
"zod": "^4.3.6"
|
|
72
|
-
},
|
|
73
|
-
"devDependencies": {
|
|
74
|
-
"@eslint/js": "^9.39.2",
|
|
75
|
-
"@trivago/prettier-plugin-sort-imports": "^6.0.2",
|
|
76
|
-
"@types/node": "^24",
|
|
77
|
-
"eslint": "^9.23.2",
|
|
78
|
-
"eslint-config-prettier": "^10.1.8",
|
|
79
|
-
"eslint-plugin-de-morgan": "^2.0.0",
|
|
80
|
-
"eslint-plugin-depend": "^1.4.0",
|
|
81
|
-
"eslint-plugin-sonarjs": "^3.0.6",
|
|
82
|
-
"eslint-plugin-unused-imports": "^4.4.1",
|
|
83
|
-
"knip": "^5.83.1",
|
|
84
|
-
"prettier": "^3.8.1",
|
|
85
|
-
"typescript": "^5.9.3",
|
|
86
|
-
"typescript-eslint": "^8.55.0"
|
|
87
|
-
},
|
|
88
|
-
"engines": {
|
|
89
|
-
"node": ">=24"
|
|
90
|
-
}
|
|
91
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "@j0hanz/fetch-url-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"mcpName": "io.github.j0hanz/fetch-url-mcp",
|
|
5
|
+
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"bin": {
|
|
10
|
+
"fetch-url-mcp": "dist/index.js"
|
|
11
|
+
},
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"default": "./dist/index.js"
|
|
16
|
+
},
|
|
17
|
+
"./package.json": "./package.json"
|
|
18
|
+
},
|
|
19
|
+
"files": [
|
|
20
|
+
"dist",
|
|
21
|
+
"README.md"
|
|
22
|
+
],
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "https://github.com/j0hanz/fetch-url-mcp.git"
|
|
26
|
+
},
|
|
27
|
+
"homepage": "https://github.com/j0hanz/fetch-url-mcp#readme",
|
|
28
|
+
"bugs": {
|
|
29
|
+
"url": "https://github.com/j0hanz/fetch-url-mcp/issues"
|
|
30
|
+
},
|
|
31
|
+
"author": "j0hanz",
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"keywords": [
|
|
34
|
+
"mcp",
|
|
35
|
+
"mcp-server",
|
|
36
|
+
"web-fetching",
|
|
37
|
+
"content-extraction",
|
|
38
|
+
"readability",
|
|
39
|
+
"markdown",
|
|
40
|
+
"ai-tools",
|
|
41
|
+
"model-context-protocol",
|
|
42
|
+
"fetch-url-mcp"
|
|
43
|
+
],
|
|
44
|
+
"scripts": {
|
|
45
|
+
"clean": "node scripts/tasks.mjs clean",
|
|
46
|
+
"validate:instructions": "node scripts/tasks.mjs validate:instructions",
|
|
47
|
+
"build": "node scripts/tasks.mjs build",
|
|
48
|
+
"copy:assets": "node scripts/tasks.mjs copy:assets",
|
|
49
|
+
"prepare": "npm run build",
|
|
50
|
+
"dev": "tsc --watch --preserveWatchOutput",
|
|
51
|
+
"dev:run": "node --env-file=.env --watch dist/index.js",
|
|
52
|
+
"start": "node dist/index.js",
|
|
53
|
+
"format": "prettier --write .",
|
|
54
|
+
"type-check": "node scripts/tasks.mjs type-check",
|
|
55
|
+
"type-check:diagnostics": "tsc --noEmit --extendedDiagnostics",
|
|
56
|
+
"type-check:trace": "node -e \"require('fs').rmSync('.ts-trace',{recursive:true,force:true})\" && tsc --noEmit --generateTrace .ts-trace",
|
|
57
|
+
"lint": "eslint .",
|
|
58
|
+
"lint:fix": "eslint . --fix",
|
|
59
|
+
"test": "node scripts/tasks.mjs test",
|
|
60
|
+
"test:coverage": "node scripts/tasks.mjs test --coverage",
|
|
61
|
+
"knip": "knip",
|
|
62
|
+
"knip:fix": "knip --fix",
|
|
63
|
+
"inspector": "npm run build && npx -y @modelcontextprotocol/inspector node dist/index.js --stdio",
|
|
64
|
+
"prepublishOnly": "npm run lint && npm run type-check && npm run build"
|
|
65
|
+
},
|
|
66
|
+
"dependencies": {
|
|
67
|
+
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
68
|
+
"@mozilla/readability": "^0.6.0",
|
|
69
|
+
"linkedom": "^0.18.12",
|
|
70
|
+
"node-html-markdown": "^2.0.0",
|
|
71
|
+
"zod": "^4.3.6"
|
|
72
|
+
},
|
|
73
|
+
"devDependencies": {
|
|
74
|
+
"@eslint/js": "^9.39.2",
|
|
75
|
+
"@trivago/prettier-plugin-sort-imports": "^6.0.2",
|
|
76
|
+
"@types/node": "^24",
|
|
77
|
+
"eslint": "^9.23.2",
|
|
78
|
+
"eslint-config-prettier": "^10.1.8",
|
|
79
|
+
"eslint-plugin-de-morgan": "^2.0.0",
|
|
80
|
+
"eslint-plugin-depend": "^1.4.0",
|
|
81
|
+
"eslint-plugin-sonarjs": "^3.0.6",
|
|
82
|
+
"eslint-plugin-unused-imports": "^4.4.1",
|
|
83
|
+
"knip": "^5.83.1",
|
|
84
|
+
"prettier": "^3.8.1",
|
|
85
|
+
"typescript": "^5.9.3",
|
|
86
|
+
"typescript-eslint": "^8.55.0"
|
|
87
|
+
},
|
|
88
|
+
"engines": {
|
|
89
|
+
"node": ">=24"
|
|
90
|
+
}
|
|
91
|
+
}
|