mcp-web-reader 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -33
- package/dist/index.js +115 -115
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -1,39 +1,45 @@
|
|
|
1
1
|
# MCP Web Reader
|
|
2
2
|
|
|
3
|
-
A powerful MCP (Model Context Protocol) server that enables Claude and other LLMs to read and parse web content.
|
|
3
|
+
A powerful MCP (Model Context Protocol) server that enables Claude and other LLMs to read and parse web content. Bypasses access restrictions for WeChat articles, paywalled sites, and Cloudflare-protected pages.
|
|
4
|
+
|
|
5
|
+
[简体中文](./README_CN.md)
|
|
4
6
|
|
|
5
7
|
## Features
|
|
6
8
|
|
|
7
|
-
- 🚀 **Multi-engine
|
|
8
|
-
- 🔄 **
|
|
9
|
-
- 🌐 **Bypass restrictions**:
|
|
9
|
+
- 🚀 **Multi-engine**: Jina Reader API, local parser, and Playwright browser
|
|
10
|
+
- 🔄 **Smart fallback**: Auto-switches Jina → Local → Playwright browser
|
|
11
|
+
- 🌐 **Bypass restrictions**: Cloudflare, CAPTCHAs, access controls
|
|
10
12
|
- 📦 **Batch processing**: Fetch multiple URLs simultaneously
|
|
11
|
-
-
|
|
12
|
-
- 📝 **Markdown output**: Automatic conversion to clean Markdown format
|
|
13
|
+
- 📝 **Markdown output**: Automatic conversion to clean Markdown
|
|
13
14
|
|
|
14
15
|
## Installation
|
|
15
16
|
|
|
16
|
-
### Quick Install (Recommended)
|
|
17
|
-
|
|
18
17
|
```bash
|
|
19
18
|
npm install -g mcp-web-reader
|
|
20
19
|
```
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
> **Note**: Chromium browser (~100-200MB) will be automatically downloaded. This is required for:
|
|
22
|
+
> - WeChat articles (need browser rendering)
|
|
23
|
+
> - Cloudflare-protected sites
|
|
24
|
+
> - JavaScript-heavy sites
|
|
25
|
+
> - CAPTCHA/access restrictions
|
|
26
|
+
|
|
27
|
+
Download may take 1-5 minutes depending on network speed.
|
|
28
|
+
|
|
29
|
+
### From Source
|
|
23
30
|
|
|
24
31
|
```bash
|
|
25
32
|
git clone https://github.com/Gracker/mcp-web-reader.git
|
|
26
33
|
cd mcp-web-reader
|
|
27
34
|
npm install
|
|
28
35
|
npm run build
|
|
29
|
-
npx playwright install chromium
|
|
30
36
|
```
|
|
31
37
|
|
|
32
38
|
## Configuration
|
|
33
39
|
|
|
34
40
|
### Claude Desktop
|
|
35
41
|
|
|
36
|
-
Add to your
|
|
42
|
+
Add to your config file:
|
|
37
43
|
|
|
38
44
|
**Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
|
39
45
|
**macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
@@ -48,36 +54,27 @@ Add to your Claude Desktop config file:
|
|
|
48
54
|
}
|
|
49
55
|
```
|
|
50
56
|
|
|
51
|
-
### Claude Code
|
|
52
|
-
|
|
53
|
-
For Claude Code users, add the MCP server using the command line:
|
|
57
|
+
### Claude Code
|
|
54
58
|
|
|
55
59
|
```bash
|
|
56
60
|
claude mcp add web-reader -- mcp-web-reader
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
To verify the server is configured:
|
|
60
|
-
```bash
|
|
61
61
|
claude mcp list
|
|
62
62
|
```
|
|
63
63
|
|
|
64
64
|
## Usage
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
After configuration, use natural language commands:
|
|
69
|
-
|
|
66
|
+
In Claude:
|
|
70
67
|
- "Fetch content from https://example.com"
|
|
71
|
-
- "Get content using browser for https://mp.weixin.qq.com/..."
|
|
68
|
+
- "Get content using browser for https://mp.weixin.qq.com/..."
|
|
72
69
|
- "Fetch multiple URLs: [url1, url2, url3]"
|
|
73
70
|
|
|
74
71
|
## Supported Sites
|
|
75
72
|
|
|
76
|
-
-
|
|
77
|
-
-
|
|
78
|
-
-
|
|
79
|
-
-
|
|
80
|
-
-
|
|
73
|
+
- WeChat articles (mp.weixin.qq.com)
|
|
74
|
+
- Paywalled sites (NYT, Time Magazine, etc.)
|
|
75
|
+
- Cloudflare-protected sites
|
|
76
|
+
- JavaScript-heavy sites
|
|
77
|
+
- CAPTCHA-protected sites
|
|
81
78
|
|
|
82
79
|
## Tools
|
|
83
80
|
|
|
@@ -89,12 +86,12 @@ After configuration, use natural language commands:
|
|
|
89
86
|
|
|
90
87
|
## Architecture
|
|
91
88
|
|
|
92
|
-
Intelligent fallback
|
|
89
|
+
Intelligent fallback:
|
|
93
90
|
```
|
|
94
91
|
URL Request → Jina Reader → Local Parser → Playwright Browser
|
|
95
92
|
```
|
|
96
93
|
|
|
97
|
-
Auto-detects restrictions and switches to browser
|
|
94
|
+
Auto-detects restrictions and switches to browser for:
|
|
98
95
|
- HTTP status codes: 403, 429, 503, 520-524
|
|
99
96
|
- Keywords: Cloudflare, CAPTCHA, Access Denied
|
|
100
97
|
- Content patterns: Security checks, human verification
|
|
@@ -102,13 +99,11 @@ Auto-detects restrictions and switches to browser mode for:
|
|
|
102
99
|
## Development
|
|
103
100
|
|
|
104
101
|
```bash
|
|
105
|
-
npm run dev # Development
|
|
102
|
+
npm run dev # Development with auto-rebuild
|
|
106
103
|
npm run build # Build production version
|
|
107
104
|
npm start # Test run
|
|
108
|
-
npx playwright install chromium # Install browser (required)
|
|
109
105
|
```
|
|
110
106
|
|
|
111
107
|
## License
|
|
112
108
|
|
|
113
109
|
MIT License
|
|
114
|
-
|
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import fetch from "node-fetch";
|
|
|
5
5
|
import { JSDOM } from "jsdom";
|
|
6
6
|
import TurndownService from "turndown";
|
|
7
7
|
import { chromium } from "playwright";
|
|
8
|
-
//
|
|
8
|
+
// Create server instance
|
|
9
9
|
const server = new Server({
|
|
10
10
|
name: "web-reader",
|
|
11
11
|
version: "2.0.0",
|
|
@@ -14,19 +14,19 @@ const server = new Server({
|
|
|
14
14
|
tools: {},
|
|
15
15
|
},
|
|
16
16
|
});
|
|
17
|
-
//
|
|
17
|
+
// Initialize Turndown service (convert HTML to Markdown)
|
|
18
18
|
const turndownService = new TurndownService({
|
|
19
19
|
headingStyle: "atx",
|
|
20
20
|
codeBlockStyle: "fenced",
|
|
21
21
|
});
|
|
22
|
-
//
|
|
22
|
+
// Configure Turndown rules
|
|
23
23
|
turndownService.addRule("skipScripts", {
|
|
24
24
|
filter: ["script", "style", "noscript"],
|
|
25
25
|
replacement: () => "",
|
|
26
26
|
});
|
|
27
|
-
//
|
|
27
|
+
// Browser instance management
|
|
28
28
|
let browser = null;
|
|
29
|
-
//
|
|
29
|
+
// Get or create browser instance
|
|
30
30
|
async function getBrowser() {
|
|
31
31
|
if (!browser) {
|
|
32
32
|
browser = await chromium.launch({
|
|
@@ -34,7 +34,7 @@ async function getBrowser() {
|
|
|
34
34
|
args: [
|
|
35
35
|
'--no-sandbox',
|
|
36
36
|
'--disable-dev-shm-usage',
|
|
37
|
-
'--disable-blink-features=AutomationControlled', //
|
|
37
|
+
'--disable-blink-features=AutomationControlled', // Disable automation detection
|
|
38
38
|
'--disable-infobars',
|
|
39
39
|
'--window-size=1920,1080',
|
|
40
40
|
'--start-maximized',
|
|
@@ -43,14 +43,14 @@ async function getBrowser() {
|
|
|
43
43
|
}
|
|
44
44
|
return browser;
|
|
45
45
|
}
|
|
46
|
-
//
|
|
46
|
+
// Clean up browser instance
|
|
47
47
|
async function closeBrowser() {
|
|
48
48
|
if (browser) {
|
|
49
49
|
await browser.close();
|
|
50
50
|
browser = null;
|
|
51
51
|
}
|
|
52
52
|
}
|
|
53
|
-
// URL
|
|
53
|
+
// URL validation function
|
|
54
54
|
function isValidUrl(urlString) {
|
|
55
55
|
try {
|
|
56
56
|
const url = new URL(urlString);
|
|
@@ -60,18 +60,18 @@ function isValidUrl(urlString) {
|
|
|
60
60
|
return false;
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
|
-
//
|
|
63
|
+
// Check if it's a WeChat article link
|
|
64
64
|
function isWeixinUrl(url) {
|
|
65
65
|
return url.includes('mp.weixin.qq.com') || url.includes('weixin.qq.com');
|
|
66
66
|
}
|
|
67
|
-
//
|
|
67
|
+
// Check if browser mode is needed
|
|
68
68
|
function shouldUseBrowser(error, statusCode, content) {
|
|
69
69
|
const errorMessage = error.message.toLowerCase();
|
|
70
|
-
//
|
|
70
|
+
// Based on HTTP status codes
|
|
71
71
|
if (statusCode && [403, 429, 503, 520, 521, 522, 523, 524].includes(statusCode)) {
|
|
72
72
|
return true;
|
|
73
73
|
}
|
|
74
|
-
//
|
|
74
|
+
// Based on error messages
|
|
75
75
|
const browserTriggers = [
|
|
76
76
|
'cloudflare',
|
|
77
77
|
'access denied',
|
|
@@ -83,13 +83,13 @@ function shouldUseBrowser(error, statusCode, content) {
|
|
|
83
83
|
'blocked',
|
|
84
84
|
'protection',
|
|
85
85
|
'verification required',
|
|
86
|
-
'
|
|
87
|
-
'
|
|
86
|
+
'environment anomaly',
|
|
87
|
+
'verify'
|
|
88
88
|
];
|
|
89
89
|
if (browserTriggers.some(trigger => errorMessage.includes(trigger))) {
|
|
90
90
|
return true;
|
|
91
91
|
}
|
|
92
|
-
//
|
|
92
|
+
// Based on response content
|
|
93
93
|
if (content) {
|
|
94
94
|
const contentLower = content.toLowerCase();
|
|
95
95
|
const contentTriggers = [
|
|
@@ -99,10 +99,10 @@ function shouldUseBrowser(error, statusCode, content) {
|
|
|
99
99
|
'security check',
|
|
100
100
|
'human verification',
|
|
101
101
|
'captcha',
|
|
102
|
-
//
|
|
103
|
-
'
|
|
104
|
-
'
|
|
105
|
-
'
|
|
102
|
+
// WeChat-specific verification keywords
|
|
103
|
+
'environment anomaly',
|
|
104
|
+
'verify',
|
|
105
|
+
'complete verification to continue',
|
|
106
106
|
'verify'
|
|
107
107
|
];
|
|
108
108
|
if (contentTriggers.some(trigger => contentLower.includes(trigger))) {
|
|
@@ -111,12 +111,12 @@ function shouldUseBrowser(error, statusCode, content) {
|
|
|
111
111
|
}
|
|
112
112
|
return false;
|
|
113
113
|
}
|
|
114
|
-
//
|
|
114
|
+
// Fetch content using Jina Reader
|
|
115
115
|
async function fetchWithJinaReader(url) {
|
|
116
116
|
try {
|
|
117
117
|
// Jina Reader API URL
|
|
118
118
|
const jinaUrl = `https://r.jina.ai/${url}`;
|
|
119
|
-
//
|
|
119
|
+
// Create timeout controller
|
|
120
120
|
const controller = new AbortController();
|
|
121
121
|
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
122
122
|
const response = await fetch(jinaUrl, {
|
|
@@ -131,9 +131,9 @@ async function fetchWithJinaReader(url) {
|
|
|
131
131
|
throw new Error(`Jina Reader API error! status: ${response.status}`);
|
|
132
132
|
}
|
|
133
133
|
const markdown = await response.text();
|
|
134
|
-
//
|
|
134
|
+
// Extract title from Markdown (usually the first # heading)
|
|
135
135
|
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
136
|
-
const title = titleMatch ? titleMatch[1] : "
|
|
136
|
+
const title = titleMatch ? titleMatch[1] : "No title";
|
|
137
137
|
return {
|
|
138
138
|
title,
|
|
139
139
|
content: markdown,
|
|
@@ -148,21 +148,21 @@ async function fetchWithJinaReader(url) {
|
|
|
148
148
|
catch (error) {
|
|
149
149
|
if (error instanceof Error) {
|
|
150
150
|
if (error.name === 'AbortError') {
|
|
151
|
-
throw new Error(`Jina Reader
|
|
151
|
+
throw new Error(`Jina Reader request timeout (30s)`);
|
|
152
152
|
}
|
|
153
|
-
throw new Error(`Jina Reader
|
|
153
|
+
throw new Error(`Jina Reader fetch failed: ${error.message}`);
|
|
154
154
|
}
|
|
155
|
-
throw new Error(`Jina Reader
|
|
155
|
+
throw new Error(`Jina Reader fetch failed: ${String(error)}`);
|
|
156
156
|
}
|
|
157
157
|
}
|
|
158
|
-
//
|
|
158
|
+
// Fetch web content using Playwright
|
|
159
159
|
async function fetchWithPlaywright(url) {
|
|
160
160
|
let page = null;
|
|
161
161
|
const isWeixin = isWeixinUrl(url);
|
|
162
162
|
try {
|
|
163
163
|
const browserInstance = await getBrowser();
|
|
164
164
|
page = await browserInstance.newPage();
|
|
165
|
-
//
|
|
165
|
+
// Set real User-Agent (simulate Chrome on Mac)
|
|
166
166
|
const userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
167
167
|
await page.setExtraHTTPHeaders({
|
|
168
168
|
'User-Agent': userAgent,
|
|
@@ -174,7 +174,7 @@ async function fetchWithPlaywright(url) {
|
|
|
174
174
|
...(isWeixin ? { 'Referer': 'https://mp.weixin.qq.com/' } : {}),
|
|
175
175
|
});
|
|
176
176
|
await page.setViewportSize({ width: 1920, height: 1080 });
|
|
177
|
-
//
|
|
177
|
+
// WeChat articles need to load styles for correct rendering, filter for other sites
|
|
178
178
|
if (!isWeixin) {
|
|
179
179
|
await page.route('**/*', (route) => {
|
|
180
180
|
const resourceType = route.request().resourceType();
|
|
@@ -186,30 +186,30 @@ async function fetchWithPlaywright(url) {
|
|
|
186
186
|
}
|
|
187
187
|
});
|
|
188
188
|
}
|
|
189
|
-
//
|
|
189
|
+
// Navigate to page with longer timeout
|
|
190
190
|
await page.goto(url, {
|
|
191
191
|
timeout: 45000,
|
|
192
|
-
waitUntil: 'networkidle' //
|
|
192
|
+
waitUntil: 'networkidle' // Wait for network idle to ensure JS execution
|
|
193
193
|
});
|
|
194
|
-
//
|
|
194
|
+
// WeChat articles need longer wait time
|
|
195
195
|
const waitTime = isWeixin ? 5000 : 2000;
|
|
196
196
|
await page.waitForTimeout(waitTime);
|
|
197
|
-
//
|
|
198
|
-
const title = await page.title() || "
|
|
199
|
-
//
|
|
197
|
+
// Get page title
|
|
198
|
+
const title = await page.title() || "No title";
|
|
199
|
+
// Remove unwanted elements
|
|
200
200
|
await page.evaluate(() => {
|
|
201
201
|
const elementsToRemove = document.querySelectorAll('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .comments, .social-share');
|
|
202
202
|
elementsToRemove.forEach(el => el.remove());
|
|
203
203
|
});
|
|
204
|
-
//
|
|
204
|
+
// Get main content (WeChat articles have specific DOM structure)
|
|
205
205
|
const htmlContent = await page.evaluate(() => {
|
|
206
|
-
//
|
|
206
|
+
// WeChat article specific selectors
|
|
207
207
|
const weixinContent = document.querySelector('#js_content') ||
|
|
208
208
|
document.querySelector('.rich_media_content');
|
|
209
209
|
if (weixinContent) {
|
|
210
210
|
return weixinContent.innerHTML;
|
|
211
211
|
}
|
|
212
|
-
//
|
|
212
|
+
// Common selectors
|
|
213
213
|
const mainContent = document.querySelector('main') ||
|
|
214
214
|
document.querySelector('article') ||
|
|
215
215
|
document.querySelector('[role="main"]') ||
|
|
@@ -220,9 +220,9 @@ async function fetchWithPlaywright(url) {
|
|
|
220
220
|
document.body;
|
|
221
221
|
return mainContent ? mainContent.innerHTML : document.body.innerHTML;
|
|
222
222
|
});
|
|
223
|
-
//
|
|
223
|
+
// Convert to Markdown
|
|
224
224
|
const markdown = turndownService.turndown(htmlContent);
|
|
225
|
-
//
|
|
225
|
+
// Clean content
|
|
226
226
|
const cleanedContent = markdown
|
|
227
227
|
.replace(/\n{3,}/g, "\n\n")
|
|
228
228
|
.replace(/^\s+$/gm, "")
|
|
@@ -240,9 +240,9 @@ async function fetchWithPlaywright(url) {
|
|
|
240
240
|
}
|
|
241
241
|
catch (error) {
|
|
242
242
|
if (error instanceof Error) {
|
|
243
|
-
throw new Error(`Playwright
|
|
243
|
+
throw new Error(`Playwright fetch failed: ${error.message}`);
|
|
244
244
|
}
|
|
245
|
-
throw new Error(`Playwright
|
|
245
|
+
throw new Error(`Playwright fetch failed: ${String(error)}`);
|
|
246
246
|
}
|
|
247
247
|
finally {
|
|
248
248
|
if (page) {
|
|
@@ -250,13 +250,13 @@ async function fetchWithPlaywright(url) {
|
|
|
250
250
|
}
|
|
251
251
|
}
|
|
252
252
|
}
|
|
253
|
-
//
|
|
253
|
+
// Local web content extraction function
|
|
254
254
|
async function fetchWithLocalParser(url) {
|
|
255
255
|
try {
|
|
256
|
-
//
|
|
256
|
+
// Create timeout controller
|
|
257
257
|
const controller = new AbortController();
|
|
258
258
|
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
259
|
-
//
|
|
259
|
+
// Send HTTP request
|
|
260
260
|
const response = await fetch(url, {
|
|
261
261
|
headers: {
|
|
262
262
|
"User-Agent": "Mozilla/5.0 (compatible; MCP-URLFetcher/2.0)",
|
|
@@ -267,17 +267,17 @@ async function fetchWithLocalParser(url) {
|
|
|
267
267
|
if (!response.ok) {
|
|
268
268
|
throw new Error(`HTTP error! status: ${response.status}`);
|
|
269
269
|
}
|
|
270
|
-
//
|
|
270
|
+
// Get HTML content
|
|
271
271
|
const html = await response.text();
|
|
272
|
-
//
|
|
272
|
+
// Parse HTML with JSDOM
|
|
273
273
|
const dom = new JSDOM(html);
|
|
274
274
|
const document = dom.window.document;
|
|
275
|
-
//
|
|
276
|
-
const title = document.querySelector("title")?.textContent || "
|
|
277
|
-
//
|
|
275
|
+
// Get title
|
|
276
|
+
const title = document.querySelector("title")?.textContent || "No title";
|
|
277
|
+
// Remove unwanted elements
|
|
278
278
|
const elementsToRemove = document.querySelectorAll("script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .comments");
|
|
279
279
|
elementsToRemove.forEach(el => el.remove());
|
|
280
|
-
//
|
|
280
|
+
// Get main content area
|
|
281
281
|
const mainContent = document.querySelector("main") ||
|
|
282
282
|
document.querySelector("article") ||
|
|
283
283
|
document.querySelector('[role="main"]') ||
|
|
@@ -286,9 +286,9 @@ async function fetchWithLocalParser(url) {
|
|
|
286
286
|
document.querySelector(".post") ||
|
|
287
287
|
document.querySelector(".entry-content") ||
|
|
288
288
|
document.body;
|
|
289
|
-
//
|
|
289
|
+
// Convert to Markdown
|
|
290
290
|
const markdown = turndownService.turndown(mainContent.innerHTML);
|
|
291
|
-
//
|
|
291
|
+
// Clean extra whitespace
|
|
292
292
|
const cleanedContent = markdown
|
|
293
293
|
.replace(/\n{3,}/g, "\n\n")
|
|
294
294
|
.replace(/^\s+$/gm, "")
|
|
@@ -307,62 +307,62 @@ async function fetchWithLocalParser(url) {
|
|
|
307
307
|
catch (error) {
|
|
308
308
|
if (error instanceof Error) {
|
|
309
309
|
if (error.name === 'AbortError') {
|
|
310
|
-
throw new Error(
|
|
310
|
+
throw new Error(`Local parser request timeout (30s)`);
|
|
311
311
|
}
|
|
312
|
-
throw new Error(
|
|
312
|
+
throw new Error(`Local parser failed: ${error.message}`);
|
|
313
313
|
}
|
|
314
|
-
throw new Error(
|
|
314
|
+
throw new Error(`Local parser failed: ${String(error)}`);
|
|
315
315
|
}
|
|
316
316
|
}
|
|
317
|
-
//
|
|
318
|
-
//
|
|
317
|
+
// Smart web content fetching (three-tier fallback: Jina → Local → Playwright)
|
|
318
|
+
// For known sites requiring browser (like WeChat), use browser mode directly
|
|
319
319
|
async function fetchWebContent(url, preferJina = true) {
|
|
320
|
-
//
|
|
320
|
+
// WeChat articles use browser mode directly as other methods cannot bypass verification
|
|
321
321
|
if (isWeixinUrl(url)) {
|
|
322
|
-
console.error("
|
|
322
|
+
console.error("Detected WeChat article, using Playwright browser mode");
|
|
323
323
|
return await fetchWithPlaywright(url);
|
|
324
324
|
}
|
|
325
325
|
if (preferJina) {
|
|
326
|
-
//
|
|
326
|
+
// Tier 1: Try Jina Reader
|
|
327
327
|
try {
|
|
328
328
|
return await fetchWithJinaReader(url);
|
|
329
329
|
}
|
|
330
330
|
catch (jinaError) {
|
|
331
|
-
console.error("Jina Reader
|
|
332
|
-
//
|
|
331
|
+
console.error("Jina Reader failed, trying local parser:", jinaError instanceof Error ? jinaError.message : String(jinaError));
|
|
332
|
+
// Tier 2: Try local parser
|
|
333
333
|
try {
|
|
334
334
|
return await fetchWithLocalParser(url);
|
|
335
335
|
}
|
|
336
336
|
catch (localError) {
|
|
337
|
-
console.error("
|
|
338
|
-
//
|
|
337
|
+
console.error("Local parser failed, checking if browser mode needed:", localError instanceof Error ? localError.message : String(localError));
|
|
338
|
+
// Check if browser mode is needed
|
|
339
339
|
const jinaErr = jinaError instanceof Error ? jinaError : new Error(String(jinaError));
|
|
340
340
|
const localErr = localError instanceof Error ? localError : new Error(String(localError));
|
|
341
341
|
if (shouldUseBrowser(jinaErr) || shouldUseBrowser(localErr)) {
|
|
342
|
-
console.error("
|
|
342
|
+
console.error("Detected access restrictions, using Playwright browser mode");
|
|
343
343
|
try {
|
|
344
|
-
//
|
|
344
|
+
// Tier 3: Use Playwright browser
|
|
345
345
|
return await fetchWithPlaywright(url);
|
|
346
346
|
}
|
|
347
347
|
catch (browserError) {
|
|
348
|
-
throw new Error(
|
|
348
|
+
throw new Error(`All methods failed. Jina: ${jinaErr.message}, Local: ${localErr.message}, Browser: ${browserError instanceof Error ? browserError.message : String(browserError)}`);
|
|
349
349
|
}
|
|
350
350
|
}
|
|
351
351
|
else {
|
|
352
|
-
throw new Error(`Jina
|
|
352
|
+
throw new Error(`Jina and local parser both failed. Jina: ${jinaErr.message}, Local: ${localErr.message}`);
|
|
353
353
|
}
|
|
354
354
|
}
|
|
355
355
|
}
|
|
356
356
|
}
|
|
357
357
|
else {
|
|
358
|
-
//
|
|
358
|
+
// If not prioritizing Jina, start with local parser
|
|
359
359
|
try {
|
|
360
360
|
return await fetchWithLocalParser(url);
|
|
361
361
|
}
|
|
362
362
|
catch (localError) {
|
|
363
363
|
const localErr = localError instanceof Error ? localError : new Error(String(localError));
|
|
364
364
|
if (shouldUseBrowser(localErr)) {
|
|
365
|
-
console.error("
|
|
365
|
+
console.error("Local parser failed, detected access restrictions, using Playwright browser mode");
|
|
366
366
|
return await fetchWithPlaywright(url);
|
|
367
367
|
}
|
|
368
368
|
else {
|
|
@@ -371,23 +371,23 @@ async function fetchWebContent(url, preferJina = true) {
|
|
|
371
371
|
}
|
|
372
372
|
}
|
|
373
373
|
}
|
|
374
|
-
//
|
|
374
|
+
// Handle tool list requests
|
|
375
375
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
376
376
|
return {
|
|
377
377
|
tools: [
|
|
378
378
|
{
|
|
379
379
|
name: "fetch_url",
|
|
380
|
-
description: "
|
|
380
|
+
description: "Fetch web content from specified URL and convert to Markdown format. Uses Jina Reader by default, automatically falls back to local parser on failure",
|
|
381
381
|
inputSchema: {
|
|
382
382
|
type: "object",
|
|
383
383
|
properties: {
|
|
384
384
|
url: {
|
|
385
385
|
type: "string",
|
|
386
|
-
description: "
|
|
386
|
+
description: "Webpage URL to fetch (must be http or https protocol)",
|
|
387
387
|
},
|
|
388
388
|
preferJina: {
|
|
389
389
|
type: "boolean",
|
|
390
|
-
description: "
|
|
390
|
+
description: "Whether to prioritize Jina Reader (default: true)",
|
|
391
391
|
default: true,
|
|
392
392
|
},
|
|
393
393
|
},
|
|
@@ -396,7 +396,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
396
396
|
},
|
|
397
397
|
{
|
|
398
398
|
name: "fetch_multiple_urls",
|
|
399
|
-
description: "
|
|
399
|
+
description: "Batch fetch web content from multiple URLs",
|
|
400
400
|
inputSchema: {
|
|
401
401
|
type: "object",
|
|
402
402
|
properties: {
|
|
@@ -405,12 +405,12 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
405
405
|
items: {
|
|
406
406
|
type: "string",
|
|
407
407
|
},
|
|
408
|
-
description: "
|
|
409
|
-
maxItems: 10, //
|
|
408
|
+
description: "List of webpage URLs to fetch",
|
|
409
|
+
maxItems: 10, // Limit to 10 URLs
|
|
410
410
|
},
|
|
411
411
|
preferJina: {
|
|
412
412
|
type: "boolean",
|
|
413
|
-
description: "
|
|
413
|
+
description: "Whether to prioritize Jina Reader (default: true)",
|
|
414
414
|
default: true,
|
|
415
415
|
},
|
|
416
416
|
},
|
|
@@ -419,13 +419,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
419
419
|
},
|
|
420
420
|
{
|
|
421
421
|
name: "fetch_url_with_jina",
|
|
422
|
-
description: "
|
|
422
|
+
description: "Force fetch using Jina Reader (suitable for complex webpages)",
|
|
423
423
|
inputSchema: {
|
|
424
424
|
type: "object",
|
|
425
425
|
properties: {
|
|
426
426
|
url: {
|
|
427
427
|
type: "string",
|
|
428
|
-
description: "
|
|
428
|
+
description: "Webpage URL to fetch",
|
|
429
429
|
},
|
|
430
430
|
},
|
|
431
431
|
required: ["url"],
|
|
@@ -433,13 +433,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
433
433
|
},
|
|
434
434
|
{
|
|
435
435
|
name: "fetch_url_local",
|
|
436
|
-
description: "
|
|
436
|
+
description: "Force fetch using local parser (suitable for simple webpages or when Jina is unavailable)",
|
|
437
437
|
inputSchema: {
|
|
438
438
|
type: "object",
|
|
439
439
|
properties: {
|
|
440
440
|
url: {
|
|
441
441
|
type: "string",
|
|
442
|
-
description: "
|
|
442
|
+
description: "Webpage URL to fetch",
|
|
443
443
|
},
|
|
444
444
|
},
|
|
445
445
|
required: ["url"],
|
|
@@ -447,13 +447,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
447
447
|
},
|
|
448
448
|
{
|
|
449
449
|
name: "fetch_url_with_browser",
|
|
450
|
-
description: "
|
|
450
|
+
description: "Force fetch using Playwright browser (suitable for websites with access restrictions, such as Cloudflare protection, CAPTCHA, etc.)",
|
|
451
451
|
inputSchema: {
|
|
452
452
|
type: "object",
|
|
453
453
|
properties: {
|
|
454
454
|
url: {
|
|
455
455
|
type: "string",
|
|
456
|
-
description: "
|
|
456
|
+
description: "Webpage URL to fetch",
|
|
457
457
|
},
|
|
458
458
|
},
|
|
459
459
|
required: ["url"],
|
|
@@ -462,23 +462,23 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
462
462
|
],
|
|
463
463
|
};
|
|
464
464
|
});
|
|
465
|
-
//
|
|
465
|
+
// Handle tool call requests
|
|
466
466
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
467
467
|
const { name, arguments: args } = request.params;
|
|
468
468
|
try {
|
|
469
469
|
if (name === "fetch_url") {
|
|
470
470
|
const { url, preferJina = true } = args;
|
|
471
|
-
//
|
|
471
|
+
// Validate URL
|
|
472
472
|
if (!isValidUrl(url)) {
|
|
473
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
473
|
+
throw new McpError(ErrorCode.InvalidParams, "Invalid URL format, please provide http or https protocol URL");
|
|
474
474
|
}
|
|
475
|
-
//
|
|
475
|
+
// Fetch web content
|
|
476
476
|
const result = await fetchWebContent(url, preferJina);
|
|
477
477
|
return {
|
|
478
478
|
content: [
|
|
479
479
|
{
|
|
480
480
|
type: "text",
|
|
481
|
-
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n
|
|
481
|
+
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n**Fetched At**: ${result.metadata.fetchedAt}\n**Content Length**: ${result.metadata.contentLength} characters\n**Method**: ${result.metadata.method}\n\n---\n\n${result.content}`,
|
|
482
482
|
},
|
|
483
483
|
],
|
|
484
484
|
};
|
|
@@ -486,14 +486,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
486
486
|
else if (name === "fetch_url_with_jina") {
|
|
487
487
|
const { url } = args;
|
|
488
488
|
if (!isValidUrl(url)) {
|
|
489
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
489
|
+
throw new McpError(ErrorCode.InvalidParams, "Invalid URL format");
|
|
490
490
|
}
|
|
491
491
|
const result = await fetchWithJinaReader(url);
|
|
492
492
|
return {
|
|
493
493
|
content: [
|
|
494
494
|
{
|
|
495
495
|
type: "text",
|
|
496
|
-
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n
|
|
496
|
+
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n**Fetched At**: ${result.metadata.fetchedAt}\n**Content Length**: ${result.metadata.contentLength} characters\n**Method**: Jina Reader\n\n---\n\n${result.content}`,
|
|
497
497
|
},
|
|
498
498
|
],
|
|
499
499
|
};
|
|
@@ -501,42 +501,42 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
501
501
|
else if (name === "fetch_url_local") {
|
|
502
502
|
const { url } = args;
|
|
503
503
|
if (!isValidUrl(url)) {
|
|
504
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
504
|
+
throw new McpError(ErrorCode.InvalidParams, "Invalid URL format");
|
|
505
505
|
}
|
|
506
506
|
const result = await fetchWithLocalParser(url);
|
|
507
507
|
return {
|
|
508
508
|
content: [
|
|
509
509
|
{
|
|
510
510
|
type: "text",
|
|
511
|
-
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n
|
|
511
|
+
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n**Fetched At**: ${result.metadata.fetchedAt}\n**Content Length**: ${result.metadata.contentLength} characters\n**Method**: Local Parser\n\n---\n\n${result.content}`,
|
|
512
512
|
},
|
|
513
513
|
],
|
|
514
514
|
};
|
|
515
515
|
}
|
|
516
516
|
else if (name === "fetch_multiple_urls") {
|
|
517
517
|
const { urls, preferJina = true } = args;
|
|
518
|
-
//
|
|
518
|
+
// Validate all URLs
|
|
519
519
|
const invalidUrls = urls.filter(url => !isValidUrl(url));
|
|
520
520
|
if (invalidUrls.length > 0) {
|
|
521
|
-
throw new McpError(ErrorCode.InvalidParams,
|
|
521
|
+
throw new McpError(ErrorCode.InvalidParams, `The following URLs have invalid format: ${invalidUrls.join(", ")}`);
|
|
522
522
|
}
|
|
523
|
-
//
|
|
523
|
+
// Fetch all URLs concurrently
|
|
524
524
|
const results = await Promise.allSettled(urls.map(url => fetchWebContent(url, preferJina)));
|
|
525
|
-
//
|
|
526
|
-
let combinedContent = "#
|
|
525
|
+
// Combine results
|
|
526
|
+
let combinedContent = "# Batch URL Content Fetch Results\n\n";
|
|
527
527
|
results.forEach((result, index) => {
|
|
528
528
|
const url = urls[index];
|
|
529
529
|
combinedContent += `## ${index + 1}. ${url}\n\n`;
|
|
530
530
|
if (result.status === "fulfilled") {
|
|
531
531
|
const { title, content, metadata } = result.value;
|
|
532
|
-
combinedContent +=
|
|
533
|
-
combinedContent +=
|
|
534
|
-
combinedContent +=
|
|
535
|
-
combinedContent +=
|
|
536
|
-
combinedContent += `###
|
|
532
|
+
combinedContent += `**Title**: ${title}\n`;
|
|
533
|
+
combinedContent += `**Fetched At**: ${metadata.fetchedAt}\n`;
|
|
534
|
+
combinedContent += `**Content Length**: ${metadata.contentLength} characters\n`;
|
|
535
|
+
combinedContent += `**Method**: ${metadata.method}\n\n`;
|
|
536
|
+
combinedContent += `### Content\n\n${content}\n\n`;
|
|
537
537
|
}
|
|
538
538
|
else {
|
|
539
|
-
combinedContent +=
|
|
539
|
+
combinedContent += `**Error**: ${result.reason}\n\n`;
|
|
540
540
|
}
|
|
541
541
|
combinedContent += "---\n\n";
|
|
542
542
|
});
|
|
@@ -552,47 +552,47 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
552
552
|
else if (name === "fetch_url_with_browser") {
|
|
553
553
|
const { url } = args;
|
|
554
554
|
if (!isValidUrl(url)) {
|
|
555
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
555
|
+
throw new McpError(ErrorCode.InvalidParams, "Invalid URL format");
|
|
556
556
|
}
|
|
557
557
|
const result = await fetchWithPlaywright(url);
|
|
558
558
|
return {
|
|
559
559
|
content: [
|
|
560
560
|
{
|
|
561
561
|
type: "text",
|
|
562
|
-
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n
|
|
562
|
+
text: `# ${result.title}\n\n**URL**: ${result.metadata.url}\n**Fetched At**: ${result.metadata.fetchedAt}\n**Content Length**: ${result.metadata.contentLength} characters\n**Method**: Playwright Browser\n\n---\n\n${result.content}`,
|
|
563
563
|
},
|
|
564
564
|
],
|
|
565
565
|
};
|
|
566
566
|
}
|
|
567
567
|
else {
|
|
568
|
-
throw new McpError(ErrorCode.MethodNotFound,
|
|
568
|
+
throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name}`);
|
|
569
569
|
}
|
|
570
570
|
}
|
|
571
571
|
catch (error) {
|
|
572
572
|
if (error instanceof McpError) {
|
|
573
573
|
throw error;
|
|
574
574
|
}
|
|
575
|
-
throw new McpError(ErrorCode.InternalError,
|
|
575
|
+
throw new McpError(ErrorCode.InternalError, `Tool execution failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
576
576
|
}
|
|
577
577
|
});
|
|
578
|
-
//
|
|
578
|
+
// Start server
|
|
579
579
|
async function main() {
|
|
580
580
|
const transport = new StdioServerTransport();
|
|
581
581
|
await server.connect(transport);
|
|
582
|
-
console.error("MCP Web Reader v2.0
|
|
582
|
+
console.error("MCP Web Reader v2.0 started (with Jina Reader + Playwright support)");
|
|
583
583
|
}
|
|
584
|
-
//
|
|
584
|
+
// Graceful shutdown handling
|
|
585
585
|
process.on('SIGINT', async () => {
|
|
586
|
-
console.error("
|
|
586
|
+
console.error("Received SIGINT signal, closing browser...");
|
|
587
587
|
await closeBrowser();
|
|
588
588
|
process.exit(0);
|
|
589
589
|
});
|
|
590
590
|
process.on('SIGTERM', async () => {
|
|
591
|
-
console.error("
|
|
591
|
+
console.error("Received SIGTERM signal, closing browser...");
|
|
592
592
|
await closeBrowser();
|
|
593
593
|
process.exit(0);
|
|
594
594
|
});
|
|
595
595
|
main().catch((error) => {
|
|
596
|
-
console.error("
|
|
596
|
+
console.error("Server startup failed:", error);
|
|
597
597
|
process.exit(1);
|
|
598
598
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mcp-web-reader",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "MCP server for reading web content with Jina Reader and local parser support",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
"build": "tsc",
|
|
12
12
|
"start": "node dist/index.js",
|
|
13
13
|
"dev": "tsc --watch",
|
|
14
|
-
"claude-code": "node dist/index.js"
|
|
14
|
+
"claude-code": "node dist/index.js",
|
|
15
|
+
"postinstall": "npx playwright install chromium"
|
|
15
16
|
},
|
|
16
17
|
"repository": {
|
|
17
18
|
"type": "git",
|