@just-every/mcp-read-website-fast 0.1.16 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/index.js +16 -20
- package/dist/internal/fetchMarkdown.d.ts +1 -0
- package/dist/internal/fetchMarkdown.js +58 -16
- package/dist/serve.js +10 -12
- package/dist/utils/extractMarkdownLinks.d.ts +2 -0
- package/dist/utils/extractMarkdownLinks.js +43 -0
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -5,6 +5,10 @@ Fast, token-efficient web content extraction for AI agents - converts websites t
|
|
|
5
5
|
[](https://www.npmjs.com/package/@just-every/mcp-read-website-fast)
|
|
6
6
|
[](https://github.com/just-every/mcp-read-website-fast/actions)
|
|
7
7
|
|
|
8
|
+
<a href="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast">
|
|
9
|
+
<img width="380" height="200" src="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast/badge" alt="read-website-fast MCP server" />
|
|
10
|
+
</a>
|
|
11
|
+
|
|
8
12
|
## Overview
|
|
9
13
|
|
|
10
14
|
Existing MCP web crawlers are slow and consume large quantities of tokens. This pauses the development process and provides incomplete results as LLMs need to parse whole web pages.
|
|
@@ -91,8 +95,7 @@ Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.js
|
|
|
91
95
|
- `read_website` - Fetches a webpage and converts it to clean markdown
|
|
92
96
|
- Parameters:
|
|
93
97
|
- `url` (required): The HTTP/HTTPS URL to fetch
|
|
94
|
-
- `
|
|
95
|
-
- `respectRobots` (optional): Whether to respect robots.txt
|
|
98
|
+
- `pages` (optional): Maximum number of pages to crawl (default: 1, max: 100)
|
|
96
99
|
|
|
97
100
|
### Available Resources
|
|
98
101
|
|
|
@@ -132,7 +135,7 @@ npm run dev fetch https://example.com --output both
|
|
|
132
135
|
|
|
133
136
|
### CLI Options
|
|
134
137
|
|
|
135
|
-
- `-
|
|
138
|
+
- `-p, --pages <number>` - Maximum number of pages to crawl (default: 1)
|
|
136
139
|
- `-c, --concurrency <number>` - Max concurrent requests (default: 3)
|
|
137
140
|
- `--no-robots` - Ignore robots.txt
|
|
138
141
|
- `--all-origins` - Allow cross-origin crawling
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { Command } from 'commander';
|
|
3
3
|
import { fetch } from '@just-every/crawl';
|
|
4
|
+
import { fetchMarkdown } from './internal/fetchMarkdown.js';
|
|
4
5
|
import { readFileSync } from 'fs';
|
|
5
6
|
import { fileURLToPath } from 'url';
|
|
6
7
|
import { dirname, join } from 'path';
|
|
@@ -15,7 +16,7 @@ program
|
|
|
15
16
|
program
|
|
16
17
|
.command('fetch <url>')
|
|
17
18
|
.description('Fetch a URL and convert to Markdown')
|
|
18
|
-
.option('-
|
|
19
|
+
.option('-p, --pages <number>', 'Maximum number of pages to crawl', '1')
|
|
19
20
|
.option('-c, --concurrency <number>', 'Max concurrent requests', '3')
|
|
20
21
|
.option('--no-robots', 'Ignore robots.txt')
|
|
21
22
|
.option('--all-origins', 'Allow cross-origin crawling')
|
|
@@ -25,8 +26,10 @@ program
|
|
|
25
26
|
.option('-o, --output <format>', 'Output format: json, markdown, or both', 'markdown')
|
|
26
27
|
.action(async (url, options) => {
|
|
27
28
|
try {
|
|
29
|
+
const pages = parseInt(options.pages, 10);
|
|
30
|
+
const depth = pages > 1 ? 1 : 0;
|
|
28
31
|
const crawlOptions = {
|
|
29
|
-
depth:
|
|
32
|
+
depth: depth,
|
|
30
33
|
maxConcurrency: parseInt(options.concurrency, 10),
|
|
31
34
|
respectRobots: options.robots,
|
|
32
35
|
sameOriginOnly: !options.allOrigins,
|
|
@@ -35,27 +38,24 @@ program
|
|
|
35
38
|
timeout: parseInt(options.timeout, 10),
|
|
36
39
|
};
|
|
37
40
|
console.error(`Fetching ${url}...`);
|
|
38
|
-
const results = await fetch(url, crawlOptions);
|
|
39
41
|
if (options.output === 'json') {
|
|
42
|
+
const results = await fetch(url, crawlOptions);
|
|
40
43
|
console.log(JSON.stringify(results, null, 2));
|
|
41
44
|
}
|
|
42
45
|
else if (options.output === 'markdown') {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if (results.length > 1) {
|
|
47
|
-
console.log('\n---\n');
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
if (result.error && result.markdown) {
|
|
51
|
-
console.error(`Warning for ${result.url}: ${result.error}`);
|
|
52
|
-
}
|
|
53
|
-
else if (result.error && !result.markdown) {
|
|
54
|
-
console.error(`Error for ${result.url}: ${result.error}`);
|
|
55
|
-
}
|
|
46
|
+
const result = await fetchMarkdown(url, {
|
|
47
|
+
...crawlOptions,
|
|
48
|
+
maxPages: pages,
|
|
56
49
|
});
|
|
50
|
+
if (result.markdown) {
|
|
51
|
+
console.log(result.markdown);
|
|
52
|
+
}
|
|
53
|
+
if (result.error) {
|
|
54
|
+
console.error(`Error: ${result.error}`);
|
|
55
|
+
}
|
|
57
56
|
}
|
|
58
57
|
else if (options.output === 'both') {
|
|
58
|
+
const results = await fetch(url, crawlOptions);
|
|
59
59
|
results.forEach(result => {
|
|
60
60
|
console.log(`\n## URL: ${result.url}\n`);
|
|
61
61
|
if (result.markdown) {
|
|
@@ -66,10 +66,6 @@ program
|
|
|
66
66
|
}
|
|
67
67
|
});
|
|
68
68
|
}
|
|
69
|
-
const hasFatalErrors = results.some(r => r.error && !r.markdown);
|
|
70
|
-
if (hasFatalErrors) {
|
|
71
|
-
process.exit(1);
|
|
72
|
-
}
|
|
73
69
|
}
|
|
74
70
|
catch (error) {
|
|
75
71
|
console.error('Error:', error instanceof Error ? error.message : error);
|
|
@@ -1,28 +1,70 @@
|
|
|
1
1
|
import { fetch } from '@just-every/crawl';
|
|
2
|
+
import { extractMarkdownLinks, filterSameOriginLinks } from '../utils/extractMarkdownLinks.js';
|
|
2
3
|
export async function fetchMarkdown(url, options = {}) {
|
|
3
4
|
try {
|
|
4
|
-
const
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
5
|
+
const maxPages = options.maxPages ?? 1;
|
|
6
|
+
const visited = new Set();
|
|
7
|
+
const toVisit = [url];
|
|
8
|
+
const allResults = [];
|
|
9
|
+
while (toVisit.length > 0 && allResults.length < maxPages) {
|
|
10
|
+
const currentUrl = toVisit.shift();
|
|
11
|
+
if (visited.has(currentUrl))
|
|
12
|
+
continue;
|
|
13
|
+
visited.add(currentUrl);
|
|
14
|
+
const crawlOptions = {
|
|
15
|
+
depth: 0,
|
|
16
|
+
maxConcurrency: options.maxConcurrency ?? 3,
|
|
17
|
+
respectRobots: options.respectRobots ?? true,
|
|
18
|
+
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
19
|
+
userAgent: options.userAgent,
|
|
20
|
+
cacheDir: options.cacheDir ?? '.cache',
|
|
21
|
+
timeout: options.timeout ?? 30000,
|
|
22
|
+
};
|
|
23
|
+
const results = await fetch(currentUrl, crawlOptions);
|
|
24
|
+
if (results && results.length > 0) {
|
|
25
|
+
const result = results[0];
|
|
26
|
+
allResults.push(result);
|
|
27
|
+
if (allResults.length < maxPages && result.markdown) {
|
|
28
|
+
const links = extractMarkdownLinks(result.markdown, currentUrl);
|
|
29
|
+
const filteredLinks = options.sameOriginOnly !== false
|
|
30
|
+
? filterSameOriginLinks(links, currentUrl)
|
|
31
|
+
: links;
|
|
32
|
+
for (const link of filteredLinks) {
|
|
33
|
+
if (!visited.has(link) && !toVisit.includes(link)) {
|
|
34
|
+
toVisit.push(link);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
if (allResults.length === 0) {
|
|
16
41
|
return {
|
|
17
42
|
markdown: '',
|
|
18
43
|
error: 'No results returned',
|
|
19
44
|
};
|
|
20
45
|
}
|
|
46
|
+
const pagesToReturn = allResults;
|
|
47
|
+
const combinedMarkdown = pagesToReturn
|
|
48
|
+
.map((result, index) => {
|
|
49
|
+
if (result.error) {
|
|
50
|
+
return `<!-- Error fetching ${result.url}: ${result.error} -->`;
|
|
51
|
+
}
|
|
52
|
+
let pageContent = '';
|
|
53
|
+
if (pagesToReturn.length > 1 && index > 0) {
|
|
54
|
+
pageContent += '\n\n---\n\n';
|
|
55
|
+
}
|
|
56
|
+
pageContent += `<!-- Source: ${result.url} -->\n`;
|
|
57
|
+
pageContent += result.markdown || '';
|
|
58
|
+
return pageContent;
|
|
59
|
+
})
|
|
60
|
+
.join('\n');
|
|
21
61
|
return {
|
|
22
|
-
markdown:
|
|
23
|
-
title:
|
|
24
|
-
links:
|
|
25
|
-
error:
|
|
62
|
+
markdown: combinedMarkdown,
|
|
63
|
+
title: pagesToReturn[0].title,
|
|
64
|
+
links: pagesToReturn.flatMap(r => r.links || []),
|
|
65
|
+
error: pagesToReturn.some(r => r.error)
|
|
66
|
+
? `Some pages had errors: ${pagesToReturn.filter(r => r.error).map(r => r.url).join(', ')}`
|
|
67
|
+
: undefined,
|
|
26
68
|
};
|
|
27
69
|
}
|
|
28
70
|
catch (error) {
|
package/dist/serve.js
CHANGED
|
@@ -37,15 +37,12 @@ const READ_WEBSITE_TOOL = {
|
|
|
37
37
|
type: 'string',
|
|
38
38
|
description: 'HTTP/HTTPS URL to fetch and convert to markdown',
|
|
39
39
|
},
|
|
40
|
-
|
|
40
|
+
pages: {
|
|
41
41
|
type: 'number',
|
|
42
|
-
description: '
|
|
43
|
-
default:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
type: 'boolean',
|
|
47
|
-
description: 'Whether to respect robots.txt',
|
|
48
|
-
default: true,
|
|
42
|
+
description: 'Maximum number of pages to crawl (default: 1)',
|
|
43
|
+
default: 1,
|
|
44
|
+
minimum: 1,
|
|
45
|
+
maximum: 100,
|
|
49
46
|
},
|
|
50
47
|
},
|
|
51
48
|
required: ['url'],
|
|
@@ -101,13 +98,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
101
98
|
logger.info(`Processing read request for URL: ${args.url}`);
|
|
102
99
|
logger.debug('Read parameters:', {
|
|
103
100
|
url: args.url,
|
|
104
|
-
|
|
105
|
-
respectRobots: args.respectRobots,
|
|
101
|
+
pages: args.pages,
|
|
106
102
|
});
|
|
107
103
|
logger.debug('Calling fetchMarkdown...');
|
|
104
|
+
const depth = args.pages > 1 ? 1 : 0;
|
|
108
105
|
const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
|
|
109
|
-
depth:
|
|
110
|
-
respectRobots:
|
|
106
|
+
depth: depth,
|
|
107
|
+
respectRobots: false,
|
|
108
|
+
maxPages: args.pages ?? 1,
|
|
111
109
|
});
|
|
112
110
|
logger.info('Content fetched successfully');
|
|
113
111
|
if (result.error && result.markdown) {
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
export function extractMarkdownLinks(markdown, baseUrl) {
|
|
2
|
+
const links = [];
|
|
3
|
+
const markdownLinkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
|
|
4
|
+
const bareUrlRegex = /https?:\/\/[^\s<>)\]]+/g;
|
|
5
|
+
let match;
|
|
6
|
+
while ((match = markdownLinkRegex.exec(markdown)) !== null) {
|
|
7
|
+
const url = match[2];
|
|
8
|
+
if (url && !url.startsWith('#') && !url.startsWith('mailto:') && !url.startsWith('tel:')) {
|
|
9
|
+
links.push(url);
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
while ((match = bareUrlRegex.exec(markdown)) !== null) {
|
|
13
|
+
links.push(match[0]);
|
|
14
|
+
}
|
|
15
|
+
const absoluteLinks = links.map(link => {
|
|
16
|
+
try {
|
|
17
|
+
if (link.startsWith('http://') || link.startsWith('https://')) {
|
|
18
|
+
return link;
|
|
19
|
+
}
|
|
20
|
+
return new URL(link, baseUrl).href;
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
}).filter(Boolean);
|
|
26
|
+
return [...new Set(absoluteLinks)];
|
|
27
|
+
}
|
|
28
|
+
export function filterSameOriginLinks(links, baseUrl) {
|
|
29
|
+
try {
|
|
30
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
31
|
+
return links.filter(link => {
|
|
32
|
+
try {
|
|
33
|
+
return new URL(link).origin === baseOrigin;
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
return [];
|
|
42
|
+
}
|
|
43
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@just-every/mcp-read-website-fast",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.18",
|
|
4
4
|
"description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
"homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
|
|
51
51
|
"license": "MIT",
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@just-every/crawl": "^1.0.
|
|
53
|
+
"@just-every/crawl": "^1.0.6",
|
|
54
54
|
"@modelcontextprotocol/sdk": "^1.12.3",
|
|
55
55
|
"commander": "^14.0.0"
|
|
56
56
|
},
|