@just-every/mcp-read-website-fast 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache/disk.js +1 -1
- package/dist/crawler/fetch.js +8 -7
- package/dist/crawler/queue.js +16 -10
- package/dist/crawler/robots.js +3 -3
- package/dist/index.js +14 -11
- package/dist/internal/fetchMarkdown.js +4 -4
- package/dist/parser/article.js +25 -15
- package/dist/parser/dom.js +11 -10
- package/dist/parser/markdown.js +27 -14
- package/dist/serve.js +104 -56
- package/dist/utils/chunker.js +15 -13
- package/package.json +4 -1
package/dist/cache/disk.js
CHANGED
package/dist/crawler/fetch.js
CHANGED
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
import { fetch } from 'undici';
|
|
2
2
|
export async function fetchStream(url, options = {}) {
|
|
3
|
-
const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5 } = options;
|
|
3
|
+
const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5, } = options;
|
|
4
4
|
try {
|
|
5
5
|
const response = await fetch(url, {
|
|
6
6
|
headers: {
|
|
7
7
|
'User-Agent': userAgent,
|
|
8
|
-
|
|
8
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
9
9
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
'Upgrade-Insecure-Requests': '1'
|
|
10
|
+
DNT: '1',
|
|
11
|
+
Connection: 'keep-alive',
|
|
12
|
+
'Upgrade-Insecure-Requests': '1',
|
|
13
13
|
},
|
|
14
14
|
redirect: maxRedirections > 0 ? 'follow' : 'manual',
|
|
15
|
-
signal: AbortSignal.timeout(timeout)
|
|
15
|
+
signal: AbortSignal.timeout(timeout),
|
|
16
16
|
});
|
|
17
17
|
if (!response.ok) {
|
|
18
18
|
throw new Error(`HTTP ${response.status} for ${url}`);
|
|
19
19
|
}
|
|
20
20
|
const contentType = response.headers.get('content-type');
|
|
21
|
-
if (contentType &&
|
|
21
|
+
if (contentType &&
|
|
22
|
+
!contentType.includes('text/html') &&
|
|
22
23
|
!contentType.includes('application/xhtml+xml')) {
|
|
23
24
|
throw new Error(`Non-HTML content type: ${contentType} for ${url}`);
|
|
24
25
|
}
|
package/dist/crawler/queue.js
CHANGED
|
@@ -21,7 +21,7 @@ export class CrawlQueue {
|
|
|
21
21
|
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
22
22
|
userAgent: options.userAgent ?? 'MCP/0.1',
|
|
23
23
|
cacheDir: options.cacheDir ?? '.cache',
|
|
24
|
-
timeout: options.timeout ?? 30000
|
|
24
|
+
timeout: options.timeout ?? 30000,
|
|
25
25
|
};
|
|
26
26
|
this.limit = pLimit(this.options.maxConcurrency);
|
|
27
27
|
this.cache = new DiskCache(this.options.cacheDir);
|
|
@@ -60,7 +60,7 @@ export class CrawlQueue {
|
|
|
60
60
|
this.results.push({
|
|
61
61
|
url: normalizedUrl,
|
|
62
62
|
markdown: cached.markdown,
|
|
63
|
-
title: cached.title
|
|
63
|
+
title: cached.title,
|
|
64
64
|
});
|
|
65
65
|
return;
|
|
66
66
|
}
|
|
@@ -70,7 +70,7 @@ export class CrawlQueue {
|
|
|
70
70
|
this.results.push({
|
|
71
71
|
url: normalizedUrl,
|
|
72
72
|
markdown: '',
|
|
73
|
-
error: 'Blocked by robots.txt'
|
|
73
|
+
error: 'Blocked by robots.txt',
|
|
74
74
|
});
|
|
75
75
|
return;
|
|
76
76
|
}
|
|
@@ -81,13 +81,13 @@ export class CrawlQueue {
|
|
|
81
81
|
}
|
|
82
82
|
const html = await fetchStream(normalizedUrl, {
|
|
83
83
|
userAgent: this.options.userAgent,
|
|
84
|
-
timeout: this.options.timeout
|
|
84
|
+
timeout: this.options.timeout,
|
|
85
85
|
});
|
|
86
86
|
if (!html || html.trim().length === 0) {
|
|
87
87
|
this.results.push({
|
|
88
88
|
url: normalizedUrl,
|
|
89
89
|
markdown: '',
|
|
90
|
-
error: 'Empty response from server'
|
|
90
|
+
error: 'Empty response from server',
|
|
91
91
|
});
|
|
92
92
|
return;
|
|
93
93
|
}
|
|
@@ -97,15 +97,21 @@ export class CrawlQueue {
|
|
|
97
97
|
this.results.push({
|
|
98
98
|
url: normalizedUrl,
|
|
99
99
|
markdown: '',
|
|
100
|
-
error: 'Failed to extract article content'
|
|
100
|
+
error: 'Failed to extract article content',
|
|
101
101
|
});
|
|
102
102
|
return;
|
|
103
103
|
}
|
|
104
104
|
if (!article.content || article.content.trim().length < 50) {
|
|
105
|
+
const fallbackMarkdown = `# ${article.title || 'Page Content'}\n\n` +
|
|
106
|
+
`*Note: This page appears to be JavaScript-rendered. Limited content extracted.*\n\n` +
|
|
107
|
+
(article.textContent
|
|
108
|
+
? article.textContent.substring(0, 1000) + '...'
|
|
109
|
+
: 'No text content available');
|
|
105
110
|
this.results.push({
|
|
106
111
|
url: normalizedUrl,
|
|
107
|
-
markdown:
|
|
108
|
-
|
|
112
|
+
markdown: fallbackMarkdown,
|
|
113
|
+
title: article.title || normalizedUrl,
|
|
114
|
+
error: 'Limited content extracted (JavaScript-rendered page)',
|
|
109
115
|
});
|
|
110
116
|
return;
|
|
111
117
|
}
|
|
@@ -128,14 +134,14 @@ export class CrawlQueue {
|
|
|
128
134
|
url: normalizedUrl,
|
|
129
135
|
markdown,
|
|
130
136
|
title: article.title,
|
|
131
|
-
links: links.length > 0 ? links : undefined
|
|
137
|
+
links: links.length > 0 ? links : undefined,
|
|
132
138
|
});
|
|
133
139
|
}
|
|
134
140
|
catch (error) {
|
|
135
141
|
this.results.push({
|
|
136
142
|
url: normalizedUrl,
|
|
137
143
|
markdown: '',
|
|
138
|
-
error: error instanceof Error ? error.message : 'Unknown error'
|
|
144
|
+
error: error instanceof Error ? error.message : 'Unknown error',
|
|
139
145
|
});
|
|
140
146
|
}
|
|
141
147
|
}
|
package/dist/crawler/robots.js
CHANGED
|
@@ -8,9 +8,9 @@ export async function getRobotsChecker(origin, userAgent = '*') {
|
|
|
8
8
|
const robotsUrl = new URL('/robots.txt', origin).href;
|
|
9
9
|
const robotsTxt = await fetchStream(robotsUrl, {
|
|
10
10
|
timeout: 5000,
|
|
11
|
-
userAgent
|
|
11
|
+
userAgent,
|
|
12
12
|
});
|
|
13
|
-
const robotsParserModule = await import('robots-parser');
|
|
13
|
+
const robotsParserModule = (await import('robots-parser'));
|
|
14
14
|
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
15
15
|
const robots = robotsParser(robotsUrl, robotsTxt);
|
|
16
16
|
robotsCache.set(origin, robots);
|
|
@@ -19,7 +19,7 @@ export async function getRobotsChecker(origin, userAgent = '*') {
|
|
|
19
19
|
catch {
|
|
20
20
|
const permissive = {
|
|
21
21
|
isAllowed: () => true,
|
|
22
|
-
getCrawlDelay: () => undefined
|
|
22
|
+
getCrawlDelay: () => undefined,
|
|
23
23
|
};
|
|
24
24
|
robotsCache.set(origin, permissive);
|
|
25
25
|
return permissive;
|
package/dist/index.js
CHANGED
|
@@ -32,7 +32,7 @@ program
|
|
|
32
32
|
sameOriginOnly: !options.allOrigins,
|
|
33
33
|
userAgent: options.userAgent,
|
|
34
34
|
cacheDir: options.cacheDir,
|
|
35
|
-
timeout: parseInt(options.timeout, 10)
|
|
35
|
+
timeout: parseInt(options.timeout, 10),
|
|
36
36
|
};
|
|
37
37
|
const queue = new CrawlQueue(crawlOptions);
|
|
38
38
|
await queue.init();
|
|
@@ -43,30 +43,33 @@ program
|
|
|
43
43
|
}
|
|
44
44
|
else if (options.output === 'markdown') {
|
|
45
45
|
results.forEach(result => {
|
|
46
|
-
if (result.
|
|
47
|
-
console.error(`Error for ${result.url}: ${result.error}`);
|
|
48
|
-
}
|
|
49
|
-
else if (result.markdown) {
|
|
46
|
+
if (result.markdown) {
|
|
50
47
|
console.log(result.markdown);
|
|
51
48
|
if (results.length > 1) {
|
|
52
49
|
console.log('\n---\n');
|
|
53
50
|
}
|
|
54
51
|
}
|
|
52
|
+
if (result.error && result.markdown) {
|
|
53
|
+
console.error(`Warning for ${result.url}: ${result.error}`);
|
|
54
|
+
}
|
|
55
|
+
else if (result.error && !result.markdown) {
|
|
56
|
+
console.error(`Error for ${result.url}: ${result.error}`);
|
|
57
|
+
}
|
|
55
58
|
});
|
|
56
59
|
}
|
|
57
60
|
else if (options.output === 'both') {
|
|
58
61
|
results.forEach(result => {
|
|
59
62
|
console.log(`\n## URL: ${result.url}\n`);
|
|
60
|
-
if (result.
|
|
61
|
-
console.error(`Error: ${result.error}`);
|
|
62
|
-
}
|
|
63
|
-
else {
|
|
63
|
+
if (result.markdown) {
|
|
64
64
|
console.log(result.markdown);
|
|
65
65
|
}
|
|
66
|
+
if (result.error) {
|
|
67
|
+
console.error(`${result.markdown ? 'Warning' : 'Error'}: ${result.error}`);
|
|
68
|
+
}
|
|
66
69
|
});
|
|
67
70
|
}
|
|
68
|
-
const
|
|
69
|
-
if (
|
|
71
|
+
const hasFatalErrors = results.some(r => r.error && !r.markdown);
|
|
72
|
+
if (hasFatalErrors) {
|
|
70
73
|
process.exit(1);
|
|
71
74
|
}
|
|
72
75
|
}
|
|
@@ -8,7 +8,7 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
8
8
|
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
9
9
|
userAgent: options.userAgent,
|
|
10
10
|
cacheDir: options.cacheDir ?? '.cache',
|
|
11
|
-
timeout: options.timeout ?? 30000
|
|
11
|
+
timeout: options.timeout ?? 30000,
|
|
12
12
|
};
|
|
13
13
|
const queue = new CrawlQueue(crawlOptions);
|
|
14
14
|
await queue.init();
|
|
@@ -17,20 +17,20 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
17
17
|
if (!mainResult) {
|
|
18
18
|
return {
|
|
19
19
|
markdown: '',
|
|
20
|
-
error: 'No results returned'
|
|
20
|
+
error: 'No results returned',
|
|
21
21
|
};
|
|
22
22
|
}
|
|
23
23
|
return {
|
|
24
24
|
markdown: mainResult.markdown,
|
|
25
25
|
title: mainResult.title,
|
|
26
26
|
links: mainResult.links,
|
|
27
|
-
error: mainResult.error
|
|
27
|
+
error: mainResult.error,
|
|
28
28
|
};
|
|
29
29
|
}
|
|
30
30
|
catch (error) {
|
|
31
31
|
return {
|
|
32
32
|
markdown: '',
|
|
33
|
-
error: error instanceof Error ? error.message : 'Unknown error'
|
|
33
|
+
error: error instanceof Error ? error.message : 'Unknown error',
|
|
34
34
|
};
|
|
35
35
|
}
|
|
36
36
|
}
|
package/dist/parser/article.js
CHANGED
|
@@ -4,10 +4,12 @@ export function extractArticle(dom) {
|
|
|
4
4
|
const baseUrl = dom.window.location.href;
|
|
5
5
|
const articleParagraph = document.querySelector('article p');
|
|
6
6
|
const hasStrongArticleIndicators = (document.querySelector('article') !== null &&
|
|
7
|
-
articleParagraph?.textContent &&
|
|
7
|
+
articleParagraph?.textContent &&
|
|
8
|
+
articleParagraph.textContent.length > 200) ||
|
|
8
9
|
document.querySelector('[itemtype*="BlogPosting"]') !== null ||
|
|
9
10
|
document.querySelector('[itemtype*="NewsArticle"]') !== null ||
|
|
10
|
-
document.querySelector('meta[property="article:published_time"]') !==
|
|
11
|
+
document.querySelector('meta[property="article:published_time"]') !==
|
|
12
|
+
null;
|
|
11
13
|
if (hasStrongArticleIndicators) {
|
|
12
14
|
const documentClone = document.cloneNode(true);
|
|
13
15
|
const reader = new Readability(documentClone);
|
|
@@ -24,7 +26,7 @@ export function extractArticle(dom) {
|
|
|
24
26
|
lang: article.lang || null,
|
|
25
27
|
siteName: article.siteName || null,
|
|
26
28
|
publishedTime: article.publishedTime || null,
|
|
27
|
-
baseUrl
|
|
29
|
+
baseUrl,
|
|
28
30
|
};
|
|
29
31
|
}
|
|
30
32
|
}
|
|
@@ -36,10 +38,16 @@ function extractContentManually(dom) {
|
|
|
36
38
|
const baseUrl = dom.window.location.href;
|
|
37
39
|
const title = document.querySelector('title')?.textContent ||
|
|
38
40
|
document.querySelector('h1')?.textContent ||
|
|
39
|
-
document
|
|
40
|
-
|
|
41
|
+
document
|
|
42
|
+
.querySelector('meta[property="og:title"]')
|
|
43
|
+
?.getAttribute('content') ||
|
|
44
|
+
document
|
|
45
|
+
.querySelector('meta[name="title"]')
|
|
46
|
+
?.getAttribute('content') ||
|
|
41
47
|
'Untitled Page';
|
|
42
|
-
const byline = document
|
|
48
|
+
const byline = document
|
|
49
|
+
.querySelector('meta[name="author"]')
|
|
50
|
+
?.getAttribute('content') ||
|
|
43
51
|
document.querySelector('[rel="author"]')?.textContent ||
|
|
44
52
|
document.querySelector('.author')?.textContent ||
|
|
45
53
|
null;
|
|
@@ -56,18 +64,18 @@ function extractContentManually(dom) {
|
|
|
56
64
|
siteName: null,
|
|
57
65
|
textContent: document.documentElement?.textContent || '',
|
|
58
66
|
publishedTime: null,
|
|
59
|
-
baseUrl
|
|
67
|
+
baseUrl,
|
|
60
68
|
};
|
|
61
69
|
}
|
|
62
70
|
const contentClone = document.body.cloneNode(true);
|
|
63
|
-
const selectorsToRemove = [
|
|
64
|
-
'script', 'style', 'noscript', 'template'
|
|
65
|
-
];
|
|
71
|
+
const selectorsToRemove = ['script', 'style', 'noscript', 'template'];
|
|
66
72
|
selectorsToRemove.forEach(selector => {
|
|
67
73
|
try {
|
|
68
|
-
contentClone
|
|
74
|
+
contentClone
|
|
75
|
+
.querySelectorAll(selector)
|
|
76
|
+
.forEach(el => el.remove());
|
|
69
77
|
}
|
|
70
|
-
catch
|
|
78
|
+
catch {
|
|
71
79
|
}
|
|
72
80
|
});
|
|
73
81
|
const mainContent = contentClone;
|
|
@@ -83,14 +91,16 @@ function extractContentManually(dom) {
|
|
|
83
91
|
siteName: null,
|
|
84
92
|
textContent: mainContent.textContent || '',
|
|
85
93
|
publishedTime: null,
|
|
86
|
-
baseUrl
|
|
94
|
+
baseUrl,
|
|
87
95
|
};
|
|
88
96
|
}
|
|
89
97
|
catch (error) {
|
|
90
98
|
console.error('Error in manual extraction:', error);
|
|
91
99
|
return {
|
|
92
100
|
title: 'Error extracting content',
|
|
93
|
-
content: dom.window.document.body?.innerHTML ||
|
|
101
|
+
content: dom.window.document.body?.innerHTML ||
|
|
102
|
+
dom.window.document.documentElement?.innerHTML ||
|
|
103
|
+
'',
|
|
94
104
|
byline: null,
|
|
95
105
|
excerpt: '',
|
|
96
106
|
dir: null,
|
|
@@ -99,7 +109,7 @@ function extractContentManually(dom) {
|
|
|
99
109
|
siteName: null,
|
|
100
110
|
textContent: dom.window.document.body?.textContent || '',
|
|
101
111
|
publishedTime: null,
|
|
102
|
-
baseUrl: dom.window.location.href
|
|
112
|
+
baseUrl: dom.window.location.href,
|
|
103
113
|
};
|
|
104
114
|
}
|
|
105
115
|
}
|
package/dist/parser/dom.js
CHANGED
|
@@ -5,24 +5,25 @@ export function htmlToDom(html, url) {
|
|
|
5
5
|
url,
|
|
6
6
|
contentType: 'text/html',
|
|
7
7
|
includeNodeLocations: false,
|
|
8
|
-
runScripts:
|
|
9
|
-
resources:
|
|
10
|
-
pretendToBeVisual: true
|
|
8
|
+
runScripts: undefined,
|
|
9
|
+
resources: undefined,
|
|
10
|
+
pretendToBeVisual: true,
|
|
11
|
+
virtualConsole: new JSDOM.VirtualConsole().sendTo(console, { omitJSDOMErrors: true }),
|
|
11
12
|
});
|
|
12
13
|
}
|
|
13
|
-
catch
|
|
14
|
-
console.error('Error parsing HTML with JSDOM, trying with minimal options:', error);
|
|
14
|
+
catch {
|
|
15
15
|
try {
|
|
16
16
|
return new JSDOM(html, {
|
|
17
17
|
url,
|
|
18
|
-
contentType: 'text/html'
|
|
18
|
+
contentType: 'text/html',
|
|
19
|
+
virtualConsole: new JSDOM.VirtualConsole().sendTo(console, { omitJSDOMErrors: true }),
|
|
19
20
|
});
|
|
20
21
|
}
|
|
21
|
-
catch
|
|
22
|
-
console.error('Fallback parsing also failed:', fallbackError);
|
|
22
|
+
catch {
|
|
23
23
|
return new JSDOM(`<!DOCTYPE html><html><body>${html}</body></html>`, {
|
|
24
24
|
url,
|
|
25
|
-
contentType: 'text/html'
|
|
25
|
+
contentType: 'text/html',
|
|
26
|
+
virtualConsole: new JSDOM.VirtualConsole().sendTo(console, { omitJSDOMErrors: true }),
|
|
26
27
|
});
|
|
27
28
|
}
|
|
28
29
|
}
|
|
@@ -32,7 +33,7 @@ export function extractLinks(dom) {
|
|
|
32
33
|
const links = [];
|
|
33
34
|
const baseUrl = dom.window.location.href;
|
|
34
35
|
const anchorElements = document.querySelectorAll('a[href]');
|
|
35
|
-
anchorElements.forEach(
|
|
36
|
+
anchorElements.forEach(element => {
|
|
36
37
|
try {
|
|
37
38
|
const href = element.getAttribute('href');
|
|
38
39
|
if (!href)
|
package/dist/parser/markdown.js
CHANGED
|
@@ -7,34 +7,41 @@ function convertRelativeUrls(html, baseUrl) {
|
|
|
7
7
|
const document = dom.window.document;
|
|
8
8
|
document.querySelectorAll('a[href]').forEach(link => {
|
|
9
9
|
const href = link.getAttribute('href');
|
|
10
|
-
if (href &&
|
|
11
|
-
!href.startsWith('
|
|
12
|
-
!href.startsWith('
|
|
10
|
+
if (href &&
|
|
11
|
+
!href.startsWith('http://') &&
|
|
12
|
+
!href.startsWith('https://') &&
|
|
13
|
+
!href.startsWith('//') &&
|
|
14
|
+
!href.startsWith('mailto:') &&
|
|
15
|
+
!href.startsWith('tel:') &&
|
|
16
|
+
!href.startsWith('javascript:') &&
|
|
13
17
|
!href.startsWith('#')) {
|
|
14
18
|
try {
|
|
15
19
|
const absoluteUrl = new URL(href, baseUrl).href;
|
|
16
20
|
link.setAttribute('href', absoluteUrl);
|
|
17
21
|
}
|
|
18
|
-
catch
|
|
22
|
+
catch {
|
|
19
23
|
}
|
|
20
24
|
}
|
|
21
25
|
});
|
|
22
26
|
document.querySelectorAll('img[src]').forEach(img => {
|
|
23
27
|
const src = img.getAttribute('src');
|
|
24
|
-
if (src &&
|
|
25
|
-
!src.startsWith('
|
|
28
|
+
if (src &&
|
|
29
|
+
!src.startsWith('http://') &&
|
|
30
|
+
!src.startsWith('https://') &&
|
|
31
|
+
!src.startsWith('//') &&
|
|
32
|
+
!src.startsWith('data:')) {
|
|
26
33
|
try {
|
|
27
34
|
const absoluteUrl = new URL(src, baseUrl).href;
|
|
28
35
|
img.setAttribute('src', absoluteUrl);
|
|
29
36
|
}
|
|
30
|
-
catch
|
|
37
|
+
catch {
|
|
31
38
|
}
|
|
32
39
|
}
|
|
33
40
|
});
|
|
34
41
|
const bodyElement = document.body || document.documentElement;
|
|
35
42
|
return bodyElement ? bodyElement.innerHTML : html;
|
|
36
43
|
}
|
|
37
|
-
catch
|
|
44
|
+
catch {
|
|
38
45
|
return html;
|
|
39
46
|
}
|
|
40
47
|
}
|
|
@@ -55,7 +62,7 @@ export function createTurndownService() {
|
|
|
55
62
|
},
|
|
56
63
|
defaultReplacement: (content, node) => {
|
|
57
64
|
return node.isBlock ? '\n\n' + content + '\n\n' : content;
|
|
58
|
-
}
|
|
65
|
+
},
|
|
59
66
|
});
|
|
60
67
|
turndown.use(gfm);
|
|
61
68
|
turndown.addRule('media', {
|
|
@@ -63,12 +70,14 @@ export function createTurndownService() {
|
|
|
63
70
|
replacement: (_content, node) => {
|
|
64
71
|
const element = node;
|
|
65
72
|
const src = element.getAttribute('src') || element.getAttribute('data-src');
|
|
66
|
-
const title = element.getAttribute('title') ||
|
|
73
|
+
const title = element.getAttribute('title') ||
|
|
74
|
+
element.getAttribute('alt') ||
|
|
75
|
+
'media';
|
|
67
76
|
if (src) {
|
|
68
77
|
return `\n\n[${title}](${src})\n\n`;
|
|
69
78
|
}
|
|
70
79
|
return '';
|
|
71
|
-
}
|
|
80
|
+
},
|
|
72
81
|
});
|
|
73
82
|
turndown.addRule('figure', {
|
|
74
83
|
filter: 'figure',
|
|
@@ -80,7 +89,7 @@ export function createTurndownService() {
|
|
|
80
89
|
return `\n\n${content.trim()}\n*${captionText}*\n\n`;
|
|
81
90
|
}
|
|
82
91
|
return `\n\n${content.trim()}\n\n`;
|
|
83
|
-
}
|
|
92
|
+
},
|
|
84
93
|
});
|
|
85
94
|
return turndown;
|
|
86
95
|
}
|
|
@@ -119,7 +128,9 @@ export function formatArticleMarkdown(article) {
|
|
|
119
128
|
markdown += tempDiv.textContent || article.content;
|
|
120
129
|
}
|
|
121
130
|
else {
|
|
122
|
-
markdown += article.content
|
|
131
|
+
markdown += article.content
|
|
132
|
+
.replace(/<[^>]*>/g, ' ')
|
|
133
|
+
.replace(/\s+/g, ' ');
|
|
123
134
|
}
|
|
124
135
|
}
|
|
125
136
|
return markdown
|
|
@@ -129,6 +140,8 @@ export function formatArticleMarkdown(article) {
|
|
|
129
140
|
}
|
|
130
141
|
catch (error) {
|
|
131
142
|
console.error('Fatal error in formatArticleMarkdown:', error);
|
|
132
|
-
return article.title
|
|
143
|
+
return article.title
|
|
144
|
+
? `# ${article.title}\n\n[Content extraction failed]`
|
|
145
|
+
: '[Content extraction failed]';
|
|
133
146
|
}
|
|
134
147
|
}
|
package/dist/serve.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { Server } from
|
|
3
|
-
import { StdioServerTransport } from
|
|
4
|
-
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from
|
|
2
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
3
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
4
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
5
5
|
let fetchMarkdownModule;
|
|
6
6
|
let fsPromises;
|
|
7
7
|
let pathModule;
|
|
8
8
|
const server = new Server({
|
|
9
|
-
name:
|
|
10
|
-
version:
|
|
9
|
+
name: 'read-website-fast',
|
|
10
|
+
version: '0.1.0',
|
|
11
11
|
}, {
|
|
12
12
|
capabilities: {
|
|
13
13
|
tools: {},
|
|
@@ -15,64 +15,83 @@ const server = new Server({
|
|
|
15
15
|
},
|
|
16
16
|
});
|
|
17
17
|
const READ_WEBSITE_TOOL = {
|
|
18
|
-
name:
|
|
19
|
-
description:
|
|
18
|
+
name: 'read_website_fast',
|
|
19
|
+
description: 'Quickly reads webpages and converts to markdown for fast, token efficient web scraping',
|
|
20
20
|
inputSchema: {
|
|
21
|
-
type:
|
|
21
|
+
type: 'object',
|
|
22
22
|
properties: {
|
|
23
23
|
url: {
|
|
24
|
-
type:
|
|
25
|
-
description:
|
|
24
|
+
type: 'string',
|
|
25
|
+
description: 'HTTP/HTTPS URL to fetch and convert to markdown',
|
|
26
26
|
},
|
|
27
27
|
depth: {
|
|
28
|
-
type:
|
|
29
|
-
description:
|
|
28
|
+
type: 'number',
|
|
29
|
+
description: 'Crawl depth (0 = single page)',
|
|
30
30
|
default: 0,
|
|
31
31
|
},
|
|
32
32
|
respectRobots: {
|
|
33
|
-
type:
|
|
34
|
-
description:
|
|
33
|
+
type: 'boolean',
|
|
34
|
+
description: 'Whether to respect robots.txt',
|
|
35
35
|
default: true,
|
|
36
36
|
},
|
|
37
37
|
},
|
|
38
|
-
required: [
|
|
38
|
+
required: ['url'],
|
|
39
39
|
},
|
|
40
40
|
};
|
|
41
41
|
const RESOURCES = [
|
|
42
42
|
{
|
|
43
|
-
uri:
|
|
44
|
-
name:
|
|
45
|
-
mimeType:
|
|
46
|
-
description:
|
|
43
|
+
uri: 'read-website-fast://status',
|
|
44
|
+
name: 'Cache Status',
|
|
45
|
+
mimeType: 'application/json',
|
|
46
|
+
description: 'Get cache status information',
|
|
47
47
|
},
|
|
48
48
|
{
|
|
49
|
-
uri:
|
|
50
|
-
name:
|
|
51
|
-
mimeType:
|
|
52
|
-
description:
|
|
49
|
+
uri: 'read-website-fast://clear-cache',
|
|
50
|
+
name: 'Clear Cache',
|
|
51
|
+
mimeType: 'application/json',
|
|
52
|
+
description: 'Clear the cache directory',
|
|
53
53
|
},
|
|
54
54
|
];
|
|
55
55
|
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
56
56
|
tools: [READ_WEBSITE_TOOL],
|
|
57
57
|
}));
|
|
58
58
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
59
|
-
if (request.params.name !==
|
|
59
|
+
if (request.params.name !== 'read_website_fast') {
|
|
60
60
|
throw new Error(`Unknown tool: ${request.params.name}`);
|
|
61
61
|
}
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
try {
|
|
63
|
+
if (!fetchMarkdownModule) {
|
|
64
|
+
fetchMarkdownModule = await import('./internal/fetchMarkdown.js');
|
|
65
|
+
}
|
|
66
|
+
const args = request.params.arguments;
|
|
67
|
+
if (!args.url || typeof args.url !== 'string') {
|
|
68
|
+
throw new Error('URL parameter is required and must be a string');
|
|
69
|
+
}
|
|
70
|
+
const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
|
|
71
|
+
depth: args.depth ?? 0,
|
|
72
|
+
respectRobots: args.respectRobots ?? true,
|
|
73
|
+
});
|
|
74
|
+
if (result.error && result.markdown) {
|
|
75
|
+
return {
|
|
76
|
+
content: [
|
|
77
|
+
{
|
|
78
|
+
type: 'text',
|
|
79
|
+
text: `${result.markdown}\n\n---\n*Note: ${result.error}*`,
|
|
80
|
+
},
|
|
81
|
+
],
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
if (result.error && !result.markdown) {
|
|
85
|
+
throw new Error(result.error);
|
|
86
|
+
}
|
|
87
|
+
return {
|
|
88
|
+
content: [{ type: 'text', text: result.markdown }],
|
|
89
|
+
};
|
|
64
90
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
respectRobots: args.respectRobots ?? true,
|
|
69
|
-
});
|
|
70
|
-
if (result.error) {
|
|
71
|
-
throw new Error(result.error);
|
|
91
|
+
catch (error) {
|
|
92
|
+
console.error('Tool execution error:', error);
|
|
93
|
+
throw new Error(`Failed to fetch content: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
72
94
|
}
|
|
73
|
-
return {
|
|
74
|
-
content: [{ type: "text", text: result.markdown }],
|
|
75
|
-
};
|
|
76
95
|
});
|
|
77
96
|
server.setRequestHandler(ListResourcesRequestSchema, async () => ({
|
|
78
97
|
resources: RESOURCES,
|
|
@@ -80,14 +99,14 @@ server.setRequestHandler(ListResourcesRequestSchema, async () => ({
|
|
|
80
99
|
server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
81
100
|
const uri = request.params.uri;
|
|
82
101
|
if (!fsPromises) {
|
|
83
|
-
fsPromises = await import(
|
|
102
|
+
fsPromises = await import('fs/promises');
|
|
84
103
|
}
|
|
85
104
|
if (!pathModule) {
|
|
86
|
-
pathModule = await import(
|
|
105
|
+
pathModule = await import('path');
|
|
87
106
|
}
|
|
88
|
-
if (uri ===
|
|
107
|
+
if (uri === 'read-website-fast://status') {
|
|
89
108
|
try {
|
|
90
|
-
const cacheDir =
|
|
109
|
+
const cacheDir = '.cache';
|
|
91
110
|
const files = await fsPromises.readdir(cacheDir).catch(() => []);
|
|
92
111
|
let totalSize = 0;
|
|
93
112
|
for (const file of files) {
|
|
@@ -102,7 +121,7 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
102
121
|
contents: [
|
|
103
122
|
{
|
|
104
123
|
uri,
|
|
105
|
-
mimeType:
|
|
124
|
+
mimeType: 'application/json',
|
|
106
125
|
text: JSON.stringify({
|
|
107
126
|
cacheSize: totalSize,
|
|
108
127
|
cacheFiles: files.length,
|
|
@@ -117,27 +136,29 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
117
136
|
contents: [
|
|
118
137
|
{
|
|
119
138
|
uri,
|
|
120
|
-
mimeType:
|
|
139
|
+
mimeType: 'application/json',
|
|
121
140
|
text: JSON.stringify({
|
|
122
|
-
error:
|
|
123
|
-
message: error instanceof Error
|
|
141
|
+
error: 'Failed to get cache status',
|
|
142
|
+
message: error instanceof Error
|
|
143
|
+
? error.message
|
|
144
|
+
: 'Unknown error',
|
|
124
145
|
}, null, 2),
|
|
125
146
|
},
|
|
126
147
|
],
|
|
127
148
|
};
|
|
128
149
|
}
|
|
129
150
|
}
|
|
130
|
-
if (uri ===
|
|
151
|
+
if (uri === 'read-website-fast://clear-cache') {
|
|
131
152
|
try {
|
|
132
|
-
await fsPromises.rm(
|
|
153
|
+
await fsPromises.rm('.cache', { recursive: true, force: true });
|
|
133
154
|
return {
|
|
134
155
|
contents: [
|
|
135
156
|
{
|
|
136
157
|
uri,
|
|
137
|
-
mimeType:
|
|
158
|
+
mimeType: 'application/json',
|
|
138
159
|
text: JSON.stringify({
|
|
139
|
-
status:
|
|
140
|
-
message:
|
|
160
|
+
status: 'success',
|
|
161
|
+
message: 'Cache cleared successfully',
|
|
141
162
|
}, null, 2),
|
|
142
163
|
},
|
|
143
164
|
],
|
|
@@ -148,10 +169,12 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
148
169
|
contents: [
|
|
149
170
|
{
|
|
150
171
|
uri,
|
|
151
|
-
mimeType:
|
|
172
|
+
mimeType: 'application/json',
|
|
152
173
|
text: JSON.stringify({
|
|
153
|
-
status:
|
|
154
|
-
message: error instanceof Error
|
|
174
|
+
status: 'error',
|
|
175
|
+
message: error instanceof Error
|
|
176
|
+
? error.message
|
|
177
|
+
: 'Failed to clear cache',
|
|
155
178
|
}, null, 2),
|
|
156
179
|
},
|
|
157
180
|
],
|
|
@@ -162,10 +185,35 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
162
185
|
});
|
|
163
186
|
async function runServer() {
|
|
164
187
|
const transport = new StdioServerTransport();
|
|
165
|
-
|
|
166
|
-
|
|
188
|
+
process.on('SIGINT', async () => {
|
|
189
|
+
console.error('Received SIGINT, shutting down gracefully...');
|
|
190
|
+
await server.close();
|
|
191
|
+
process.exit(0);
|
|
192
|
+
});
|
|
193
|
+
process.on('SIGTERM', async () => {
|
|
194
|
+
console.error('Received SIGTERM, shutting down gracefully...');
|
|
195
|
+
await server.close();
|
|
196
|
+
process.exit(0);
|
|
197
|
+
});
|
|
198
|
+
process.on('uncaughtException', error => {
|
|
199
|
+
console.error('Uncaught exception:', error);
|
|
200
|
+
process.exit(1);
|
|
201
|
+
});
|
|
202
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
203
|
+
console.error('Unhandled rejection at:', promise, 'reason:', reason);
|
|
204
|
+
process.exit(1);
|
|
205
|
+
});
|
|
206
|
+
try {
|
|
207
|
+
await server.connect(transport);
|
|
208
|
+
console.error('read-website-fast MCP server running');
|
|
209
|
+
process.stdin.resume();
|
|
210
|
+
}
|
|
211
|
+
catch (error) {
|
|
212
|
+
console.error('Failed to start server:', error);
|
|
213
|
+
process.exit(1);
|
|
214
|
+
}
|
|
167
215
|
}
|
|
168
|
-
runServer().catch(
|
|
169
|
-
console.error(
|
|
216
|
+
runServer().catch(error => {
|
|
217
|
+
console.error('Server initialization error:', error);
|
|
170
218
|
process.exit(1);
|
|
171
219
|
});
|
package/dist/utils/chunker.js
CHANGED
|
@@ -5,7 +5,7 @@ export class MarkdownChunker {
|
|
|
5
5
|
maxTokens: options.maxTokens ?? 0,
|
|
6
6
|
maxChars: options.maxChars ?? 4000,
|
|
7
7
|
splitOn: options.splitOn ?? 'heading',
|
|
8
|
-
overlap: options.overlap ?? 200
|
|
8
|
+
overlap: options.overlap ?? 200,
|
|
9
9
|
};
|
|
10
10
|
}
|
|
11
11
|
chunk(markdown) {
|
|
@@ -36,8 +36,8 @@ export class MarkdownChunker {
|
|
|
36
36
|
metadata: {
|
|
37
37
|
headings: [...currentHeadings],
|
|
38
38
|
startLine,
|
|
39
|
-
endLine: i - 1
|
|
40
|
-
}
|
|
39
|
+
endLine: i - 1,
|
|
40
|
+
},
|
|
41
41
|
});
|
|
42
42
|
const overlapLines = this.getOverlapLines(currentChunk);
|
|
43
43
|
currentChunk = [...overlapLines, line];
|
|
@@ -58,8 +58,8 @@ export class MarkdownChunker {
|
|
|
58
58
|
metadata: {
|
|
59
59
|
headings: [...currentHeadings],
|
|
60
60
|
startLine,
|
|
61
|
-
endLine: i
|
|
62
|
-
}
|
|
61
|
+
endLine: i,
|
|
62
|
+
},
|
|
63
63
|
});
|
|
64
64
|
const overlapLines = this.getOverlapLines(currentChunk);
|
|
65
65
|
currentChunk = [...overlapLines];
|
|
@@ -74,8 +74,8 @@ export class MarkdownChunker {
|
|
|
74
74
|
metadata: {
|
|
75
75
|
headings: currentHeadings,
|
|
76
76
|
startLine,
|
|
77
|
-
endLine: lines.length - 1
|
|
78
|
-
}
|
|
77
|
+
endLine: lines.length - 1,
|
|
78
|
+
},
|
|
79
79
|
});
|
|
80
80
|
}
|
|
81
81
|
return chunks;
|
|
@@ -85,11 +85,12 @@ export class MarkdownChunker {
|
|
|
85
85
|
const paragraphs = markdown.split(/\n\n+/);
|
|
86
86
|
let currentChunk = [];
|
|
87
87
|
for (const paragraph of paragraphs) {
|
|
88
|
-
const wouldExceedLimit = currentChunk.join('\n\n').length + paragraph.length >
|
|
88
|
+
const wouldExceedLimit = currentChunk.join('\n\n').length + paragraph.length >
|
|
89
|
+
this.options.maxChars;
|
|
89
90
|
if (wouldExceedLimit && currentChunk.length > 0) {
|
|
90
91
|
chunks.push({
|
|
91
92
|
content: currentChunk.join('\n\n').trim(),
|
|
92
|
-
index: chunks.length
|
|
93
|
+
index: chunks.length,
|
|
93
94
|
});
|
|
94
95
|
currentChunk = [];
|
|
95
96
|
}
|
|
@@ -98,7 +99,7 @@ export class MarkdownChunker {
|
|
|
98
99
|
if (currentChunk.length > 0) {
|
|
99
100
|
chunks.push({
|
|
100
101
|
content: currentChunk.join('\n\n').trim(),
|
|
101
|
-
index: chunks.length
|
|
102
|
+
index: chunks.length,
|
|
102
103
|
});
|
|
103
104
|
}
|
|
104
105
|
return chunks;
|
|
@@ -108,11 +109,12 @@ export class MarkdownChunker {
|
|
|
108
109
|
const sentences = markdown.match(/[^.!?]+[.!?]+/g) || [markdown];
|
|
109
110
|
let currentChunk = [];
|
|
110
111
|
for (const sentence of sentences) {
|
|
111
|
-
const wouldExceedLimit = currentChunk.join(' ').length + sentence.length >
|
|
112
|
+
const wouldExceedLimit = currentChunk.join(' ').length + sentence.length >
|
|
113
|
+
this.options.maxChars;
|
|
112
114
|
if (wouldExceedLimit && currentChunk.length > 0) {
|
|
113
115
|
chunks.push({
|
|
114
116
|
content: currentChunk.join(' ').trim(),
|
|
115
|
-
index: chunks.length
|
|
117
|
+
index: chunks.length,
|
|
116
118
|
});
|
|
117
119
|
currentChunk = [];
|
|
118
120
|
}
|
|
@@ -121,7 +123,7 @@ export class MarkdownChunker {
|
|
|
121
123
|
if (currentChunk.length > 0) {
|
|
122
124
|
chunks.push({
|
|
123
125
|
content: currentChunk.join(' ').trim(),
|
|
124
|
-
index: chunks.length
|
|
126
|
+
index: chunks.length,
|
|
125
127
|
});
|
|
126
128
|
}
|
|
127
129
|
return chunks;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@just-every/mcp-read-website-fast",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -67,8 +67,11 @@
|
|
|
67
67
|
"@typescript-eslint/eslint-plugin": "^8.34.0",
|
|
68
68
|
"@typescript-eslint/parser": "^8.34.0",
|
|
69
69
|
"eslint": "^9.28.0",
|
|
70
|
+
"eslint-config-prettier": "^10.1.5",
|
|
71
|
+
"eslint-plugin-prettier": "^5.4.1",
|
|
70
72
|
"tsx": "^4.7.0",
|
|
71
73
|
"typescript": "^5.3.3",
|
|
74
|
+
"typescript-eslint": "^8.34.0",
|
|
72
75
|
"vitest": "^3.2.3"
|
|
73
76
|
},
|
|
74
77
|
"engines": {
|