@just-every/mcp-read-website-fast 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/bin/mcp-read-website.js +49 -0
- package/dist/cache/disk.d.ts +12 -0
- package/dist/cache/disk.js +54 -0
- package/dist/cache/normalize.d.ts +2 -0
- package/dist/cache/normalize.js +31 -0
- package/dist/crawler/fetch.d.ts +8 -0
- package/dist/crawler/fetch.js +42 -0
- package/dist/crawler/queue.d.ts +14 -0
- package/dist/crawler/queue.js +142 -0
- package/dist/crawler/robots.d.ts +8 -0
- package/dist/crawler/robots.js +47 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +99 -0
- package/dist/internal/fetchMarkdown.d.ts +16 -0
- package/dist/internal/fetchMarkdown.js +36 -0
- package/dist/parser/article.d.ts +4 -0
- package/dist/parser/article.js +115 -0
- package/dist/parser/dom.d.ts +3 -0
- package/dist/parser/dom.js +53 -0
- package/dist/parser/markdown.d.ts +9 -0
- package/dist/parser/markdown.js +134 -0
- package/dist/serve.d.ts +2 -0
- package/dist/serve.js +171 -0
- package/dist/utils/chunker.d.ts +26 -0
- package/dist/utils/chunker.js +146 -0
- package/dist/utils/logger.d.ts +18 -0
- package/dist/utils/logger.js +52 -0
- package/package.json +71 -0
- package/tsconfig.json +24 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from 'commander';
|
|
3
|
+
import { CrawlQueue } from './crawler/queue.js';
|
|
4
|
+
import { readFileSync } from 'fs';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
6
|
+
import { dirname, join } from 'path';
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
const packageJson = JSON.parse(readFileSync(join(__dirname, '../package.json'), 'utf-8'));
|
|
10
|
+
const program = new Command();
|
|
11
|
+
program
|
|
12
|
+
.name('mcp')
|
|
13
|
+
.description('Markdown Content Preprocessor - Extract and convert web content to clean Markdown')
|
|
14
|
+
.version(packageJson.version);
|
|
15
|
+
program
|
|
16
|
+
.command('fetch <url>')
|
|
17
|
+
.description('Fetch a URL and convert to Markdown')
|
|
18
|
+
.option('-d, --depth <number>', 'Crawl depth (0 = single page)', '0')
|
|
19
|
+
.option('-c, --concurrency <number>', 'Max concurrent requests', '3')
|
|
20
|
+
.option('--no-robots', 'Ignore robots.txt')
|
|
21
|
+
.option('--all-origins', 'Allow cross-origin crawling')
|
|
22
|
+
.option('-u, --user-agent <string>', 'Custom user agent')
|
|
23
|
+
.option('--cache-dir <path>', 'Cache directory', '.cache')
|
|
24
|
+
.option('-t, --timeout <ms>', 'Request timeout in milliseconds', '30000')
|
|
25
|
+
.option('-o, --output <format>', 'Output format: json, markdown, or both', 'markdown')
|
|
26
|
+
.action(async (url, options) => {
|
|
27
|
+
try {
|
|
28
|
+
const crawlOptions = {
|
|
29
|
+
depth: parseInt(options.depth, 10),
|
|
30
|
+
maxConcurrency: parseInt(options.concurrency, 10),
|
|
31
|
+
respectRobots: options.robots,
|
|
32
|
+
sameOriginOnly: !options.allOrigins,
|
|
33
|
+
userAgent: options.userAgent,
|
|
34
|
+
cacheDir: options.cacheDir,
|
|
35
|
+
timeout: parseInt(options.timeout, 10)
|
|
36
|
+
};
|
|
37
|
+
const queue = new CrawlQueue(crawlOptions);
|
|
38
|
+
await queue.init();
|
|
39
|
+
console.error(`Fetching ${url}...`);
|
|
40
|
+
const results = await queue.crawl(url);
|
|
41
|
+
if (options.output === 'json') {
|
|
42
|
+
console.log(JSON.stringify(results, null, 2));
|
|
43
|
+
}
|
|
44
|
+
else if (options.output === 'markdown') {
|
|
45
|
+
results.forEach(result => {
|
|
46
|
+
if (result.error) {
|
|
47
|
+
console.error(`Error for ${result.url}: ${result.error}`);
|
|
48
|
+
}
|
|
49
|
+
else if (result.markdown) {
|
|
50
|
+
console.log(result.markdown);
|
|
51
|
+
if (results.length > 1) {
|
|
52
|
+
console.log('\n---\n');
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
else if (options.output === 'both') {
|
|
58
|
+
results.forEach(result => {
|
|
59
|
+
console.log(`\n## URL: ${result.url}\n`);
|
|
60
|
+
if (result.error) {
|
|
61
|
+
console.error(`Error: ${result.error}`);
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
console.log(result.markdown);
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
const hasErrors = results.some(r => r.error);
|
|
69
|
+
if (hasErrors) {
|
|
70
|
+
process.exit(1);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
console.error('Error:', error instanceof Error ? error.message : error);
|
|
75
|
+
process.exit(1);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
program
|
|
79
|
+
.command('clear-cache')
|
|
80
|
+
.description('Clear the cache directory')
|
|
81
|
+
.option('--cache-dir <path>', 'Cache directory', '.cache')
|
|
82
|
+
.action(async (options) => {
|
|
83
|
+
try {
|
|
84
|
+
const { rm } = await import('fs/promises');
|
|
85
|
+
await rm(options.cacheDir, { recursive: true, force: true });
|
|
86
|
+
console.log(`Cache cleared: ${options.cacheDir}`);
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
console.error('Error clearing cache:', error);
|
|
90
|
+
process.exit(1);
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
program
|
|
94
|
+
.command('serve')
|
|
95
|
+
.description('Run as an MCP server')
|
|
96
|
+
.action(async () => {
|
|
97
|
+
await import('./serve.js');
|
|
98
|
+
});
|
|
99
|
+
program.parse();
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export interface FetchMarkdownOptions {
|
|
2
|
+
depth?: number;
|
|
3
|
+
maxConcurrency?: number;
|
|
4
|
+
respectRobots?: boolean;
|
|
5
|
+
sameOriginOnly?: boolean;
|
|
6
|
+
userAgent?: string;
|
|
7
|
+
cacheDir?: string;
|
|
8
|
+
timeout?: number;
|
|
9
|
+
}
|
|
10
|
+
export interface FetchMarkdownResult {
|
|
11
|
+
markdown: string;
|
|
12
|
+
title?: string;
|
|
13
|
+
links?: string[];
|
|
14
|
+
error?: string;
|
|
15
|
+
}
|
|
16
|
+
export declare function fetchMarkdown(url: string, options?: FetchMarkdownOptions): Promise<FetchMarkdownResult>;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { CrawlQueue } from '../crawler/queue.js';
|
|
2
|
+
export async function fetchMarkdown(url, options = {}) {
|
|
3
|
+
try {
|
|
4
|
+
const crawlOptions = {
|
|
5
|
+
depth: options.depth ?? 0,
|
|
6
|
+
maxConcurrency: options.maxConcurrency ?? 3,
|
|
7
|
+
respectRobots: options.respectRobots ?? true,
|
|
8
|
+
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
9
|
+
userAgent: options.userAgent,
|
|
10
|
+
cacheDir: options.cacheDir ?? '.cache',
|
|
11
|
+
timeout: options.timeout ?? 30000
|
|
12
|
+
};
|
|
13
|
+
const queue = new CrawlQueue(crawlOptions);
|
|
14
|
+
await queue.init();
|
|
15
|
+
const results = await queue.crawl(url);
|
|
16
|
+
const mainResult = results[0];
|
|
17
|
+
if (!mainResult) {
|
|
18
|
+
return {
|
|
19
|
+
markdown: '',
|
|
20
|
+
error: 'No results returned'
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
return {
|
|
24
|
+
markdown: mainResult.markdown,
|
|
25
|
+
title: mainResult.title,
|
|
26
|
+
links: mainResult.links,
|
|
27
|
+
error: mainResult.error
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
return {
|
|
32
|
+
markdown: '',
|
|
33
|
+
error: error instanceof Error ? error.message : 'Unknown error'
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { Readability } from '@mozilla/readability';
|
|
2
|
+
export function extractArticle(dom) {
|
|
3
|
+
const document = dom.window.document;
|
|
4
|
+
const baseUrl = dom.window.location.href;
|
|
5
|
+
const articleParagraph = document.querySelector('article p');
|
|
6
|
+
const hasStrongArticleIndicators = (document.querySelector('article') !== null &&
|
|
7
|
+
articleParagraph?.textContent && articleParagraph.textContent.length > 200) ||
|
|
8
|
+
document.querySelector('[itemtype*="BlogPosting"]') !== null ||
|
|
9
|
+
document.querySelector('[itemtype*="NewsArticle"]') !== null ||
|
|
10
|
+
document.querySelector('meta[property="article:published_time"]') !== null;
|
|
11
|
+
if (hasStrongArticleIndicators) {
|
|
12
|
+
const documentClone = document.cloneNode(true);
|
|
13
|
+
const reader = new Readability(documentClone);
|
|
14
|
+
const article = reader.parse();
|
|
15
|
+
if (article && article.content && article.content.trim().length > 500) {
|
|
16
|
+
return {
|
|
17
|
+
title: article.title || 'Untitled',
|
|
18
|
+
content: article.content || '',
|
|
19
|
+
textContent: article.textContent || '',
|
|
20
|
+
length: article.length || 0,
|
|
21
|
+
excerpt: article.excerpt || '',
|
|
22
|
+
byline: article.byline || null,
|
|
23
|
+
dir: article.dir || null,
|
|
24
|
+
lang: article.lang || null,
|
|
25
|
+
siteName: article.siteName || null,
|
|
26
|
+
publishedTime: article.publishedTime || null,
|
|
27
|
+
baseUrl
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return extractContentManually(dom);
|
|
32
|
+
}
|
|
33
|
+
function extractContentManually(dom) {
|
|
34
|
+
try {
|
|
35
|
+
const document = dom.window.document;
|
|
36
|
+
const baseUrl = dom.window.location.href;
|
|
37
|
+
const title = document.querySelector('title')?.textContent ||
|
|
38
|
+
document.querySelector('h1')?.textContent ||
|
|
39
|
+
document.querySelector('meta[property="og:title"]')?.getAttribute('content') ||
|
|
40
|
+
document.querySelector('meta[name="title"]')?.getAttribute('content') ||
|
|
41
|
+
'Untitled Page';
|
|
42
|
+
const byline = document.querySelector('meta[name="author"]')?.getAttribute('content') ||
|
|
43
|
+
document.querySelector('[rel="author"]')?.textContent ||
|
|
44
|
+
document.querySelector('.author')?.textContent ||
|
|
45
|
+
null;
|
|
46
|
+
if (!document.body) {
|
|
47
|
+
const html = document.documentElement?.innerHTML || '';
|
|
48
|
+
return {
|
|
49
|
+
title: title.trim(),
|
|
50
|
+
content: html,
|
|
51
|
+
byline,
|
|
52
|
+
excerpt: '',
|
|
53
|
+
dir: null,
|
|
54
|
+
lang: document.documentElement?.lang || null,
|
|
55
|
+
length: html.length,
|
|
56
|
+
siteName: null,
|
|
57
|
+
textContent: document.documentElement?.textContent || '',
|
|
58
|
+
publishedTime: null,
|
|
59
|
+
baseUrl
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
const contentClone = document.body.cloneNode(true);
|
|
63
|
+
const selectorsToRemove = [
|
|
64
|
+
'script', 'style', 'noscript', 'template'
|
|
65
|
+
];
|
|
66
|
+
selectorsToRemove.forEach(selector => {
|
|
67
|
+
try {
|
|
68
|
+
contentClone.querySelectorAll(selector).forEach(el => el.remove());
|
|
69
|
+
}
|
|
70
|
+
catch (e) {
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
const mainContent = contentClone;
|
|
74
|
+
const content = mainContent.innerHTML || mainContent.textContent || '';
|
|
75
|
+
return {
|
|
76
|
+
title: title.trim(),
|
|
77
|
+
content,
|
|
78
|
+
byline,
|
|
79
|
+
excerpt: '',
|
|
80
|
+
dir: null,
|
|
81
|
+
lang: document.documentElement?.lang || null,
|
|
82
|
+
length: content.length,
|
|
83
|
+
siteName: null,
|
|
84
|
+
textContent: mainContent.textContent || '',
|
|
85
|
+
publishedTime: null,
|
|
86
|
+
baseUrl
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
console.error('Error in manual extraction:', error);
|
|
91
|
+
return {
|
|
92
|
+
title: 'Error extracting content',
|
|
93
|
+
content: dom.window.document.body?.innerHTML || dom.window.document.documentElement?.innerHTML || '',
|
|
94
|
+
byline: null,
|
|
95
|
+
excerpt: '',
|
|
96
|
+
dir: null,
|
|
97
|
+
lang: null,
|
|
98
|
+
length: 0,
|
|
99
|
+
siteName: null,
|
|
100
|
+
textContent: dom.window.document.body?.textContent || '',
|
|
101
|
+
publishedTime: null,
|
|
102
|
+
baseUrl: dom.window.location.href
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
export function hasContent(html) {
|
|
107
|
+
const lowerHtml = html.toLowerCase();
|
|
108
|
+
if (lowerHtml.includes('<noscript>') &&
|
|
109
|
+
!lowerHtml.includes('<article') &&
|
|
110
|
+
!lowerHtml.includes('<main')) {
|
|
111
|
+
return false;
|
|
112
|
+
}
|
|
113
|
+
const textContent = html.replace(/<[^>]*>/g, '').trim();
|
|
114
|
+
return textContent.length > 100;
|
|
115
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { JSDOM } from 'jsdom';
|
|
2
|
+
export function htmlToDom(html, url) {
|
|
3
|
+
try {
|
|
4
|
+
return new JSDOM(html, {
|
|
5
|
+
url,
|
|
6
|
+
contentType: 'text/html',
|
|
7
|
+
includeNodeLocations: false,
|
|
8
|
+
runScripts: 'outside-only',
|
|
9
|
+
resources: 'usable',
|
|
10
|
+
pretendToBeVisual: true
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
catch (error) {
|
|
14
|
+
console.error('Error parsing HTML with JSDOM, trying with minimal options:', error);
|
|
15
|
+
try {
|
|
16
|
+
return new JSDOM(html, {
|
|
17
|
+
url,
|
|
18
|
+
contentType: 'text/html'
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
catch (fallbackError) {
|
|
22
|
+
console.error('Fallback parsing also failed:', fallbackError);
|
|
23
|
+
return new JSDOM(`<!DOCTYPE html><html><body>${html}</body></html>`, {
|
|
24
|
+
url,
|
|
25
|
+
contentType: 'text/html'
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
export function extractLinks(dom) {
|
|
31
|
+
const document = dom.window.document;
|
|
32
|
+
const links = [];
|
|
33
|
+
const baseUrl = dom.window.location.href;
|
|
34
|
+
const anchorElements = document.querySelectorAll('a[href]');
|
|
35
|
+
anchorElements.forEach((element) => {
|
|
36
|
+
try {
|
|
37
|
+
const href = element.getAttribute('href');
|
|
38
|
+
if (!href)
|
|
39
|
+
return;
|
|
40
|
+
if (href.startsWith('mailto:') ||
|
|
41
|
+
href.startsWith('tel:') ||
|
|
42
|
+
href.startsWith('javascript:') ||
|
|
43
|
+
href.startsWith('#')) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
const absoluteUrl = new URL(href, baseUrl).href;
|
|
47
|
+
links.push(absoluteUrl);
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
return [...new Set(links)];
|
|
53
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
export declare function createTurndownService(): TurndownService;
|
|
3
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
4
|
+
export declare function formatArticleMarkdown(article: {
|
|
5
|
+
title: string;
|
|
6
|
+
content: string;
|
|
7
|
+
byline?: string | null;
|
|
8
|
+
baseUrl?: string;
|
|
9
|
+
}): string;
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
3
|
+
import { JSDOM } from 'jsdom';
|
|
4
|
+
function convertRelativeUrls(html, baseUrl) {
|
|
5
|
+
try {
|
|
6
|
+
const dom = new JSDOM(html, { url: baseUrl });
|
|
7
|
+
const document = dom.window.document;
|
|
8
|
+
document.querySelectorAll('a[href]').forEach(link => {
|
|
9
|
+
const href = link.getAttribute('href');
|
|
10
|
+
if (href && !href.startsWith('http://') && !href.startsWith('https://') &&
|
|
11
|
+
!href.startsWith('//') && !href.startsWith('mailto:') &&
|
|
12
|
+
!href.startsWith('tel:') && !href.startsWith('javascript:') &&
|
|
13
|
+
!href.startsWith('#')) {
|
|
14
|
+
try {
|
|
15
|
+
const absoluteUrl = new URL(href, baseUrl).href;
|
|
16
|
+
link.setAttribute('href', absoluteUrl);
|
|
17
|
+
}
|
|
18
|
+
catch (e) {
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
document.querySelectorAll('img[src]').forEach(img => {
|
|
23
|
+
const src = img.getAttribute('src');
|
|
24
|
+
if (src && !src.startsWith('http://') && !src.startsWith('https://') &&
|
|
25
|
+
!src.startsWith('//') && !src.startsWith('data:')) {
|
|
26
|
+
try {
|
|
27
|
+
const absoluteUrl = new URL(src, baseUrl).href;
|
|
28
|
+
img.setAttribute('src', absoluteUrl);
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
const bodyElement = document.body || document.documentElement;
|
|
35
|
+
return bodyElement ? bodyElement.innerHTML : html;
|
|
36
|
+
}
|
|
37
|
+
catch (e) {
|
|
38
|
+
return html;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
export function createTurndownService() {
|
|
42
|
+
const turndown = new TurndownService({
|
|
43
|
+
headingStyle: 'atx',
|
|
44
|
+
codeBlockStyle: 'fenced',
|
|
45
|
+
linkStyle: 'inlined',
|
|
46
|
+
emDelimiter: '_',
|
|
47
|
+
bulletListMarker: '-',
|
|
48
|
+
strongDelimiter: '**',
|
|
49
|
+
hr: '---',
|
|
50
|
+
blankReplacement: (_content, node) => {
|
|
51
|
+
return node.isBlock ? '\n\n' : '';
|
|
52
|
+
},
|
|
53
|
+
keepReplacement: (content, node) => {
|
|
54
|
+
return node.isBlock ? '\n\n' + content + '\n\n' : content;
|
|
55
|
+
},
|
|
56
|
+
defaultReplacement: (content, node) => {
|
|
57
|
+
return node.isBlock ? '\n\n' + content + '\n\n' : content;
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
turndown.use(gfm);
|
|
61
|
+
turndown.addRule('media', {
|
|
62
|
+
filter: ['iframe', 'video', 'audio', 'embed'],
|
|
63
|
+
replacement: (_content, node) => {
|
|
64
|
+
const element = node;
|
|
65
|
+
const src = element.getAttribute('src') || element.getAttribute('data-src');
|
|
66
|
+
const title = element.getAttribute('title') || element.getAttribute('alt') || 'media';
|
|
67
|
+
if (src) {
|
|
68
|
+
return `\n\n[${title}](${src})\n\n`;
|
|
69
|
+
}
|
|
70
|
+
return '';
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
turndown.addRule('figure', {
|
|
74
|
+
filter: 'figure',
|
|
75
|
+
replacement: (content, node) => {
|
|
76
|
+
const figure = node;
|
|
77
|
+
const caption = figure.querySelector('figcaption');
|
|
78
|
+
if (caption) {
|
|
79
|
+
const captionText = caption.textContent || '';
|
|
80
|
+
return `\n\n${content.trim()}\n*${captionText}*\n\n`;
|
|
81
|
+
}
|
|
82
|
+
return `\n\n${content.trim()}\n\n`;
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
return turndown;
|
|
86
|
+
}
|
|
87
|
+
export function htmlToMarkdown(html) {
|
|
88
|
+
const turndown = createTurndownService();
|
|
89
|
+
let markdown = turndown.turndown(html);
|
|
90
|
+
markdown = markdown
|
|
91
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
92
|
+
.replace(/\s+$/gm, '')
|
|
93
|
+
.trim();
|
|
94
|
+
return markdown;
|
|
95
|
+
}
|
|
96
|
+
export function formatArticleMarkdown(article) {
|
|
97
|
+
try {
|
|
98
|
+
const turndown = createTurndownService();
|
|
99
|
+
let markdown = '';
|
|
100
|
+
if (article.title && article.title.trim()) {
|
|
101
|
+
markdown = `# ${article.title}\n\n`;
|
|
102
|
+
}
|
|
103
|
+
if (article.byline) {
|
|
104
|
+
markdown += `*By ${article.byline}*\n\n---\n\n`;
|
|
105
|
+
}
|
|
106
|
+
try {
|
|
107
|
+
const processedContent = article.baseUrl
|
|
108
|
+
? convertRelativeUrls(article.content, article.baseUrl)
|
|
109
|
+
: article.content;
|
|
110
|
+
markdown += turndown.turndown(processedContent);
|
|
111
|
+
}
|
|
112
|
+
catch (conversionError) {
|
|
113
|
+
console.error('Error converting HTML to markdown:', conversionError);
|
|
114
|
+
const tempDiv = typeof document !== 'undefined'
|
|
115
|
+
? document.createElement('div')
|
|
116
|
+
: null;
|
|
117
|
+
if (tempDiv) {
|
|
118
|
+
tempDiv.innerHTML = article.content;
|
|
119
|
+
markdown += tempDiv.textContent || article.content;
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
markdown += article.content.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return markdown
|
|
126
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
127
|
+
.replace(/\s+$/gm, '')
|
|
128
|
+
.trim();
|
|
129
|
+
}
|
|
130
|
+
catch (error) {
|
|
131
|
+
console.error('Fatal error in formatArticleMarkdown:', error);
|
|
132
|
+
return article.title ? `# ${article.title}\n\n[Content extraction failed]` : '[Content extraction failed]';
|
|
133
|
+
}
|
|
134
|
+
}
|
package/dist/serve.d.ts
ADDED
package/dist/serve.js
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
3
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
5
|
+
let fetchMarkdownModule;
|
|
6
|
+
let fsPromises;
|
|
7
|
+
let pathModule;
|
|
8
|
+
const server = new Server({
|
|
9
|
+
name: "read-website-fast",
|
|
10
|
+
version: "0.1.0",
|
|
11
|
+
}, {
|
|
12
|
+
capabilities: {
|
|
13
|
+
tools: {},
|
|
14
|
+
resources: {},
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
const READ_WEBSITE_TOOL = {
|
|
18
|
+
name: "read_website_fast",
|
|
19
|
+
description: "Quickly reads webpages and converts to markdown for fast, token efficient web scraping",
|
|
20
|
+
inputSchema: {
|
|
21
|
+
type: "object",
|
|
22
|
+
properties: {
|
|
23
|
+
url: {
|
|
24
|
+
type: "string",
|
|
25
|
+
description: "HTTP/HTTPS URL to fetch and convert to markdown",
|
|
26
|
+
},
|
|
27
|
+
depth: {
|
|
28
|
+
type: "number",
|
|
29
|
+
description: "Crawl depth (0 = single page)",
|
|
30
|
+
default: 0,
|
|
31
|
+
},
|
|
32
|
+
respectRobots: {
|
|
33
|
+
type: "boolean",
|
|
34
|
+
description: "Whether to respect robots.txt",
|
|
35
|
+
default: true,
|
|
36
|
+
},
|
|
37
|
+
},
|
|
38
|
+
required: ["url"],
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
const RESOURCES = [
|
|
42
|
+
{
|
|
43
|
+
uri: "read-website-fast://status",
|
|
44
|
+
name: "Cache Status",
|
|
45
|
+
mimeType: "application/json",
|
|
46
|
+
description: "Get cache status information",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
uri: "read-website-fast://clear-cache",
|
|
50
|
+
name: "Clear Cache",
|
|
51
|
+
mimeType: "application/json",
|
|
52
|
+
description: "Clear the cache directory",
|
|
53
|
+
},
|
|
54
|
+
];
|
|
55
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
56
|
+
tools: [READ_WEBSITE_TOOL],
|
|
57
|
+
}));
|
|
58
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
59
|
+
if (request.params.name !== "read_website_fast") {
|
|
60
|
+
throw new Error(`Unknown tool: ${request.params.name}`);
|
|
61
|
+
}
|
|
62
|
+
if (!fetchMarkdownModule) {
|
|
63
|
+
fetchMarkdownModule = await import("./internal/fetchMarkdown.js");
|
|
64
|
+
}
|
|
65
|
+
const args = request.params.arguments;
|
|
66
|
+
const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
|
|
67
|
+
depth: args.depth ?? 0,
|
|
68
|
+
respectRobots: args.respectRobots ?? true,
|
|
69
|
+
});
|
|
70
|
+
if (result.error) {
|
|
71
|
+
throw new Error(result.error);
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
content: [{ type: "text", text: result.markdown }],
|
|
75
|
+
};
|
|
76
|
+
});
|
|
77
|
+
server.setRequestHandler(ListResourcesRequestSchema, async () => ({
|
|
78
|
+
resources: RESOURCES,
|
|
79
|
+
}));
|
|
80
|
+
server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
81
|
+
const uri = request.params.uri;
|
|
82
|
+
if (!fsPromises) {
|
|
83
|
+
fsPromises = await import("fs/promises");
|
|
84
|
+
}
|
|
85
|
+
if (!pathModule) {
|
|
86
|
+
pathModule = await import("path");
|
|
87
|
+
}
|
|
88
|
+
if (uri === "read-website-fast://status") {
|
|
89
|
+
try {
|
|
90
|
+
const cacheDir = ".cache";
|
|
91
|
+
const files = await fsPromises.readdir(cacheDir).catch(() => []);
|
|
92
|
+
let totalSize = 0;
|
|
93
|
+
for (const file of files) {
|
|
94
|
+
const stats = await fsPromises
|
|
95
|
+
.stat(pathModule.join(cacheDir, file))
|
|
96
|
+
.catch(() => null);
|
|
97
|
+
if (stats) {
|
|
98
|
+
totalSize += stats.size;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return {
|
|
102
|
+
contents: [
|
|
103
|
+
{
|
|
104
|
+
uri,
|
|
105
|
+
mimeType: "application/json",
|
|
106
|
+
text: JSON.stringify({
|
|
107
|
+
cacheSize: totalSize,
|
|
108
|
+
cacheFiles: files.length,
|
|
109
|
+
cacheSizeFormatted: `${(totalSize / 1024 / 1024).toFixed(2)} MB`,
|
|
110
|
+
}, null, 2),
|
|
111
|
+
},
|
|
112
|
+
],
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
catch (error) {
|
|
116
|
+
return {
|
|
117
|
+
contents: [
|
|
118
|
+
{
|
|
119
|
+
uri,
|
|
120
|
+
mimeType: "application/json",
|
|
121
|
+
text: JSON.stringify({
|
|
122
|
+
error: "Failed to get cache status",
|
|
123
|
+
message: error instanceof Error ? error.message : "Unknown error",
|
|
124
|
+
}, null, 2),
|
|
125
|
+
},
|
|
126
|
+
],
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (uri === "read-website-fast://clear-cache") {
|
|
131
|
+
try {
|
|
132
|
+
await fsPromises.rm(".cache", { recursive: true, force: true });
|
|
133
|
+
return {
|
|
134
|
+
contents: [
|
|
135
|
+
{
|
|
136
|
+
uri,
|
|
137
|
+
mimeType: "application/json",
|
|
138
|
+
text: JSON.stringify({
|
|
139
|
+
status: "success",
|
|
140
|
+
message: "Cache cleared successfully",
|
|
141
|
+
}, null, 2),
|
|
142
|
+
},
|
|
143
|
+
],
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
catch (error) {
|
|
147
|
+
return {
|
|
148
|
+
contents: [
|
|
149
|
+
{
|
|
150
|
+
uri,
|
|
151
|
+
mimeType: "application/json",
|
|
152
|
+
text: JSON.stringify({
|
|
153
|
+
status: "error",
|
|
154
|
+
message: error instanceof Error ? error.message : "Failed to clear cache",
|
|
155
|
+
}, null, 2),
|
|
156
|
+
},
|
|
157
|
+
],
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
throw new Error(`Unknown resource: ${uri}`);
|
|
162
|
+
});
|
|
163
|
+
async function runServer() {
|
|
164
|
+
const transport = new StdioServerTransport();
|
|
165
|
+
await server.connect(transport);
|
|
166
|
+
console.error("read-website-fast MCP server running");
|
|
167
|
+
}
|
|
168
|
+
runServer().catch((error) => {
|
|
169
|
+
console.error("Server error:", error);
|
|
170
|
+
process.exit(1);
|
|
171
|
+
});
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export interface ChunkOptions {
|
|
2
|
+
maxTokens?: number;
|
|
3
|
+
maxChars?: number;
|
|
4
|
+
splitOn?: 'heading' | 'paragraph' | 'sentence';
|
|
5
|
+
overlap?: number;
|
|
6
|
+
}
|
|
7
|
+
export interface Chunk {
|
|
8
|
+
content: string;
|
|
9
|
+
index: number;
|
|
10
|
+
tokens?: number;
|
|
11
|
+
metadata?: {
|
|
12
|
+
headings?: string[];
|
|
13
|
+
startLine?: number;
|
|
14
|
+
endLine?: number;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export declare class MarkdownChunker {
|
|
18
|
+
private options;
|
|
19
|
+
constructor(options?: ChunkOptions);
|
|
20
|
+
chunk(markdown: string): Chunk[];
|
|
21
|
+
private chunkByHeading;
|
|
22
|
+
private chunkByParagraph;
|
|
23
|
+
private chunkBySentence;
|
|
24
|
+
private getOverlapLines;
|
|
25
|
+
estimateTokens(text: string): number;
|
|
26
|
+
}
|