@just-every/mcp-read-website-fast 0.1.10 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/bin/mcp-read-website.js +2 -2
- package/dist/index.js +2 -4
- package/dist/internal/fetchMarkdown.js +2 -4
- package/dist/serve-restart.d.ts +2 -0
- package/dist/serve-restart.js +105 -0
- package/dist/serve.js +130 -48
- package/dist/utils/logger.js +2 -2
- package/package.json +5 -11
- package/dist/cache/disk.d.ts +0 -12
- package/dist/cache/disk.js +0 -54
- package/dist/cache/normalize.d.ts +0 -2
- package/dist/cache/normalize.js +0 -31
- package/dist/crawler/fetch.d.ts +0 -8
- package/dist/crawler/fetch.js +0 -43
- package/dist/crawler/queue.d.ts +0 -14
- package/dist/crawler/queue.js +0 -148
- package/dist/crawler/robots.d.ts +0 -8
- package/dist/crawler/robots.js +0 -47
- package/dist/parser/article.d.ts +0 -4
- package/dist/parser/article.js +0 -125
- package/dist/parser/dom.d.ts +0 -3
- package/dist/parser/dom.js +0 -60
- package/dist/parser/markdown.d.ts +0 -9
- package/dist/parser/markdown.js +0 -147
package/README.md
CHANGED
|
@@ -11,6 +11,8 @@ Existing MCP web crawlers are slow and consume large quantities of tokens. This
|
|
|
11
11
|
|
|
12
12
|
This MCP package fetches web pages locally, strips noise, and converts content to clean Markdown while preserving links. Designed for Claude Code, IDEs and LLM pipelines with minimal token footprint. Crawl sites locally with minimal dependencies.
|
|
13
13
|
|
|
14
|
+
**Note:** This package now uses [@just-every/crawl](https://www.npmjs.com/package/@just-every/crawl) for its core crawling and markdown conversion functionality.
|
|
15
|
+
|
|
14
16
|
## Features
|
|
15
17
|
|
|
16
18
|
- **Fast startup** using official MCP SDK with lazy loading for optimal performance
|
|
@@ -86,7 +88,7 @@ Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.js
|
|
|
86
88
|
|
|
87
89
|
### Available Tools
|
|
88
90
|
|
|
89
|
-
- `
|
|
91
|
+
- `read_website` - Fetches a webpage and converts it to clean markdown
|
|
90
92
|
- Parameters:
|
|
91
93
|
- `url` (required): The HTTP/HTTPS URL to fetch
|
|
92
94
|
- `depth` (optional): Crawl depth (0 = single page)
|
|
@@ -144,6 +146,22 @@ npm run dev fetch https://example.com --output both
|
|
|
144
146
|
npm run dev clear-cache
|
|
145
147
|
```
|
|
146
148
|
|
|
149
|
+
## Auto-Restart Feature
|
|
150
|
+
|
|
151
|
+
The MCP server includes automatic restart capability by default for improved reliability:
|
|
152
|
+
|
|
153
|
+
- Automatically restarts the server if it crashes
|
|
154
|
+
- Handles unhandled exceptions and promise rejections
|
|
155
|
+
- Implements exponential backoff (max 10 attempts in 1 minute)
|
|
156
|
+
- Logs all restart attempts for monitoring
|
|
157
|
+
- Gracefully handles shutdown signals (SIGINT, SIGTERM)
|
|
158
|
+
|
|
159
|
+
For development/debugging without auto-restart:
|
|
160
|
+
```bash
|
|
161
|
+
# Run directly without restart wrapper
|
|
162
|
+
npm run serve:dev
|
|
163
|
+
```
|
|
164
|
+
|
|
147
165
|
## Architecture
|
|
148
166
|
|
|
149
167
|
```
|
|
@@ -153,7 +171,9 @@ mcp/
|
|
|
153
171
|
│ ├── parser/ # DOM parsing, Readability, Turndown conversion
|
|
154
172
|
│ ├── cache/ # Disk-based caching with SHA-256 keys
|
|
155
173
|
│ ├── utils/ # Logger, chunker utilities
|
|
156
|
-
│
|
|
174
|
+
│ ├── index.ts # CLI entry point
|
|
175
|
+
│ ├── serve.ts # MCP server entry point
|
|
176
|
+
│ └── serve-restart.ts # Auto-restart wrapper
|
|
157
177
|
```
|
|
158
178
|
|
|
159
179
|
## Development
|
package/bin/mcp-read-website.js
CHANGED
|
@@ -18,7 +18,7 @@ async function main() {
|
|
|
18
18
|
if (distExists) {
|
|
19
19
|
// Use compiled JavaScript for production (fast startup)
|
|
20
20
|
if (command === 'serve') {
|
|
21
|
-
const servePath = join(__dirname, '..', 'dist', 'serve.js');
|
|
21
|
+
const servePath = join(__dirname, '..', 'dist', 'serve-restart.js');
|
|
22
22
|
await import(servePath);
|
|
23
23
|
} else {
|
|
24
24
|
const cliPath = join(__dirname, '..', 'dist', 'index.js');
|
|
@@ -30,7 +30,7 @@ async function main() {
|
|
|
30
30
|
await import('tsx/esm');
|
|
31
31
|
|
|
32
32
|
if (command === 'serve') {
|
|
33
|
-
const servePath = join(__dirname, '..', 'src', 'serve.ts');
|
|
33
|
+
const servePath = join(__dirname, '..', 'src', 'serve-restart.ts');
|
|
34
34
|
await import(servePath);
|
|
35
35
|
} else {
|
|
36
36
|
const cliPath = join(__dirname, '..', 'src', 'index.ts');
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { Command } from 'commander';
|
|
3
|
-
import {
|
|
3
|
+
import { fetch } from '@just-every/crawl';
|
|
4
4
|
import { readFileSync } from 'fs';
|
|
5
5
|
import { fileURLToPath } from 'url';
|
|
6
6
|
import { dirname, join } from 'path';
|
|
@@ -34,10 +34,8 @@ program
|
|
|
34
34
|
cacheDir: options.cacheDir,
|
|
35
35
|
timeout: parseInt(options.timeout, 10),
|
|
36
36
|
};
|
|
37
|
-
const queue = new CrawlQueue(crawlOptions);
|
|
38
|
-
await queue.init();
|
|
39
37
|
console.error(`Fetching ${url}...`);
|
|
40
|
-
const results = await
|
|
38
|
+
const results = await fetch(url, crawlOptions);
|
|
41
39
|
if (options.output === 'json') {
|
|
42
40
|
console.log(JSON.stringify(results, null, 2));
|
|
43
41
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { fetch } from '@just-every/crawl';
|
|
2
2
|
export async function fetchMarkdown(url, options = {}) {
|
|
3
3
|
try {
|
|
4
4
|
const crawlOptions = {
|
|
@@ -10,9 +10,7 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
10
10
|
cacheDir: options.cacheDir ?? '.cache',
|
|
11
11
|
timeout: options.timeout ?? 30000,
|
|
12
12
|
};
|
|
13
|
-
const
|
|
14
|
-
await queue.init();
|
|
15
|
-
const results = await queue.crawl(url);
|
|
13
|
+
const results = await fetch(url, crawlOptions);
|
|
16
14
|
const mainResult = results[0];
|
|
17
15
|
if (!mainResult) {
|
|
18
16
|
return {
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { spawn } from 'child_process';
|
|
3
|
+
import { dirname, join } from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
const MAX_RESTART_ATTEMPTS = 10;
|
|
7
|
+
const RESTART_WINDOW_MS = 60000;
|
|
8
|
+
const INITIAL_BACKOFF_MS = 1000;
|
|
9
|
+
const MAX_BACKOFF_MS = 30000;
|
|
10
|
+
let restartAttempts = [];
|
|
11
|
+
let currentBackoff = INITIAL_BACKOFF_MS;
|
|
12
|
+
const log = (level, message, ...args) => {
|
|
13
|
+
const timestamp = new Date().toISOString();
|
|
14
|
+
console.error(`[${timestamp}] [${level}] [restart-wrapper]`, message, ...args);
|
|
15
|
+
};
|
|
16
|
+
const cleanupRestartAttempts = () => {
|
|
17
|
+
const now = Date.now();
|
|
18
|
+
restartAttempts = restartAttempts.filter(timestamp => now - timestamp < RESTART_WINDOW_MS);
|
|
19
|
+
};
|
|
20
|
+
const shouldRestart = () => {
|
|
21
|
+
cleanupRestartAttempts();
|
|
22
|
+
if (restartAttempts.length >= MAX_RESTART_ATTEMPTS) {
|
|
23
|
+
log('ERROR', `Reached maximum restart attempts (${MAX_RESTART_ATTEMPTS}) within ${RESTART_WINDOW_MS}ms`);
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
return true;
|
|
27
|
+
};
|
|
28
|
+
const getBackoffDelay = () => {
|
|
29
|
+
const delay = Math.min(currentBackoff, MAX_BACKOFF_MS);
|
|
30
|
+
currentBackoff = Math.min(currentBackoff * 2, MAX_BACKOFF_MS);
|
|
31
|
+
return delay;
|
|
32
|
+
};
|
|
33
|
+
const resetBackoff = () => {
|
|
34
|
+
currentBackoff = INITIAL_BACKOFF_MS;
|
|
35
|
+
};
|
|
36
|
+
const startServer = () => {
|
|
37
|
+
log('INFO', 'Starting MCP server...');
|
|
38
|
+
const serverPath = join(__dirname, 'serve.js');
|
|
39
|
+
const child = spawn(process.execPath, [serverPath], {
|
|
40
|
+
stdio: 'inherit',
|
|
41
|
+
env: process.env,
|
|
42
|
+
});
|
|
43
|
+
let shuttingDown = false;
|
|
44
|
+
let restartTimer = null;
|
|
45
|
+
let startupTimer = setTimeout(() => {
|
|
46
|
+
log('INFO', 'Server started successfully');
|
|
47
|
+
resetBackoff();
|
|
48
|
+
}, 5000);
|
|
49
|
+
child.on('exit', (code, signal) => {
|
|
50
|
+
clearTimeout(startupTimer);
|
|
51
|
+
if (shuttingDown) {
|
|
52
|
+
log('INFO', 'Server stopped gracefully');
|
|
53
|
+
process.exit(0);
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
if (code === 0) {
|
|
57
|
+
log('INFO', 'Server exited cleanly');
|
|
58
|
+
process.exit(0);
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
log('WARN', `Server exited with code ${code}, signal ${signal}`);
|
|
62
|
+
if (!shouldRestart()) {
|
|
63
|
+
log('ERROR', 'Too many restart attempts, giving up');
|
|
64
|
+
process.exit(1);
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
const backoffDelay = getBackoffDelay();
|
|
68
|
+
restartAttempts.push(Date.now());
|
|
69
|
+
log('INFO', `Restarting server in ${backoffDelay}ms (attempt ${restartAttempts.length}/${MAX_RESTART_ATTEMPTS})...`);
|
|
70
|
+
restartTimer = setTimeout(() => {
|
|
71
|
+
startServer();
|
|
72
|
+
}, backoffDelay);
|
|
73
|
+
});
|
|
74
|
+
child.on('error', (error) => {
|
|
75
|
+
log('ERROR', 'Failed to start server:', error);
|
|
76
|
+
process.exit(1);
|
|
77
|
+
});
|
|
78
|
+
const shutdown = (signal) => {
|
|
79
|
+
if (shuttingDown)
|
|
80
|
+
return;
|
|
81
|
+
shuttingDown = true;
|
|
82
|
+
log('INFO', `Received ${signal}, shutting down...`);
|
|
83
|
+
if (restartTimer) {
|
|
84
|
+
clearTimeout(restartTimer);
|
|
85
|
+
}
|
|
86
|
+
child.kill(signal);
|
|
87
|
+
setTimeout(() => {
|
|
88
|
+
log('WARN', 'Force killing child process');
|
|
89
|
+
child.kill('SIGKILL');
|
|
90
|
+
}, 5000);
|
|
91
|
+
};
|
|
92
|
+
process.on('SIGINT', () => shutdown('SIGINT'));
|
|
93
|
+
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
|
94
|
+
};
|
|
95
|
+
process.on('uncaughtException', (error) => {
|
|
96
|
+
log('ERROR', 'Uncaught exception in restart wrapper:', error);
|
|
97
|
+
process.exit(1);
|
|
98
|
+
});
|
|
99
|
+
process.on('unhandledRejection', (reason) => {
|
|
100
|
+
log('ERROR', 'Unhandled rejection in restart wrapper:', reason);
|
|
101
|
+
process.exit(1);
|
|
102
|
+
});
|
|
103
|
+
log('INFO', 'MCP server restart wrapper starting...');
|
|
104
|
+
log('INFO', `Configuration: max attempts=${MAX_RESTART_ATTEMPTS}, window=${RESTART_WINDOW_MS}ms`);
|
|
105
|
+
startServer();
|
package/dist/serve.js
CHANGED
|
@@ -1,13 +1,23 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
console.error('[serve.ts] Process started, PID:', process.pid);
|
|
3
|
+
console.error('[serve.ts] Node version:', process.version);
|
|
4
|
+
console.error('[serve.ts] Current directory:', process.cwd());
|
|
2
5
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
3
6
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
4
7
|
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
8
|
+
import { logger, LogLevel } from './utils/logger.js';
|
|
9
|
+
logger.setLevel(LogLevel.DEBUG);
|
|
10
|
+
logger.info('MCP Server starting up...');
|
|
11
|
+
logger.debug('Node version:', process.version);
|
|
12
|
+
logger.debug('Working directory:', process.cwd());
|
|
13
|
+
logger.debug('Environment:', { LOG_LEVEL: process.env.LOG_LEVEL });
|
|
5
14
|
process.stdin.on('error', () => { });
|
|
6
15
|
process.stdout.on('error', () => { });
|
|
7
16
|
process.stderr.on('error', () => { });
|
|
8
17
|
let fetchMarkdownModule;
|
|
9
18
|
let fsPromises;
|
|
10
19
|
let pathModule;
|
|
20
|
+
logger.debug('Creating MCP server instance...');
|
|
11
21
|
const server = new Server({
|
|
12
22
|
name: 'read-website-fast',
|
|
13
23
|
version: '0.1.0',
|
|
@@ -17,12 +27,13 @@ const server = new Server({
|
|
|
17
27
|
resources: {},
|
|
18
28
|
},
|
|
19
29
|
});
|
|
30
|
+
logger.info('MCP server instance created successfully');
|
|
20
31
|
server.onerror = error => {
|
|
21
|
-
|
|
32
|
+
logger.error('MCP Server Error:', error);
|
|
22
33
|
};
|
|
23
34
|
const READ_WEBSITE_TOOL = {
|
|
24
|
-
name: '
|
|
25
|
-
description: '
|
|
35
|
+
name: 'read_website',
|
|
36
|
+
description: 'Fast, token-efficient web content extraction - ideal for reading documentation, analyzing content, and gathering information from websites. Converts to clean Markdown while preserving links and structure.',
|
|
26
37
|
inputSchema: {
|
|
27
38
|
type: 'object',
|
|
28
39
|
properties: {
|
|
@@ -43,6 +54,13 @@ const READ_WEBSITE_TOOL = {
|
|
|
43
54
|
},
|
|
44
55
|
required: ['url'],
|
|
45
56
|
},
|
|
57
|
+
annotations: {
|
|
58
|
+
title: 'Read Website',
|
|
59
|
+
readOnlyHint: true,
|
|
60
|
+
destructiveHint: false,
|
|
61
|
+
idempotentHint: true,
|
|
62
|
+
openWorldHint: true,
|
|
63
|
+
},
|
|
46
64
|
};
|
|
47
65
|
const RESOURCES = [
|
|
48
66
|
{
|
|
@@ -58,25 +76,44 @@ const RESOURCES = [
|
|
|
58
76
|
description: 'Clear the cache directory',
|
|
59
77
|
},
|
|
60
78
|
];
|
|
61
|
-
server.setRequestHandler(ListToolsRequestSchema, async () =>
|
|
62
|
-
|
|
63
|
-
|
|
79
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
80
|
+
logger.debug('Received ListTools request');
|
|
81
|
+
const response = {
|
|
82
|
+
tools: [READ_WEBSITE_TOOL],
|
|
83
|
+
};
|
|
84
|
+
logger.debug('Returning tools:', response.tools.map(t => t.name));
|
|
85
|
+
return response;
|
|
86
|
+
});
|
|
64
87
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
65
|
-
|
|
66
|
-
|
|
88
|
+
logger.info('Received CallTool request:', request.params.name);
|
|
89
|
+
logger.debug('Request params:', JSON.stringify(request.params, null, 2));
|
|
90
|
+
if (request.params.name !== 'read_website') {
|
|
91
|
+
const error = `Unknown tool: ${request.params.name}`;
|
|
92
|
+
logger.error(error);
|
|
93
|
+
throw new Error(error);
|
|
67
94
|
}
|
|
68
95
|
try {
|
|
69
96
|
if (!fetchMarkdownModule) {
|
|
97
|
+
logger.debug('Lazy loading fetchMarkdown module...');
|
|
70
98
|
fetchMarkdownModule = await import('./internal/fetchMarkdown.js');
|
|
99
|
+
logger.info('fetchMarkdown module loaded successfully');
|
|
71
100
|
}
|
|
72
101
|
const args = request.params.arguments;
|
|
73
102
|
if (!args.url || typeof args.url !== 'string') {
|
|
74
103
|
throw new Error('URL parameter is required and must be a string');
|
|
75
104
|
}
|
|
105
|
+
logger.info(`Processing read request for URL: ${args.url}`);
|
|
106
|
+
logger.debug('Read parameters:', {
|
|
107
|
+
url: args.url,
|
|
108
|
+
depth: args.depth,
|
|
109
|
+
respectRobots: args.respectRobots,
|
|
110
|
+
});
|
|
111
|
+
logger.debug('Calling fetchMarkdown...');
|
|
76
112
|
const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
|
|
77
113
|
depth: args.depth ?? 0,
|
|
78
114
|
respectRobots: args.respectRobots ?? true,
|
|
79
115
|
});
|
|
116
|
+
logger.info('Content fetched successfully');
|
|
80
117
|
if (result.error && result.markdown) {
|
|
81
118
|
return {
|
|
82
119
|
content: [
|
|
@@ -95,14 +132,24 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
95
132
|
};
|
|
96
133
|
}
|
|
97
134
|
catch (error) {
|
|
98
|
-
|
|
135
|
+
logger.error('Error fetching content:', error.message);
|
|
136
|
+
logger.debug('Error stack:', error.stack);
|
|
137
|
+
logger.debug('Error details:', {
|
|
138
|
+
name: error.name,
|
|
139
|
+
code: error.code,
|
|
140
|
+
...error,
|
|
141
|
+
});
|
|
99
142
|
throw new Error(`Failed to fetch content: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
100
143
|
}
|
|
101
144
|
});
|
|
102
|
-
server.setRequestHandler(ListResourcesRequestSchema, async () =>
|
|
103
|
-
|
|
104
|
-
|
|
145
|
+
server.setRequestHandler(ListResourcesRequestSchema, async () => {
|
|
146
|
+
logger.debug('Received ListResources request');
|
|
147
|
+
return {
|
|
148
|
+
resources: RESOURCES,
|
|
149
|
+
};
|
|
150
|
+
});
|
|
105
151
|
server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
152
|
+
logger.debug('Received ReadResource request:', request.params);
|
|
106
153
|
const uri = request.params.uri;
|
|
107
154
|
if (!fsPromises) {
|
|
108
155
|
fsPromises = await import('fs/promises');
|
|
@@ -190,49 +237,84 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
190
237
|
throw new Error(`Unknown resource: ${uri}`);
|
|
191
238
|
});
|
|
192
239
|
async function runServer() {
|
|
193
|
-
const transport = new StdioServerTransport();
|
|
194
|
-
transport.onerror = error => {
|
|
195
|
-
console.error('[Transport Error]', error);
|
|
196
|
-
};
|
|
197
|
-
process.on('SIGINT', async () => {
|
|
198
|
-
console.error('Received SIGINT, shutting down gracefully...');
|
|
199
|
-
await server.close();
|
|
200
|
-
process.exit(0);
|
|
201
|
-
});
|
|
202
|
-
process.on('SIGTERM', async () => {
|
|
203
|
-
console.error('Received SIGTERM, shutting down gracefully...');
|
|
204
|
-
await server.close();
|
|
205
|
-
process.exit(0);
|
|
206
|
-
});
|
|
207
|
-
process.on('uncaughtException', error => {
|
|
208
|
-
console.error('Uncaught exception:', error);
|
|
209
|
-
if (error && error.message && error.message.includes('EPIPE')) {
|
|
210
|
-
console.error('Pipe error detected, keeping server alive');
|
|
211
|
-
return;
|
|
212
|
-
}
|
|
213
|
-
process.exit(1);
|
|
214
|
-
});
|
|
215
|
-
process.on('unhandledRejection', (reason, promise) => {
|
|
216
|
-
console.error('Unhandled rejection at:', promise, 'reason:', reason);
|
|
217
|
-
});
|
|
218
|
-
process.stdin.on('end', () => {
|
|
219
|
-
console.error('Stdin closed, shutting down...');
|
|
220
|
-
process.exit(0);
|
|
221
|
-
});
|
|
222
|
-
process.stdin.on('error', error => {
|
|
223
|
-
console.error('Stdin error:', error);
|
|
224
|
-
});
|
|
225
240
|
try {
|
|
241
|
+
logger.info('Starting MCP server...');
|
|
242
|
+
logger.debug('Creating StdioServerTransport...');
|
|
243
|
+
const transport = new StdioServerTransport();
|
|
244
|
+
logger.debug('Transport created, connecting to server...');
|
|
245
|
+
transport.onerror = error => {
|
|
246
|
+
logger.error('Transport Error:', error);
|
|
247
|
+
if (error?.message?.includes('Connection closed')) {
|
|
248
|
+
logger.info('Connection closed by client');
|
|
249
|
+
process.exit(0);
|
|
250
|
+
}
|
|
251
|
+
};
|
|
252
|
+
const cleanup = async (signal) => {
|
|
253
|
+
logger.info(`Received ${signal}, shutting down gracefully...`);
|
|
254
|
+
try {
|
|
255
|
+
await server.close();
|
|
256
|
+
logger.info('Server closed successfully');
|
|
257
|
+
process.exit(0);
|
|
258
|
+
}
|
|
259
|
+
catch (error) {
|
|
260
|
+
logger.error('Error during cleanup:', error);
|
|
261
|
+
process.exit(1);
|
|
262
|
+
}
|
|
263
|
+
};
|
|
264
|
+
process.on('SIGINT', () => cleanup('SIGINT'));
|
|
265
|
+
process.on('SIGTERM', () => cleanup('SIGTERM'));
|
|
266
|
+
process.on('uncaughtException', error => {
|
|
267
|
+
logger.error('Uncaught exception:', error.message);
|
|
268
|
+
logger.error('Stack trace:', error.stack);
|
|
269
|
+
logger.debug('Full error object:', error);
|
|
270
|
+
if (error && error.message && error.message.includes('EPIPE')) {
|
|
271
|
+
logger.warn('Pipe error detected, keeping server alive');
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
process.exit(1);
|
|
275
|
+
});
|
|
276
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
277
|
+
logger.error('Unhandled Rejection at:', promise);
|
|
278
|
+
logger.error('Rejection reason:', reason);
|
|
279
|
+
logger.debug('Full rejection details:', { reason, promise });
|
|
280
|
+
});
|
|
281
|
+
process.on('exit', code => {
|
|
282
|
+
logger.info(`Process exiting with code: ${code}`);
|
|
283
|
+
});
|
|
284
|
+
process.on('warning', warning => {
|
|
285
|
+
logger.warn('Process warning:', warning.message);
|
|
286
|
+
logger.debug('Warning details:', warning);
|
|
287
|
+
});
|
|
288
|
+
process.stdin.on('end', () => {
|
|
289
|
+
logger.info('Stdin closed, shutting down...');
|
|
290
|
+
setTimeout(() => process.exit(0), 100);
|
|
291
|
+
});
|
|
292
|
+
process.stdin.on('error', error => {
|
|
293
|
+
logger.warn('Stdin error:', error);
|
|
294
|
+
});
|
|
226
295
|
await server.connect(transport);
|
|
227
|
-
|
|
296
|
+
logger.info('MCP server connected and running successfully!');
|
|
297
|
+
logger.info('Ready to receive requests');
|
|
298
|
+
logger.debug('Server details:', {
|
|
299
|
+
name: 'read-website-fast',
|
|
300
|
+
version: '0.1.0',
|
|
301
|
+
pid: process.pid,
|
|
302
|
+
});
|
|
303
|
+
setInterval(() => {
|
|
304
|
+
logger.debug('Server heartbeat - still running...');
|
|
305
|
+
}, 30000);
|
|
228
306
|
process.stdin.resume();
|
|
229
307
|
}
|
|
230
308
|
catch (error) {
|
|
231
|
-
|
|
232
|
-
|
|
309
|
+
logger.error('Failed to start server:', error.message);
|
|
310
|
+
logger.debug('Startup error details:', error);
|
|
311
|
+
throw error;
|
|
233
312
|
}
|
|
234
313
|
}
|
|
314
|
+
logger.info('Initializing MCP server...');
|
|
235
315
|
runServer().catch(error => {
|
|
236
|
-
|
|
316
|
+
logger.error('Fatal server error:', error.message);
|
|
317
|
+
logger.error('Stack trace:', error.stack);
|
|
318
|
+
logger.debug('Full error:', error);
|
|
237
319
|
process.exit(1);
|
|
238
320
|
});
|
package/dist/utils/logger.js
CHANGED
|
@@ -23,10 +23,10 @@ export class Logger {
|
|
|
23
23
|
console.error(prefix, message, ...args);
|
|
24
24
|
break;
|
|
25
25
|
case LogLevel.WARN:
|
|
26
|
-
console.
|
|
26
|
+
console.error(prefix, message, ...args);
|
|
27
27
|
break;
|
|
28
28
|
default:
|
|
29
|
-
console.
|
|
29
|
+
console.error(prefix, message, ...args);
|
|
30
30
|
}
|
|
31
31
|
}
|
|
32
32
|
error(message, ...args) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@just-every/mcp-read-website-fast",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.13",
|
|
4
4
|
"description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
"build:dev": "tsc",
|
|
19
19
|
"dev": "tsx src/index.ts",
|
|
20
20
|
"start": "node dist/index.js",
|
|
21
|
-
"serve": "
|
|
21
|
+
"serve": "node dist/serve-restart.js",
|
|
22
22
|
"serve:dev": "tsx src/serve.ts",
|
|
23
23
|
"test": "vitest",
|
|
24
24
|
"test:deploy": "vitest run test/deployment.test.ts",
|
|
@@ -50,15 +50,9 @@
|
|
|
50
50
|
"homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
|
|
51
51
|
"license": "MIT",
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@
|
|
54
|
-
"@
|
|
55
|
-
"commander": "^14.0.0"
|
|
56
|
-
"jsdom": "^26.1.0",
|
|
57
|
-
"p-limit": "^6.2.0",
|
|
58
|
-
"robots-parser": "^3.0.1",
|
|
59
|
-
"turndown": "^7.1.3",
|
|
60
|
-
"turndown-plugin-gfm": "^1.0.2",
|
|
61
|
-
"undici": "^7.10.0"
|
|
53
|
+
"@just-every/crawl": "^1.0.2",
|
|
54
|
+
"@modelcontextprotocol/sdk": "^1.12.3",
|
|
55
|
+
"commander": "^14.0.0"
|
|
62
56
|
},
|
|
63
57
|
"devDependencies": {
|
|
64
58
|
"@types/jsdom": "^21.1.6",
|
package/dist/cache/disk.d.ts
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import { CacheEntry } from '../types.js';
|
|
2
|
-
export declare class DiskCache {
|
|
3
|
-
private cacheDir;
|
|
4
|
-
constructor(cacheDir?: string);
|
|
5
|
-
init(): Promise<void>;
|
|
6
|
-
private getCacheKey;
|
|
7
|
-
private getCachePath;
|
|
8
|
-
has(url: string): Promise<boolean>;
|
|
9
|
-
get(url: string): Promise<CacheEntry | null>;
|
|
10
|
-
put(url: string, markdown: string, title?: string): Promise<void>;
|
|
11
|
-
getAge(url: string): Promise<number | null>;
|
|
12
|
-
}
|
package/dist/cache/disk.js
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import { createHash } from 'crypto';
|
|
2
|
-
import { mkdir, readFile, writeFile, access } from 'fs/promises';
|
|
3
|
-
import { join } from 'path';
|
|
4
|
-
export class DiskCache {
|
|
5
|
-
cacheDir;
|
|
6
|
-
constructor(cacheDir = '.cache') {
|
|
7
|
-
this.cacheDir = cacheDir;
|
|
8
|
-
}
|
|
9
|
-
async init() {
|
|
10
|
-
await mkdir(this.cacheDir, { recursive: true });
|
|
11
|
-
}
|
|
12
|
-
getCacheKey(url) {
|
|
13
|
-
return createHash('sha256').update(url).digest('hex');
|
|
14
|
-
}
|
|
15
|
-
getCachePath(url) {
|
|
16
|
-
const key = this.getCacheKey(url);
|
|
17
|
-
return join(this.cacheDir, `${key}.json`);
|
|
18
|
-
}
|
|
19
|
-
async has(url) {
|
|
20
|
-
try {
|
|
21
|
-
await access(this.getCachePath(url));
|
|
22
|
-
return true;
|
|
23
|
-
}
|
|
24
|
-
catch {
|
|
25
|
-
return false;
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
async get(url) {
|
|
29
|
-
try {
|
|
30
|
-
const path = this.getCachePath(url);
|
|
31
|
-
const data = await readFile(path, 'utf-8');
|
|
32
|
-
return JSON.parse(data);
|
|
33
|
-
}
|
|
34
|
-
catch {
|
|
35
|
-
return null;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
async put(url, markdown, title) {
|
|
39
|
-
const entry = {
|
|
40
|
-
url,
|
|
41
|
-
markdown,
|
|
42
|
-
timestamp: Date.now(),
|
|
43
|
-
title,
|
|
44
|
-
};
|
|
45
|
-
const path = this.getCachePath(url);
|
|
46
|
-
await writeFile(path, JSON.stringify(entry, null, 2));
|
|
47
|
-
}
|
|
48
|
-
async getAge(url) {
|
|
49
|
-
const entry = await this.get(url);
|
|
50
|
-
if (!entry)
|
|
51
|
-
return null;
|
|
52
|
-
return Date.now() - entry.timestamp;
|
|
53
|
-
}
|
|
54
|
-
}
|
package/dist/cache/normalize.js
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
export function normalizeUrl(url) {
|
|
2
|
-
try {
|
|
3
|
-
const parsed = new URL(url);
|
|
4
|
-
if (parsed.pathname !== '/' && parsed.pathname.endsWith('/')) {
|
|
5
|
-
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
6
|
-
}
|
|
7
|
-
const params = Array.from(parsed.searchParams.entries());
|
|
8
|
-
params.sort(([a], [b]) => a.localeCompare(b));
|
|
9
|
-
parsed.search = '';
|
|
10
|
-
params.forEach(([key, value]) => parsed.searchParams.append(key, value));
|
|
11
|
-
if ((parsed.protocol === 'http:' && parsed.port === '80') ||
|
|
12
|
-
(parsed.protocol === 'https:' && parsed.port === '443')) {
|
|
13
|
-
parsed.port = '';
|
|
14
|
-
}
|
|
15
|
-
parsed.hash = '';
|
|
16
|
-
return parsed.href;
|
|
17
|
-
}
|
|
18
|
-
catch {
|
|
19
|
-
return url;
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
export function isSameOrigin(url1, url2) {
|
|
23
|
-
try {
|
|
24
|
-
const u1 = new URL(url1);
|
|
25
|
-
const u2 = new URL(url2);
|
|
26
|
-
return u1.origin === u2.origin;
|
|
27
|
-
}
|
|
28
|
-
catch {
|
|
29
|
-
return false;
|
|
30
|
-
}
|
|
31
|
-
}
|
package/dist/crawler/fetch.d.ts
DELETED
package/dist/crawler/fetch.js
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import { fetch } from 'undici';
|
|
2
|
-
export async function fetchStream(url, options = {}) {
|
|
3
|
-
const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5, } = options;
|
|
4
|
-
try {
|
|
5
|
-
const response = await fetch(url, {
|
|
6
|
-
headers: {
|
|
7
|
-
'User-Agent': userAgent,
|
|
8
|
-
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
9
|
-
'Accept-Language': 'en-US,en;q=0.5',
|
|
10
|
-
DNT: '1',
|
|
11
|
-
Connection: 'keep-alive',
|
|
12
|
-
'Upgrade-Insecure-Requests': '1',
|
|
13
|
-
},
|
|
14
|
-
redirect: maxRedirections > 0 ? 'follow' : 'manual',
|
|
15
|
-
signal: AbortSignal.timeout(timeout),
|
|
16
|
-
});
|
|
17
|
-
if (!response.ok) {
|
|
18
|
-
throw new Error(`HTTP ${response.status} for ${url}`);
|
|
19
|
-
}
|
|
20
|
-
const contentType = response.headers.get('content-type');
|
|
21
|
-
if (contentType &&
|
|
22
|
-
!contentType.includes('text/html') &&
|
|
23
|
-
!contentType.includes('application/xhtml+xml')) {
|
|
24
|
-
throw new Error(`Non-HTML content type: ${contentType} for ${url}`);
|
|
25
|
-
}
|
|
26
|
-
return await response.text();
|
|
27
|
-
}
|
|
28
|
-
catch (error) {
|
|
29
|
-
if (error instanceof Error) {
|
|
30
|
-
throw new Error(`Failed to fetch ${url}: ${error.message}`);
|
|
31
|
-
}
|
|
32
|
-
throw error;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
export function isValidUrl(url) {
|
|
36
|
-
try {
|
|
37
|
-
const parsed = new URL(url);
|
|
38
|
-
return parsed.protocol === 'http:' || parsed.protocol === 'https:';
|
|
39
|
-
}
|
|
40
|
-
catch {
|
|
41
|
-
return false;
|
|
42
|
-
}
|
|
43
|
-
}
|
package/dist/crawler/queue.d.ts
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import { CrawlOptions, CrawlResult } from '../types.js';
|
|
2
|
-
export declare class CrawlQueue {
|
|
3
|
-
private visited;
|
|
4
|
-
private queue;
|
|
5
|
-
private limit;
|
|
6
|
-
private cache;
|
|
7
|
-
private options;
|
|
8
|
-
private results;
|
|
9
|
-
constructor(options?: CrawlOptions);
|
|
10
|
-
init(): Promise<void>;
|
|
11
|
-
crawl(startUrl: string): Promise<CrawlResult[]>;
|
|
12
|
-
private processQueue;
|
|
13
|
-
private processUrl;
|
|
14
|
-
}
|
package/dist/crawler/queue.js
DELETED
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
import pLimit from 'p-limit';
|
|
2
|
-
import { normalizeUrl, isSameOrigin } from '../cache/normalize.js';
|
|
3
|
-
import { DiskCache } from '../cache/disk.js';
|
|
4
|
-
import { fetchStream, isValidUrl } from './fetch.js';
|
|
5
|
-
import { isAllowedByRobots, getCrawlDelay } from './robots.js';
|
|
6
|
-
import { htmlToDom, extractLinks } from '../parser/dom.js';
|
|
7
|
-
import { extractArticle } from '../parser/article.js';
|
|
8
|
-
import { formatArticleMarkdown } from '../parser/markdown.js';
|
|
9
|
-
export class CrawlQueue {
|
|
10
|
-
visited = new Set();
|
|
11
|
-
queue = [];
|
|
12
|
-
limit;
|
|
13
|
-
cache;
|
|
14
|
-
options;
|
|
15
|
-
results = [];
|
|
16
|
-
constructor(options = {}) {
|
|
17
|
-
this.options = {
|
|
18
|
-
depth: options.depth ?? 0,
|
|
19
|
-
maxConcurrency: options.maxConcurrency ?? 3,
|
|
20
|
-
respectRobots: options.respectRobots ?? true,
|
|
21
|
-
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
22
|
-
userAgent: options.userAgent ?? 'MCP/0.1',
|
|
23
|
-
cacheDir: options.cacheDir ?? '.cache',
|
|
24
|
-
timeout: options.timeout ?? 30000,
|
|
25
|
-
};
|
|
26
|
-
this.limit = pLimit(this.options.maxConcurrency);
|
|
27
|
-
this.cache = new DiskCache(this.options.cacheDir);
|
|
28
|
-
}
|
|
29
|
-
async init() {
|
|
30
|
-
await this.cache.init();
|
|
31
|
-
}
|
|
32
|
-
async crawl(startUrl) {
|
|
33
|
-
const normalizedUrl = normalizeUrl(startUrl);
|
|
34
|
-
if (!isValidUrl(normalizedUrl)) {
|
|
35
|
-
throw new Error(`Invalid URL: ${startUrl}`);
|
|
36
|
-
}
|
|
37
|
-
this.queue.push(normalizedUrl);
|
|
38
|
-
await this.processQueue(0);
|
|
39
|
-
return this.results;
|
|
40
|
-
}
|
|
41
|
-
async processQueue(currentDepth) {
|
|
42
|
-
if (currentDepth > this.options.depth)
|
|
43
|
-
return;
|
|
44
|
-
const urls = [...this.queue];
|
|
45
|
-
this.queue = [];
|
|
46
|
-
const tasks = urls.map(url => this.limit(() => this.processUrl(url, currentDepth)));
|
|
47
|
-
await Promise.all(tasks);
|
|
48
|
-
if (this.queue.length > 0) {
|
|
49
|
-
await this.processQueue(currentDepth + 1);
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
async processUrl(url, depth) {
|
|
53
|
-
const normalizedUrl = normalizeUrl(url);
|
|
54
|
-
if (this.visited.has(normalizedUrl))
|
|
55
|
-
return;
|
|
56
|
-
this.visited.add(normalizedUrl);
|
|
57
|
-
try {
|
|
58
|
-
const cached = await this.cache.get(normalizedUrl);
|
|
59
|
-
if (cached) {
|
|
60
|
-
this.results.push({
|
|
61
|
-
url: normalizedUrl,
|
|
62
|
-
markdown: cached.markdown,
|
|
63
|
-
title: cached.title,
|
|
64
|
-
});
|
|
65
|
-
return;
|
|
66
|
-
}
|
|
67
|
-
if (this.options.respectRobots) {
|
|
68
|
-
const allowed = await isAllowedByRobots(normalizedUrl, this.options.userAgent);
|
|
69
|
-
if (!allowed) {
|
|
70
|
-
this.results.push({
|
|
71
|
-
url: normalizedUrl,
|
|
72
|
-
markdown: '',
|
|
73
|
-
error: 'Blocked by robots.txt',
|
|
74
|
-
});
|
|
75
|
-
return;
|
|
76
|
-
}
|
|
77
|
-
const delay = await getCrawlDelay(normalizedUrl, this.options.userAgent);
|
|
78
|
-
if (delay > 0) {
|
|
79
|
-
await new Promise(resolve => setTimeout(resolve, delay * 1000));
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
const html = await fetchStream(normalizedUrl, {
|
|
83
|
-
userAgent: this.options.userAgent,
|
|
84
|
-
timeout: this.options.timeout,
|
|
85
|
-
});
|
|
86
|
-
if (!html || html.trim().length === 0) {
|
|
87
|
-
this.results.push({
|
|
88
|
-
url: normalizedUrl,
|
|
89
|
-
markdown: '',
|
|
90
|
-
error: 'Empty response from server',
|
|
91
|
-
});
|
|
92
|
-
return;
|
|
93
|
-
}
|
|
94
|
-
const dom = htmlToDom(html, normalizedUrl);
|
|
95
|
-
const article = extractArticle(dom);
|
|
96
|
-
if (!article) {
|
|
97
|
-
this.results.push({
|
|
98
|
-
url: normalizedUrl,
|
|
99
|
-
markdown: '',
|
|
100
|
-
error: 'Failed to extract article content',
|
|
101
|
-
});
|
|
102
|
-
return;
|
|
103
|
-
}
|
|
104
|
-
if (!article.content || article.content.trim().length < 50) {
|
|
105
|
-
const fallbackMarkdown = `# ${article.title || 'Page Content'}\n\n` +
|
|
106
|
-
`*Note: This page appears to be JavaScript-rendered. Limited content extracted.*\n\n` +
|
|
107
|
-
(article.textContent
|
|
108
|
-
? article.textContent.substring(0, 1000) + '...'
|
|
109
|
-
: 'No text content available');
|
|
110
|
-
this.results.push({
|
|
111
|
-
url: normalizedUrl,
|
|
112
|
-
markdown: fallbackMarkdown,
|
|
113
|
-
title: article.title || normalizedUrl,
|
|
114
|
-
error: 'Limited content extracted (JavaScript-rendered page)',
|
|
115
|
-
});
|
|
116
|
-
return;
|
|
117
|
-
}
|
|
118
|
-
const markdown = formatArticleMarkdown(article);
|
|
119
|
-
await this.cache.put(normalizedUrl, markdown, article.title);
|
|
120
|
-
let links = [];
|
|
121
|
-
if (depth < this.options.depth) {
|
|
122
|
-
links = extractLinks(dom);
|
|
123
|
-
if (this.options.sameOriginOnly) {
|
|
124
|
-
links = links.filter(link => isSameOrigin(normalizedUrl, link));
|
|
125
|
-
}
|
|
126
|
-
links.forEach(link => {
|
|
127
|
-
const normalized = normalizeUrl(link);
|
|
128
|
-
if (!this.visited.has(normalized)) {
|
|
129
|
-
this.queue.push(normalized);
|
|
130
|
-
}
|
|
131
|
-
});
|
|
132
|
-
}
|
|
133
|
-
this.results.push({
|
|
134
|
-
url: normalizedUrl,
|
|
135
|
-
markdown,
|
|
136
|
-
title: article.title,
|
|
137
|
-
links: links.length > 0 ? links : undefined,
|
|
138
|
-
});
|
|
139
|
-
}
|
|
140
|
-
catch (error) {
|
|
141
|
-
this.results.push({
|
|
142
|
-
url: normalizedUrl,
|
|
143
|
-
markdown: '',
|
|
144
|
-
error: error instanceof Error ? error.message : 'Unknown error',
|
|
145
|
-
});
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
}
|
package/dist/crawler/robots.d.ts
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
interface RobotsChecker {
|
|
2
|
-
isAllowed(url: string, userAgent?: string): boolean;
|
|
3
|
-
getCrawlDelay(userAgent?: string): number | undefined;
|
|
4
|
-
}
|
|
5
|
-
export declare function getRobotsChecker(origin: string, userAgent?: string): Promise<RobotsChecker>;
|
|
6
|
-
export declare function isAllowedByRobots(url: string, userAgent?: string): Promise<boolean>;
|
|
7
|
-
export declare function getCrawlDelay(url: string, userAgent?: string): Promise<number>;
|
|
8
|
-
export {};
|
package/dist/crawler/robots.js
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import { fetchStream } from './fetch.js';
|
|
2
|
-
const robotsCache = new Map();
|
|
3
|
-
export async function getRobotsChecker(origin, userAgent = '*') {
|
|
4
|
-
const cached = robotsCache.get(origin);
|
|
5
|
-
if (cached)
|
|
6
|
-
return cached;
|
|
7
|
-
try {
|
|
8
|
-
const robotsUrl = new URL('/robots.txt', origin).href;
|
|
9
|
-
const robotsTxt = await fetchStream(robotsUrl, {
|
|
10
|
-
timeout: 5000,
|
|
11
|
-
userAgent,
|
|
12
|
-
});
|
|
13
|
-
const robotsParserModule = (await import('robots-parser'));
|
|
14
|
-
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
15
|
-
const robots = robotsParser(robotsUrl, robotsTxt);
|
|
16
|
-
robotsCache.set(origin, robots);
|
|
17
|
-
return robots;
|
|
18
|
-
}
|
|
19
|
-
catch {
|
|
20
|
-
const permissive = {
|
|
21
|
-
isAllowed: () => true,
|
|
22
|
-
getCrawlDelay: () => undefined,
|
|
23
|
-
};
|
|
24
|
-
robotsCache.set(origin, permissive);
|
|
25
|
-
return permissive;
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
export async function isAllowedByRobots(url, userAgent = '*') {
|
|
29
|
-
try {
|
|
30
|
-
const { origin } = new URL(url);
|
|
31
|
-
const checker = await getRobotsChecker(origin, userAgent);
|
|
32
|
-
return checker.isAllowed(url, userAgent);
|
|
33
|
-
}
|
|
34
|
-
catch {
|
|
35
|
-
return true;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
export async function getCrawlDelay(url, userAgent = '*') {
|
|
39
|
-
try {
|
|
40
|
-
const { origin } = new URL(url);
|
|
41
|
-
const checker = await getRobotsChecker(origin, userAgent);
|
|
42
|
-
return checker.getCrawlDelay(userAgent) || 0;
|
|
43
|
-
}
|
|
44
|
-
catch {
|
|
45
|
-
return 0;
|
|
46
|
-
}
|
|
47
|
-
}
|
package/dist/parser/article.d.ts
DELETED
package/dist/parser/article.js
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
import { Readability } from '@mozilla/readability';
|
|
2
|
-
export function extractArticle(dom) {
|
|
3
|
-
const document = dom.window.document;
|
|
4
|
-
const baseUrl = dom.window.location.href;
|
|
5
|
-
const articleParagraph = document.querySelector('article p');
|
|
6
|
-
const hasStrongArticleIndicators = (document.querySelector('article') !== null &&
|
|
7
|
-
articleParagraph?.textContent &&
|
|
8
|
-
articleParagraph.textContent.length > 200) ||
|
|
9
|
-
document.querySelector('[itemtype*="BlogPosting"]') !== null ||
|
|
10
|
-
document.querySelector('[itemtype*="NewsArticle"]') !== null ||
|
|
11
|
-
document.querySelector('meta[property="article:published_time"]') !==
|
|
12
|
-
null;
|
|
13
|
-
if (hasStrongArticleIndicators) {
|
|
14
|
-
const documentClone = document.cloneNode(true);
|
|
15
|
-
const reader = new Readability(documentClone);
|
|
16
|
-
const article = reader.parse();
|
|
17
|
-
if (article && article.content && article.content.trim().length > 500) {
|
|
18
|
-
return {
|
|
19
|
-
title: article.title || 'Untitled',
|
|
20
|
-
content: article.content || '',
|
|
21
|
-
textContent: article.textContent || '',
|
|
22
|
-
length: article.length || 0,
|
|
23
|
-
excerpt: article.excerpt || '',
|
|
24
|
-
byline: article.byline || null,
|
|
25
|
-
dir: article.dir || null,
|
|
26
|
-
lang: article.lang || null,
|
|
27
|
-
siteName: article.siteName || null,
|
|
28
|
-
publishedTime: article.publishedTime || null,
|
|
29
|
-
baseUrl,
|
|
30
|
-
};
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
return extractContentManually(dom);
|
|
34
|
-
}
|
|
35
|
-
function extractContentManually(dom) {
|
|
36
|
-
try {
|
|
37
|
-
const document = dom.window.document;
|
|
38
|
-
const baseUrl = dom.window.location.href;
|
|
39
|
-
const title = document.querySelector('title')?.textContent ||
|
|
40
|
-
document.querySelector('h1')?.textContent ||
|
|
41
|
-
document
|
|
42
|
-
.querySelector('meta[property="og:title"]')
|
|
43
|
-
?.getAttribute('content') ||
|
|
44
|
-
document
|
|
45
|
-
.querySelector('meta[name="title"]')
|
|
46
|
-
?.getAttribute('content') ||
|
|
47
|
-
'Untitled Page';
|
|
48
|
-
const byline = document
|
|
49
|
-
.querySelector('meta[name="author"]')
|
|
50
|
-
?.getAttribute('content') ||
|
|
51
|
-
document.querySelector('[rel="author"]')?.textContent ||
|
|
52
|
-
document.querySelector('.author')?.textContent ||
|
|
53
|
-
null;
|
|
54
|
-
if (!document.body) {
|
|
55
|
-
const html = document.documentElement?.innerHTML || '';
|
|
56
|
-
return {
|
|
57
|
-
title: title.trim(),
|
|
58
|
-
content: html,
|
|
59
|
-
byline,
|
|
60
|
-
excerpt: '',
|
|
61
|
-
dir: null,
|
|
62
|
-
lang: document.documentElement?.lang || null,
|
|
63
|
-
length: html.length,
|
|
64
|
-
siteName: null,
|
|
65
|
-
textContent: document.documentElement?.textContent || '',
|
|
66
|
-
publishedTime: null,
|
|
67
|
-
baseUrl,
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
const contentClone = document.body.cloneNode(true);
|
|
71
|
-
const selectorsToRemove = ['script', 'style', 'noscript', 'template'];
|
|
72
|
-
selectorsToRemove.forEach(selector => {
|
|
73
|
-
try {
|
|
74
|
-
contentClone
|
|
75
|
-
.querySelectorAll(selector)
|
|
76
|
-
.forEach(el => el.remove());
|
|
77
|
-
}
|
|
78
|
-
catch {
|
|
79
|
-
}
|
|
80
|
-
});
|
|
81
|
-
const mainContent = contentClone;
|
|
82
|
-
const content = mainContent.innerHTML || mainContent.textContent || '';
|
|
83
|
-
return {
|
|
84
|
-
title: title.trim(),
|
|
85
|
-
content,
|
|
86
|
-
byline,
|
|
87
|
-
excerpt: '',
|
|
88
|
-
dir: null,
|
|
89
|
-
lang: document.documentElement?.lang || null,
|
|
90
|
-
length: content.length,
|
|
91
|
-
siteName: null,
|
|
92
|
-
textContent: mainContent.textContent || '',
|
|
93
|
-
publishedTime: null,
|
|
94
|
-
baseUrl,
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
catch (error) {
|
|
98
|
-
console.error('Error in manual extraction:', error);
|
|
99
|
-
return {
|
|
100
|
-
title: 'Error extracting content',
|
|
101
|
-
content: dom.window.document.body?.innerHTML ||
|
|
102
|
-
dom.window.document.documentElement?.innerHTML ||
|
|
103
|
-
'',
|
|
104
|
-
byline: null,
|
|
105
|
-
excerpt: '',
|
|
106
|
-
dir: null,
|
|
107
|
-
lang: null,
|
|
108
|
-
length: 0,
|
|
109
|
-
siteName: null,
|
|
110
|
-
textContent: dom.window.document.body?.textContent || '',
|
|
111
|
-
publishedTime: null,
|
|
112
|
-
baseUrl: dom.window.location.href,
|
|
113
|
-
};
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
export function hasContent(html) {
|
|
117
|
-
const lowerHtml = html.toLowerCase();
|
|
118
|
-
if (lowerHtml.includes('<noscript>') &&
|
|
119
|
-
!lowerHtml.includes('<article') &&
|
|
120
|
-
!lowerHtml.includes('<main')) {
|
|
121
|
-
return false;
|
|
122
|
-
}
|
|
123
|
-
const textContent = html.replace(/<[^>]*>/g, '').trim();
|
|
124
|
-
return textContent.length > 100;
|
|
125
|
-
}
|
package/dist/parser/dom.d.ts
DELETED
package/dist/parser/dom.js
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
2
|
-
export function htmlToDom(html, url) {
|
|
3
|
-
try {
|
|
4
|
-
return new JSDOM(html, {
|
|
5
|
-
url,
|
|
6
|
-
contentType: 'text/html',
|
|
7
|
-
includeNodeLocations: false,
|
|
8
|
-
runScripts: undefined,
|
|
9
|
-
resources: undefined,
|
|
10
|
-
pretendToBeVisual: true,
|
|
11
|
-
virtualConsole: new VirtualConsole().sendTo(console, {
|
|
12
|
-
omitJSDOMErrors: true,
|
|
13
|
-
}),
|
|
14
|
-
});
|
|
15
|
-
}
|
|
16
|
-
catch {
|
|
17
|
-
try {
|
|
18
|
-
return new JSDOM(html, {
|
|
19
|
-
url,
|
|
20
|
-
contentType: 'text/html',
|
|
21
|
-
virtualConsole: new VirtualConsole().sendTo(console, {
|
|
22
|
-
omitJSDOMErrors: true,
|
|
23
|
-
}),
|
|
24
|
-
});
|
|
25
|
-
}
|
|
26
|
-
catch {
|
|
27
|
-
return new JSDOM(`<!DOCTYPE html><html><body>${html}</body></html>`, {
|
|
28
|
-
url,
|
|
29
|
-
contentType: 'text/html',
|
|
30
|
-
virtualConsole: new VirtualConsole().sendTo(console, {
|
|
31
|
-
omitJSDOMErrors: true,
|
|
32
|
-
}),
|
|
33
|
-
});
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
export function extractLinks(dom) {
|
|
38
|
-
const document = dom.window.document;
|
|
39
|
-
const links = [];
|
|
40
|
-
const baseUrl = dom.window.location.href;
|
|
41
|
-
const anchorElements = document.querySelectorAll('a[href]');
|
|
42
|
-
anchorElements.forEach(element => {
|
|
43
|
-
try {
|
|
44
|
-
const href = element.getAttribute('href');
|
|
45
|
-
if (!href)
|
|
46
|
-
return;
|
|
47
|
-
if (href.startsWith('mailto:') ||
|
|
48
|
-
href.startsWith('tel:') ||
|
|
49
|
-
href.startsWith('javascript:') ||
|
|
50
|
-
href.startsWith('#')) {
|
|
51
|
-
return;
|
|
52
|
-
}
|
|
53
|
-
const absoluteUrl = new URL(href, baseUrl).href;
|
|
54
|
-
links.push(absoluteUrl);
|
|
55
|
-
}
|
|
56
|
-
catch {
|
|
57
|
-
}
|
|
58
|
-
});
|
|
59
|
-
return [...new Set(links)];
|
|
60
|
-
}
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import TurndownService from 'turndown';
|
|
2
|
-
export declare function createTurndownService(): TurndownService;
|
|
3
|
-
export declare function htmlToMarkdown(html: string): string;
|
|
4
|
-
export declare function formatArticleMarkdown(article: {
|
|
5
|
-
title: string;
|
|
6
|
-
content: string;
|
|
7
|
-
byline?: string | null;
|
|
8
|
-
baseUrl?: string;
|
|
9
|
-
}): string;
|
package/dist/parser/markdown.js
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
import TurndownService from 'turndown';
|
|
2
|
-
import { gfm } from 'turndown-plugin-gfm';
|
|
3
|
-
import { JSDOM } from 'jsdom';
|
|
4
|
-
function convertRelativeUrls(html, baseUrl) {
|
|
5
|
-
try {
|
|
6
|
-
const dom = new JSDOM(html, { url: baseUrl });
|
|
7
|
-
const document = dom.window.document;
|
|
8
|
-
document.querySelectorAll('a[href]').forEach(link => {
|
|
9
|
-
const href = link.getAttribute('href');
|
|
10
|
-
if (href &&
|
|
11
|
-
!href.startsWith('http://') &&
|
|
12
|
-
!href.startsWith('https://') &&
|
|
13
|
-
!href.startsWith('//') &&
|
|
14
|
-
!href.startsWith('mailto:') &&
|
|
15
|
-
!href.startsWith('tel:') &&
|
|
16
|
-
!href.startsWith('javascript:') &&
|
|
17
|
-
!href.startsWith('#')) {
|
|
18
|
-
try {
|
|
19
|
-
const absoluteUrl = new URL(href, baseUrl).href;
|
|
20
|
-
link.setAttribute('href', absoluteUrl);
|
|
21
|
-
}
|
|
22
|
-
catch {
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
});
|
|
26
|
-
document.querySelectorAll('img[src]').forEach(img => {
|
|
27
|
-
const src = img.getAttribute('src');
|
|
28
|
-
if (src &&
|
|
29
|
-
!src.startsWith('http://') &&
|
|
30
|
-
!src.startsWith('https://') &&
|
|
31
|
-
!src.startsWith('//') &&
|
|
32
|
-
!src.startsWith('data:')) {
|
|
33
|
-
try {
|
|
34
|
-
const absoluteUrl = new URL(src, baseUrl).href;
|
|
35
|
-
img.setAttribute('src', absoluteUrl);
|
|
36
|
-
}
|
|
37
|
-
catch {
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
});
|
|
41
|
-
const bodyElement = document.body || document.documentElement;
|
|
42
|
-
return bodyElement ? bodyElement.innerHTML : html;
|
|
43
|
-
}
|
|
44
|
-
catch {
|
|
45
|
-
return html;
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
export function createTurndownService() {
|
|
49
|
-
const turndown = new TurndownService({
|
|
50
|
-
headingStyle: 'atx',
|
|
51
|
-
codeBlockStyle: 'fenced',
|
|
52
|
-
linkStyle: 'inlined',
|
|
53
|
-
emDelimiter: '_',
|
|
54
|
-
bulletListMarker: '-',
|
|
55
|
-
strongDelimiter: '**',
|
|
56
|
-
hr: '---',
|
|
57
|
-
blankReplacement: (_content, node) => {
|
|
58
|
-
return node.isBlock ? '\n\n' : '';
|
|
59
|
-
},
|
|
60
|
-
keepReplacement: (content, node) => {
|
|
61
|
-
return node.isBlock ? '\n\n' + content + '\n\n' : content;
|
|
62
|
-
},
|
|
63
|
-
defaultReplacement: (content, node) => {
|
|
64
|
-
return node.isBlock ? '\n\n' + content + '\n\n' : content;
|
|
65
|
-
},
|
|
66
|
-
});
|
|
67
|
-
turndown.use(gfm);
|
|
68
|
-
turndown.addRule('media', {
|
|
69
|
-
filter: ['iframe', 'video', 'audio', 'embed'],
|
|
70
|
-
replacement: (_content, node) => {
|
|
71
|
-
const element = node;
|
|
72
|
-
const src = element.getAttribute('src') || element.getAttribute('data-src');
|
|
73
|
-
const title = element.getAttribute('title') ||
|
|
74
|
-
element.getAttribute('alt') ||
|
|
75
|
-
'media';
|
|
76
|
-
if (src) {
|
|
77
|
-
return `\n\n[${title}](${src})\n\n`;
|
|
78
|
-
}
|
|
79
|
-
return '';
|
|
80
|
-
},
|
|
81
|
-
});
|
|
82
|
-
turndown.addRule('figure', {
|
|
83
|
-
filter: 'figure',
|
|
84
|
-
replacement: (content, node) => {
|
|
85
|
-
const figure = node;
|
|
86
|
-
const caption = figure.querySelector('figcaption');
|
|
87
|
-
if (caption) {
|
|
88
|
-
const captionText = caption.textContent || '';
|
|
89
|
-
return `\n\n${content.trim()}\n*${captionText}*\n\n`;
|
|
90
|
-
}
|
|
91
|
-
return `\n\n${content.trim()}\n\n`;
|
|
92
|
-
},
|
|
93
|
-
});
|
|
94
|
-
return turndown;
|
|
95
|
-
}
|
|
96
|
-
export function htmlToMarkdown(html) {
|
|
97
|
-
const turndown = createTurndownService();
|
|
98
|
-
let markdown = turndown.turndown(html);
|
|
99
|
-
markdown = markdown
|
|
100
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
101
|
-
.replace(/\s+$/gm, '')
|
|
102
|
-
.trim();
|
|
103
|
-
return markdown;
|
|
104
|
-
}
|
|
105
|
-
export function formatArticleMarkdown(article) {
|
|
106
|
-
try {
|
|
107
|
-
const turndown = createTurndownService();
|
|
108
|
-
let markdown = '';
|
|
109
|
-
if (article.title && article.title.trim()) {
|
|
110
|
-
markdown = `# ${article.title}\n\n`;
|
|
111
|
-
}
|
|
112
|
-
if (article.byline) {
|
|
113
|
-
markdown += `*By ${article.byline}*\n\n---\n\n`;
|
|
114
|
-
}
|
|
115
|
-
try {
|
|
116
|
-
const processedContent = article.baseUrl
|
|
117
|
-
? convertRelativeUrls(article.content, article.baseUrl)
|
|
118
|
-
: article.content;
|
|
119
|
-
markdown += turndown.turndown(processedContent);
|
|
120
|
-
}
|
|
121
|
-
catch (conversionError) {
|
|
122
|
-
console.error('Error converting HTML to markdown:', conversionError);
|
|
123
|
-
const tempDiv = typeof document !== 'undefined'
|
|
124
|
-
? document.createElement('div')
|
|
125
|
-
: null;
|
|
126
|
-
if (tempDiv) {
|
|
127
|
-
tempDiv.innerHTML = article.content;
|
|
128
|
-
markdown += tempDiv.textContent || article.content;
|
|
129
|
-
}
|
|
130
|
-
else {
|
|
131
|
-
markdown += article.content
|
|
132
|
-
.replace(/<[^>]*>/g, ' ')
|
|
133
|
-
.replace(/\s+/g, ' ');
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
return markdown
|
|
137
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
138
|
-
.replace(/\s+$/gm, '')
|
|
139
|
-
.trim();
|
|
140
|
-
}
|
|
141
|
-
catch (error) {
|
|
142
|
-
console.error('Fatal error in formatArticleMarkdown:', error);
|
|
143
|
-
return article.title
|
|
144
|
-
? `# ${article.title}\n\n[Content extraction failed]`
|
|
145
|
-
: '[Content extraction failed]';
|
|
146
|
-
}
|
|
147
|
-
}
|