@just-every/mcp-read-website-fast 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/bin/mcp-read-website.js +49 -0
- package/dist/cache/disk.d.ts +12 -0
- package/dist/cache/disk.js +54 -0
- package/dist/cache/normalize.d.ts +2 -0
- package/dist/cache/normalize.js +31 -0
- package/dist/crawler/fetch.d.ts +8 -0
- package/dist/crawler/fetch.js +42 -0
- package/dist/crawler/queue.d.ts +14 -0
- package/dist/crawler/queue.js +142 -0
- package/dist/crawler/robots.d.ts +8 -0
- package/dist/crawler/robots.js +47 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +99 -0
- package/dist/internal/fetchMarkdown.d.ts +16 -0
- package/dist/internal/fetchMarkdown.js +36 -0
- package/dist/parser/article.d.ts +4 -0
- package/dist/parser/article.js +115 -0
- package/dist/parser/dom.d.ts +3 -0
- package/dist/parser/dom.js +53 -0
- package/dist/parser/markdown.d.ts +9 -0
- package/dist/parser/markdown.js +134 -0
- package/dist/serve.d.ts +2 -0
- package/dist/serve.js +171 -0
- package/dist/utils/chunker.d.ts +26 -0
- package/dist/utils/chunker.js +146 -0
- package/dist/utils/logger.d.ts +18 -0
- package/dist/utils/logger.js +52 -0
- package/package.json +71 -0
- package/tsconfig.json +24 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Context
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# MCP Read JustEvery Website
|
|
2
|
+
|
|
3
|
+
Existing MCP web crawlers are slow and consume large quantities of tokens. This pauses the development process and provides incomplete results as LLMs need to parse whole web pages.
|
|
4
|
+
|
|
5
|
+
This MCP package fetches web pages locally, strips noise, and converts content to clean Markdown while preserving links. Designed for Claude Code, IDEs and LLM pipelines with minimal token footprint. Crawl sites locally with minimal dependencies.
|
|
6
|
+
|
|
7
|
+
## MCP Server Configuration
|
|
8
|
+
|
|
9
|
+
This tool can be used as an MCP (Model Context Protocol) server with Claude Desktop, Cursor, VS Code, and other compatible clients.
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
### Claude Code
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
claude mcp add read-website-fast -s user -- npx -y @just-every/mcp-read-website-fast
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### VS Code
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
code --add-mcp '{"name":"read-website-fast","command":"npx","args":["-y","@just-every/mcp-read-website-fast"]}'
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Cursor
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
cursor://anysphere.cursor-deeplink/mcp/install?name=read-website-fast&config=eyJyZWFkLXdlYnNpdGUtZmFzdCI6eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsIkBqdXN0LWV2ZXJ5L21jcC1yZWFkLXdlYnNpdGUtZmFzdCJdfX0=
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### JetBrains IDEs
|
|
32
|
+
|
|
33
|
+
Settings → Tools → AI Assistant → Model Context Protocol (MCP) → Add
|
|
34
|
+
|
|
35
|
+
Choose “As JSON” and paste:
|
|
36
|
+
|
|
37
|
+
```json
|
|
38
|
+
{"command":"npx","args":["-y","@just-every/mcp-read-website-fast"]}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or, in the chat window, type /add and fill in the same JSON—both paths land the server in a single step. 
|
|
42
|
+
|
|
43
|
+
### Raw JSON (works in any MCP client)
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"mcpServers": {
|
|
48
|
+
"read-website-fast": {
|
|
49
|
+
"command": "npx",
|
|
50
|
+
"args": ["-y", "@just-every/mcp-read-website-fast"]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.json, or .mcp.json for Claude).
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- **Fast startup** using official MCP SDK with lazy loading for optimal performance
|
|
63
|
+
- **Content extraction** using Mozilla Readability (same as Firefox Reader View)
|
|
64
|
+
- **HTML to Markdown** conversion with Turndown + GFM support
|
|
65
|
+
- **Smart caching** with SHA-256 hashed URLs
|
|
66
|
+
- **Polite crawling** with robots.txt support and rate limiting
|
|
67
|
+
- **Concurrent fetching** with configurable depth crawling
|
|
68
|
+
- **Stream-first design** for low memory usage
|
|
69
|
+
- **Link preservation** for knowledge graphs
|
|
70
|
+
- **Optional chunking** for downstream processing
|
|
71
|
+
|
|
72
|
+
### Available Tools
|
|
73
|
+
|
|
74
|
+
- `read_website_fast` - Fetches a webpage and converts it to clean markdown
|
|
75
|
+
- Parameters:
|
|
76
|
+
- `url` (required): The HTTP/HTTPS URL to fetch
|
|
77
|
+
- `depth` (optional): Crawl depth (0 = single page)
|
|
78
|
+
- `respectRobots` (optional): Whether to respect robots.txt
|
|
79
|
+
|
|
80
|
+
### Available Resources
|
|
81
|
+
|
|
82
|
+
- `read-website-fast://status` - Get cache statistics
|
|
83
|
+
- `read-website-fast://clear-cache` - Clear the cache directory
|
|
84
|
+
|
|
85
|
+
## Development Usage
|
|
86
|
+
|
|
87
|
+
### Install
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
npm install
|
|
91
|
+
npm run build
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Single page fetch
|
|
95
|
+
```bash
|
|
96
|
+
npm run dev fetch https://example.com/article
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Crawl with depth
|
|
100
|
+
```bash
|
|
101
|
+
npm run dev fetch https://example.com --depth 2 --concurrency 5
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Output formats
|
|
105
|
+
```bash
|
|
106
|
+
# Markdown only (default)
|
|
107
|
+
npm run dev fetch https://example.com
|
|
108
|
+
|
|
109
|
+
# JSON output with metadata
|
|
110
|
+
npm run dev fetch https://example.com --output json
|
|
111
|
+
|
|
112
|
+
# Both URL and markdown
|
|
113
|
+
npm run dev fetch https://example.com --output both
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### CLI Options
|
|
117
|
+
|
|
118
|
+
- `-d, --depth <number>` - Crawl depth (0 = single page, default: 0)
|
|
119
|
+
- `-c, --concurrency <number>` - Max concurrent requests (default: 3)
|
|
120
|
+
- `--no-robots` - Ignore robots.txt
|
|
121
|
+
- `--all-origins` - Allow cross-origin crawling
|
|
122
|
+
- `-u, --user-agent <string>` - Custom user agent
|
|
123
|
+
- `--cache-dir <path>` - Cache directory (default: .cache)
|
|
124
|
+
- `-t, --timeout <ms>` - Request timeout in milliseconds (default: 30000)
|
|
125
|
+
- `-o, --output <format>` - Output format: json, markdown, or both (default: markdown)
|
|
126
|
+
|
|
127
|
+
### Clear cache
|
|
128
|
+
```bash
|
|
129
|
+
npm run dev clear-cache
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Architecture
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
mcp/
|
|
136
|
+
├── src/
|
|
137
|
+
│ ├── crawler/ # URL fetching, queue management, robots.txt
|
|
138
|
+
│ ├── parser/ # DOM parsing, Readability, Turndown conversion
|
|
139
|
+
│ ├── cache/ # Disk-based caching with SHA-256 keys
|
|
140
|
+
│ ├── utils/ # Logger, chunker utilities
|
|
141
|
+
│ └── index.ts # CLI entry point
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Development
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
# Run in development mode
|
|
148
|
+
npm run dev fetch https://example.com
|
|
149
|
+
|
|
150
|
+
# Build for production
|
|
151
|
+
npm run build
|
|
152
|
+
|
|
153
|
+
# Run tests
|
|
154
|
+
npm test
|
|
155
|
+
|
|
156
|
+
# Type checking
|
|
157
|
+
npm run typecheck
|
|
158
|
+
|
|
159
|
+
# Linting
|
|
160
|
+
npm run lint
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## License
|
|
164
|
+
|
|
165
|
+
MIT
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import { dirname, join } from 'path';
|
|
5
|
+
import { existsSync } from 'fs';
|
|
6
|
+
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
const args = process.argv.slice(2);
|
|
10
|
+
|
|
11
|
+
async function main() {
|
|
12
|
+
// Default to 'serve' if no arguments provided (for MCP usage)
|
|
13
|
+
const command = args[0] || 'serve';
|
|
14
|
+
|
|
15
|
+
// Check if compiled dist exists
|
|
16
|
+
const distExists = existsSync(join(__dirname, '..', 'dist'));
|
|
17
|
+
|
|
18
|
+
if (distExists) {
|
|
19
|
+
// Use compiled JavaScript for production (fast startup)
|
|
20
|
+
if (command === 'serve') {
|
|
21
|
+
const servePath = join(__dirname, '..', 'dist', 'serve.js');
|
|
22
|
+
await import(servePath);
|
|
23
|
+
} else {
|
|
24
|
+
const cliPath = join(__dirname, '..', 'dist', 'index.js');
|
|
25
|
+
await import(cliPath);
|
|
26
|
+
}
|
|
27
|
+
} else {
|
|
28
|
+
// Fall back to TypeScript with tsx for development
|
|
29
|
+
try {
|
|
30
|
+
await import('tsx/esm');
|
|
31
|
+
|
|
32
|
+
if (command === 'serve') {
|
|
33
|
+
const servePath = join(__dirname, '..', 'src', 'serve.ts');
|
|
34
|
+
await import(servePath);
|
|
35
|
+
} else {
|
|
36
|
+
const cliPath = join(__dirname, '..', 'src', 'index.ts');
|
|
37
|
+
await import(cliPath);
|
|
38
|
+
}
|
|
39
|
+
} catch (error) {
|
|
40
|
+
console.error('Error: Development dependencies not installed. Please run "npm install" first.');
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
main().catch(err => {
|
|
47
|
+
console.error('Error:', err);
|
|
48
|
+
process.exit(1);
|
|
49
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { CacheEntry } from '../types.js';
|
|
2
|
+
export declare class DiskCache {
|
|
3
|
+
private cacheDir;
|
|
4
|
+
constructor(cacheDir?: string);
|
|
5
|
+
init(): Promise<void>;
|
|
6
|
+
private getCacheKey;
|
|
7
|
+
private getCachePath;
|
|
8
|
+
has(url: string): Promise<boolean>;
|
|
9
|
+
get(url: string): Promise<CacheEntry | null>;
|
|
10
|
+
put(url: string, markdown: string, title?: string): Promise<void>;
|
|
11
|
+
getAge(url: string): Promise<number | null>;
|
|
12
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
import { mkdir, readFile, writeFile, access } from 'fs/promises';
|
|
3
|
+
import { join } from 'path';
|
|
4
|
+
export class DiskCache {
|
|
5
|
+
cacheDir;
|
|
6
|
+
constructor(cacheDir = '.cache') {
|
|
7
|
+
this.cacheDir = cacheDir;
|
|
8
|
+
}
|
|
9
|
+
async init() {
|
|
10
|
+
await mkdir(this.cacheDir, { recursive: true });
|
|
11
|
+
}
|
|
12
|
+
getCacheKey(url) {
|
|
13
|
+
return createHash('sha256').update(url).digest('hex');
|
|
14
|
+
}
|
|
15
|
+
getCachePath(url) {
|
|
16
|
+
const key = this.getCacheKey(url);
|
|
17
|
+
return join(this.cacheDir, `${key}.json`);
|
|
18
|
+
}
|
|
19
|
+
async has(url) {
|
|
20
|
+
try {
|
|
21
|
+
await access(this.getCachePath(url));
|
|
22
|
+
return true;
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
async get(url) {
|
|
29
|
+
try {
|
|
30
|
+
const path = this.getCachePath(url);
|
|
31
|
+
const data = await readFile(path, 'utf-8');
|
|
32
|
+
return JSON.parse(data);
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
async put(url, markdown, title) {
|
|
39
|
+
const entry = {
|
|
40
|
+
url,
|
|
41
|
+
markdown,
|
|
42
|
+
timestamp: Date.now(),
|
|
43
|
+
title
|
|
44
|
+
};
|
|
45
|
+
const path = this.getCachePath(url);
|
|
46
|
+
await writeFile(path, JSON.stringify(entry, null, 2));
|
|
47
|
+
}
|
|
48
|
+
async getAge(url) {
|
|
49
|
+
const entry = await this.get(url);
|
|
50
|
+
if (!entry)
|
|
51
|
+
return null;
|
|
52
|
+
return Date.now() - entry.timestamp;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export function normalizeUrl(url) {
|
|
2
|
+
try {
|
|
3
|
+
const parsed = new URL(url);
|
|
4
|
+
if (parsed.pathname !== '/' && parsed.pathname.endsWith('/')) {
|
|
5
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
6
|
+
}
|
|
7
|
+
const params = Array.from(parsed.searchParams.entries());
|
|
8
|
+
params.sort(([a], [b]) => a.localeCompare(b));
|
|
9
|
+
parsed.search = '';
|
|
10
|
+
params.forEach(([key, value]) => parsed.searchParams.append(key, value));
|
|
11
|
+
if ((parsed.protocol === 'http:' && parsed.port === '80') ||
|
|
12
|
+
(parsed.protocol === 'https:' && parsed.port === '443')) {
|
|
13
|
+
parsed.port = '';
|
|
14
|
+
}
|
|
15
|
+
parsed.hash = '';
|
|
16
|
+
return parsed.href;
|
|
17
|
+
}
|
|
18
|
+
catch {
|
|
19
|
+
return url;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
export function isSameOrigin(url1, url2) {
|
|
23
|
+
try {
|
|
24
|
+
const u1 = new URL(url1);
|
|
25
|
+
const u2 = new URL(url2);
|
|
26
|
+
return u1.origin === u2.origin;
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { fetch } from 'undici';
|
|
2
|
+
export async function fetchStream(url, options = {}) {
|
|
3
|
+
const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5 } = options;
|
|
4
|
+
try {
|
|
5
|
+
const response = await fetch(url, {
|
|
6
|
+
headers: {
|
|
7
|
+
'User-Agent': userAgent,
|
|
8
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
9
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
10
|
+
'DNT': '1',
|
|
11
|
+
'Connection': 'keep-alive',
|
|
12
|
+
'Upgrade-Insecure-Requests': '1'
|
|
13
|
+
},
|
|
14
|
+
redirect: maxRedirections > 0 ? 'follow' : 'manual',
|
|
15
|
+
signal: AbortSignal.timeout(timeout)
|
|
16
|
+
});
|
|
17
|
+
if (!response.ok) {
|
|
18
|
+
throw new Error(`HTTP ${response.status} for ${url}`);
|
|
19
|
+
}
|
|
20
|
+
const contentType = response.headers.get('content-type');
|
|
21
|
+
if (contentType && !contentType.includes('text/html') &&
|
|
22
|
+
!contentType.includes('application/xhtml+xml')) {
|
|
23
|
+
throw new Error(`Non-HTML content type: ${contentType} for ${url}`);
|
|
24
|
+
}
|
|
25
|
+
return await response.text();
|
|
26
|
+
}
|
|
27
|
+
catch (error) {
|
|
28
|
+
if (error instanceof Error) {
|
|
29
|
+
throw new Error(`Failed to fetch ${url}: ${error.message}`);
|
|
30
|
+
}
|
|
31
|
+
throw error;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export function isValidUrl(url) {
|
|
35
|
+
try {
|
|
36
|
+
const parsed = new URL(url);
|
|
37
|
+
return parsed.protocol === 'http:' || parsed.protocol === 'https:';
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { CrawlOptions, CrawlResult } from '../types.js';
|
|
2
|
+
export declare class CrawlQueue {
|
|
3
|
+
private visited;
|
|
4
|
+
private queue;
|
|
5
|
+
private limit;
|
|
6
|
+
private cache;
|
|
7
|
+
private options;
|
|
8
|
+
private results;
|
|
9
|
+
constructor(options?: CrawlOptions);
|
|
10
|
+
init(): Promise<void>;
|
|
11
|
+
crawl(startUrl: string): Promise<CrawlResult[]>;
|
|
12
|
+
private processQueue;
|
|
13
|
+
private processUrl;
|
|
14
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import pLimit from 'p-limit';
|
|
2
|
+
import { normalizeUrl, isSameOrigin } from '../cache/normalize.js';
|
|
3
|
+
import { DiskCache } from '../cache/disk.js';
|
|
4
|
+
import { fetchStream, isValidUrl } from './fetch.js';
|
|
5
|
+
import { isAllowedByRobots, getCrawlDelay } from './robots.js';
|
|
6
|
+
import { htmlToDom, extractLinks } from '../parser/dom.js';
|
|
7
|
+
import { extractArticle } from '../parser/article.js';
|
|
8
|
+
import { formatArticleMarkdown } from '../parser/markdown.js';
|
|
9
|
+
export class CrawlQueue {
|
|
10
|
+
visited = new Set();
|
|
11
|
+
queue = [];
|
|
12
|
+
limit;
|
|
13
|
+
cache;
|
|
14
|
+
options;
|
|
15
|
+
results = [];
|
|
16
|
+
constructor(options = {}) {
|
|
17
|
+
this.options = {
|
|
18
|
+
depth: options.depth ?? 0,
|
|
19
|
+
maxConcurrency: options.maxConcurrency ?? 3,
|
|
20
|
+
respectRobots: options.respectRobots ?? true,
|
|
21
|
+
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
22
|
+
userAgent: options.userAgent ?? 'MCP/0.1',
|
|
23
|
+
cacheDir: options.cacheDir ?? '.cache',
|
|
24
|
+
timeout: options.timeout ?? 30000
|
|
25
|
+
};
|
|
26
|
+
this.limit = pLimit(this.options.maxConcurrency);
|
|
27
|
+
this.cache = new DiskCache(this.options.cacheDir);
|
|
28
|
+
}
|
|
29
|
+
async init() {
|
|
30
|
+
await this.cache.init();
|
|
31
|
+
}
|
|
32
|
+
async crawl(startUrl) {
|
|
33
|
+
const normalizedUrl = normalizeUrl(startUrl);
|
|
34
|
+
if (!isValidUrl(normalizedUrl)) {
|
|
35
|
+
throw new Error(`Invalid URL: ${startUrl}`);
|
|
36
|
+
}
|
|
37
|
+
this.queue.push(normalizedUrl);
|
|
38
|
+
await this.processQueue(0);
|
|
39
|
+
return this.results;
|
|
40
|
+
}
|
|
41
|
+
async processQueue(currentDepth) {
|
|
42
|
+
if (currentDepth > this.options.depth)
|
|
43
|
+
return;
|
|
44
|
+
const urls = [...this.queue];
|
|
45
|
+
this.queue = [];
|
|
46
|
+
const tasks = urls.map(url => this.limit(() => this.processUrl(url, currentDepth)));
|
|
47
|
+
await Promise.all(tasks);
|
|
48
|
+
if (this.queue.length > 0) {
|
|
49
|
+
await this.processQueue(currentDepth + 1);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
async processUrl(url, depth) {
|
|
53
|
+
const normalizedUrl = normalizeUrl(url);
|
|
54
|
+
if (this.visited.has(normalizedUrl))
|
|
55
|
+
return;
|
|
56
|
+
this.visited.add(normalizedUrl);
|
|
57
|
+
try {
|
|
58
|
+
const cached = await this.cache.get(normalizedUrl);
|
|
59
|
+
if (cached) {
|
|
60
|
+
this.results.push({
|
|
61
|
+
url: normalizedUrl,
|
|
62
|
+
markdown: cached.markdown,
|
|
63
|
+
title: cached.title
|
|
64
|
+
});
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
if (this.options.respectRobots) {
|
|
68
|
+
const allowed = await isAllowedByRobots(normalizedUrl, this.options.userAgent);
|
|
69
|
+
if (!allowed) {
|
|
70
|
+
this.results.push({
|
|
71
|
+
url: normalizedUrl,
|
|
72
|
+
markdown: '',
|
|
73
|
+
error: 'Blocked by robots.txt'
|
|
74
|
+
});
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
const delay = await getCrawlDelay(normalizedUrl, this.options.userAgent);
|
|
78
|
+
if (delay > 0) {
|
|
79
|
+
await new Promise(resolve => setTimeout(resolve, delay * 1000));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
const html = await fetchStream(normalizedUrl, {
|
|
83
|
+
userAgent: this.options.userAgent,
|
|
84
|
+
timeout: this.options.timeout
|
|
85
|
+
});
|
|
86
|
+
if (!html || html.trim().length === 0) {
|
|
87
|
+
this.results.push({
|
|
88
|
+
url: normalizedUrl,
|
|
89
|
+
markdown: '',
|
|
90
|
+
error: 'Empty response from server'
|
|
91
|
+
});
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
const dom = htmlToDom(html, normalizedUrl);
|
|
95
|
+
const article = extractArticle(dom);
|
|
96
|
+
if (!article) {
|
|
97
|
+
this.results.push({
|
|
98
|
+
url: normalizedUrl,
|
|
99
|
+
markdown: '',
|
|
100
|
+
error: 'Failed to extract article content'
|
|
101
|
+
});
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
if (!article.content || article.content.trim().length < 50) {
|
|
105
|
+
this.results.push({
|
|
106
|
+
url: normalizedUrl,
|
|
107
|
+
markdown: '',
|
|
108
|
+
error: 'Page contains minimal extractable content'
|
|
109
|
+
});
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
const markdown = formatArticleMarkdown(article);
|
|
113
|
+
await this.cache.put(normalizedUrl, markdown, article.title);
|
|
114
|
+
let links = [];
|
|
115
|
+
if (depth < this.options.depth) {
|
|
116
|
+
links = extractLinks(dom);
|
|
117
|
+
if (this.options.sameOriginOnly) {
|
|
118
|
+
links = links.filter(link => isSameOrigin(normalizedUrl, link));
|
|
119
|
+
}
|
|
120
|
+
links.forEach(link => {
|
|
121
|
+
const normalized = normalizeUrl(link);
|
|
122
|
+
if (!this.visited.has(normalized)) {
|
|
123
|
+
this.queue.push(normalized);
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
this.results.push({
|
|
128
|
+
url: normalizedUrl,
|
|
129
|
+
markdown,
|
|
130
|
+
title: article.title,
|
|
131
|
+
links: links.length > 0 ? links : undefined
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
this.results.push({
|
|
136
|
+
url: normalizedUrl,
|
|
137
|
+
markdown: '',
|
|
138
|
+
error: error instanceof Error ? error.message : 'Unknown error'
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
interface RobotsChecker {
|
|
2
|
+
isAllowed(url: string, userAgent?: string): boolean;
|
|
3
|
+
getCrawlDelay(userAgent?: string): number | undefined;
|
|
4
|
+
}
|
|
5
|
+
export declare function getRobotsChecker(origin: string, userAgent?: string): Promise<RobotsChecker>;
|
|
6
|
+
export declare function isAllowedByRobots(url: string, userAgent?: string): Promise<boolean>;
|
|
7
|
+
export declare function getCrawlDelay(url: string, userAgent?: string): Promise<number>;
|
|
8
|
+
export {};
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { fetchStream } from './fetch.js';
|
|
2
|
+
const robotsCache = new Map();
|
|
3
|
+
export async function getRobotsChecker(origin, userAgent = '*') {
|
|
4
|
+
const cached = robotsCache.get(origin);
|
|
5
|
+
if (cached)
|
|
6
|
+
return cached;
|
|
7
|
+
try {
|
|
8
|
+
const robotsUrl = new URL('/robots.txt', origin).href;
|
|
9
|
+
const robotsTxt = await fetchStream(robotsUrl, {
|
|
10
|
+
timeout: 5000,
|
|
11
|
+
userAgent
|
|
12
|
+
});
|
|
13
|
+
const robotsParserModule = await import('robots-parser');
|
|
14
|
+
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
15
|
+
const robots = robotsParser(robotsUrl, robotsTxt);
|
|
16
|
+
robotsCache.set(origin, robots);
|
|
17
|
+
return robots;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
const permissive = {
|
|
21
|
+
isAllowed: () => true,
|
|
22
|
+
getCrawlDelay: () => undefined
|
|
23
|
+
};
|
|
24
|
+
robotsCache.set(origin, permissive);
|
|
25
|
+
return permissive;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export async function isAllowedByRobots(url, userAgent = '*') {
|
|
29
|
+
try {
|
|
30
|
+
const { origin } = new URL(url);
|
|
31
|
+
const checker = await getRobotsChecker(origin, userAgent);
|
|
32
|
+
return checker.isAllowed(url, userAgent);
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return true;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export async function getCrawlDelay(url, userAgent = '*') {
|
|
39
|
+
try {
|
|
40
|
+
const { origin } = new URL(url);
|
|
41
|
+
const checker = await getRobotsChecker(origin, userAgent);
|
|
42
|
+
return checker.getCrawlDelay(userAgent) || 0;
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
}
|
package/dist/index.d.ts
ADDED