@just-every/mcp-read-website-fast 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Context
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,165 @@
1
+ # MCP Read JustEvery Website
2
+
3
+ Existing MCP web crawlers are slow and consume large quantities of tokens. This pauses the development process and provides incomplete results as LLMs need to parse whole web pages.
4
+
5
+ This MCP package fetches web pages locally, strips noise, and converts content to clean Markdown while preserving links. Designed for Claude Code, IDEs and LLM pipelines with minimal token footprint. Crawl sites locally with minimal dependencies.
6
+
7
+ ## MCP Server Configuration
8
+
9
+ This tool can be used as an MCP (Model Context Protocol) server with Claude Desktop, Cursor, VS Code, and other compatible clients.
10
+
11
+ ## Installation
12
+
13
+ ### Claude Code
14
+
15
+ ```bash
16
+ claude mcp add read-website-fast -s user -- npx -y @just-every/mcp-read-website-fast
17
+ ```
18
+
19
+ ### VS Code
20
+
21
+ ```bash
22
+ code --add-mcp '{"name":"read-website-fast","command":"npx","args":["-y","@just-every/mcp-read-website-fast"]}'
23
+ ```
24
+
25
+ ### Cursor
26
+
27
+ ```bash
28
+ cursor://anysphere.cursor-deeplink/mcp/install?name=read-website-fast&config=eyJyZWFkLXdlYnNpdGUtZmFzdCI6eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsIkBqdXN0LWV2ZXJ5L21jcC1yZWFkLXdlYnNpdGUtZmFzdCJdfX0=
29
+ ```
30
+
31
+ ### JetBrains IDEs
32
+
33
+ Settings → Tools → AI Assistant → Model Context Protocol (MCP) → Add
34
+
35
+ Choose “As JSON” and paste:
36
+
37
+ ```json
38
+ {"command":"npx","args":["-y","@just-every/mcp-read-website-fast"]}
39
+ ```
40
+
41
+ Or, in the chat window, type /add and fill in the same JSON—both paths land the server in a single step. 
42
+
43
+ ### Raw JSON (works in any MCP client)
44
+
45
+ ```json
46
+ {
47
+ "mcpServers": {
48
+ "read-website-fast": {
49
+ "command": "npx",
50
+ "args": ["-y", "@just-every/mcp-read-website-fast"]
51
+ }
52
+ }
53
+ }
54
+ ```
55
+
56
+ Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.json, or .mcp.json for Claude).
57
+
58
+
59
+
60
+ ## Features
61
+
62
+ - **Fast startup** using official MCP SDK with lazy loading for optimal performance
63
+ - **Content extraction** using Mozilla Readability (same as Firefox Reader View)
64
+ - **HTML to Markdown** conversion with Turndown + GFM support
65
+ - **Smart caching** with SHA-256 hashed URLs
66
+ - **Polite crawling** with robots.txt support and rate limiting
67
+ - **Concurrent fetching** with configurable depth crawling
68
+ - **Stream-first design** for low memory usage
69
+ - **Link preservation** for knowledge graphs
70
+ - **Optional chunking** for downstream processing
71
+
72
+ ### Available Tools
73
+
74
+ - `read_website_fast` - Fetches a webpage and converts it to clean markdown
75
+ - Parameters:
76
+ - `url` (required): The HTTP/HTTPS URL to fetch
77
+ - `depth` (optional): Crawl depth (0 = single page)
78
+ - `respectRobots` (optional): Whether to respect robots.txt
79
+
80
+ ### Available Resources
81
+
82
+ - `read-website-fast://status` - Get cache statistics
83
+ - `read-website-fast://clear-cache` - Clear the cache directory
84
+
85
+ ## Development Usage
86
+
87
+ ### Install
88
+
89
+ ```bash
90
+ npm install
91
+ npm run build
92
+ ```
93
+
94
+ ### Single page fetch
95
+ ```bash
96
+ npm run dev fetch https://example.com/article
97
+ ```
98
+
99
+ ### Crawl with depth
100
+ ```bash
101
+ npm run dev fetch https://example.com --depth 2 --concurrency 5
102
+ ```
103
+
104
+ ### Output formats
105
+ ```bash
106
+ # Markdown only (default)
107
+ npm run dev fetch https://example.com
108
+
109
+ # JSON output with metadata
110
+ npm run dev fetch https://example.com --output json
111
+
112
+ # Both URL and markdown
113
+ npm run dev fetch https://example.com --output both
114
+ ```
115
+
116
+ ### CLI Options
117
+
118
+ - `-d, --depth <number>` - Crawl depth (0 = single page, default: 0)
119
+ - `-c, --concurrency <number>` - Max concurrent requests (default: 3)
120
+ - `--no-robots` - Ignore robots.txt
121
+ - `--all-origins` - Allow cross-origin crawling
122
+ - `-u, --user-agent <string>` - Custom user agent
123
+ - `--cache-dir <path>` - Cache directory (default: .cache)
124
+ - `-t, --timeout <ms>` - Request timeout in milliseconds (default: 30000)
125
+ - `-o, --output <format>` - Output format: json, markdown, or both (default: markdown)
126
+
127
+ ### Clear cache
128
+ ```bash
129
+ npm run dev clear-cache
130
+ ```
131
+
132
+ ## Architecture
133
+
134
+ ```
135
+ mcp/
136
+ ├── src/
137
+ │ ├── crawler/ # URL fetching, queue management, robots.txt
138
+ │ ├── parser/ # DOM parsing, Readability, Turndown conversion
139
+ │ ├── cache/ # Disk-based caching with SHA-256 keys
140
+ │ ├── utils/ # Logger, chunker utilities
141
+ │ └── index.ts # CLI entry point
142
+ ```
143
+
144
+ ## Development
145
+
146
+ ```bash
147
+ # Run in development mode
148
+ npm run dev fetch https://example.com
149
+
150
+ # Build for production
151
+ npm run build
152
+
153
+ # Run tests
154
+ npm test
155
+
156
+ # Type checking
157
+ npm run typecheck
158
+
159
+ # Linting
160
+ npm run lint
161
+ ```
162
+
163
+ ## License
164
+
165
+ MIT
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { fileURLToPath } from 'url';
4
+ import { dirname, join } from 'path';
5
+ import { existsSync } from 'fs';
6
+
7
+ const __filename = fileURLToPath(import.meta.url);
8
+ const __dirname = dirname(__filename);
9
+ const args = process.argv.slice(2);
10
+
11
+ async function main() {
12
+ // Default to 'serve' if no arguments provided (for MCP usage)
13
+ const command = args[0] || 'serve';
14
+
15
+ // Check if compiled dist exists
16
+ const distExists = existsSync(join(__dirname, '..', 'dist'));
17
+
18
+ if (distExists) {
19
+ // Use compiled JavaScript for production (fast startup)
20
+ if (command === 'serve') {
21
+ const servePath = join(__dirname, '..', 'dist', 'serve.js');
22
+ await import(servePath);
23
+ } else {
24
+ const cliPath = join(__dirname, '..', 'dist', 'index.js');
25
+ await import(cliPath);
26
+ }
27
+ } else {
28
+ // Fall back to TypeScript with tsx for development
29
+ try {
30
+ await import('tsx/esm');
31
+
32
+ if (command === 'serve') {
33
+ const servePath = join(__dirname, '..', 'src', 'serve.ts');
34
+ await import(servePath);
35
+ } else {
36
+ const cliPath = join(__dirname, '..', 'src', 'index.ts');
37
+ await import(cliPath);
38
+ }
39
+ } catch (error) {
40
+ console.error('Error: Development dependencies not installed. Please run "npm install" first.');
41
+ process.exit(1);
42
+ }
43
+ }
44
+ }
45
+
46
+ main().catch(err => {
47
+ console.error('Error:', err);
48
+ process.exit(1);
49
+ });
@@ -0,0 +1,12 @@
1
+ import { CacheEntry } from '../types.js';
2
+ export declare class DiskCache {
3
+ private cacheDir;
4
+ constructor(cacheDir?: string);
5
+ init(): Promise<void>;
6
+ private getCacheKey;
7
+ private getCachePath;
8
+ has(url: string): Promise<boolean>;
9
+ get(url: string): Promise<CacheEntry | null>;
10
+ put(url: string, markdown: string, title?: string): Promise<void>;
11
+ getAge(url: string): Promise<number | null>;
12
+ }
@@ -0,0 +1,54 @@
1
+ import { createHash } from 'crypto';
2
+ import { mkdir, readFile, writeFile, access } from 'fs/promises';
3
+ import { join } from 'path';
4
+ export class DiskCache {
5
+ cacheDir;
6
+ constructor(cacheDir = '.cache') {
7
+ this.cacheDir = cacheDir;
8
+ }
9
+ async init() {
10
+ await mkdir(this.cacheDir, { recursive: true });
11
+ }
12
+ getCacheKey(url) {
13
+ return createHash('sha256').update(url).digest('hex');
14
+ }
15
+ getCachePath(url) {
16
+ const key = this.getCacheKey(url);
17
+ return join(this.cacheDir, `${key}.json`);
18
+ }
19
+ async has(url) {
20
+ try {
21
+ await access(this.getCachePath(url));
22
+ return true;
23
+ }
24
+ catch {
25
+ return false;
26
+ }
27
+ }
28
+ async get(url) {
29
+ try {
30
+ const path = this.getCachePath(url);
31
+ const data = await readFile(path, 'utf-8');
32
+ return JSON.parse(data);
33
+ }
34
+ catch {
35
+ return null;
36
+ }
37
+ }
38
+ async put(url, markdown, title) {
39
+ const entry = {
40
+ url,
41
+ markdown,
42
+ timestamp: Date.now(),
43
+ title
44
+ };
45
+ const path = this.getCachePath(url);
46
+ await writeFile(path, JSON.stringify(entry, null, 2));
47
+ }
48
+ async getAge(url) {
49
+ const entry = await this.get(url);
50
+ if (!entry)
51
+ return null;
52
+ return Date.now() - entry.timestamp;
53
+ }
54
+ }
@@ -0,0 +1,2 @@
1
+ export declare function normalizeUrl(url: string): string;
2
+ export declare function isSameOrigin(url1: string, url2: string): boolean;
@@ -0,0 +1,31 @@
1
+ export function normalizeUrl(url) {
2
+ try {
3
+ const parsed = new URL(url);
4
+ if (parsed.pathname !== '/' && parsed.pathname.endsWith('/')) {
5
+ parsed.pathname = parsed.pathname.slice(0, -1);
6
+ }
7
+ const params = Array.from(parsed.searchParams.entries());
8
+ params.sort(([a], [b]) => a.localeCompare(b));
9
+ parsed.search = '';
10
+ params.forEach(([key, value]) => parsed.searchParams.append(key, value));
11
+ if ((parsed.protocol === 'http:' && parsed.port === '80') ||
12
+ (parsed.protocol === 'https:' && parsed.port === '443')) {
13
+ parsed.port = '';
14
+ }
15
+ parsed.hash = '';
16
+ return parsed.href;
17
+ }
18
+ catch {
19
+ return url;
20
+ }
21
+ }
22
+ export function isSameOrigin(url1, url2) {
23
+ try {
24
+ const u1 = new URL(url1);
25
+ const u2 = new URL(url2);
26
+ return u1.origin === u2.origin;
27
+ }
28
+ catch {
29
+ return false;
30
+ }
31
+ }
@@ -0,0 +1,8 @@
1
+ interface FetchOptions {
2
+ userAgent?: string;
3
+ timeout?: number;
4
+ maxRedirections?: number;
5
+ }
6
+ export declare function fetchStream(url: string, options?: FetchOptions): Promise<string>;
7
+ export declare function isValidUrl(url: string): boolean;
8
+ export {};
@@ -0,0 +1,42 @@
1
+ import { fetch } from 'undici';
2
+ export async function fetchStream(url, options = {}) {
3
+ const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5 } = options;
4
+ try {
5
+ const response = await fetch(url, {
6
+ headers: {
7
+ 'User-Agent': userAgent,
8
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9
+ 'Accept-Language': 'en-US,en;q=0.5',
10
+ 'DNT': '1',
11
+ 'Connection': 'keep-alive',
12
+ 'Upgrade-Insecure-Requests': '1'
13
+ },
14
+ redirect: maxRedirections > 0 ? 'follow' : 'manual',
15
+ signal: AbortSignal.timeout(timeout)
16
+ });
17
+ if (!response.ok) {
18
+ throw new Error(`HTTP ${response.status} for ${url}`);
19
+ }
20
+ const contentType = response.headers.get('content-type');
21
+ if (contentType && !contentType.includes('text/html') &&
22
+ !contentType.includes('application/xhtml+xml')) {
23
+ throw new Error(`Non-HTML content type: ${contentType} for ${url}`);
24
+ }
25
+ return await response.text();
26
+ }
27
+ catch (error) {
28
+ if (error instanceof Error) {
29
+ throw new Error(`Failed to fetch ${url}: ${error.message}`);
30
+ }
31
+ throw error;
32
+ }
33
+ }
34
+ export function isValidUrl(url) {
35
+ try {
36
+ const parsed = new URL(url);
37
+ return parsed.protocol === 'http:' || parsed.protocol === 'https:';
38
+ }
39
+ catch {
40
+ return false;
41
+ }
42
+ }
@@ -0,0 +1,14 @@
1
+ import { CrawlOptions, CrawlResult } from '../types.js';
2
+ export declare class CrawlQueue {
3
+ private visited;
4
+ private queue;
5
+ private limit;
6
+ private cache;
7
+ private options;
8
+ private results;
9
+ constructor(options?: CrawlOptions);
10
+ init(): Promise<void>;
11
+ crawl(startUrl: string): Promise<CrawlResult[]>;
12
+ private processQueue;
13
+ private processUrl;
14
+ }
@@ -0,0 +1,142 @@
1
+ import pLimit from 'p-limit';
2
+ import { normalizeUrl, isSameOrigin } from '../cache/normalize.js';
3
+ import { DiskCache } from '../cache/disk.js';
4
+ import { fetchStream, isValidUrl } from './fetch.js';
5
+ import { isAllowedByRobots, getCrawlDelay } from './robots.js';
6
+ import { htmlToDom, extractLinks } from '../parser/dom.js';
7
+ import { extractArticle } from '../parser/article.js';
8
+ import { formatArticleMarkdown } from '../parser/markdown.js';
9
+ export class CrawlQueue {
10
+ visited = new Set();
11
+ queue = [];
12
+ limit;
13
+ cache;
14
+ options;
15
+ results = [];
16
+ constructor(options = {}) {
17
+ this.options = {
18
+ depth: options.depth ?? 0,
19
+ maxConcurrency: options.maxConcurrency ?? 3,
20
+ respectRobots: options.respectRobots ?? true,
21
+ sameOriginOnly: options.sameOriginOnly ?? true,
22
+ userAgent: options.userAgent ?? 'MCP/0.1',
23
+ cacheDir: options.cacheDir ?? '.cache',
24
+ timeout: options.timeout ?? 30000
25
+ };
26
+ this.limit = pLimit(this.options.maxConcurrency);
27
+ this.cache = new DiskCache(this.options.cacheDir);
28
+ }
29
+ async init() {
30
+ await this.cache.init();
31
+ }
32
+ async crawl(startUrl) {
33
+ const normalizedUrl = normalizeUrl(startUrl);
34
+ if (!isValidUrl(normalizedUrl)) {
35
+ throw new Error(`Invalid URL: ${startUrl}`);
36
+ }
37
+ this.queue.push(normalizedUrl);
38
+ await this.processQueue(0);
39
+ return this.results;
40
+ }
41
+ async processQueue(currentDepth) {
42
+ if (currentDepth > this.options.depth)
43
+ return;
44
+ const urls = [...this.queue];
45
+ this.queue = [];
46
+ const tasks = urls.map(url => this.limit(() => this.processUrl(url, currentDepth)));
47
+ await Promise.all(tasks);
48
+ if (this.queue.length > 0) {
49
+ await this.processQueue(currentDepth + 1);
50
+ }
51
+ }
52
+ async processUrl(url, depth) {
53
+ const normalizedUrl = normalizeUrl(url);
54
+ if (this.visited.has(normalizedUrl))
55
+ return;
56
+ this.visited.add(normalizedUrl);
57
+ try {
58
+ const cached = await this.cache.get(normalizedUrl);
59
+ if (cached) {
60
+ this.results.push({
61
+ url: normalizedUrl,
62
+ markdown: cached.markdown,
63
+ title: cached.title
64
+ });
65
+ return;
66
+ }
67
+ if (this.options.respectRobots) {
68
+ const allowed = await isAllowedByRobots(normalizedUrl, this.options.userAgent);
69
+ if (!allowed) {
70
+ this.results.push({
71
+ url: normalizedUrl,
72
+ markdown: '',
73
+ error: 'Blocked by robots.txt'
74
+ });
75
+ return;
76
+ }
77
+ const delay = await getCrawlDelay(normalizedUrl, this.options.userAgent);
78
+ if (delay > 0) {
79
+ await new Promise(resolve => setTimeout(resolve, delay * 1000));
80
+ }
81
+ }
82
+ const html = await fetchStream(normalizedUrl, {
83
+ userAgent: this.options.userAgent,
84
+ timeout: this.options.timeout
85
+ });
86
+ if (!html || html.trim().length === 0) {
87
+ this.results.push({
88
+ url: normalizedUrl,
89
+ markdown: '',
90
+ error: 'Empty response from server'
91
+ });
92
+ return;
93
+ }
94
+ const dom = htmlToDom(html, normalizedUrl);
95
+ const article = extractArticle(dom);
96
+ if (!article) {
97
+ this.results.push({
98
+ url: normalizedUrl,
99
+ markdown: '',
100
+ error: 'Failed to extract article content'
101
+ });
102
+ return;
103
+ }
104
+ if (!article.content || article.content.trim().length < 50) {
105
+ this.results.push({
106
+ url: normalizedUrl,
107
+ markdown: '',
108
+ error: 'Page contains minimal extractable content'
109
+ });
110
+ return;
111
+ }
112
+ const markdown = formatArticleMarkdown(article);
113
+ await this.cache.put(normalizedUrl, markdown, article.title);
114
+ let links = [];
115
+ if (depth < this.options.depth) {
116
+ links = extractLinks(dom);
117
+ if (this.options.sameOriginOnly) {
118
+ links = links.filter(link => isSameOrigin(normalizedUrl, link));
119
+ }
120
+ links.forEach(link => {
121
+ const normalized = normalizeUrl(link);
122
+ if (!this.visited.has(normalized)) {
123
+ this.queue.push(normalized);
124
+ }
125
+ });
126
+ }
127
+ this.results.push({
128
+ url: normalizedUrl,
129
+ markdown,
130
+ title: article.title,
131
+ links: links.length > 0 ? links : undefined
132
+ });
133
+ }
134
+ catch (error) {
135
+ this.results.push({
136
+ url: normalizedUrl,
137
+ markdown: '',
138
+ error: error instanceof Error ? error.message : 'Unknown error'
139
+ });
140
+ }
141
+ }
142
+ }
@@ -0,0 +1,8 @@
1
+ interface RobotsChecker {
2
+ isAllowed(url: string, userAgent?: string): boolean;
3
+ getCrawlDelay(userAgent?: string): number | undefined;
4
+ }
5
+ export declare function getRobotsChecker(origin: string, userAgent?: string): Promise<RobotsChecker>;
6
+ export declare function isAllowedByRobots(url: string, userAgent?: string): Promise<boolean>;
7
+ export declare function getCrawlDelay(url: string, userAgent?: string): Promise<number>;
8
+ export {};
@@ -0,0 +1,47 @@
1
+ import { fetchStream } from './fetch.js';
2
+ const robotsCache = new Map();
3
+ export async function getRobotsChecker(origin, userAgent = '*') {
4
+ const cached = robotsCache.get(origin);
5
+ if (cached)
6
+ return cached;
7
+ try {
8
+ const robotsUrl = new URL('/robots.txt', origin).href;
9
+ const robotsTxt = await fetchStream(robotsUrl, {
10
+ timeout: 5000,
11
+ userAgent
12
+ });
13
+ const robotsParserModule = await import('robots-parser');
14
+ const robotsParser = robotsParserModule.default || robotsParserModule;
15
+ const robots = robotsParser(robotsUrl, robotsTxt);
16
+ robotsCache.set(origin, robots);
17
+ return robots;
18
+ }
19
+ catch {
20
+ const permissive = {
21
+ isAllowed: () => true,
22
+ getCrawlDelay: () => undefined
23
+ };
24
+ robotsCache.set(origin, permissive);
25
+ return permissive;
26
+ }
27
+ }
28
+ export async function isAllowedByRobots(url, userAgent = '*') {
29
+ try {
30
+ const { origin } = new URL(url);
31
+ const checker = await getRobotsChecker(origin, userAgent);
32
+ return checker.isAllowed(url, userAgent);
33
+ }
34
+ catch {
35
+ return true;
36
+ }
37
+ }
38
+ export async function getCrawlDelay(url, userAgent = '*') {
39
+ try {
40
+ const { origin } = new URL(url);
41
+ const checker = await getRobotsChecker(origin, userAgent);
42
+ return checker.getCrawlDelay(userAgent) || 0;
43
+ }
44
+ catch {
45
+ return 0;
46
+ }
47
+ }
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};