@just-every/mcp-read-website-fast 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,6 +5,10 @@ Fast, token-efficient web content extraction for AI agents - converts websites t
5
5
  [![npm version](https://badge.fury.io/js/@just-every%2Fmcp-read-website-fast.svg)](https://www.npmjs.com/package/@just-every/mcp-read-website-fast)
6
6
  [![GitHub Actions](https://github.com/just-every/mcp-read-website-fast/workflows/Release/badge.svg)](https://github.com/just-every/mcp-read-website-fast/actions)
7
7
 
8
+ <a href="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast">
9
+ <img width="380" height="200" src="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast/badge" alt="read-website-fast MCP server" />
10
+ </a>
11
+
8
12
  ## Overview
9
13
 
10
14
  Existing MCP web crawlers are slow and consume large quantities of tokens. This pauses the development process and provides incomplete results as LLMs need to parse whole web pages.
@@ -91,8 +95,7 @@ Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.js
91
95
  - `read_website` - Fetches a webpage and converts it to clean markdown
92
96
  - Parameters:
93
97
  - `url` (required): The HTTP/HTTPS URL to fetch
94
- - `depth` (optional): Crawl depth (0 = single page)
95
- - `respectRobots` (optional): Whether to respect robots.txt
98
+ - `pages` (optional): Maximum number of pages to crawl (default: 1, max: 100)
96
99
 
97
100
  ### Available Resources
98
101
 
@@ -132,7 +135,7 @@ npm run dev fetch https://example.com --output both
132
135
 
133
136
  ### CLI Options
134
137
 
135
- - `-d, --depth <number>` - Crawl depth (0 = single page, default: 0)
138
+ - `-p, --pages <number>` - Maximum number of pages to crawl (default: 1)
136
139
  - `-c, --concurrency <number>` - Max concurrent requests (default: 3)
137
140
  - `--no-robots` - Ignore robots.txt
138
141
  - `--all-origins` - Allow cross-origin crawling
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import { Command } from 'commander';
3
3
  import { fetch } from '@just-every/crawl';
4
+ import { fetchMarkdown } from './internal/fetchMarkdown.js';
4
5
  import { readFileSync } from 'fs';
5
6
  import { fileURLToPath } from 'url';
6
7
  import { dirname, join } from 'path';
@@ -15,7 +16,7 @@ program
15
16
  program
16
17
  .command('fetch <url>')
17
18
  .description('Fetch a URL and convert to Markdown')
18
- .option('-d, --depth <number>', 'Crawl depth (0 = single page)', '0')
19
+ .option('-p, --pages <number>', 'Maximum number of pages to crawl', '1')
19
20
  .option('-c, --concurrency <number>', 'Max concurrent requests', '3')
20
21
  .option('--no-robots', 'Ignore robots.txt')
21
22
  .option('--all-origins', 'Allow cross-origin crawling')
@@ -25,8 +26,10 @@ program
25
26
  .option('-o, --output <format>', 'Output format: json, markdown, or both', 'markdown')
26
27
  .action(async (url, options) => {
27
28
  try {
29
+ const pages = parseInt(options.pages, 10);
30
+ const depth = pages > 1 ? 1 : 0;
28
31
  const crawlOptions = {
29
- depth: parseInt(options.depth, 10),
32
+ depth: depth,
30
33
  maxConcurrency: parseInt(options.concurrency, 10),
31
34
  respectRobots: options.robots,
32
35
  sameOriginOnly: !options.allOrigins,
@@ -35,27 +38,24 @@ program
35
38
  timeout: parseInt(options.timeout, 10),
36
39
  };
37
40
  console.error(`Fetching ${url}...`);
38
- const results = await fetch(url, crawlOptions);
39
41
  if (options.output === 'json') {
42
+ const results = await fetch(url, crawlOptions);
40
43
  console.log(JSON.stringify(results, null, 2));
41
44
  }
42
45
  else if (options.output === 'markdown') {
43
- results.forEach(result => {
44
- if (result.markdown) {
45
- console.log(result.markdown);
46
- if (results.length > 1) {
47
- console.log('\n---\n');
48
- }
49
- }
50
- if (result.error && result.markdown) {
51
- console.error(`Warning for ${result.url}: ${result.error}`);
52
- }
53
- else if (result.error && !result.markdown) {
54
- console.error(`Error for ${result.url}: ${result.error}`);
55
- }
46
+ const result = await fetchMarkdown(url, {
47
+ ...crawlOptions,
48
+ maxPages: pages,
56
49
  });
50
+ if (result.markdown) {
51
+ console.log(result.markdown);
52
+ }
53
+ if (result.error) {
54
+ console.error(`Error: ${result.error}`);
55
+ }
57
56
  }
58
57
  else if (options.output === 'both') {
58
+ const results = await fetch(url, crawlOptions);
59
59
  results.forEach(result => {
60
60
  console.log(`\n## URL: ${result.url}\n`);
61
61
  if (result.markdown) {
@@ -66,10 +66,6 @@ program
66
66
  }
67
67
  });
68
68
  }
69
- const hasFatalErrors = results.some(r => r.error && !r.markdown);
70
- if (hasFatalErrors) {
71
- process.exit(1);
72
- }
73
69
  }
74
70
  catch (error) {
75
71
  console.error('Error:', error instanceof Error ? error.message : error);
@@ -6,6 +6,7 @@ export interface FetchMarkdownOptions {
6
6
  userAgent?: string;
7
7
  cacheDir?: string;
8
8
  timeout?: number;
9
+ maxPages?: number;
9
10
  }
10
11
  export interface FetchMarkdownResult {
11
12
  markdown: string;
@@ -1,28 +1,70 @@
1
1
  import { fetch } from '@just-every/crawl';
2
+ import { extractMarkdownLinks, filterSameOriginLinks } from '../utils/extractMarkdownLinks.js';
2
3
  export async function fetchMarkdown(url, options = {}) {
3
4
  try {
4
- const crawlOptions = {
5
- depth: options.depth ?? 0,
6
- maxConcurrency: options.maxConcurrency ?? 3,
7
- respectRobots: options.respectRobots ?? true,
8
- sameOriginOnly: options.sameOriginOnly ?? true,
9
- userAgent: options.userAgent,
10
- cacheDir: options.cacheDir ?? '.cache',
11
- timeout: options.timeout ?? 30000,
12
- };
13
- const results = await fetch(url, crawlOptions);
14
- const mainResult = results[0];
15
- if (!mainResult) {
5
+ const maxPages = options.maxPages ?? 1;
6
+ const visited = new Set();
7
+ const toVisit = [url];
8
+ const allResults = [];
9
+ while (toVisit.length > 0 && allResults.length < maxPages) {
10
+ const currentUrl = toVisit.shift();
11
+ if (visited.has(currentUrl))
12
+ continue;
13
+ visited.add(currentUrl);
14
+ const crawlOptions = {
15
+ depth: 0,
16
+ maxConcurrency: options.maxConcurrency ?? 3,
17
+ respectRobots: options.respectRobots ?? true,
18
+ sameOriginOnly: options.sameOriginOnly ?? true,
19
+ userAgent: options.userAgent,
20
+ cacheDir: options.cacheDir ?? '.cache',
21
+ timeout: options.timeout ?? 30000,
22
+ };
23
+ const results = await fetch(currentUrl, crawlOptions);
24
+ if (results && results.length > 0) {
25
+ const result = results[0];
26
+ allResults.push(result);
27
+ if (allResults.length < maxPages && result.markdown) {
28
+ const links = extractMarkdownLinks(result.markdown, currentUrl);
29
+ const filteredLinks = options.sameOriginOnly !== false
30
+ ? filterSameOriginLinks(links, currentUrl)
31
+ : links;
32
+ for (const link of filteredLinks) {
33
+ if (!visited.has(link) && !toVisit.includes(link)) {
34
+ toVisit.push(link);
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ if (allResults.length === 0) {
16
41
  return {
17
42
  markdown: '',
18
43
  error: 'No results returned',
19
44
  };
20
45
  }
46
+ const pagesToReturn = allResults;
47
+ const combinedMarkdown = pagesToReturn
48
+ .map((result, index) => {
49
+ if (result.error) {
50
+ return `<!-- Error fetching ${result.url}: ${result.error} -->`;
51
+ }
52
+ let pageContent = '';
53
+ if (pagesToReturn.length > 1 && index > 0) {
54
+ pageContent += '\n\n---\n\n';
55
+ }
56
+ pageContent += `<!-- Source: ${result.url} -->\n`;
57
+ pageContent += result.markdown || '';
58
+ return pageContent;
59
+ })
60
+ .join('\n');
21
61
  return {
22
- markdown: mainResult.markdown,
23
- title: mainResult.title,
24
- links: mainResult.links,
25
- error: mainResult.error,
62
+ markdown: combinedMarkdown,
63
+ title: pagesToReturn[0].title,
64
+ links: pagesToReturn.flatMap(r => r.links || []),
65
+ error: pagesToReturn.some(r => r.error)
66
+ ? `Some pages had errors: ${pagesToReturn.filter(r => r.error).map(r => r.url).join(', ')}`
67
+ : undefined,
26
68
  };
27
69
  }
28
70
  catch (error) {
package/dist/serve.js CHANGED
@@ -37,15 +37,12 @@ const READ_WEBSITE_TOOL = {
37
37
  type: 'string',
38
38
  description: 'HTTP/HTTPS URL to fetch and convert to markdown',
39
39
  },
40
- depth: {
40
+ pages: {
41
41
  type: 'number',
42
- description: 'Crawl depth (0 = single page)',
43
- default: 0,
44
- },
45
- respectRobots: {
46
- type: 'boolean',
47
- description: 'Whether to respect robots.txt',
48
- default: true,
42
+ description: 'Maximum number of pages to crawl (default: 1)',
43
+ default: 1,
44
+ minimum: 1,
45
+ maximum: 100,
49
46
  },
50
47
  },
51
48
  required: ['url'],
@@ -101,13 +98,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
101
98
  logger.info(`Processing read request for URL: ${args.url}`);
102
99
  logger.debug('Read parameters:', {
103
100
  url: args.url,
104
- depth: args.depth,
105
- respectRobots: args.respectRobots,
101
+ pages: args.pages,
106
102
  });
107
103
  logger.debug('Calling fetchMarkdown...');
104
+ const depth = args.pages > 1 ? 1 : 0;
108
105
  const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
109
- depth: args.depth ?? 0,
110
- respectRobots: args.respectRobots ?? true,
106
+ depth: depth,
107
+ respectRobots: false,
108
+ maxPages: args.pages ?? 1,
111
109
  });
112
110
  logger.info('Content fetched successfully');
113
111
  if (result.error && result.markdown) {
@@ -0,0 +1,2 @@
1
+ export declare function extractMarkdownLinks(markdown: string, baseUrl: string): string[];
2
+ export declare function filterSameOriginLinks(links: string[], baseUrl: string): string[];
@@ -0,0 +1,43 @@
1
+ export function extractMarkdownLinks(markdown, baseUrl) {
2
+ const links = [];
3
+ const markdownLinkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
4
+ const bareUrlRegex = /https?:\/\/[^\s<>)\]]+/g;
5
+ let match;
6
+ while ((match = markdownLinkRegex.exec(markdown)) !== null) {
7
+ const url = match[2];
8
+ if (url && !url.startsWith('#') && !url.startsWith('mailto:') && !url.startsWith('tel:')) {
9
+ links.push(url);
10
+ }
11
+ }
12
+ while ((match = bareUrlRegex.exec(markdown)) !== null) {
13
+ links.push(match[0]);
14
+ }
15
+ const absoluteLinks = links.map(link => {
16
+ try {
17
+ if (link.startsWith('http://') || link.startsWith('https://')) {
18
+ return link;
19
+ }
20
+ return new URL(link, baseUrl).href;
21
+ }
22
+ catch {
23
+ return null;
24
+ }
25
+ }).filter(Boolean);
26
+ return [...new Set(absoluteLinks)];
27
+ }
28
+ export function filterSameOriginLinks(links, baseUrl) {
29
+ try {
30
+ const baseOrigin = new URL(baseUrl).origin;
31
+ return links.filter(link => {
32
+ try {
33
+ return new URL(link).origin === baseOrigin;
34
+ }
35
+ catch {
36
+ return false;
37
+ }
38
+ });
39
+ }
40
+ catch {
41
+ return [];
42
+ }
43
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@just-every/mcp-read-website-fast",
3
- "version": "0.1.16",
3
+ "version": "0.1.18",
4
4
  "description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -50,7 +50,7 @@
50
50
  "homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
51
51
  "license": "MIT",
52
52
  "dependencies": {
53
- "@just-every/crawl": "^1.0.4",
53
+ "@just-every/crawl": "^1.0.6",
54
54
  "@modelcontextprotocol/sdk": "^1.12.3",
55
55
  "commander": "^14.0.0"
56
56
  },