npm - @just-every/mcp-read-website-fast - Versions diffs - 0.1.16 → 0.1.18 - Mend

@just-every/mcp-read-website-fast 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +6 -3
package/dist/index.js +16 -20
package/dist/internal/fetchMarkdown.d.ts +1 -0
package/dist/internal/fetchMarkdown.js +58 -16
package/dist/serve.js +10 -12
package/dist/utils/extractMarkdownLinks.d.ts +2 -0
package/dist/utils/extractMarkdownLinks.js +43 -0
package/package.json +2 -2

package/README.md CHANGED Viewed

@@ -5,6 +5,10 @@ Fast, token-efficient web content extraction for AI agents - converts websites t
 [![npm version](https://badge.fury.io/js/@just-every%2Fmcp-read-website-fast.svg)](https://www.npmjs.com/package/@just-every/mcp-read-website-fast)
 [![GitHub Actions](https://github.com/just-every/mcp-read-website-fast/workflows/Release/badge.svg)](https://github.com/just-every/mcp-read-website-fast/actions)
+<a href="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast">
+  <img width="380" height="200" src="https://glama.ai/mcp/servers/@just-every/mcp-read-website-fast/badge" alt="read-website-fast MCP server" />
+</a>
 ## Overview
 Existing MCP web crawlers are slow and consume large quantities of tokens. This pauses the development process and provides incomplete results as LLMs need to parse whole web pages.
@@ -91,8 +95,7 @@ Drop this into your client’s mcp.json (e.g. .vscode/mcp.json, ~/.cursor/mcp.js
 - `read_website` - Fetches a webpage and converts it to clean markdown
   - Parameters:
     - `url` (required): The HTTP/HTTPS URL to fetch
-    - `depth` (optional): Crawl depth (0 = single page)
-    - `respectRobots` (optional): Whether to respect robots.txt
+    - `pages` (optional): Maximum number of pages to crawl (default: 1, max: 100)
 ### Available Resources
@@ -132,7 +135,7 @@ npm run dev fetch https://example.com --output both
 ### CLI Options
-- `-d, --depth <number>` - Crawl depth (0 = single page, default: 0)
+- `-p, --pages <number>` - Maximum number of pages to crawl (default: 1)
 - `-c, --concurrency <number>` - Max concurrent requests (default: 3)
 - `--no-robots` - Ignore robots.txt
 - `--all-origins` - Allow cross-origin crawling

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 import { Command } from 'commander';
 import { fetch } from '@just-every/crawl';
+import { fetchMarkdown } from './internal/fetchMarkdown.js';
 import { readFileSync } from 'fs';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
@@ -15,7 +16,7 @@ program
 program
     .command('fetch <url>')
     .description('Fetch a URL and convert to Markdown')
-    .option('-d, --depth <number>', 'Crawl depth (0 = single page)', '0')
+    .option('-p, --pages <number>', 'Maximum number of pages to crawl', '1')
     .option('-c, --concurrency <number>', 'Max concurrent requests', '3')
     .option('--no-robots', 'Ignore robots.txt')
     .option('--all-origins', 'Allow cross-origin crawling')
@@ -25,8 +26,10 @@ program
     .option('-o, --output <format>', 'Output format: json, markdown, or both', 'markdown')
     .action(async (url, options) => {
     try {
+        const pages = parseInt(options.pages, 10);
+        const depth = pages > 1 ? 1 : 0;
         const crawlOptions = {
-            depth: parseInt(options.depth, 10),
+            depth: depth,
             maxConcurrency: parseInt(options.concurrency, 10),
             respectRobots: options.robots,
             sameOriginOnly: !options.allOrigins,
@@ -35,27 +38,24 @@ program
             timeout: parseInt(options.timeout, 10),
         };
         console.error(`Fetching ${url}...`);
-        const results = await fetch(url, crawlOptions);
         if (options.output === 'json') {
+            const results = await fetch(url, crawlOptions);
             console.log(JSON.stringify(results, null, 2));
         }
         else if (options.output === 'markdown') {
-            results.forEach(result => {
-                if (result.markdown) {
-                    console.log(result.markdown);
-                    if (results.length > 1) {
-                        console.log('\n---\n');
-                    }
-                }
-                if (result.error && result.markdown) {
-                    console.error(`Warning for ${result.url}: ${result.error}`);
-                }
-                else if (result.error && !result.markdown) {
-                    console.error(`Error for ${result.url}: ${result.error}`);
-                }
+            const result = await fetchMarkdown(url, {
+                ...crawlOptions,
+                maxPages: pages,
             });
+            if (result.markdown) {
+                console.log(result.markdown);
+            }
+            if (result.error) {
+                console.error(`Error: ${result.error}`);
+            }
         }
         else if (options.output === 'both') {
+            const results = await fetch(url, crawlOptions);
             results.forEach(result => {
                 console.log(`\n## URL: ${result.url}\n`);
                 if (result.markdown) {
@@ -66,10 +66,6 @@ program
                 }
             });
         }
-        const hasFatalErrors = results.some(r => r.error && !r.markdown);
-        if (hasFatalErrors) {
-            process.exit(1);
-        }
     }
     catch (error) {
         console.error('Error:', error instanceof Error ? error.message : error);

package/dist/internal/fetchMarkdown.d.ts CHANGED Viewed

@@ -6,6 +6,7 @@ export interface FetchMarkdownOptions {
     userAgent?: string;
     cacheDir?: string;
     timeout?: number;
+    maxPages?: number;
 }
 export interface FetchMarkdownResult {
     markdown: string;

package/dist/internal/fetchMarkdown.js CHANGED Viewed

@@ -1,28 +1,70 @@
 import { fetch } from '@just-every/crawl';
+import { extractMarkdownLinks, filterSameOriginLinks } from '../utils/extractMarkdownLinks.js';
 export async function fetchMarkdown(url, options = {}) {
     try {
-        const crawlOptions = {
-            depth: options.depth ?? 0,
-            maxConcurrency: options.maxConcurrency ?? 3,
-            respectRobots: options.respectRobots ?? true,
-            sameOriginOnly: options.sameOriginOnly ?? true,
-            userAgent: options.userAgent,
-            cacheDir: options.cacheDir ?? '.cache',
-            timeout: options.timeout ?? 30000,
-        };
-        const results = await fetch(url, crawlOptions);
-        const mainResult = results[0];
-        if (!mainResult) {
+        const maxPages = options.maxPages ?? 1;
+        const visited = new Set();
+        const toVisit = [url];
+        const allResults = [];
+        while (toVisit.length > 0 && allResults.length < maxPages) {
+            const currentUrl = toVisit.shift();
+            if (visited.has(currentUrl))
+                continue;
+            visited.add(currentUrl);
+            const crawlOptions = {
+                depth: 0,
+                maxConcurrency: options.maxConcurrency ?? 3,
+                respectRobots: options.respectRobots ?? true,
+                sameOriginOnly: options.sameOriginOnly ?? true,
+                userAgent: options.userAgent,
+                cacheDir: options.cacheDir ?? '.cache',
+                timeout: options.timeout ?? 30000,
+            };
+            const results = await fetch(currentUrl, crawlOptions);
+            if (results && results.length > 0) {
+                const result = results[0];
+                allResults.push(result);
+                if (allResults.length < maxPages && result.markdown) {
+                    const links = extractMarkdownLinks(result.markdown, currentUrl);
+                    const filteredLinks = options.sameOriginOnly !== false
+                        ? filterSameOriginLinks(links, currentUrl)
+                        : links;
+                    for (const link of filteredLinks) {
+                        if (!visited.has(link) && !toVisit.includes(link)) {
+                            toVisit.push(link);
+                        }
+                    }
+                }
+            }
+        }
+        if (allResults.length === 0) {
             return {
                 markdown: '',
                 error: 'No results returned',
             };
         }
+        const pagesToReturn = allResults;
+        const combinedMarkdown = pagesToReturn
+            .map((result, index) => {
+            if (result.error) {
+                return `<!-- Error fetching ${result.url}: ${result.error} -->`;
+            }
+            let pageContent = '';
+            if (pagesToReturn.length > 1 && index > 0) {
+                pageContent += '\n\n---\n\n';
+            }
+            pageContent += `<!-- Source: ${result.url} -->\n`;
+            pageContent += result.markdown || '';
+            return pageContent;
+        })
+            .join('\n');
         return {
-            markdown: mainResult.markdown,
-            title: mainResult.title,
-            links: mainResult.links,
-            error: mainResult.error,
+            markdown: combinedMarkdown,
+            title: pagesToReturn[0].title,
+            links: pagesToReturn.flatMap(r => r.links || []),
+            error: pagesToReturn.some(r => r.error)
+                ? `Some pages had errors: ${pagesToReturn.filter(r => r.error).map(r => r.url).join(', ')}`
+                : undefined,
         };
     }
     catch (error) {

package/dist/serve.js CHANGED Viewed

@@ -37,15 +37,12 @@ const READ_WEBSITE_TOOL = {
                 type: 'string',
                 description: 'HTTP/HTTPS URL to fetch and convert to markdown',
             },
-            depth: {
+            pages: {
                 type: 'number',
-                description: 'Crawl depth (0 = single page)',
-                default: 0,
-            },
-            respectRobots: {
-                type: 'boolean',
-                description: 'Whether to respect robots.txt',
-                default: true,
+                description: 'Maximum number of pages to crawl (default: 1)',
+                default: 1,
+                minimum: 1,
+                maximum: 100,
             },
         },
         required: ['url'],
@@ -101,13 +98,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
         logger.info(`Processing read request for URL: ${args.url}`);
         logger.debug('Read parameters:', {
             url: args.url,
-            depth: args.depth,
-            respectRobots: args.respectRobots,
+            pages: args.pages,
         });
         logger.debug('Calling fetchMarkdown...');
+        const depth = args.pages > 1 ? 1 : 0;
         const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
-            depth: args.depth ?? 0,
-            respectRobots: args.respectRobots ?? true,
+            depth: depth,
+            respectRobots: false,
+            maxPages: args.pages ?? 1,
         });
         logger.info('Content fetched successfully');
         if (result.error && result.markdown) {

package/dist/utils/extractMarkdownLinks.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export declare function extractMarkdownLinks(markdown: string, baseUrl: string): string[];
2	+ export declare function filterSameOriginLinks(links: string[], baseUrl: string): string[];

package/dist/utils/extractMarkdownLinks.js ADDED Viewed

@@ -0,0 +1,43 @@
+export function extractMarkdownLinks(markdown, baseUrl) {
+    const links = [];
+    const markdownLinkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
+    const bareUrlRegex = /https?:\/\/[^\s<>)\]]+/g;
+    let match;
+    while ((match = markdownLinkRegex.exec(markdown)) !== null) {
+        const url = match[2];
+        if (url && !url.startsWith('#') && !url.startsWith('mailto:') && !url.startsWith('tel:')) {
+            links.push(url);
+        }
+    }
+    while ((match = bareUrlRegex.exec(markdown)) !== null) {
+        links.push(match[0]);
+    }
+    const absoluteLinks = links.map(link => {
+        try {
+            if (link.startsWith('http://') || link.startsWith('https://')) {
+                return link;
+            }
+            return new URL(link, baseUrl).href;
+        }
+        catch {
+            return null;
+        }
+    }).filter(Boolean);
+    return [...new Set(absoluteLinks)];
+}
+export function filterSameOriginLinks(links, baseUrl) {
+    try {
+        const baseOrigin = new URL(baseUrl).origin;
+        return links.filter(link => {
+            try {
+                return new URL(link).origin === baseOrigin;
+            }
+            catch {
+                return false;
+            }
+        });
+    }
+    catch {
+        return [];
+    }
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@just-every/mcp-read-website-fast",
-  "version": "0.1.16",
+  "version": "0.1.18",
   "description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
   "main": "dist/index.js",
   "bin": {
@@ -50,7 +50,7 @@
   "homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
   "license": "MIT",
   "dependencies": {
-    "@just-every/crawl": "^1.0.4",
+    "@just-every/crawl": "^1.0.6",
     "@modelcontextprotocol/sdk": "^1.12.3",
     "commander": "^14.0.0"
   },