npm - @lukaszraczylo/cloudflare-crawl-mcp - Versions diffs - 0.3.1 - Mend

@lukaszraczylo/cloudflare-crawl-mcp 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/.env.example +9 -0
package/FUNDING.yml +2 -0
package/LICENSE +21 -0
package/README.md +302 -0
package/dist/index.d.ts +1 -0
package/dist/index.js +339 -0
package/dist/index.test.d.ts +1 -0
package/dist/index.test.js +628 -0
package/dist/integration.test.d.ts +1 -0
package/dist/integration.test.js +134 -0
package/eslint.config.js +20 -0
package/package.json +36 -0
package/semver.yaml +15 -0

package/.env.example ADDED Viewed

@@ -0,0 +1,9 @@
+# Cloudflare API Token (get from https://dash.cloudflare.com/profile/api-tokens)
+# Required permissions: Account > Browser Rendering > Edit
+CF_API_TOKEN=your_cloudflare_api_token
+# Cloudflare Account ID (get from https://dash.cloudflare.com/_/account)
+CF_ACCOUNT_ID=your_cloudflare_account_id
+# Rate limit: REST API requests per minute (default: 6 for Free, 600 for Paid)
+# CF_RATE_LIMIT=6

package/FUNDING.yml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ github: lukaszraczylo
2	+ custom: https://github.com/sponsors/lukaszraczylo

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Lukasz Raczylo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,302 @@
+# @lukaszraczylo/cloudflare-crawl-mcp
+<p align="center">
+  <a href="https://www.npmjs.com/package/@lukaszraczylo/cloudflare-crawl-mcp">
+    <img src="https://img.shields.io/npm/v/@lukaszraczylo/cloudflare-crawl-mcp" alt="NPM Version">
+  </a>
+  <a href="https://github.com/lukaszraczylo/cloudflare-crawl-mcp/blob/main/LICENSE">
+    <img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License">
+  </a>
+</p>
+MCP server for crawling websites using Cloudflare Browser Rendering API. Supports multiple output formats including Markdown, HTML, and JSON.
+## Features
+- **Multiple Output Formats**: Choose between Markdown, HTML, or JSON output
+- **Configurable Crawling**: Control depth, page limits, and link following
+- **Pattern Filtering**: Include/exclude URLs using wildcard patterns
+- **JavaScript Rendering**: Execute JavaScript for dynamic content (or disable for static content)
+- **Environment-Based Secrets**: Securely manage credentials via environment variables
+## Prerequisites
+- Node.js 18+
+- Cloudflare account with Browser Rendering API access
+- Cloudflare API Token with `Browser Rendering` permissions
+- Cloudflare Account ID
+## Quick Start
+```bash
+# Clone and setup
+npm install
+npm run build
+# Run with environment variables
+CF_API_TOKEN=your_token CF_ACCOUNT_ID=your_account_id npm start
+```
+## Installation
+### 1. Clone the Repository
+```bash
+git clone https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
+cd cloudflare-crawl-mcp
+```
+### 2. Install Dependencies
+```bash
+npm install
+```
+### 3. Build the Server
+```bash
+npm run build
+```
+### 4. Configure Environment Variables
+Copy the example environment file and add your credentials:
+```bash
+cp .env.example .env
+```
+Edit `.env` with your Cloudflare credentials:
+```
+CF_API_TOKEN=your_cloudflare_api_token
+CF_ACCOUNT_ID=your_cloudflare_account_id
+```
+#### Getting Cloudflare Credentials
+1. **Account ID**: Find it at https://dash.cloudflare.com/_/account
+2. **API Token**: Create one at https://dash.cloudflare.com/profile/api-tokens with these permissions:
+   - `Account` > `Browser Rendering` > `Edit`
+## MCP Configuration
+### Claude Desktop (macOS)
+Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
+```json
+{
+  "mcpServers": {
+    "cloudflare-crawl": {
+      "command": "npm",
+      "args": ["start"],
+      "env": {
+        "CF_API_TOKEN": "your_api_token",
+        "CF_ACCOUNT_ID": "your_account_id"
+      },
+      "path": "/path/to/cloudflare-crawl-mcp"
+    }
+  }
+}
+```
+### Claude Code (CLI)
+```json
+{
+  "mcpServers": {
+    "cloudflare-crawl": {
+      "command": "npm",
+      "args": ["start"],
+      "env": {
+        "CF_API_TOKEN": "your_api_token",
+        "CF_ACCOUNT_ID": "your_account_id"
+      }
+    }
+  }
+}
+```
+### Cursor
+Add to `~/.cursor/settings.json` (MCP configuration):
+```json
+{
+  "mcpServers": {
+    "cloudflare-crawl": {
+      "command": "npm",
+      "args": ["start"],
+      "env": {
+        "CF_API_TOKEN": "your_api_token",
+        "CF_ACCOUNT_ID": "your_account_id"
+      },
+      "path": "/path/to/cloudflare-crawl-mcp"
+    }
+  }
+}
+```
+## Available Tools
+### crawl_url_markdown
+Crawl a website and return content in **Markdown** format.
+```typescript
+{
+  "name": "crawl_url_markdown",
+  "arguments": {
+    "url": "https://example.com/docs",
+    "limit": 50,
+    "depth": 2,
+    "includePatterns": ["https://example.com/docs/**"],
+    "excludePatterns": ["https://example.com/docs/archive/**"],
+    "render": true
+  }
+}
+```
+### crawl_url_html
+Crawl a website and return content in **HTML** format.
+```typescript
+{
+  "name": "crawl_url_html",
+  "arguments": {
+    "url": "https://example.com",
+    "limit": 10
+  }
+}
+```
+### crawl_url_json
+Crawl a website and return content in **JSON** format (uses Workers AI for data extraction).
+```typescript
+{
+  "name": "crawl_url_json",
+  "arguments": {
+    "url": "https://example.com/products",
+    "limit": 20
+  }
+}
+```
+## Parameters
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `url` | string | required | Starting URL to crawl |
+| `limit` | number | 10 | Maximum pages to crawl (max: 100,000) |
+| `depth` | number | 1 | Maximum link depth from starting URL |
+| `includeSubdomains` | boolean | false | Follow links to subdomains |
+| `includeExternalLinks` | boolean | false | Follow links to external domains |
+| `includePatterns` | string[] | [] | Wildcard patterns to include |
+| `excludePatterns` | string[] | [] | Wildcard patterns to exclude |
+| `render` | boolean | true | Execute JavaScript (false = faster static fetch) |
+### Pattern Syntax
+- `*` - Matches any characters except `/`
+- `**` - Matches any characters including `/`
+Examples:
+- `https://example.com/docs/**` - All URLs under /docs
+- `https://example.com/*.html` - All HTML files directly in root
+## Development
+### Commands
+```bash
+npm install             # Install dependencies
+npm run typecheck       # Type check with tsc
+npm run lint            # Lint with ESLint
+npm run build           # Build TypeScript
+npm start               # Run server
+npm test                # Run tests
+npm run test:watch      # Run tests in watch mode
+```
+CI runs typecheck, lint, build and test.
+### Testing
+The project includes comprehensive tests covering:
+- Environment variable handling
+- Crawl options building
+- Result formatting (Markdown, HTML, JSON)
+- Error handling
+- API integration
+Run tests:
+```bash
+npm test
+```
+## Architecture
+```
+src/
+├── index.ts          # Main MCP server implementation
+│
+├── API Layer
+│   ├── initiateCrawl()    # POST to /crawl endpoint
+│   ├── waitForCrawl()     # Poll for job completion
+│   └── getCrawlResults()  # Fetch final results
+│
+├── Formatters
+│   ├── formatMarkdownResult()
+│   ├── formatHtmlResult()
+│   └── formatJsonResult()
+│
+└── MCP Handlers
+    ├── ListToolsRequestSchema    # Tool registration
+    └── CallToolRequestSchema     # Tool execution
+```
+## Cloudflare Limits
+- **Max crawl duration**: 7 days
+- **Results available**: 14 days after completion
+- **Max pages per job**: 100,000
+- **Free plan**: 10 minutes of browser time per day
+See [Cloudflare Browser Rendering Limits](https://developers.cloudflare.com/browser-rendering/limits/) for details.
+## Troubleshooting
+### Crawl returns no results
+- Check `robots.txt` blocking (use `render: false` to bypass)
+- Verify `includePatterns` match actual URLs
+- Try increasing `depth` or disabling pattern filters
+### Job cancelled due to limits
+- Upgrade to Workers Paid plan
+- Use `render: false` for static content
+- Reduce `limit` parameter
+### Authentication errors
+- Verify API Token has Browser Rendering permissions
+- Confirm Account ID is correct
+## License
+MIT License - see [LICENSE](LICENSE) file.
+## Contributing
+Contributions are welcome! Please read our contributing guidelines before submitting PRs at https://github.com/lukaszraczylo/cloudflare-crawl-mcp.
+## Support
+- Open an issue at https://github.com/lukaszraczylo/cloudflare-crawl-mcp/issues
+- Check Cloudflare's [Browser Rendering Docs](https://developers.cloudflare.com/browser-rendering/) for API details

package/dist/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/index.js ADDED Viewed

@@ -0,0 +1,339 @@
+import { Server } from "@modelcontextprotocol/sdk/server/index.js";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
+const API_BASE = "https://api.cloudflare.com/client/v4";
+const MAX_RETRIES = 3;
+const RATE_LIMIT_DELAY_MS = 10000;
+let lastRequestTime = 0;
+let requestCount = 0;
+let windowStart = Date.now();
+function getEnv(key) {
+    const value = process.env[key];
+    if (!value) {
+        throw new Error(`Missing required environment variable: ${key}`);
+    }
+    return value;
+}
+async function enforceRateLimit() {
+    const now = Date.now();
+    const windowDuration = 60000;
+    if (now - windowStart >= windowDuration) {
+        requestCount = 0;
+        windowStart = now;
+    }
+    const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
+    if (requestCount >= requestsPerMinute) {
+        const waitTime = windowDuration - (now - windowStart);
+        console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
+        await new Promise((resolve) => setTimeout(resolve, waitTime));
+        requestCount = 0;
+        windowStart = Date.now();
+    }
+    const timeSinceLastRequest = now - lastRequestTime;
+    if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
+        const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
+        await new Promise((resolve) => setTimeout(resolve, waitTime));
+    }
+    lastRequestTime = Date.now();
+    requestCount++;
+}
+async function fetchWithRetry(fn, retries = MAX_RETRIES) {
+    let lastError = null;
+    for (let attempt = 0; attempt < retries; attempt++) {
+        try {
+            return await fn();
+        }
+        catch (error) {
+            lastError = error;
+            const errorStr = error.message || "";
+            const isRateLimit = errorStr.includes("429") ||
+                errorStr.includes("Rate limit");
+            if (!isRateLimit || attempt === retries - 1) {
+                throw error;
+            }
+            const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
+            const delay = retryAfterMatch
+                ? parseInt(retryAfterMatch[1], 10) * 1000
+                : Math.min(1000 * Math.pow(2, attempt), 30000);
+            console.error(`Rate limited. Retrying in ${delay}ms...`);
+            await new Promise((resolve) => setTimeout(resolve, delay));
+        }
+    }
+    throw lastError;
+}
+async function initiateCrawl(accountId, apiToken, options) {
+    await enforceRateLimit();
+    return fetchWithRetry(async () => {
+        const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
+            method: "POST",
+            headers: {
+                Authorization: `Bearer ${apiToken}`,
+                "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+                url: options.url,
+                limit: options.limit ?? 10,
+                depth: options.depth ?? 1,
+                formats: options.formats ?? ["markdown"],
+                render: options.render ?? true,
+                maxAge: options.maxAge,
+                source: options.source ?? "all",
+                options: options.options ?? {},
+            }),
+        });
+        if (!response.ok) {
+            const error = await response.text();
+            const retryAfter = response.headers.get("Retry-After");
+            const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
+            throw new Error(errorMsg);
+        }
+        const data = await response.json();
+        if (!data.success) {
+            throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
+        }
+        return data.result.id;
+    });
+}
+async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
+    for (let i = 0; i < maxAttempts; i++) {
+        const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
+            headers: {
+                Authorization: `Bearer ${apiToken}`,
+            },
+        });
+        if (!response.ok) {
+            const error = await response.text();
+            throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
+        }
+        const data = await response.json();
+        const status = data.result.status;
+        if (status !== "running") {
+            return data.result;
+        }
+        await new Promise((resolve) => setTimeout(resolve, delayMs));
+    }
+    throw new Error("Crawl job did not complete within timeout");
+}
+function buildCrawlOptions(args, formats) {
+    return {
+        url: args.url,
+        limit: args.limit,
+        depth: args.depth,
+        formats,
+        render: args.render,
+        options: {
+            includeExternalLinks: args.includeExternalLinks,
+            includeSubdomains: args.includeSubdomains,
+            includePatterns: args.includePatterns,
+            excludePatterns: args.excludePatterns,
+        },
+    };
+}
+function formatMarkdownResult(result) {
+    const records = result.records || [];
+    const completedRecords = records.filter((r) => r.status === "completed");
+    const content = completedRecords
+        .map((record) => {
+        const title = record.metadata?.title || record.url;
+        return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
+    })
+        .join("\n");
+    return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
+}
+function formatHtmlResult(result) {
+    const records = result.records || [];
+    const completedRecords = records.filter((r) => r.status === "completed");
+    const content = completedRecords
+        .map((record) => {
+        const title = record.metadata?.title || record.url;
+        return `<article>\n  <h2>${title}</h2>\n  <p>Source: <a href="${record.url}">${record.url}</a></p>\n  <div class="content">${record.html || ""}</div>\n</article>\n`;
+    })
+        .join("\n");
+    return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
+}
+function formatJsonResult(result) {
+    const records = result.records || [];
+    const completedRecords = records.filter((r) => r.status === "completed");
+    const jsonOutput = {
+        summary: {
+            total: result.total,
+            completed: completedRecords.length,
+            status: result.status,
+        },
+        pages: completedRecords.map((record) => ({
+            url: record.url,
+            title: record.metadata?.title,
+            status: record.metadata?.status,
+            markdown: record.markdown,
+            html: record.html,
+            json: record.json,
+        })),
+    };
+    return JSON.stringify(jsonOutput, null, 2);
+}
+function handleErrorResult(result, jobId) {
+    const errorMessages = {
+        errored: `Crawl job errored. Job ID: ${jobId}`,
+        cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
+        cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
+        cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
+    };
+    const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
+    return {
+        content: [{ type: "text", text: message }],
+        isError: true,
+    };
+}
+const server = new Server({
+    name: "cloudflare-crawl-mcp",
+    version: "1.0.0",
+}, {
+    capabilities: {
+        tools: {},
+    },
+});
+const baseToolSchema = {
+    type: "object",
+    properties: {
+        url: {
+            type: "string",
+            description: "The starting URL to crawl",
+        },
+        limit: {
+            type: "number",
+            description: "Maximum number of pages to crawl (default: 10, max: 100000)",
+        },
+        depth: {
+            type: "number",
+            description: "Maximum link depth to crawl from the starting URL (default: 1)",
+        },
+        includeSubdomains: {
+            type: "boolean",
+            description: "If true, follows links to subdomains of the starting URL (default: false)",
+        },
+        includeExternalLinks: {
+            type: "boolean",
+            description: "If true, follows links to external domains (default: false)",
+        },
+        includePatterns: {
+            type: "array",
+            items: { type: "string" },
+            description: "Only visits URLs that match one of these wildcard patterns",
+        },
+        excludePatterns: {
+            type: "array",
+            items: { type: "string" },
+            description: "Does not visit URLs that match any of these wildcard patterns",
+        },
+        render: {
+            type: "boolean",
+            description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
+        },
+    },
+    required: ["url"],
+};
+const RATE_LIMIT_INFO = `
+---
+**Cloudflare Browser Rendering Limits:**
+| Plan | Concurrent Browsers | Browser Time | REST API Rate |
+|------|---------------------|--------------|---------------|
+| Free | 3 | 10 min/day | 6 req/min |
+| Paid | 10 | 10 hours/month | 600 req/min |
+**Environment Variables:**
+- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
+**Tips:**
+- Use \`render: false\` for static content to avoid browser time usage
+- Use \`maxAge\` to cache results and reduce API calls
+- Set \`limit\` and \`depth\` appropriately to stay within limits
+---`;
+server.setRequestHandler(ListToolsRequestSchema, async () => {
+    return {
+        tools: [
+            {
+                name: "crawl_url_markdown",
+                description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
+                inputSchema: {
+                    ...baseToolSchema,
+                    properties: {
+                        ...baseToolSchema.properties,
+                    },
+                },
+            },
+            {
+                name: "crawl_url_html",
+                description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
+                inputSchema: baseToolSchema,
+            },
+            {
+                name: "crawl_url_json",
+                description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
+                inputSchema: baseToolSchema,
+            },
+        ],
+    };
+});
+server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    const { name, arguments: args } = request.params;
+    const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
+    if (!toolMatch) {
+        return {
+            content: [{ type: "text", text: `Unknown tool: ${name}` }],
+            isError: true,
+        };
+    }
+    const format = toolMatch[1];
+    const formatMap = {
+        markdown: ["markdown"],
+        html: ["html"],
+        json: ["json"],
+    };
+    const formats = formatMap[format];
+    try {
+        const apiToken = getEnv("CF_API_TOKEN");
+        const accountId = getEnv("CF_ACCOUNT_ID");
+        const crawlArgs = {
+            url: args.url,
+            limit: args.limit,
+            depth: args.depth,
+            includeSubdomains: args.includeSubdomains,
+            includeExternalLinks: args.includeExternalLinks,
+            includePatterns: args.includePatterns,
+            excludePatterns: args.excludePatterns,
+            render: args.render,
+        };
+        const options = buildCrawlOptions(crawlArgs, formats);
+        const jobId = await initiateCrawl(accountId, apiToken, options);
+        const result = await waitForCrawl(accountId, apiToken, jobId);
+        const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
+        if (terminalStatuses.includes(result.status)) {
+            return handleErrorResult(result, jobId);
+        }
+        const formatterMap = {
+            markdown: formatMarkdownResult,
+            html: formatHtmlResult,
+            json: formatJsonResult,
+        };
+        const formattedContent = formatterMap[format](result);
+        return {
+            content: [{ type: "text", text: formattedContent }],
+        };
+    }
+    catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        return {
+            content: [{ type: "text", text: `Error: ${message}` }],
+            isError: true,
+        };
+    }
+});
+async function main() {
+    const transport = new StdioServerTransport();
+    await server.connect(transport);
+}
+main().catch((error) => {
+    console.error("Server error:", error);
+    process.exit(1);
+});

package/dist/index.test.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};